1// Copyright 2007, Google Inc.
2// All rights reserved.
3//
4// Redistribution and use in source and binary forms, with or without
5// modification, are permitted provided that the following conditions are
6// met:
7//
8//     * Redistributions of source code must retain the above copyright
9// notice, this list of conditions and the following disclaimer.
10//     * Redistributions in binary form must reproduce the above
11// copyright notice, this list of conditions and the following disclaimer
12// in the documentation and/or other materials provided with the
13// distribution.
14//     * Neither the name of Google Inc. nor the names of its
15// contributors may be used to endorse or promote products derived from
16// this software without specific prior written permission.
17//
18// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
30// This file is intended to be included in another C++ file where the character
31// types are defined. This allows us to write mostly generic code, but not have
32// templace bloat because everything is inlined when anybody calls any of our
33// functions.
34
35#ifndef GOOGLEURL_SRC_URL_CANON_INTERNAL_H__
36#define GOOGLEURL_SRC_URL_CANON_INTERNAL_H__
37
38#include <stdlib.h>
39
40#include "base/logging.h"
41#include "googleurl/src/url_canon.h"
42
43namespace url_canon {
44
45// Character type handling -----------------------------------------------------
46
47// Bits that identify different character types. These types identify different
48// bits that are set for each 8-bit character in the kSharedCharTypeTable.
49enum SharedCharTypes {
50  // Characters that do not require escaping in queries. Characters that do
51  // not have this flag will be escaped, see url_canon_query.cc
52  CHAR_QUERY = 1,
53
54  // Valid in the username/password field.
55  CHAR_USERINFO = 2,
56
57  // Valid in a IPv4 address (digits plus dot and 'x' for hex).
58  CHAR_IPV4 = 4,
59
60  // Valid in an ASCII-representation of a hex digit (as in %-escaped).
61  CHAR_HEX = 8,
62
63  // Valid in an ASCII-representation of a decimal digit.
64  CHAR_DEC = 16,
65
66  // Valid in an ASCII-representation of an octal digit.
67  CHAR_OCT = 32,
68};
69
70// This table contains the flags in SharedCharTypes for each 8-bit character.
71// Some canonicalization functions have their own specialized lookup table.
72// For those with simple requirements, we have collected the flags in one
73// place so there are fewer lookup tables to load into the CPU cache.
74//
75// Using an unsigned char type has a small but measurable performance benefit
76// over using a 32-bit number.
77extern const unsigned char kSharedCharTypeTable[0x100];
78
79// More readable wrappers around the character type lookup table.
80inline bool IsCharOfType(unsigned char c, SharedCharTypes type) {
81  return !!(kSharedCharTypeTable[c] & type);
82}
83inline bool IsQueryChar(unsigned char c) {
84  return IsCharOfType(c, CHAR_QUERY);
85}
86inline bool IsIPv4Char(unsigned char c) {
87  return IsCharOfType(c, CHAR_IPV4);
88}
89inline bool IsHexChar(unsigned char c) {
90  return IsCharOfType(c, CHAR_HEX);
91}
92
93// Appends the given string to the output, escaping characters that do not
94// match the given |type| in SharedCharTypes.
95void AppendStringOfType(const char* source, int length,
96                        SharedCharTypes type,
97                        CanonOutput* output);
98void AppendStringOfType(const char16* source, int length,
99                        SharedCharTypes type,
100                        CanonOutput* output);
101
102// Maps the hex numerical values 0x0 to 0xf to the corresponding ASCII digit
103// that will be used to represent it.
104extern const char kHexCharLookup[0x10];
105
106// This lookup table allows fast conversion between ASCII hex letters and their
107// corresponding numerical value. The 8-bit range is divided up into 8
108// regions of 0x20 characters each. Each of the three character types (numbers,
109// uppercase, lowercase) falls into different regions of this range. The table
110// contains the amount to subtract from characters in that range to get at
111// the corresponding numerical value.
112//
113// See HexDigitToValue for the lookup.
114extern const char kCharToHexLookup[8];
115
116// Assumes the input is a valid hex digit! Call IsHexChar before using this.
117inline unsigned char HexCharToValue(unsigned char c) {
118  return c - kCharToHexLookup[c / 0x20];
119}
120
121// Indicates if the given character is a dot or dot equivalent, returning the
122// number of characters taken by it. This will be one for a literal dot, 3 for
123// an escaped dot. If the character is not a dot, this will return 0.
124template<typename CHAR>
125inline int IsDot(const CHAR* spec, int offset, int end) {
126  if (spec[offset] == '.') {
127    return 1;
128  } else if (spec[offset] == '%' && offset + 3 <= end &&
129             spec[offset + 1] == '2' &&
130             (spec[offset + 2] == 'e' || spec[offset + 2] == 'E')) {
131    // Found "%2e"
132    return 3;
133  }
134  return 0;
135}
136
137// Returns the canonicalized version of the input character according to scheme
138// rules. This is implemented alongside the scheme canonicalizer, and is
139// required for relative URL resolving to test for scheme equality.
140//
141// Returns 0 if the input character is not a valid scheme character.
142char CanonicalSchemeChar(char16 ch);
143
144// Write a single character, escaped, to the output. This always escapes: it
145// does no checking that thee character requires escaping.
146// Escaping makes sense only 8 bit chars, so code works in all cases of
147// input parameters (8/16bit).
148template<typename UINCHAR, typename OUTCHAR>
149inline void AppendEscapedChar(UINCHAR ch,
150                              CanonOutputT<OUTCHAR>* output) {
151  output->push_back('%');
152  output->push_back(kHexCharLookup[ch >> 4]);
153  output->push_back(kHexCharLookup[ch & 0xf]);
154}
155
156// The character we'll substitute for undecodable or invalid characters.
157extern const char16 kUnicodeReplacementCharacter;
158
159// UTF-8 functions ------------------------------------------------------------
160
161// Reads one character in UTF-8 starting at |*begin| in |str| and places
162// the decoded value into |*code_point|. If the character is valid, we will
163// return true. If invalid, we'll return false and put the
164// kUnicodeReplacementCharacter into |*code_point|.
165//
166// |*begin| will be updated to point to the last character consumed so it
167// can be incremented in a loop and will be ready for the next character.
168// (for a single-byte ASCII character, it will not be changed).
169//
170// Implementation is in url_canon_icu.cc.
171bool ReadUTFChar(const char* str, int* begin, int length,
172                 unsigned* code_point_out);
173
174// Generic To-UTF-8 converter. This will call the given append method for each
175// character that should be appended, with the given output method. Wrappers
176// are provided below for escaped and non-escaped versions of this.
177//
178// The char_value must have already been checked that it's a valid Unicode
179// character.
180template<class Output, void Appender(unsigned char, Output*)>
181inline void DoAppendUTF8(unsigned char_value, Output* output) {
182  if (char_value <= 0x7f) {
183    Appender(static_cast<unsigned char>(char_value), output);
184  } else if (char_value <= 0x7ff) {
185    // 110xxxxx 10xxxxxx
186    Appender(static_cast<unsigned char>(0xC0 | (char_value >> 6)),
187             output);
188    Appender(static_cast<unsigned char>(0x80 | (char_value & 0x3f)),
189             output);
190  } else if (char_value <= 0xffff) {
191    // 1110xxxx 10xxxxxx 10xxxxxx
192    Appender(static_cast<unsigned char>(0xe0 | (char_value >> 12)),
193             output);
194    Appender(static_cast<unsigned char>(0x80 | ((char_value >> 6) & 0x3f)),
195             output);
196    Appender(static_cast<unsigned char>(0x80 | (char_value & 0x3f)),
197             output);
198  } else if (char_value <= 0x10FFFF) {  // Max unicode code point.
199    // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
200    Appender(static_cast<unsigned char>(0xf0 | (char_value >> 18)),
201             output);
202    Appender(static_cast<unsigned char>(0x80 | ((char_value >> 12) & 0x3f)),
203             output);
204    Appender(static_cast<unsigned char>(0x80 | ((char_value >> 6) & 0x3f)),
205             output);
206    Appender(static_cast<unsigned char>(0x80 | (char_value & 0x3f)),
207             output);
208  } else {
209    // Invalid UTF-8 character (>20 bits).
210    NOTREACHED();
211  }
212}
213
214// Helper used by AppendUTF8Value below. We use an unsigned parameter so there
215// are no funny sign problems with the input, but then have to convert it to
216// a regular char for appending.
217inline void AppendCharToOutput(unsigned char ch, CanonOutput* output) {
218  output->push_back(static_cast<char>(ch));
219}
220
221// Writes the given character to the output as UTF-8. This does NO checking
222// of the validity of the unicode characters; the caller should ensure that
223// the value it is appending is valid to append.
224inline void AppendUTF8Value(unsigned char_value, CanonOutput* output) {
225  DoAppendUTF8<CanonOutput, AppendCharToOutput>(char_value, output);
226}
227
228// Writes the given character to the output as UTF-8, escaping ALL
229// characters (even when they are ASCII). This does NO checking of the
230// validity of the unicode characters; the caller should ensure that the value
231// it is appending is valid to append.
232inline void AppendUTF8EscapedValue(unsigned char_value, CanonOutput* output) {
233  DoAppendUTF8<CanonOutput, AppendEscapedChar>(char_value, output);
234}
235
236// UTF-16 functions -----------------------------------------------------------
237
238// Reads one character in UTF-16 starting at |*begin| in |str| and places
239// the decoded value into |*code_point|. If the character is valid, we will
240// return true. If invalid, we'll return false and put the
241// kUnicodeReplacementCharacter into |*code_point|.
242//
243// |*begin| will be updated to point to the last character consumed so it
244// can be incremented in a loop and will be ready for the next character.
245// (for a single-16-bit-word character, it will not be changed).
246//
247// Implementation is in url_canon_icu.cc.
248bool ReadUTFChar(const char16* str, int* begin, int length,
249                 unsigned* code_point);
250
251// Equivalent to U16_APPEND_UNSAFE in ICU but uses our output method.
252inline void AppendUTF16Value(unsigned code_point,
253                             CanonOutputT<char16>* output) {
254  if (code_point > 0xffff) {
255    output->push_back(static_cast<char16>((code_point >> 10) + 0xd7c0));
256    output->push_back(static_cast<char16>((code_point & 0x3ff) | 0xdc00));
257  } else {
258    output->push_back(static_cast<char16>(code_point));
259  }
260}
261
262// Escaping functions ---------------------------------------------------------
263
264// Writes the given character to the output as UTF-8, escaped. Call this
265// function only when the input is wide. Returns true on success. Failure
266// means there was some problem with the encoding, we'll still try to
267// update the |*begin| pointer and add a placeholder character to the
268// output so processing can continue.
269//
270// We will append the character starting at ch[begin] with the buffer ch
271// being |length|. |*begin| will be updated to point to the last character
272// consumed (we may consume more than one for UTF-16) so that if called in
273// a loop, incrementing the pointer will move to the next character.
274//
275// Every single output character will be escaped. This means that if you
276// give it an ASCII character as input, it will be escaped. Some code uses
277// this when it knows that a character is invalid according to its rules
278// for validity. If you don't want escaping for ASCII characters, you will
279// have to filter them out prior to calling this function.
280//
281// Assumes that ch[begin] is within range in the array, but does not assume
282// that any following characters are.
283inline bool AppendUTF8EscapedChar(const char16* str, int* begin, int length,
284                                  CanonOutput* output) {
285  // UTF-16 input. Readchar16 will handle invalid characters for us and give
286  // us the kUnicodeReplacementCharacter, so we don't have to do special
287  // checking after failure, just pass through the failure to the caller.
288  unsigned char_value;
289  bool success = ReadUTFChar(str, begin, length, &char_value);
290  AppendUTF8EscapedValue(char_value, output);
291  return success;
292}
293
294// Handles UTF-8 input. See the wide version above for usage.
295inline bool AppendUTF8EscapedChar(const char* str, int* begin, int length,
296                                  CanonOutput* output) {
297  // ReadUTF8Char will handle invalid characters for us and give us the
298  // kUnicodeReplacementCharacter, so we don't have to do special checking
299  // after failure, just pass through the failure to the caller.
300  unsigned ch;
301  bool success = ReadUTFChar(str, begin, length, &ch);
302  AppendUTF8EscapedValue(ch, output);
303  return success;
304}
305
306// Given a '%' character at |*begin| in the string |spec|, this will decode
307// the escaped value and put it into |*unescaped_value| on success (returns
308// true). On failure, this will return false, and will not write into
309// |*unescaped_value|.
310//
311// |*begin| will be updated to point to the last character of the escape
312// sequence so that when called with the index of a for loop, the next time
313// through it will point to the next character to be considered. On failure,
314// |*begin| will be unchanged.
315inline bool Is8BitChar(char c) {
316  return true;  // this case is specialized to avoid a warning
317}
318inline bool Is8BitChar(char16 c) {
319  return c <= 255;
320}
321
322template<typename CHAR>
323inline bool DecodeEscaped(const CHAR* spec, int* begin, int end,
324                          unsigned char* unescaped_value) {
325  if (*begin + 3 > end ||
326      !Is8BitChar(spec[*begin + 1]) || !Is8BitChar(spec[*begin + 2])) {
327    // Invalid escape sequence because there's not enough room, or the
328    // digits are not ASCII.
329    return false;
330  }
331
332  unsigned char first = static_cast<unsigned char>(spec[*begin + 1]);
333  unsigned char second = static_cast<unsigned char>(spec[*begin + 2]);
334  if (!IsHexChar(first) || !IsHexChar(second)) {
335    // Invalid hex digits, fail.
336    return false;
337  }
338
339  // Valid escape sequence.
340  *unescaped_value = (HexCharToValue(first) << 4) + HexCharToValue(second);
341  *begin += 2;
342  return true;
343}
344
345// Appends the given substring to the output, escaping "some" characters that
346// it feels may not be safe. It assumes the input values are all contained in
347// 8-bit although it allows any type.
348//
349// This is used in error cases to append invalid output so that it looks
350// approximately correct. Non-error cases should not call this function since
351// the escaping rules are not guaranteed!
352void AppendInvalidNarrowString(const char* spec, int begin, int end,
353                               CanonOutput* output);
354void AppendInvalidNarrowString(const char16* spec, int begin, int end,
355                               CanonOutput* output);
356
357// Misc canonicalization helpers ----------------------------------------------
358
359// Converts between UTF-8 and UTF-16, returning true on successful conversion.
360// The output will be appended to the given canonicalizer output (so make sure
361// it's empty if you want to replace).
362//
363// On invalid input, this will still write as much output as possible,
364// replacing the invalid characters with the "invalid character". It will
365// return false in the failure case, and the caller should not continue as
366// normal.
367bool ConvertUTF16ToUTF8(const char16* input, int input_len,
368                        CanonOutput* output);
369bool ConvertUTF8ToUTF16(const char* input, int input_len,
370                        CanonOutputT<char16>* output);
371
372// Converts from UTF-16 to 8-bit using the character set converter. If the
373// converter is NULL, this will use UTF-8.
374void ConvertUTF16ToQueryEncoding(const char16* input,
375                                 const url_parse::Component& query,
376                                 CharsetConverter* converter,
377                                 CanonOutput* output);
378
379// Applies the replacements to the given component source. The component source
380// should be pre-initialized to the "old" base. That is, all pointers will
381// point to the spec of the old URL, and all of the Parsed components will
382// be indices into that string.
383//
384// The pointers and components in the |source| for all non-NULL strings in the
385// |repl| (replacements) will be updated to reference those strings.
386// Canonicalizing with the new |source| and |parsed| can then combine URL
387// components from many different strings.
388void SetupOverrideComponents(const char* base,
389                             const Replacements<char>& repl,
390                             URLComponentSource<char>* source,
391                             url_parse::Parsed* parsed);
392
393// Like the above 8-bit version, except that it additionally converts the
394// UTF-16 input to UTF-8 before doing the overrides.
395//
396// The given utf8_buffer is used to store the converted components. They will
397// be appended one after another, with the parsed structure identifying the
398// appropriate substrings. This buffer is a parameter because the source has
399// no storage, so the buffer must have the same lifetime as the source
400// parameter owned by the caller.
401//
402// THE CALLER MUST NOT ADD TO THE |utf8_buffer| AFTER THIS CALL. Members of
403// |source| will point into this buffer, which could be invalidated if
404// additional data is added and the CanonOutput resizes its buffer.
405//
406// Returns true on success. Fales means that the input was not valid UTF-16,
407// although we will have still done the override with "invalid characters" in
408// place of errors.
409bool SetupUTF16OverrideComponents(const char* base,
410                                  const Replacements<char16>& repl,
411                                  CanonOutput* utf8_buffer,
412                                  URLComponentSource<char>* source,
413                                  url_parse::Parsed* parsed);
414
415// Implemented in url_canon_path.cc, these are required by the relative URL
416// resolver as well, so we declare them here.
417bool CanonicalizePartialPath(const char* spec,
418                             const url_parse::Component& path,
419                             int path_begin_in_output,
420                             CanonOutput* output);
421bool CanonicalizePartialPath(const char16* spec,
422                             const url_parse::Component& path,
423                             int path_begin_in_output,
424                             CanonOutput* output);
425
426#ifndef WIN32
427
428// Implementations of Windows' int-to-string conversions
429int _itoa_s(int value, char* buffer, size_t size_in_chars, int radix);
430int _itow_s(int value, char16* buffer, size_t size_in_chars, int radix);
431
432// Secure template overloads for these functions
433template<size_t N>
434inline int _itoa_s(int value, char (&buffer)[N], int radix) {
435  return _itoa_s(value, buffer, N, radix);
436}
437
438template<size_t N>
439inline int _itow_s(int value, char16 (&buffer)[N], int radix) {
440  return _itow_s(value, buffer, N, radix);
441}
442
443// _strtoui64 and strtoull behave the same
444inline unsigned long long _strtoui64(const char* nptr,
445                                     char** endptr, int base) {
446  return strtoull(nptr, endptr, base);
447}
448
449#endif  // WIN32
450
451}  // namespace url_canon
452
453#endif  // GOOGLEURL_SRC_URL_CANON_INTERNAL_H__
454