1// Copyright 2013 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5// ICU integration functions.
6
7#include <stdlib.h>
8#include <string.h>
9
10#include "base/logging.h"
11#include "third_party/icu/source/common/unicode/ucnv.h"
12#include "third_party/icu/source/common/unicode/ucnv_cb.h"
13#include "third_party/icu/source/common/unicode/uidna.h"
14#include "url/url_canon_icu.h"
15#include "url/url_canon_internal.h"  // for _itoa_s
16
17namespace url_canon {
18
19namespace {
20
21// Called when converting a character that can not be represented, this will
22// append an escaped version of the numerical character reference for that code
23// point. It is of the form "&#1234;" and we will escape the non-digits to
24// "%26%231234%3B". Why? This is what Netscape did back in the olden days.
25void appendURLEscapedChar(const void* context,
26                          UConverterFromUnicodeArgs* from_args,
27                          const UChar* code_units,
28                          int32_t length,
29                          UChar32 code_point,
30                          UConverterCallbackReason reason,
31                          UErrorCode* err) {
32  if (reason == UCNV_UNASSIGNED) {
33    *err = U_ZERO_ERROR;
34
35    const static int prefix_len = 6;
36    const static char prefix[prefix_len + 1] = "%26%23";  // "&#" percent-escaped
37    ucnv_cbFromUWriteBytes(from_args, prefix, prefix_len, 0, err);
38
39    DCHECK(code_point < 0x110000);
40    char number[8];  // Max Unicode code point is 7 digits.
41    _itoa_s(code_point, number, 10);
42    int number_len = static_cast<int>(strlen(number));
43    ucnv_cbFromUWriteBytes(from_args, number, number_len, 0, err);
44
45    const static int postfix_len = 3;
46    const static char postfix[postfix_len + 1] = "%3B";   // ";" percent-escaped
47    ucnv_cbFromUWriteBytes(from_args, postfix, postfix_len, 0, err);
48  }
49}
50
51// A class for scoping the installation of the invalid character callback.
52class AppendHandlerInstaller {
53 public:
54  // The owner of this object must ensure that the converter is alive for the
55  // duration of this object's lifetime.
56  AppendHandlerInstaller(UConverter* converter) : converter_(converter) {
57    UErrorCode err = U_ZERO_ERROR;
58    ucnv_setFromUCallBack(converter_, appendURLEscapedChar, 0,
59                          &old_callback_, &old_context_, &err);
60  }
61
62  ~AppendHandlerInstaller() {
63    UErrorCode err = U_ZERO_ERROR;
64    ucnv_setFromUCallBack(converter_, old_callback_, old_context_, 0, 0, &err);
65  }
66
67 private:
68  UConverter* converter_;
69
70  UConverterFromUCallback old_callback_;
71  const void* old_context_;
72};
73
74}  // namespace
75
76ICUCharsetConverter::ICUCharsetConverter(UConverter* converter)
77    : converter_(converter) {
78}
79
80ICUCharsetConverter::~ICUCharsetConverter() {
81}
82
83void ICUCharsetConverter::ConvertFromUTF16(const base::char16* input,
84                                           int input_len,
85                                           CanonOutput* output) {
86  // Install our error handler. It will be called for character that can not
87  // be represented in the destination character set.
88  AppendHandlerInstaller handler(converter_);
89
90  int begin_offset = output->length();
91  int dest_capacity = output->capacity() - begin_offset;
92  output->set_length(output->length());
93
94  do {
95    UErrorCode err = U_ZERO_ERROR;
96    char* dest = &output->data()[begin_offset];
97    int required_capacity = ucnv_fromUChars(converter_, dest, dest_capacity,
98                                            input, input_len, &err);
99    if (err != U_BUFFER_OVERFLOW_ERROR) {
100      output->set_length(begin_offset + required_capacity);
101      return;
102    }
103
104    // Output didn't fit, expand
105    dest_capacity = required_capacity;
106    output->Resize(begin_offset + dest_capacity);
107  } while (true);
108}
109
110// Converts the Unicode input representing a hostname to ASCII using IDN rules.
111// The output must be ASCII, but is represented as wide characters.
112//
113// On success, the output will be filled with the ASCII host name and it will
114// return true. Unlike most other canonicalization functions, this assumes that
115// the output is empty. The beginning of the host will be at offset 0, and
116// the length of the output will be set to the length of the new host name.
117//
118// On error, this will return false. The output in this case is undefined.
119bool IDNToASCII(const base::char16* src, int src_len, CanonOutputW* output) {
120  DCHECK(output->length() == 0);  // Output buffer is assumed empty.
121  while (true) {
122    // Use ALLOW_UNASSIGNED to be more tolerant of hostnames that violate
123    // the spec (which do exist). This does not present any risk and is a
124    // little more future proof.
125    UErrorCode err = U_ZERO_ERROR;
126    int num_converted = uidna_IDNToASCII(src, src_len, output->data(),
127                                         output->capacity(),
128                                         UIDNA_ALLOW_UNASSIGNED, NULL, &err);
129    if (err == U_ZERO_ERROR) {
130      output->set_length(num_converted);
131      return true;
132    }
133    if (err != U_BUFFER_OVERFLOW_ERROR)
134      return false;  // Unknown error, give up.
135
136    // Not enough room in our buffer, expand.
137    output->Resize(output->capacity() * 2);
138  }
139}
140
141bool ReadUTFChar(const char* str, int* begin, int length,
142                 unsigned* code_point_out) {
143  int code_point;  // Avoids warning when U8_NEXT writes -1 to it.
144  U8_NEXT(str, *begin, length, code_point);
145  *code_point_out = static_cast<unsigned>(code_point);
146
147  // The ICU macro above moves to the next char, we want to point to the last
148  // char consumed.
149  (*begin)--;
150
151  // Validate the decoded value.
152  if (U_IS_UNICODE_CHAR(code_point))
153    return true;
154  *code_point_out = kUnicodeReplacementCharacter;
155  return false;
156}
157
158bool ReadUTFChar(const base::char16* str, int* begin, int length,
159                 unsigned* code_point) {
160  if (U16_IS_SURROGATE(str[*begin])) {
161    if (!U16_IS_SURROGATE_LEAD(str[*begin]) || *begin + 1 >= length ||
162        !U16_IS_TRAIL(str[*begin + 1])) {
163      // Invalid surrogate pair.
164      *code_point = kUnicodeReplacementCharacter;
165      return false;
166    } else {
167      // Valid surrogate pair.
168      *code_point = U16_GET_SUPPLEMENTARY(str[*begin], str[*begin + 1]);
169      (*begin)++;
170    }
171  } else {
172    // Not a surrogate, just one 16-bit word.
173    *code_point = str[*begin];
174  }
175
176  if (U_IS_UNICODE_CHAR(*code_point))
177    return true;
178
179  // Invalid code point.
180  *code_point = kUnicodeReplacementCharacter;
181  return false;
182}
183
184}  // namespace url_canon
185