1// Copyright 2014 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "base/macros.h"
6#include "testing/gtest/include/gtest/gtest.h"
7#include "third_party/icu/source/common/unicode/ucnv.h"
8#include "url/url_canon.h"
9#include "url/url_canon_icu.h"
10#include "url/url_canon_stdstring.h"
11#include "url/url_test_utils.h"
12
13// Some implementations of base/basictypes.h may define ARRAYSIZE.
14// If it's not defined, we define it to the ARRAYSIZE_UNSAFE macro
15// which is in our version of basictypes.h.
16#ifndef ARRAYSIZE
17#define ARRAYSIZE ARRAYSIZE_UNSAFE
18#endif
19
20namespace url {
21
22using test_utils::WStringToUTF16;
23
24namespace {
25
26// Wrapper around a UConverter object that managers creation and destruction.
27class UConvScoper {
28 public:
29  explicit UConvScoper(const char* charset_name) {
30    UErrorCode err = U_ZERO_ERROR;
31    converter_ = ucnv_open(charset_name, &err);
32  }
33
34  ~UConvScoper() {
35    if (converter_)
36      ucnv_close(converter_);
37  }
38
39  // Returns the converter object, may be NULL.
40  UConverter* converter() const { return converter_; }
41
42 private:
43  UConverter* converter_;
44};
45
46TEST(URLCanonIcuTest, ICUCharsetConverter) {
47  struct ICUCase {
48    const wchar_t* input;
49    const char* encoding;
50    const char* expected;
51  } icu_cases[] = {
52      // UTF-8.
53    {L"Hello, world", "utf-8", "Hello, world"},
54    {L"\x4f60\x597d", "utf-8", "\xe4\xbd\xa0\xe5\xa5\xbd"},
55      // Non-BMP UTF-8.
56    {L"!\xd800\xdf00!", "utf-8", "!\xf0\x90\x8c\x80!"},
57      // Big5
58    {L"\x4f60\x597d", "big5", "\xa7\x41\xa6\x6e"},
59      // Unrepresentable character in the destination set.
60    {L"hello\x4f60\x06de\x597dworld", "big5",
61      "hello\xa7\x41%26%231758%3B\xa6\x6eworld"},
62  };
63
64  for (size_t i = 0; i < ARRAYSIZE(icu_cases); i++) {
65    UConvScoper conv(icu_cases[i].encoding);
66    ASSERT_TRUE(conv.converter() != NULL);
67    ICUCharsetConverter converter(conv.converter());
68
69    std::string str;
70    StdStringCanonOutput output(&str);
71
72    base::string16 input_str(WStringToUTF16(icu_cases[i].input));
73    int input_len = static_cast<int>(input_str.length());
74    converter.ConvertFromUTF16(input_str.c_str(), input_len, &output);
75    output.Complete();
76
77    EXPECT_STREQ(icu_cases[i].expected, str.c_str());
78  }
79
80  // Test string sizes around the resize boundary for the output to make sure
81  // the converter resizes as needed.
82  const int static_size = 16;
83  UConvScoper conv("utf-8");
84  ASSERT_TRUE(conv.converter());
85  ICUCharsetConverter converter(conv.converter());
86  for (int i = static_size - 2; i <= static_size + 2; i++) {
87    // Make a string with the appropriate length.
88    base::string16 input;
89    for (int ch = 0; ch < i; ch++)
90      input.push_back('a');
91
92    RawCanonOutput<static_size> output;
93    converter.ConvertFromUTF16(input.c_str(), static_cast<int>(input.length()),
94                               &output);
95    EXPECT_EQ(input.length(), static_cast<size_t>(output.length()));
96  }
97}
98
99TEST(URLCanonIcuTest, QueryWithConverter) {
100  struct QueryCase {
101    const char* input8;
102    const wchar_t* input16;
103    const char* encoding;
104    const char* expected;
105  } query_cases[] = {
106      // Regular ASCII case in some different encodings.
107    {"foo=bar", L"foo=bar", "utf-8", "?foo=bar"},
108    {"foo=bar", L"foo=bar", "shift_jis", "?foo=bar"},
109    {"foo=bar", L"foo=bar", "gb2312", "?foo=bar"},
110      // Chinese input/output
111    {"q=\xe4\xbd\xa0\xe5\xa5\xbd", L"q=\x4f60\x597d", "gb2312",
112      "?q=%C4%E3%BA%C3"},
113    {"q=\xe4\xbd\xa0\xe5\xa5\xbd", L"q=\x4f60\x597d", "big5", "?q=%A7A%A6n"},
114      // Unencodable character in the destination character set should be
115      // escaped. The escape sequence unescapes to be the entity name:
116      // "?q=&#20320;"
117    {"q=Chinese\xef\xbc\xa7", L"q=Chinese\xff27", "iso-8859-1",
118      "?q=Chinese%26%2365319%3B"},
119  };
120
121  for (size_t i = 0; i < ARRAYSIZE(query_cases); i++) {
122    Component out_comp;
123
124    UConvScoper conv(query_cases[i].encoding);
125    ASSERT_TRUE(!query_cases[i].encoding || conv.converter());
126    ICUCharsetConverter converter(conv.converter());
127
128    if (query_cases[i].input8) {
129      int len = static_cast<int>(strlen(query_cases[i].input8));
130      Component in_comp(0, len);
131      std::string out_str;
132
133      StdStringCanonOutput output(&out_str);
134      CanonicalizeQuery(query_cases[i].input8, in_comp, &converter, &output,
135                        &out_comp);
136      output.Complete();
137
138      EXPECT_EQ(query_cases[i].expected, out_str);
139    }
140
141    if (query_cases[i].input16) {
142      base::string16 input16(WStringToUTF16(query_cases[i].input16));
143      int len = static_cast<int>(input16.length());
144      Component in_comp(0, len);
145      std::string out_str;
146
147      StdStringCanonOutput output(&out_str);
148      CanonicalizeQuery(input16.c_str(), in_comp, &converter, &output,
149                        &out_comp);
150      output.Complete();
151
152      EXPECT_EQ(query_cases[i].expected, out_str);
153    }
154  }
155
156  // Extra test for input with embedded NULL;
157  std::string out_str;
158  StdStringCanonOutput output(&out_str);
159  Component out_comp;
160  CanonicalizeQuery("a \x00z\x01", Component(0, 5), NULL, &output, &out_comp);
161  output.Complete();
162  EXPECT_EQ("?a%20%00z%01", out_str);
163}
164
165}  // namespace
166
167}  // namespace url
168