1010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)// Copyright 2014 The Chromium Authors. All rights reserved.
2010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)// Use of this source code is governed by a BSD-style license that can be
3010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)// found in the LICENSE file.
4010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)
5010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)#include "base/macros.h"
6010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)#include "testing/gtest/include/gtest/gtest.h"
7010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)#include "third_party/icu/source/common/unicode/ucnv.h"
8010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)#include "url/url_canon.h"
9010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)#include "url/url_canon_icu.h"
10010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)#include "url/url_canon_stdstring.h"
11010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)#include "url/url_test_utils.h"
12010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)
13010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)// Some implementations of base/basictypes.h may define ARRAYSIZE.
14010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)// If it's not defined, we define it to the ARRAYSIZE_UNSAFE macro
15010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)// which is in our version of basictypes.h.
16010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)#ifndef ARRAYSIZE
17010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)#define ARRAYSIZE ARRAYSIZE_UNSAFE
18010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)#endif
19010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)
20010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)namespace url {
21010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)
22010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)using test_utils::WStringToUTF16;
23010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)
24010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)namespace {
25010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)
26010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)// Wrapper around a UConverter object that managers creation and destruction.
27010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)class UConvScoper {
28010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles) public:
29010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)  explicit UConvScoper(const char* charset_name) {
30010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)    UErrorCode err = U_ZERO_ERROR;
31010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)    converter_ = ucnv_open(charset_name, &err);
32010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)  }
33010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)
34010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)  ~UConvScoper() {
35010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)    if (converter_)
36010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)      ucnv_close(converter_);
37010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)  }
38010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)
39010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)  // Returns the converter object, may be NULL.
40010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)  UConverter* converter() const { return converter_; }
41010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)
42010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles) private:
43010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)  UConverter* converter_;
44010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)};
45010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)
46010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)TEST(URLCanonIcuTest, ICUCharsetConverter) {
47010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)  struct ICUCase {
48010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)    const wchar_t* input;
49010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)    const char* encoding;
50010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)    const char* expected;
51010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)  } icu_cases[] = {
52010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)      // UTF-8.
53010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)    {L"Hello, world", "utf-8", "Hello, world"},
54010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)    {L"\x4f60\x597d", "utf-8", "\xe4\xbd\xa0\xe5\xa5\xbd"},
55010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)      // Non-BMP UTF-8.
56010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)    {L"!\xd800\xdf00!", "utf-8", "!\xf0\x90\x8c\x80!"},
57010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)      // Big5
58010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)    {L"\x4f60\x597d", "big5", "\xa7\x41\xa6\x6e"},
59010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)      // Unrepresentable character in the destination set.
60010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)    {L"hello\x4f60\x06de\x597dworld", "big5",
61010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)      "hello\xa7\x41%26%231758%3B\xa6\x6eworld"},
62010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)  };
63010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)
64010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)  for (size_t i = 0; i < ARRAYSIZE(icu_cases); i++) {
65010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)    UConvScoper conv(icu_cases[i].encoding);
66010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)    ASSERT_TRUE(conv.converter() != NULL);
67010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)    ICUCharsetConverter converter(conv.converter());
68010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)
69010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)    std::string str;
70010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)    StdStringCanonOutput output(&str);
71010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)
72010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)    base::string16 input_str(WStringToUTF16(icu_cases[i].input));
73010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)    int input_len = static_cast<int>(input_str.length());
74010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)    converter.ConvertFromUTF16(input_str.c_str(), input_len, &output);
75010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)    output.Complete();
76010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)
77010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)    EXPECT_STREQ(icu_cases[i].expected, str.c_str());
78010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)  }
79010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)
80010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)  // Test string sizes around the resize boundary for the output to make sure
81010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)  // the converter resizes as needed.
82010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)  const int static_size = 16;
83010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)  UConvScoper conv("utf-8");
84010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)  ASSERT_TRUE(conv.converter());
85010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)  ICUCharsetConverter converter(conv.converter());
86010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)  for (int i = static_size - 2; i <= static_size + 2; i++) {
87010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)    // Make a string with the appropriate length.
88010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)    base::string16 input;
89010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)    for (int ch = 0; ch < i; ch++)
90010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)      input.push_back('a');
91010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)
92010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)    RawCanonOutput<static_size> output;
93010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)    converter.ConvertFromUTF16(input.c_str(), static_cast<int>(input.length()),
94010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)                               &output);
95010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)    EXPECT_EQ(input.length(), static_cast<size_t>(output.length()));
96010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)  }
97010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)}
98010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)
99010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)TEST(URLCanonIcuTest, QueryWithConverter) {
100010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)  struct QueryCase {
101010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)    const char* input8;
102010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)    const wchar_t* input16;
103010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)    const char* encoding;
104010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)    const char* expected;
105010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)  } query_cases[] = {
106010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)      // Regular ASCII case in some different encodings.
107010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)    {"foo=bar", L"foo=bar", "utf-8", "?foo=bar"},
108010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)    {"foo=bar", L"foo=bar", "shift_jis", "?foo=bar"},
109010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)    {"foo=bar", L"foo=bar", "gb2312", "?foo=bar"},
110010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)      // Chinese input/output
111010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)    {"q=\xe4\xbd\xa0\xe5\xa5\xbd", L"q=\x4f60\x597d", "gb2312",
112010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)      "?q=%C4%E3%BA%C3"},
113010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)    {"q=\xe4\xbd\xa0\xe5\xa5\xbd", L"q=\x4f60\x597d", "big5", "?q=%A7A%A6n"},
114010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)      // Unencodable character in the destination character set should be
115010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)      // escaped. The escape sequence unescapes to be the entity name:
116010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)      // "?q=&#20320;"
117010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)    {"q=Chinese\xef\xbc\xa7", L"q=Chinese\xff27", "iso-8859-1",
118010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)      "?q=Chinese%26%2365319%3B"},
119010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)  };
120010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)
121010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)  for (size_t i = 0; i < ARRAYSIZE(query_cases); i++) {
122010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)    Component out_comp;
123010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)
124010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)    UConvScoper conv(query_cases[i].encoding);
125010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)    ASSERT_TRUE(!query_cases[i].encoding || conv.converter());
126010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)    ICUCharsetConverter converter(conv.converter());
127010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)
128010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)    if (query_cases[i].input8) {
129010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)      int len = static_cast<int>(strlen(query_cases[i].input8));
130010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)      Component in_comp(0, len);
131010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)      std::string out_str;
132010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)
133010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)      StdStringCanonOutput output(&out_str);
134010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)      CanonicalizeQuery(query_cases[i].input8, in_comp, &converter, &output,
135010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)                        &out_comp);
136010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)      output.Complete();
137010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)
138010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)      EXPECT_EQ(query_cases[i].expected, out_str);
139010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)    }
140010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)
141010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)    if (query_cases[i].input16) {
142010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)      base::string16 input16(WStringToUTF16(query_cases[i].input16));
143010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)      int len = static_cast<int>(input16.length());
144010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)      Component in_comp(0, len);
145010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)      std::string out_str;
146010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)
147010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)      StdStringCanonOutput output(&out_str);
148010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)      CanonicalizeQuery(input16.c_str(), in_comp, &converter, &output,
149010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)                        &out_comp);
150010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)      output.Complete();
151010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)
152010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)      EXPECT_EQ(query_cases[i].expected, out_str);
153010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)    }
154010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)  }
155010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)
156010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)  // Extra test for input with embedded NULL;
157010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)  std::string out_str;
158010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)  StdStringCanonOutput output(&out_str);
159010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)  Component out_comp;
160010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)  CanonicalizeQuery("a \x00z\x01", Component(0, 5), NULL, &output, &out_comp);
161010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)  output.Complete();
162010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)  EXPECT_EQ("?a%20%00z%01", out_str);
163010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)}
164010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)
165010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)}  // namespace
166010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)
167010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)}  // namespace url
168