icu_string_conversions_unittest.cc revision c7f5f8508d98d5952d42ed7648c2a8f30a4da156
1// Copyright (c) 2009 The Chromium Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style license that can be 3// found in the LICENSE file. 4 5#include <math.h> 6#include <stdarg.h> 7 8#include <limits> 9#include <sstream> 10 11#include "base/basictypes.h" 12#include "base/i18n/icu_string_conversions.h" 13#include "base/logging.h" 14#include "base/utf_string_conversions.h" 15#include "testing/gtest/include/gtest/gtest.h" 16 17namespace base { 18 19namespace { 20 21// Given a null-terminated string of wchar_t with each wchar_t representing 22// a UTF-16 code unit, returns a string16 made up of wchar_t's in the input. 23// Each wchar_t should be <= 0xFFFF and a non-BMP character (> U+FFFF) 24// should be represented as a surrogate pair (two UTF-16 units) 25// *even* where wchar_t is 32-bit (Linux and Mac). 26// 27// This is to help write tests for functions with string16 params until 28// the C++ 0x UTF-16 literal is well-supported by compilers. 29string16 BuildString16(const wchar_t* s) { 30#if defined(WCHAR_T_IS_UTF16) 31 return string16(s); 32#elif defined(WCHAR_T_IS_UTF32) 33 string16 u16; 34 while (*s != 0) { 35 DCHECK(static_cast<unsigned int>(*s) <= 0xFFFFu); 36 u16.push_back(*s++); 37 } 38 return u16; 39#endif 40} 41 42const wchar_t* const kConvertRoundtripCases[] = { 43 L"Google Video", 44 // "网页 图片 资讯更多 »" 45 L"\x7f51\x9875\x0020\x56fe\x7247\x0020\x8d44\x8baf\x66f4\x591a\x0020\x00bb", 46 // "Παγκόσμιος Ιστός" 47 L"\x03a0\x03b1\x03b3\x03ba\x03cc\x03c3\x03bc\x03b9" 48 L"\x03bf\x03c2\x0020\x0399\x03c3\x03c4\x03cc\x03c2", 49 // "Поиск страниц на русском" 50 L"\x041f\x043e\x0438\x0441\x043a\x0020\x0441\x0442" 51 L"\x0440\x0430\x043d\x0438\x0446\x0020\x043d\x0430" 52 L"\x0020\x0440\x0443\x0441\x0441\x043a\x043e\x043c", 53 // "전체서비스" 54 L"\xc804\xccb4\xc11c\xbe44\xc2a4", 55 56 // Test characters that take more than 16 bits. This will depend on whether 57 // wchar_t is 16 or 32 bits. 58#if defined(WCHAR_T_IS_UTF16) 59 L"\xd800\xdf00", 60 // ????? (Mathematical Alphanumeric Symbols (U+011d40 - U+011d44 : A,B,C,D,E) 61 L"\xd807\xdd40\xd807\xdd41\xd807\xdd42\xd807\xdd43\xd807\xdd44", 62#elif defined(WCHAR_T_IS_UTF32) 63 L"\x10300", 64 // ????? (Mathematical Alphanumeric Symbols (U+011d40 - U+011d44 : A,B,C,D,E) 65 L"\x11d40\x11d41\x11d42\x11d43\x11d44", 66#endif 67}; 68 69} // namespace 70 71TEST(ICUStringConversionsTest, ConvertCodepageUTF8) { 72 // Make sure WideToCodepage works like WideToUTF8. 73 for (size_t i = 0; i < arraysize(kConvertRoundtripCases); ++i) { 74 std::string expected(WideToUTF8(kConvertRoundtripCases[i])); 75 std::string utf8; 76 EXPECT_TRUE(WideToCodepage(kConvertRoundtripCases[i], kCodepageUTF8, 77 OnStringConversionError::SKIP, &utf8)); 78 EXPECT_EQ(expected, utf8); 79 } 80} 81 82// kConverterCodepageCases is not comprehensive. There are a number of cases 83// to add if we really want to have a comprehensive coverage of various 84// codepages and their 'idiosyncrasies'. Currently, the only implementation 85// for CodepageTo* and *ToCodepage uses ICU, which has a very extensive 86// set of tests for the charset conversion. So, we can get away with a 87// relatively small number of cases listed below. 88// 89// Note about |u16_wide| in the following struct. 90// On Windows, the field is always identical to |wide|. On Mac and Linux, 91// it's identical as long as there's no character outside the 92// BMP (<= U+FFFF). When there is, it is different from |wide| and 93// is not a real wide string (UTF-32 string) in that each wchar_t in 94// the string is a UTF-16 code unit zero-extended to be 32-bit 95// even when the code unit belongs to a surrogate pair. 96// For instance, a Unicode string (U+0041 U+010000) is represented as 97// L"\x0041\xD800\xDC00" instead of L"\x0041\x10000". 98// To avoid the clutter, |u16_wide| will be set to NULL 99// if it's identical to |wide| on *all* platforms. 100 101static const struct { 102 const char* codepage_name; 103 const char* encoded; 104 OnStringConversionError::Type on_error; 105 bool success; 106 const wchar_t* wide; 107 const wchar_t* u16_wide; 108} kConvertCodepageCases[] = { 109 // Test a case where the input cannot be decoded, using SKIP, FAIL 110 // and SUBSTITUTE error handling rules. "A7 41" is valid, but "A6" isn't. 111 {"big5", 112 "\xA7\x41\xA6", 113 OnStringConversionError::FAIL, 114 false, 115 L"", 116 NULL}, 117 {"big5", 118 "\xA7\x41\xA6", 119 OnStringConversionError::SKIP, 120 true, 121 L"\x4F60", 122 NULL}, 123 {"big5", 124 "\xA7\x41\xA6", 125 OnStringConversionError::SUBSTITUTE, 126 true, 127 L"\x4F60\xFFFD", 128 NULL}, 129 // Arabic (ISO-8859) 130 {"iso-8859-6", 131 "\xC7\xEE\xE4\xD3\xF1\xEE\xE4\xC7\xE5\xEF" " " 132 "\xD9\xEE\xE4\xEE\xEA\xF2\xE3\xEF\xE5\xF2", 133 OnStringConversionError::FAIL, 134 true, 135 L"\x0627\x064E\x0644\x0633\x0651\x064E\x0644\x0627\x0645\x064F" L" " 136 L"\x0639\x064E\x0644\x064E\x064A\x0652\x0643\x064F\x0645\x0652", 137 NULL}, 138 // Chinese Simplified (GB2312) 139 {"gb2312", 140 "\xC4\xE3\xBA\xC3", 141 OnStringConversionError::FAIL, 142 true, 143 L"\x4F60\x597D", 144 NULL}, 145 // Chinese (GB18030) : 4 byte sequences mapped to BMP characters 146 {"gb18030", 147 "\x81\x30\x84\x36\xA1\xA7", 148 OnStringConversionError::FAIL, 149 true, 150 L"\x00A5\x00A8", 151 NULL}, 152 // Chinese (GB18030) : A 4 byte sequence mapped to plane 2 (U+20000) 153 {"gb18030", 154 "\x95\x32\x82\x36\xD2\xBB", 155 OnStringConversionError::FAIL, 156 true, 157#if defined(WCHAR_T_IS_UTF16) 158 L"\xD840\xDC00\x4E00", 159#elif defined(WCHAR_T_IS_UTF32) 160 L"\x20000\x4E00", 161#endif 162 L"\xD840\xDC00\x4E00"}, 163 {"big5", 164 "\xA7\x41\xA6\x6E", 165 OnStringConversionError::FAIL, 166 true, 167 L"\x4F60\x597D", 168 NULL}, 169 // Greek (ISO-8859) 170 {"iso-8859-7", 171 "\xE3\xE5\xE9\xDC" " " "\xF3\xEF\xF5", 172 OnStringConversionError::FAIL, 173 true, 174 L"\x03B3\x03B5\x03B9\x03AC" L" " L"\x03C3\x03BF\x03C5", 175 NULL}, 176 // Hebrew (Windows) 177 {"windows-1255", 178 "\xF9\xD1\xC8\xEC\xE5\xC9\xED", 179 OnStringConversionError::FAIL, 180 true, 181 L"\x05E9\x05C1\x05B8\x05DC\x05D5\x05B9\x05DD", 182 NULL}, 183 // Hindi Devanagari (ISCII) 184 {"iscii-dev", 185 "\xEF\x42" "\xC6\xCC\xD7\xE8\xB3\xDA\xCF", 186 OnStringConversionError::FAIL, 187 true, 188 L"\x0928\x092E\x0938\x094D\x0915\x093E\x0930", 189 NULL}, 190 // Korean (EUC) 191 {"euc-kr", 192 "\xBE\xC8\xB3\xE7\xC7\xCF\xBC\xBC\xBF\xE4", 193 OnStringConversionError::FAIL, 194 true, 195 L"\xC548\xB155\xD558\xC138\xC694", 196 NULL}, 197 // Japanese (EUC) 198 {"euc-jp", 199 "\xA4\xB3\xA4\xF3\xA4\xCB\xA4\xC1\xA4\xCF\xB0\xEC\x8F\xB0\xA1\x8E\xA6", 200 OnStringConversionError::FAIL, 201 true, 202 L"\x3053\x3093\x306B\x3061\x306F\x4E00\x4E02\xFF66", 203 NULL}, 204 // Japanese (ISO-2022) 205 {"iso-2022-jp", 206 "\x1B$B" "\x24\x33\x24\x73\x24\x4B\x24\x41\x24\x4F\x30\x6C" "\x1B(B" 207 "ab" "\x1B(J" "\x5C\x7E#$" "\x1B(B", 208 OnStringConversionError::FAIL, 209 true, 210 L"\x3053\x3093\x306B\x3061\x306F\x4E00" L"ab\x00A5\x203E#$", 211 NULL}, 212 // Japanese (Shift-JIS) 213 {"sjis", 214 "\x82\xB1\x82\xF1\x82\xC9\x82\xBF\x82\xCD\x88\xEA\xA6", 215 OnStringConversionError::FAIL, 216 true, 217 L"\x3053\x3093\x306B\x3061\x306F\x4E00\xFF66", 218 NULL}, 219 // Russian (KOI8) 220 {"koi8-r", 221 "\xDA\xC4\xD2\xC1\xD7\xD3\xD4\xD7\xD5\xCA\xD4\xC5", 222 OnStringConversionError::FAIL, 223 true, 224 L"\x0437\x0434\x0440\x0430\x0432\x0441\x0442\x0432" 225 L"\x0443\x0439\x0442\x0435", 226 NULL}, 227 // Thai (windows-874) 228 {"windows-874", 229 "\xCA\xC7\xD1\xCA\xB4\xD5" "\xA4\xC3\xD1\xBA", 230 OnStringConversionError::FAIL, 231 true, 232 L"\x0E2A\x0E27\x0E31\x0E2A\x0E14\x0E35" 233 L"\x0E04\x0E23\x0e31\x0E1A", 234 NULL}, 235}; 236 237TEST(ICUStringConversionsTest, ConvertBetweenCodepageAndWide) { 238 for (size_t i = 0; i < ARRAYSIZE_UNSAFE(kConvertCodepageCases); ++i) { 239 std::wstring wide; 240 bool success = CodepageToWide(kConvertCodepageCases[i].encoded, 241 kConvertCodepageCases[i].codepage_name, 242 kConvertCodepageCases[i].on_error, 243 &wide); 244 EXPECT_EQ(kConvertCodepageCases[i].success, success); 245 EXPECT_EQ(kConvertCodepageCases[i].wide, wide); 246 247 // When decoding was successful and nothing was skipped, we also check the 248 // reverse conversion. Not all conversions are round-trippable, but 249 // kConverterCodepageCases does not have any one-way conversion at the 250 // moment. 251 if (success && 252 kConvertCodepageCases[i].on_error == 253 OnStringConversionError::FAIL) { 254 std::string encoded; 255 success = WideToCodepage(wide, kConvertCodepageCases[i].codepage_name, 256 kConvertCodepageCases[i].on_error, &encoded); 257 EXPECT_EQ(kConvertCodepageCases[i].success, success); 258 EXPECT_EQ(kConvertCodepageCases[i].encoded, encoded); 259 } 260 } 261 262 // The above cases handled codepage->wide errors, but not wide->codepage. 263 // Test that here. 264 std::string encoded("Temp data"); // Make sure the string gets cleared. 265 266 // First test going to an encoding that can not represent that character. 267 EXPECT_FALSE(WideToCodepage(L"Chinese\xff27", "iso-8859-1", 268 OnStringConversionError::FAIL, &encoded)); 269 EXPECT_TRUE(encoded.empty()); 270 EXPECT_TRUE(WideToCodepage(L"Chinese\xff27", "iso-8859-1", 271 OnStringConversionError::SKIP, &encoded)); 272 EXPECT_STREQ("Chinese", encoded.c_str()); 273 // From Unicode, SUBSTITUTE is the same as SKIP for now. 274 EXPECT_TRUE(WideToCodepage(L"Chinese\xff27", "iso-8859-1", 275 OnStringConversionError::SUBSTITUTE, 276 &encoded)); 277 EXPECT_STREQ("Chinese", encoded.c_str()); 278 279#if defined(WCHAR_T_IS_UTF16) 280 // When we're in UTF-16 mode, test an invalid UTF-16 character in the input. 281 EXPECT_FALSE(WideToCodepage(L"a\xd800z", "iso-8859-1", 282 OnStringConversionError::FAIL, &encoded)); 283 EXPECT_TRUE(encoded.empty()); 284 EXPECT_TRUE(WideToCodepage(L"a\xd800z", "iso-8859-1", 285 OnStringConversionError::SKIP, &encoded)); 286 EXPECT_STREQ("az", encoded.c_str()); 287#endif // WCHAR_T_IS_UTF16 288 289 // Invalid characters should fail. 290 EXPECT_TRUE(WideToCodepage(L"a\xffffz", "iso-8859-1", 291 OnStringConversionError::SKIP, &encoded)); 292 EXPECT_STREQ("az", encoded.c_str()); 293 294 // Invalid codepages should fail. 295 EXPECT_FALSE(WideToCodepage(L"Hello, world", "awesome-8571-2", 296 OnStringConversionError::SKIP, &encoded)); 297} 298 299TEST(ICUStringConversionsTest, ConvertBetweenCodepageAndUTF16) { 300 for (size_t i = 0; i < ARRAYSIZE_UNSAFE(kConvertCodepageCases); ++i) { 301 string16 utf16; 302 bool success = CodepageToUTF16(kConvertCodepageCases[i].encoded, 303 kConvertCodepageCases[i].codepage_name, 304 kConvertCodepageCases[i].on_error, 305 &utf16); 306 string16 utf16_expected; 307 if (kConvertCodepageCases[i].u16_wide == NULL) 308 utf16_expected = BuildString16(kConvertCodepageCases[i].wide); 309 else 310 utf16_expected = BuildString16(kConvertCodepageCases[i].u16_wide); 311 EXPECT_EQ(kConvertCodepageCases[i].success, success); 312 EXPECT_EQ(utf16_expected, utf16); 313 314 // When decoding was successful and nothing was skipped, we also check the 315 // reverse conversion. See also the corresponding comment in 316 // ConvertBetweenCodepageAndWide. 317 if (success && 318 kConvertCodepageCases[i].on_error == OnStringConversionError::FAIL) { 319 std::string encoded; 320 success = UTF16ToCodepage(utf16, kConvertCodepageCases[i].codepage_name, 321 kConvertCodepageCases[i].on_error, &encoded); 322 EXPECT_EQ(kConvertCodepageCases[i].success, success); 323 EXPECT_EQ(kConvertCodepageCases[i].encoded, encoded); 324 } 325 } 326} 327 328} // namespace base 329