15821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Copyright (c) 2011 The Chromium Authors. All rights reserved. 25821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Use of this source code is governed by a BSD-style license that can be 35821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// found in the LICENSE file. 45821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 55821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include <math.h> 65821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include <stdarg.h> 75821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 85821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include <limits> 95821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include <sstream> 105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "base/basictypes.h" 125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "base/format_macros.h" 135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "base/i18n/icu_string_conversions.h" 145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "base/logging.h" 15c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)#include "base/strings/string_piece.h" 16868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)#include "base/strings/stringprintf.h" 17868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)#include "base/strings/utf_string_conversions.h" 185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "testing/gtest/include/gtest/gtest.h" 195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)namespace base { 215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)namespace { 235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Given a null-terminated string of wchar_t with each wchar_t representing 255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// a UTF-16 code unit, returns a string16 made up of wchar_t's in the input. 265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Each wchar_t should be <= 0xFFFF and a non-BMP character (> U+FFFF) 275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// should be represented as a surrogate pair (two UTF-16 units) 285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// *even* where wchar_t is 32-bit (Linux and Mac). 295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// This is to help write tests for functions with string16 params until 315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// the C++ 0x UTF-16 literal is well-supported by compilers. 325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)string16 BuildString16(const wchar_t* s) { 335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#if defined(WCHAR_T_IS_UTF16) 345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return string16(s); 355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#elif defined(WCHAR_T_IS_UTF32) 365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) string16 u16; 375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) while (*s != 0) { 385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) DCHECK_LE(static_cast<unsigned int>(*s), 0xFFFFu); 395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) u16.push_back(*s++); 405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return u16; 425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#endif 435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)const wchar_t* const kConvertRoundtripCases[] = { 465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) L"Google Video", 475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // "网页 图片 资讯更多 »" 485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) L"\x7f51\x9875\x0020\x56fe\x7247\x0020\x8d44\x8baf\x66f4\x591a\x0020\x00bb", 495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // "Παγκόσμιος Ιστός" 505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) L"\x03a0\x03b1\x03b3\x03ba\x03cc\x03c3\x03bc\x03b9" 515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) L"\x03bf\x03c2\x0020\x0399\x03c3\x03c4\x03cc\x03c2", 525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // "Поиск страниц на русском" 535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) L"\x041f\x043e\x0438\x0441\x043a\x0020\x0441\x0442" 545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) L"\x0440\x0430\x043d\x0438\x0446\x0020\x043d\x0430" 555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) L"\x0020\x0440\x0443\x0441\x0441\x043a\x043e\x043c", 565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // "전체서비스" 575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) L"\xc804\xccb4\xc11c\xbe44\xc2a4", 585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Test characters that take more than 16 bits. This will depend on whether 605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // wchar_t is 16 or 32 bits. 615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#if defined(WCHAR_T_IS_UTF16) 625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) L"\xd800\xdf00", 635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // ????? (Mathematical Alphanumeric Symbols (U+011d40 - U+011d44 : A,B,C,D,E) 645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) L"\xd807\xdd40\xd807\xdd41\xd807\xdd42\xd807\xdd43\xd807\xdd44", 655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#elif defined(WCHAR_T_IS_UTF32) 665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) L"\x10300", 675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // ????? (Mathematical Alphanumeric Symbols (U+011d40 - U+011d44 : A,B,C,D,E) 685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) L"\x11d40\x11d41\x11d42\x11d43\x11d44", 695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#endif 705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}; 715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} // namespace 735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)TEST(ICUStringConversionsTest, ConvertCodepageUTF8) { 755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Make sure WideToCodepage works like WideToUTF8. 765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) for (size_t i = 0; i < arraysize(kConvertRoundtripCases); ++i) { 775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) SCOPED_TRACE(base::StringPrintf("Test[%" PRIuS "]: %ls", 785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) i, kConvertRoundtripCases[i])); 795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) std::string expected(WideToUTF8(kConvertRoundtripCases[i])); 815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) std::string utf8; 825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) EXPECT_TRUE(WideToCodepage(kConvertRoundtripCases[i], kCodepageUTF8, 835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) OnStringConversionError::SKIP, &utf8)); 845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) EXPECT_EQ(expected, utf8); 855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// kConverterCodepageCases is not comprehensive. There are a number of cases 895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// to add if we really want to have a comprehensive coverage of various 905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// codepages and their 'idiosyncrasies'. Currently, the only implementation 915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// for CodepageTo* and *ToCodepage uses ICU, which has a very extensive 925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// set of tests for the charset conversion. So, we can get away with a 935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// relatively small number of cases listed below. 945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Note about |u16_wide| in the following struct. 965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// On Windows, the field is always identical to |wide|. On Mac and Linux, 975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// it's identical as long as there's no character outside the 985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// BMP (<= U+FFFF). When there is, it is different from |wide| and 995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// is not a real wide string (UTF-32 string) in that each wchar_t in 1005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// the string is a UTF-16 code unit zero-extended to be 32-bit 1015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// even when the code unit belongs to a surrogate pair. 1025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// For instance, a Unicode string (U+0041 U+010000) is represented as 1035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// L"\x0041\xD800\xDC00" instead of L"\x0041\x10000". 1045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// To avoid the clutter, |u16_wide| will be set to NULL 1055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// if it's identical to |wide| on *all* platforms. 1065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)static const struct { 1085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const char* codepage_name; 1095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const char* encoded; 1105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) OnStringConversionError::Type on_error; 1115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool success; 1125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const wchar_t* wide; 1135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const wchar_t* u16_wide; 1145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} kConvertCodepageCases[] = { 1155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Test a case where the input cannot be decoded, using SKIP, FAIL 1165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // and SUBSTITUTE error handling rules. "A7 41" is valid, but "A6" isn't. 1175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) {"big5", 1185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "\xA7\x41\xA6", 1195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) OnStringConversionError::FAIL, 1205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) false, 1215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) L"", 1225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) NULL}, 1235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) {"big5", 1245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "\xA7\x41\xA6", 1255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) OnStringConversionError::SKIP, 1265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) true, 1275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) L"\x4F60", 1285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) NULL}, 1295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) {"big5", 1305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "\xA7\x41\xA6", 1315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) OnStringConversionError::SUBSTITUTE, 1325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) true, 1335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) L"\x4F60\xFFFD", 1345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) NULL}, 1355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Arabic (ISO-8859) 1365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) {"iso-8859-6", 1375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "\xC7\xEE\xE4\xD3\xF1\xEE\xE4\xC7\xE5\xEF" " " 1385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "\xD9\xEE\xE4\xEE\xEA\xF2\xE3\xEF\xE5\xF2", 1395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) OnStringConversionError::FAIL, 1405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) true, 1415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) L"\x0627\x064E\x0644\x0633\x0651\x064E\x0644\x0627\x0645\x064F" L" " 1425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) L"\x0639\x064E\x0644\x064E\x064A\x0652\x0643\x064F\x0645\x0652", 1435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) NULL}, 1445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Chinese Simplified (GB2312) 1455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) {"gb2312", 1465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "\xC4\xE3\xBA\xC3", 1475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) OnStringConversionError::FAIL, 1485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) true, 1495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) L"\x4F60\x597D", 1505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) NULL}, 1515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Chinese (GB18030) : 4 byte sequences mapped to BMP characters 1525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) {"gb18030", 1535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "\x81\x30\x84\x36\xA1\xA7", 1545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) OnStringConversionError::FAIL, 1555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) true, 1565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) L"\x00A5\x00A8", 1575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) NULL}, 1585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Chinese (GB18030) : A 4 byte sequence mapped to plane 2 (U+20000) 1595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) {"gb18030", 1605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "\x95\x32\x82\x36\xD2\xBB", 1615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) OnStringConversionError::FAIL, 1625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) true, 1635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#if defined(WCHAR_T_IS_UTF16) 1645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) L"\xD840\xDC00\x4E00", 1655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#elif defined(WCHAR_T_IS_UTF32) 1665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) L"\x20000\x4E00", 1675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#endif 1685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) L"\xD840\xDC00\x4E00"}, 1695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) {"big5", 1705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "\xA7\x41\xA6\x6E", 1715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) OnStringConversionError::FAIL, 1725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) true, 1735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) L"\x4F60\x597D", 1745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) NULL}, 1755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Greek (ISO-8859) 1765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) {"iso-8859-7", 1775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "\xE3\xE5\xE9\xDC" " " "\xF3\xEF\xF5", 1785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) OnStringConversionError::FAIL, 1795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) true, 1805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) L"\x03B3\x03B5\x03B9\x03AC" L" " L"\x03C3\x03BF\x03C5", 1815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) NULL}, 1825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Hebrew (Windows) 1835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) {"windows-1255", 1845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "\xF9\xD1\xC8\xEC\xE5\xC9\xED", 1855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) OnStringConversionError::FAIL, 1865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) true, 1875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) L"\x05E9\x05C1\x05B8\x05DC\x05D5\x05B9\x05DD", 1885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) NULL}, 1895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Korean (EUC) 1905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) {"euc-kr", 1915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "\xBE\xC8\xB3\xE7\xC7\xCF\xBC\xBC\xBF\xE4", 1925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) OnStringConversionError::FAIL, 1935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) true, 1945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) L"\xC548\xB155\xD558\xC138\xC694", 1955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) NULL}, 1965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Japanese (EUC) 1975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) {"euc-jp", 1985f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) "\xA4\xB3\xA4\xF3\xA4\xCB\xA4\xC1\xA4\xCF\xB0\xEC\x8E\xA6", 1995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) OnStringConversionError::FAIL, 2005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) true, 2015f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) L"\x3053\x3093\x306B\x3061\x306F\x4E00\xFF66", 2025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) NULL}, 2035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Japanese (ISO-2022) 2045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) {"iso-2022-jp", 2055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "\x1B$B" "\x24\x33\x24\x73\x24\x4B\x24\x41\x24\x4F\x30\x6C" "\x1B(B" 2065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "ab" "\x1B(J" "\x5C\x7E#$" "\x1B(B", 2075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) OnStringConversionError::FAIL, 2085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) true, 2095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) L"\x3053\x3093\x306B\x3061\x306F\x4E00" L"ab\x00A5\x203E#$", 2105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) NULL}, 2115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Japanese (Shift-JIS) 2125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) {"sjis", 2135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "\x82\xB1\x82\xF1\x82\xC9\x82\xBF\x82\xCD\x88\xEA\xA6", 2145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) OnStringConversionError::FAIL, 2155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) true, 2165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) L"\x3053\x3093\x306B\x3061\x306F\x4E00\xFF66", 2175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) NULL}, 2185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Russian (KOI8) 2195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) {"koi8-r", 2205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "\xDA\xC4\xD2\xC1\xD7\xD3\xD4\xD7\xD5\xCA\xD4\xC5", 2215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) OnStringConversionError::FAIL, 2225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) true, 2235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) L"\x0437\x0434\x0440\x0430\x0432\x0441\x0442\x0432" 2245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) L"\x0443\x0439\x0442\x0435", 2255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) NULL}, 2265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Thai (windows-874) 2275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) {"windows-874", 2285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "\xCA\xC7\xD1\xCA\xB4\xD5" "\xA4\xC3\xD1\xBA", 2295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) OnStringConversionError::FAIL, 2305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) true, 2315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) L"\x0E2A\x0E27\x0E31\x0E2A\x0E14\x0E35" 2325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) L"\x0E04\x0E23\x0e31\x0E1A", 2335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) NULL}, 2345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}; 2355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)TEST(ICUStringConversionsTest, ConvertBetweenCodepageAndWide) { 2375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) for (size_t i = 0; i < ARRAYSIZE_UNSAFE(kConvertCodepageCases); ++i) { 2385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) SCOPED_TRACE(base::StringPrintf( 2395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "Test[%" PRIuS "]: <encoded: %s> <codepage: %s>", i, 2405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) kConvertCodepageCases[i].encoded, 2415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) kConvertCodepageCases[i].codepage_name)); 2425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) std::wstring wide; 2445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool success = CodepageToWide(kConvertCodepageCases[i].encoded, 2455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) kConvertCodepageCases[i].codepage_name, 2465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) kConvertCodepageCases[i].on_error, 2475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) &wide); 2485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) EXPECT_EQ(kConvertCodepageCases[i].success, success); 2495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) EXPECT_EQ(kConvertCodepageCases[i].wide, wide); 2505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // When decoding was successful and nothing was skipped, we also check the 2525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // reverse conversion. Not all conversions are round-trippable, but 2535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // kConverterCodepageCases does not have any one-way conversion at the 2545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // moment. 2555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (success && 2565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) kConvertCodepageCases[i].on_error == 2575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) OnStringConversionError::FAIL) { 2585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) std::string encoded; 2595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) success = WideToCodepage(wide, kConvertCodepageCases[i].codepage_name, 2605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) kConvertCodepageCases[i].on_error, &encoded); 2615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) EXPECT_EQ(kConvertCodepageCases[i].success, success); 2625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) EXPECT_EQ(kConvertCodepageCases[i].encoded, encoded); 2635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 2645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 2655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // The above cases handled codepage->wide errors, but not wide->codepage. 2675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Test that here. 2685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) std::string encoded("Temp data"); // Make sure the string gets cleared. 2695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // First test going to an encoding that can not represent that character. 2715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) EXPECT_FALSE(WideToCodepage(L"Chinese\xff27", "iso-8859-1", 2725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) OnStringConversionError::FAIL, &encoded)); 2735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) EXPECT_TRUE(encoded.empty()); 2745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) EXPECT_TRUE(WideToCodepage(L"Chinese\xff27", "iso-8859-1", 2755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) OnStringConversionError::SKIP, &encoded)); 2765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) EXPECT_STREQ("Chinese", encoded.c_str()); 2775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // From Unicode, SUBSTITUTE is the same as SKIP for now. 2785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) EXPECT_TRUE(WideToCodepage(L"Chinese\xff27", "iso-8859-1", 2795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) OnStringConversionError::SUBSTITUTE, 2805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) &encoded)); 2815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) EXPECT_STREQ("Chinese", encoded.c_str()); 2825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#if defined(WCHAR_T_IS_UTF16) 2845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // When we're in UTF-16 mode, test an invalid UTF-16 character in the input. 2855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) EXPECT_FALSE(WideToCodepage(L"a\xd800z", "iso-8859-1", 2865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) OnStringConversionError::FAIL, &encoded)); 2875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) EXPECT_TRUE(encoded.empty()); 2885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) EXPECT_TRUE(WideToCodepage(L"a\xd800z", "iso-8859-1", 2895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) OnStringConversionError::SKIP, &encoded)); 2905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) EXPECT_STREQ("az", encoded.c_str()); 2915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#endif // WCHAR_T_IS_UTF16 2925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Invalid characters should fail. 2945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) EXPECT_TRUE(WideToCodepage(L"a\xffffz", "iso-8859-1", 2955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) OnStringConversionError::SKIP, &encoded)); 2965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) EXPECT_STREQ("az", encoded.c_str()); 2975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Invalid codepages should fail. 2995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) EXPECT_FALSE(WideToCodepage(L"Hello, world", "awesome-8571-2", 3005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) OnStringConversionError::SKIP, &encoded)); 3015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 3025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)TEST(ICUStringConversionsTest, ConvertBetweenCodepageAndUTF16) { 3045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) for (size_t i = 0; i < ARRAYSIZE_UNSAFE(kConvertCodepageCases); ++i) { 3055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) SCOPED_TRACE(base::StringPrintf( 3065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "Test[%" PRIuS "]: <encoded: %s> <codepage: %s>", i, 3075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) kConvertCodepageCases[i].encoded, 3085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) kConvertCodepageCases[i].codepage_name)); 3095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) string16 utf16; 3115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool success = CodepageToUTF16(kConvertCodepageCases[i].encoded, 3125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) kConvertCodepageCases[i].codepage_name, 3135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) kConvertCodepageCases[i].on_error, 3145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) &utf16); 3155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) string16 utf16_expected; 3165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (kConvertCodepageCases[i].u16_wide == NULL) 3175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) utf16_expected = BuildString16(kConvertCodepageCases[i].wide); 3185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) else 3195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) utf16_expected = BuildString16(kConvertCodepageCases[i].u16_wide); 3205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) EXPECT_EQ(kConvertCodepageCases[i].success, success); 3215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) EXPECT_EQ(utf16_expected, utf16); 3225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // When decoding was successful and nothing was skipped, we also check the 3245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // reverse conversion. See also the corresponding comment in 3255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // ConvertBetweenCodepageAndWide. 3265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (success && 3275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) kConvertCodepageCases[i].on_error == OnStringConversionError::FAIL) { 3285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) std::string encoded; 3295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) success = UTF16ToCodepage(utf16, kConvertCodepageCases[i].codepage_name, 3305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) kConvertCodepageCases[i].on_error, &encoded); 3315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) EXPECT_EQ(kConvertCodepageCases[i].success, success); 3325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) EXPECT_EQ(kConvertCodepageCases[i].encoded, encoded); 3335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 3345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 3355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 3365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)static const struct { 3385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const char* encoded; 3395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const char* codepage_name; 3405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool expected_success; 3415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const char* expected_value; 3425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} kConvertAndNormalizeCases[] = { 3435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) {"foo-\xe4.html", "iso-8859-1", true, "foo-\xc3\xa4.html"}, 3445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) {"foo-\xe4.html", "iso-8859-7", true, "foo-\xce\xb4.html"}, 3455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) {"foo-\xe4.html", "foo-bar", false, ""}, 3465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) {"foo-\xff.html", "ascii", false, ""}, 3475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) {"foo.html", "ascii", true, "foo.html"}, 3485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) {"foo-a\xcc\x88.html", "utf-8", true, "foo-\xc3\xa4.html"}, 3495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) {"\x95\x32\x82\x36\xD2\xBB", "gb18030", true, "\xF0\xA0\x80\x80\xE4\xB8\x80"}, 3505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) {"\xA7\x41\xA6\x6E", "big5", true, "\xE4\xBD\xA0\xE5\xA5\xBD"}, 3515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Windows-1258 does have a combining character at xD2 (which is U+0309). 3525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // The sequence of (U+00E2, U+0309) is also encoded as U+1EA9. 3535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) {"foo\xE2\xD2", "windows-1258", true, "foo\xE1\xBA\xA9"}, 3545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) {"", "iso-8859-1", true, ""}, 3555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}; 3565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)TEST(ICUStringConversionsTest, ConvertToUtf8AndNormalize) { 3575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) std::string result; 3585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) for (size_t i = 0; i < ARRAYSIZE_UNSAFE(kConvertAndNormalizeCases); ++i) { 3595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) SCOPED_TRACE(base::StringPrintf( 3605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "Test[%" PRIuS "]: <encoded: %s> <codepage: %s>", i, 3615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) kConvertAndNormalizeCases[i].encoded, 3625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) kConvertAndNormalizeCases[i].codepage_name)); 3635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool success = ConvertToUtf8AndNormalize( 3655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) kConvertAndNormalizeCases[i].encoded, 3665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) kConvertAndNormalizeCases[i].codepage_name, &result); 3675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) EXPECT_EQ(kConvertAndNormalizeCases[i].expected_success, success); 3685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) EXPECT_EQ(kConvertAndNormalizeCases[i].expected_value, result); 3695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 3705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 3715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} // namespace base 373