15821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Copyright (c) 2012 The Chromium Authors. All rights reserved. 25821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Use of this source code is governed by a BSD-style license that can be 35821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// found in the LICENSE file. 45821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 55821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "base/i18n/icu_string_conversions.h" 65821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 75821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include <vector> 85821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 95821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "base/basictypes.h" 105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "base/logging.h" 115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "base/memory/scoped_ptr.h" 12868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)#include "base/strings/string_util.h" 13868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)#include "base/strings/utf_string_conversions.h" 14ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch#include "third_party/icu/source/common/unicode/ucnv.h" 15ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch#include "third_party/icu/source/common/unicode/ucnv_cb.h" 16ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch#include "third_party/icu/source/common/unicode/ucnv_err.h" 17ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch#include "third_party/icu/source/common/unicode/unorm.h" 18ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch#include "third_party/icu/source/common/unicode/ustring.h" 195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)namespace base { 215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)namespace { 235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// ToUnicodeCallbackSubstitute() is based on UCNV_TO_U_CALLBACK_SUBSTITUTE 245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// in source/common/ucnv_err.c. 255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Copyright (c) 1995-2006 International Business Machines Corporation 275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// and others 285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// All rights reserved. 305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Permission is hereby granted, free of charge, to any person obtaining a 335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// copy of this software and associated documentation files (the "Software"), 345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// to deal in the Software without restriction, including without limitation 355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// the rights to use, copy, modify, merge, publish, distribute, and/or 365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// sell copies of the Software, and to permit persons to whom the Software 375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// is furnished to do so, provided that the above copyright notice(s) and 385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// this permission notice appear in all copies of the Software and that 395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// both the above copyright notice(s) and this permission notice appear in 405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// supporting documentation. 415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT 455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS 465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT 475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS 485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE 495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE 505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// OR PERFORMANCE OF THIS SOFTWARE. 515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Except as contained in this notice, the name of a copyright holder 535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// shall not be used in advertising or otherwise to promote the sale, use 545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// or other dealings in this Software without prior written authorization 555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// of the copyright holder. 565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// ___________________________________________________________________________ 585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// All trademarks and registered trademarks mentioned herein are the property 605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// of their respective owners. 615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)void ToUnicodeCallbackSubstitute(const void* context, 635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) UConverterToUnicodeArgs *to_args, 645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const char* code_units, 655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int32_t length, 665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) UConverterCallbackReason reason, 675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) UErrorCode * err) { 685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) static const UChar kReplacementChar = 0xFFFD; 695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (reason <= UCNV_IRREGULAR) { 705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (context == NULL || 715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) (*(reinterpret_cast<const char*>(context)) == 'i' && 725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) reason == UCNV_UNASSIGNED)) { 735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) *err = U_ZERO_ERROR; 745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ucnv_cbToUWriteUChars(to_args, &kReplacementChar, 1, 0, err); 755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // else the caller must have set the error code accordingly. 775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // else ignore the reset, close and clone calls. 795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)bool ConvertFromUTF16(UConverter* converter, const UChar* uchar_src, 825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int uchar_len, OnStringConversionError::Type on_error, 835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) std::string* encoded) { 845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int encoded_max_length = UCNV_GET_MAX_BYTES_FOR_STRING(uchar_len, 855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ucnv_getMaxCharSize(converter)); 865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) encoded->resize(encoded_max_length); 875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) UErrorCode status = U_ZERO_ERROR; 895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Setup our error handler. 915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) switch (on_error) { 925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case OnStringConversionError::FAIL: 935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ucnv_setFromUCallBack(converter, UCNV_FROM_U_CALLBACK_STOP, 0, 945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) NULL, NULL, &status); 955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) break; 965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case OnStringConversionError::SKIP: 975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case OnStringConversionError::SUBSTITUTE: 985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ucnv_setFromUCallBack(converter, UCNV_FROM_U_CALLBACK_SKIP, 0, 995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) NULL, NULL, &status); 1005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) break; 1015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) default: 1025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) NOTREACHED(); 1035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 1045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // ucnv_fromUChars returns size not including terminating null 1065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int actual_size = ucnv_fromUChars(converter, &(*encoded)[0], 1075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) encoded_max_length, uchar_src, uchar_len, &status); 1085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) encoded->resize(actual_size); 1095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ucnv_close(converter); 1105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (U_SUCCESS(status)) 1115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return true; 1125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) encoded->clear(); // Make sure the output is empty on error. 1135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return false; 1145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 1155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Set up our error handler for ToUTF-16 converters 1175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)void SetUpErrorHandlerForToUChars(OnStringConversionError::Type on_error, 1185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) UConverter* converter, UErrorCode* status) { 1195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) switch (on_error) { 1205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case OnStringConversionError::FAIL: 1215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ucnv_setToUCallBack(converter, UCNV_TO_U_CALLBACK_STOP, 0, 1225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) NULL, NULL, status); 1235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) break; 1245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case OnStringConversionError::SKIP: 1255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ucnv_setToUCallBack(converter, UCNV_TO_U_CALLBACK_SKIP, 0, 1265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) NULL, NULL, status); 1275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) break; 1285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case OnStringConversionError::SUBSTITUTE: 1295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ucnv_setToUCallBack(converter, ToUnicodeCallbackSubstitute, 0, 1305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) NULL, NULL, status); 1315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) break; 1325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) default: 1335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) NOTREACHED(); 1345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 1355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 1365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)inline UConverterType utf32_platform_endian() { 1385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#if U_IS_BIG_ENDIAN 1395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return UCNV_UTF32_BigEndian; 1405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#else 1415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return UCNV_UTF32_LittleEndian; 1425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#endif 1435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 1445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} // namespace 1465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Codepage <-> Wide/UTF-16 --------------------------------------------------- 1485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)bool UTF16ToCodepage(const string16& utf16, 1505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const char* codepage_name, 1515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) OnStringConversionError::Type on_error, 1525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) std::string* encoded) { 1535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) encoded->clear(); 1545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) UErrorCode status = U_ZERO_ERROR; 1565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) UConverter* converter = ucnv_open(codepage_name, &status); 1575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (!U_SUCCESS(status)) 1585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return false; 1595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return ConvertFromUTF16(converter, utf16.c_str(), 1615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) static_cast<int>(utf16.length()), on_error, encoded); 1625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 1635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)bool CodepageToUTF16(const std::string& encoded, 1655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const char* codepage_name, 1665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) OnStringConversionError::Type on_error, 1675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) string16* utf16) { 1685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) utf16->clear(); 1695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) UErrorCode status = U_ZERO_ERROR; 1715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) UConverter* converter = ucnv_open(codepage_name, &status); 1725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (!U_SUCCESS(status)) 1735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return false; 1745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Even in the worst case, the maximum length in 2-byte units of UTF-16 1765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // output would be at most the same as the number of bytes in input. There 1775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // is no single-byte encoding in which a character is mapped to a 1785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // non-BMP character requiring two 2-byte units. 1795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // 1805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Moreover, non-BMP characters in legacy multibyte encodings 1815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // (e.g. EUC-JP, GB18030) take at least 2 bytes. The only exceptions are 1825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // BOCU and SCSU, but we don't care about them. 1835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) size_t uchar_max_length = encoded.length() + 1; 1845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) SetUpErrorHandlerForToUChars(on_error, converter, &status); 1862a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) scoped_ptr<char16[]> buffer(new char16[uchar_max_length]); 1875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int actual_size = ucnv_toUChars(converter, buffer.get(), 1885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) static_cast<int>(uchar_max_length), encoded.data(), 1895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) static_cast<int>(encoded.length()), &status); 1905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ucnv_close(converter); 1915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (!U_SUCCESS(status)) { 1925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) utf16->clear(); // Make sure the output is empty on error. 1935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return false; 1945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 1955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) utf16->assign(buffer.get(), actual_size); 1975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return true; 1985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 1995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)bool WideToCodepage(const std::wstring& wide, 2015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const char* codepage_name, 2025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) OnStringConversionError::Type on_error, 2035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) std::string* encoded) { 2045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#if defined(WCHAR_T_IS_UTF16) 2055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return UTF16ToCodepage(wide, codepage_name, on_error, encoded); 2065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#elif defined(WCHAR_T_IS_UTF32) 2075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) encoded->clear(); 2085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) UErrorCode status = U_ZERO_ERROR; 2105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) UConverter* converter = ucnv_open(codepage_name, &status); 2115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (!U_SUCCESS(status)) 2125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return false; 2135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int utf16_len; 2155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // When wchar_t is wider than UChar (16 bits), transform |wide| into a 2165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // UChar* string. Size the UChar* buffer to be large enough to hold twice 2175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // as many UTF-16 code units (UChar's) as there are Unicode code points, 2185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // in case each code points translates to a UTF-16 surrogate pair, 2195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // and leave room for a NUL terminator. 2205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) std::vector<UChar> utf16(wide.length() * 2 + 1); 2215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) u_strFromUTF32(&utf16[0], utf16.size(), &utf16_len, 2225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) reinterpret_cast<const UChar32*>(wide.c_str()), 2235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) wide.length(), &status); 2245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) DCHECK(U_SUCCESS(status)) << "failed to convert wstring to UChar*"; 2255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return ConvertFromUTF16(converter, &utf16[0], utf16_len, on_error, encoded); 2275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#endif // defined(WCHAR_T_IS_UTF32) 2285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 2295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)bool CodepageToWide(const std::string& encoded, 2315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const char* codepage_name, 2325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) OnStringConversionError::Type on_error, 2335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) std::wstring* wide) { 2345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#if defined(WCHAR_T_IS_UTF16) 2355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return CodepageToUTF16(encoded, codepage_name, on_error, wide); 2365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#elif defined(WCHAR_T_IS_UTF32) 2375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) wide->clear(); 2385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) UErrorCode status = U_ZERO_ERROR; 2405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) UConverter* converter = ucnv_open(codepage_name, &status); 2415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (!U_SUCCESS(status)) 2425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return false; 2435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // The maximum length in 4 byte unit of UTF-32 output would be 2455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // at most the same as the number of bytes in input. In the worst 2465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // case of GB18030 (excluding escaped-based encodings like ISO-2022-JP), 2475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // this can be 4 times larger than actually needed. 2485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) size_t wchar_max_length = encoded.length() + 1; 2495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) SetUpErrorHandlerForToUChars(on_error, converter, &status); 2512a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) scoped_ptr<wchar_t[]> buffer(new wchar_t[wchar_max_length]); 2525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int actual_size = ucnv_toAlgorithmic(utf32_platform_endian(), converter, 2535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) reinterpret_cast<char*>(buffer.get()), 2545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) static_cast<int>(wchar_max_length) * sizeof(wchar_t), encoded.data(), 2555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) static_cast<int>(encoded.length()), &status); 2565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ucnv_close(converter); 2575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (!U_SUCCESS(status)) { 2585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) wide->clear(); // Make sure the output is empty on error. 2595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return false; 2605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 2615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // actual_size is # of bytes. 2635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) wide->assign(buffer.get(), actual_size / sizeof(wchar_t)); 2645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return true; 2655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#endif // defined(WCHAR_T_IS_UTF32) 2665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 2675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)bool ConvertToUtf8AndNormalize(const std::string& text, 2695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const std::string& charset, 2705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) std::string* result) { 2715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) result->clear(); 2725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) string16 utf16; 2735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (!CodepageToUTF16( 2745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) text, charset.c_str(), OnStringConversionError::FAIL, &utf16)) 2755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return false; 2765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) UErrorCode status = U_ZERO_ERROR; 2785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) size_t max_length = utf16.length() + 1; 2795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) string16 normalized_utf16; 2802a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) scoped_ptr<char16[]> buffer(new char16[max_length]); 2815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int actual_length = unorm_normalize( 2825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) utf16.c_str(), utf16.length(), UNORM_NFC, 0, 2835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) buffer.get(), static_cast<int>(max_length), &status); 2845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (!U_SUCCESS(status)) 2855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return false; 2865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) normalized_utf16.assign(buffer.get(), actual_length); 2875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return UTF16ToUTF8(normalized_utf16.data(), 2895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) normalized_utf16.length(), result); 2905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 2915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} // namespace base 293