1c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// Copyright (c) 2009 The Chromium Authors. All rights reserved. 2c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// Use of this source code is governed by a BSD-style license that can be 3c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// found in the LICENSE file. 4c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 5c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#include "base/i18n/icu_string_conversions.h" 6c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 7c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#include <vector> 8c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 9c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#include "base/basictypes.h" 10c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#include "base/logging.h" 11c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#include "base/string_util.h" 12513209b27ff55e2841eac0e4120199c23acce758Ben Murdoch#include "base/utf_string_conversions.h" 13c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#include "unicode/ucnv.h" 14c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#include "unicode/ucnv_cb.h" 15c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#include "unicode/ucnv_err.h" 16513209b27ff55e2841eac0e4120199c23acce758Ben Murdoch#include "unicode/unorm.h" 17c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#include "unicode/ustring.h" 18c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 19c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottnamespace base { 20c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 21c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottnamespace { 22c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// ToUnicodeCallbackSubstitute() is based on UCNV_TO_U_CALLBACK_SUSBSTITUTE 23c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// in source/common/ucnv_err.c. 24c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 25c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// Copyright (c) 1995-2006 International Business Machines Corporation 26c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// and others 27c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// 28c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// All rights reserved. 29c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// 30c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 31c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// Permission is hereby granted, free of charge, to any person obtaining a 32c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// copy of this software and associated documentation files (the "Software"), 33c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// to deal in the Software without restriction, including without limitation 34c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// the rights to use, copy, modify, merge, publish, distribute, and/or 35c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// sell copies of the Software, and to permit persons to whom the Software 36c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// is furnished to do so, provided that the above copyright notice(s) and 37c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// this permission notice appear in all copies of the Software and that 38c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// both the above copyright notice(s) and this permission notice appear in 39c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// supporting documentation. 40c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// 41c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 42c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 43c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT 44c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS 45c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT 46c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS 47c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE 48c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE 49c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// OR PERFORMANCE OF THIS SOFTWARE. 50c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// 51c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// Except as contained in this notice, the name of a copyright holder 52c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// shall not be used in advertising or otherwise to promote the sale, use 53c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// or other dealings in this Software without prior written authorization 54c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// of the copyright holder. 55c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 56c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// ___________________________________________________________________________ 57c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// 58c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// All trademarks and registered trademarks mentioned herein are the property 59c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// of their respective owners. 60c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 61c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottvoid ToUnicodeCallbackSubstitute(const void* context, 62c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott UConverterToUnicodeArgs *to_args, 63c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott const char* code_units, 64c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott int32_t length, 65c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott UConverterCallbackReason reason, 66c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott UErrorCode * err) { 67c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott static const UChar kReplacementChar = 0xFFFD; 68c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott if (reason <= UCNV_IRREGULAR) { 69c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott if (context == NULL || 70c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott (*(reinterpret_cast<const char*>(context)) == 'i' && 71c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott reason == UCNV_UNASSIGNED)) { 72c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott *err = U_ZERO_ERROR; 73c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott ucnv_cbToUWriteUChars(to_args, &kReplacementChar, 1, 0, err); 74c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 75c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // else the caller must have set the error code accordingly. 76c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 77c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // else ignore the reset, close and clone calls. 78c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott} 79c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 80c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottbool ConvertFromUTF16(UConverter* converter, const UChar* uchar_src, 81c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott int uchar_len, OnStringConversionError::Type on_error, 82c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott std::string* encoded) { 83c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott int encoded_max_length = UCNV_GET_MAX_BYTES_FOR_STRING(uchar_len, 84c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott ucnv_getMaxCharSize(converter)); 85c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott encoded->resize(encoded_max_length); 86c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 87c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott UErrorCode status = U_ZERO_ERROR; 88c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 89c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // Setup our error handler. 90c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott switch (on_error) { 91c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott case OnStringConversionError::FAIL: 92c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott ucnv_setFromUCallBack(converter, UCNV_FROM_U_CALLBACK_STOP, 0, 93c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott NULL, NULL, &status); 94c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott break; 95c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott case OnStringConversionError::SKIP: 96c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott case OnStringConversionError::SUBSTITUTE: 97c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott ucnv_setFromUCallBack(converter, UCNV_FROM_U_CALLBACK_SKIP, 0, 98c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott NULL, NULL, &status); 99c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott break; 100c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott default: 101c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott NOTREACHED(); 102c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 103c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 104c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // ucnv_fromUChars returns size not including terminating null 105c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott int actual_size = ucnv_fromUChars(converter, &(*encoded)[0], 106c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott encoded_max_length, uchar_src, uchar_len, &status); 107c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott encoded->resize(actual_size); 108c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott ucnv_close(converter); 109c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott if (U_SUCCESS(status)) 110c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott return true; 111c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott encoded->clear(); // Make sure the output is empty on error. 112c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott return false; 113c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott} 114c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 115c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// Set up our error handler for ToUTF-16 converters 116c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottvoid SetUpErrorHandlerForToUChars(OnStringConversionError::Type on_error, 117c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott UConverter* converter, UErrorCode* status) { 118c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott switch (on_error) { 119c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott case OnStringConversionError::FAIL: 120c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott ucnv_setToUCallBack(converter, UCNV_TO_U_CALLBACK_STOP, 0, 121c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott NULL, NULL, status); 122c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott break; 123c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott case OnStringConversionError::SKIP: 124c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott ucnv_setToUCallBack(converter, UCNV_TO_U_CALLBACK_SKIP, 0, 125c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott NULL, NULL, status); 126c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott break; 127c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott case OnStringConversionError::SUBSTITUTE: 128c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott ucnv_setToUCallBack(converter, ToUnicodeCallbackSubstitute, 0, 129c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott NULL, NULL, status); 130c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott break; 131c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott default: 132c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott NOTREACHED(); 133c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 134c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott} 135c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 136c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottinline UConverterType utf32_platform_endian() { 137c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#if U_IS_BIG_ENDIAN 138c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott return UCNV_UTF32_BigEndian; 139c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#else 140c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott return UCNV_UTF32_LittleEndian; 141c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#endif 142c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott} 143c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 144c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott} // namespace 145c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 146c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottconst char kCodepageLatin1[] = "ISO-8859-1"; 147c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottconst char kCodepageUTF8[] = "UTF-8"; 148c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottconst char kCodepageUTF16BE[] = "UTF-16BE"; 149c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottconst char kCodepageUTF16LE[] = "UTF-16LE"; 150c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 151c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// Codepage <-> Wide/UTF-16 --------------------------------------------------- 152c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 153c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottbool UTF16ToCodepage(const string16& utf16, 154c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott const char* codepage_name, 155c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott OnStringConversionError::Type on_error, 156c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott std::string* encoded) { 157c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott encoded->clear(); 158c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 159c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott UErrorCode status = U_ZERO_ERROR; 160c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott UConverter* converter = ucnv_open(codepage_name, &status); 161c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott if (!U_SUCCESS(status)) 162c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott return false; 163c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 164c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott return ConvertFromUTF16(converter, utf16.c_str(), 165c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott static_cast<int>(utf16.length()), on_error, encoded); 166c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott} 167c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 168c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottbool CodepageToUTF16(const std::string& encoded, 169c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott const char* codepage_name, 170c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott OnStringConversionError::Type on_error, 171c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott string16* utf16) { 172c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott utf16->clear(); 173c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 174c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott UErrorCode status = U_ZERO_ERROR; 175c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott UConverter* converter = ucnv_open(codepage_name, &status); 176c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott if (!U_SUCCESS(status)) 177c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott return false; 178c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 179c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // Even in the worst case, the maximum length in 2-byte units of UTF-16 180c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // output would be at most the same as the number of bytes in input. There 181c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // is no single-byte encoding in which a character is mapped to a 182c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // non-BMP character requiring two 2-byte units. 183c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // 184c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // Moreover, non-BMP characters in legacy multibyte encodings 185c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // (e.g. EUC-JP, GB18030) take at least 2 bytes. The only exceptions are 186c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // BOCU and SCSU, but we don't care about them. 187c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott size_t uchar_max_length = encoded.length() + 1; 188c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 189c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott SetUpErrorHandlerForToUChars(on_error, converter, &status); 190c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott int actual_size = ucnv_toUChars(converter, WriteInto(utf16, uchar_max_length), 191c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott static_cast<int>(uchar_max_length), encoded.data(), 192c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott static_cast<int>(encoded.length()), &status); 193c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott ucnv_close(converter); 194c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott if (!U_SUCCESS(status)) { 195c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott utf16->clear(); // Make sure the output is empty on error. 196c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott return false; 197c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 198c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 199c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott utf16->resize(actual_size); 200c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott return true; 201c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott} 202c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 203c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottbool WideToCodepage(const std::wstring& wide, 204c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott const char* codepage_name, 205c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott OnStringConversionError::Type on_error, 206c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott std::string* encoded) { 207c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#if defined(WCHAR_T_IS_UTF16) 208c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott return UTF16ToCodepage(wide, codepage_name, on_error, encoded); 209c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#elif defined(WCHAR_T_IS_UTF32) 210c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott encoded->clear(); 211c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 212c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott UErrorCode status = U_ZERO_ERROR; 213c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott UConverter* converter = ucnv_open(codepage_name, &status); 214c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott if (!U_SUCCESS(status)) 215c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott return false; 216c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 217c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott int utf16_len; 218c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // When wchar_t is wider than UChar (16 bits), transform |wide| into a 219c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // UChar* string. Size the UChar* buffer to be large enough to hold twice 220c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // as many UTF-16 code units (UChar's) as there are Unicode code points, 221c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // in case each code points translates to a UTF-16 surrogate pair, 222c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // and leave room for a NUL terminator. 223c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott std::vector<UChar> utf16(wide.length() * 2 + 1); 224dc0f95d653279beabeb9817299e2902918ba123eKristian Monsen u_strFromUTF32(&utf16[0], utf16.size(), &utf16_len, 225dc0f95d653279beabeb9817299e2902918ba123eKristian Monsen reinterpret_cast<const UChar32*>(wide.c_str()), 226dc0f95d653279beabeb9817299e2902918ba123eKristian Monsen wide.length(), &status); 227c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott DCHECK(U_SUCCESS(status)) << "failed to convert wstring to UChar*"; 228c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 229c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott return ConvertFromUTF16(converter, &utf16[0], utf16_len, on_error, encoded); 230c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#endif // defined(WCHAR_T_IS_UTF32) 231c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott} 232c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 233c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottbool CodepageToWide(const std::string& encoded, 234c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott const char* codepage_name, 235c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott OnStringConversionError::Type on_error, 236c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott std::wstring* wide) { 237c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#if defined(WCHAR_T_IS_UTF16) 238c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott return CodepageToUTF16(encoded, codepage_name, on_error, wide); 239c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#elif defined(WCHAR_T_IS_UTF32) 240c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott wide->clear(); 241c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 242c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott UErrorCode status = U_ZERO_ERROR; 243c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott UConverter* converter = ucnv_open(codepage_name, &status); 244c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott if (!U_SUCCESS(status)) 245c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott return false; 246c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 247c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // The maximum length in 4 byte unit of UTF-32 output would be 248c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // at most the same as the number of bytes in input. In the worst 249c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // case of GB18030 (excluding escaped-based encodings like ISO-2022-JP), 250c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // this can be 4 times larger than actually needed. 251c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott size_t wchar_max_length = encoded.length() + 1; 252c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 253c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott SetUpErrorHandlerForToUChars(on_error, converter, &status); 254c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott int actual_size = ucnv_toAlgorithmic(utf32_platform_endian(), converter, 255c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott reinterpret_cast<char*>(WriteInto(wide, wchar_max_length)), 256c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott static_cast<int>(wchar_max_length) * sizeof(wchar_t), encoded.data(), 257c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott static_cast<int>(encoded.length()), &status); 258c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott ucnv_close(converter); 259c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott if (!U_SUCCESS(status)) { 260c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott wide->clear(); // Make sure the output is empty on error. 261c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott return false; 262c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 263c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 264c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // actual_size is # of bytes. 265c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott wide->resize(actual_size / sizeof(wchar_t)); 266c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott return true; 267c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#endif // defined(WCHAR_T_IS_UTF32) 268c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott} 269c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 270513209b27ff55e2841eac0e4120199c23acce758Ben Murdochbool ConvertToUtf8AndNormalize(const std::string& text, 271513209b27ff55e2841eac0e4120199c23acce758Ben Murdoch const std::string& charset, 272513209b27ff55e2841eac0e4120199c23acce758Ben Murdoch std::string* result) { 273513209b27ff55e2841eac0e4120199c23acce758Ben Murdoch result->clear(); 274513209b27ff55e2841eac0e4120199c23acce758Ben Murdoch string16 utf16; 275513209b27ff55e2841eac0e4120199c23acce758Ben Murdoch if (!CodepageToUTF16( 276513209b27ff55e2841eac0e4120199c23acce758Ben Murdoch text, charset.c_str(), OnStringConversionError::FAIL, &utf16)) 277513209b27ff55e2841eac0e4120199c23acce758Ben Murdoch return false; 278513209b27ff55e2841eac0e4120199c23acce758Ben Murdoch 279513209b27ff55e2841eac0e4120199c23acce758Ben Murdoch UErrorCode status = U_ZERO_ERROR; 280513209b27ff55e2841eac0e4120199c23acce758Ben Murdoch size_t max_length = utf16.length() + 1; 281513209b27ff55e2841eac0e4120199c23acce758Ben Murdoch string16 normalized_utf16; 282513209b27ff55e2841eac0e4120199c23acce758Ben Murdoch int actual_length = unorm_normalize( 283513209b27ff55e2841eac0e4120199c23acce758Ben Murdoch utf16.c_str(), utf16.length(), UNORM_NFC, 0, 284513209b27ff55e2841eac0e4120199c23acce758Ben Murdoch WriteInto(&normalized_utf16, max_length), 285513209b27ff55e2841eac0e4120199c23acce758Ben Murdoch static_cast<int>(max_length), &status); 286513209b27ff55e2841eac0e4120199c23acce758Ben Murdoch if (!U_SUCCESS(status)) 287513209b27ff55e2841eac0e4120199c23acce758Ben Murdoch return false; 288513209b27ff55e2841eac0e4120199c23acce758Ben Murdoch normalized_utf16.resize(actual_length); 289513209b27ff55e2841eac0e4120199c23acce758Ben Murdoch 290513209b27ff55e2841eac0e4120199c23acce758Ben Murdoch return UTF16ToUTF8(normalized_utf16.data(), 291513209b27ff55e2841eac0e4120199c23acce758Ben Murdoch normalized_utf16.length(), result); 292513209b27ff55e2841eac0e4120199c23acce758Ben Murdoch} 293513209b27ff55e2841eac0e4120199c23acce758Ben Murdoch 294c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott} // namespace base 295