13345a6884c488ff3a535c2c9acdd33d74b37e311Iain Merrick// Copyright (c) 2010 The Chromium Authors. All rights reserved. 2c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// Use of this source code is governed by a BSD-style license that can be 3c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// found in the LICENSE file. 4c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 5c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#include "base/utf_string_conversions.h" 6c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 7c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#include "base/string_piece.h" 83345a6884c488ff3a535c2c9acdd33d74b37e311Iain Merrick#include "base/string_util.h" 9c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#include "base/utf_string_conversion_utils.h" 10c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 11c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottusing base::PrepareForUTF8Output; 12c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottusing base::PrepareForUTF16Or32Output; 13c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottusing base::ReadUnicodeCharacter; 14c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottusing base::WriteUnicodeCharacter; 15c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 16c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottnamespace { 17c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 18c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// Generalized Unicode converter ----------------------------------------------- 19c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 20c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// Converts the given source Unicode character type to the given destination 21c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// Unicode character type as a STL string. The given input buffer and size 22c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// determine the source, and the given output STL string will be replaced by 23c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// the result. 24c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scotttemplate<typename SRC_CHAR, typename DEST_STRING> 25c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottbool ConvertUnicode(const SRC_CHAR* src, 26c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott size_t src_len, 27c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott DEST_STRING* output) { 28c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // ICU requires 32-bit numbers. 29c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott bool success = true; 30c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott int32 src_len32 = static_cast<int32>(src_len); 31c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott for (int32 i = 0; i < src_len32; i++) { 32c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott uint32 code_point; 33c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott if (ReadUnicodeCharacter(src, src_len32, &i, &code_point)) { 34c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott WriteUnicodeCharacter(code_point, output); 35c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } else { 36c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott WriteUnicodeCharacter(0xFFFD, output); 37c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott success = false; 38c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 39c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 40c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 41c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott return success; 42c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott} 43c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 44c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott} // namespace 45c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 46c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// UTF-8 <-> Wide -------------------------------------------------------------- 47c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 48c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottbool WideToUTF8(const wchar_t* src, size_t src_len, std::string* output) { 49c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott PrepareForUTF8Output(src, src_len, output); 50c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott return ConvertUnicode(src, src_len, output); 51c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott} 52c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 53c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottstd::string WideToUTF8(const std::wstring& wide) { 54c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott std::string ret; 55c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // Ignore the success flag of this call, it will do the best it can for 56c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // invalid input, which is what we want here. 57c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott WideToUTF8(wide.data(), wide.length(), &ret); 58c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott return ret; 59c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott} 60c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 61c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottbool UTF8ToWide(const char* src, size_t src_len, std::wstring* output) { 62c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott PrepareForUTF16Or32Output(src, src_len, output); 63c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott return ConvertUnicode(src, src_len, output); 64c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott} 65c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 66c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottstd::wstring UTF8ToWide(const base::StringPiece& utf8) { 67c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott std::wstring ret; 68c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott UTF8ToWide(utf8.data(), utf8.length(), &ret); 69c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott return ret; 70c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott} 71c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 72c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// UTF-16 <-> Wide ------------------------------------------------------------- 73c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 74c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#if defined(WCHAR_T_IS_UTF16) 75c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 76c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// When wide == UTF-16, then conversions are a NOP. 77c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottbool WideToUTF16(const wchar_t* src, size_t src_len, string16* output) { 78c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott output->assign(src, src_len); 79c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott return true; 80c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott} 81c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 82c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottstring16 WideToUTF16(const std::wstring& wide) { 83c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott return wide; 84c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott} 85c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 86c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottbool UTF16ToWide(const char16* src, size_t src_len, std::wstring* output) { 87c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott output->assign(src, src_len); 88c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott return true; 89c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott} 90c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 91c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottstd::wstring UTF16ToWide(const string16& utf16) { 92c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott return utf16; 93c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott} 94c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 95c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#elif defined(WCHAR_T_IS_UTF32) 96c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 97c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottbool WideToUTF16(const wchar_t* src, size_t src_len, string16* output) { 98c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott output->clear(); 99c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // Assume that normally we won't have any non-BMP characters so the counts 100c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // will be the same. 101c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott output->reserve(src_len); 102c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott return ConvertUnicode(src, src_len, output); 103c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott} 104c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 105c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottstring16 WideToUTF16(const std::wstring& wide) { 106c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott string16 ret; 107c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott WideToUTF16(wide.data(), wide.length(), &ret); 108c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott return ret; 109c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott} 110c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 111c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottbool UTF16ToWide(const char16* src, size_t src_len, std::wstring* output) { 112c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott output->clear(); 113c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // Assume that normally we won't have any non-BMP characters so the counts 114c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // will be the same. 115c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott output->reserve(src_len); 116c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott return ConvertUnicode(src, src_len, output); 117c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott} 118c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 119c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottstd::wstring UTF16ToWide(const string16& utf16) { 120c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott std::wstring ret; 121c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott UTF16ToWide(utf16.data(), utf16.length(), &ret); 122c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott return ret; 123c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott} 124c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 125c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#endif // defined(WCHAR_T_IS_UTF32) 126c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 127c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// UTF16 <-> UTF8 -------------------------------------------------------------- 128c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 129c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#if defined(WCHAR_T_IS_UTF32) 130c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 131c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottbool UTF8ToUTF16(const char* src, size_t src_len, string16* output) { 132c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott PrepareForUTF16Or32Output(src, src_len, output); 133c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott return ConvertUnicode(src, src_len, output); 134c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott} 135c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 13672a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsenstring16 UTF8ToUTF16(const base::StringPiece& utf8) { 137c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott string16 ret; 138c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // Ignore the success flag of this call, it will do the best it can for 139c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // invalid input, which is what we want here. 140c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott UTF8ToUTF16(utf8.data(), utf8.length(), &ret); 141c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott return ret; 142c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott} 143c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 144c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottbool UTF16ToUTF8(const char16* src, size_t src_len, std::string* output) { 145c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott PrepareForUTF8Output(src, src_len, output); 146c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott return ConvertUnicode(src, src_len, output); 147c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott} 148c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 149c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottstd::string UTF16ToUTF8(const string16& utf16) { 150c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott std::string ret; 151c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // Ignore the success flag of this call, it will do the best it can for 152c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // invalid input, which is what we want here. 153c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott UTF16ToUTF8(utf16.data(), utf16.length(), &ret); 154c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott return ret; 155c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott} 156c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 157c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#elif defined(WCHAR_T_IS_UTF16) 158c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// Easy case since we can use the "wide" versions we already wrote above. 159c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 160c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottbool UTF8ToUTF16(const char* src, size_t src_len, string16* output) { 161c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott return UTF8ToWide(src, src_len, output); 162c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott} 163c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 16472a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsenstring16 UTF8ToUTF16(const base::StringPiece& utf8) { 165c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott return UTF8ToWide(utf8); 166c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott} 167c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 168c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottbool UTF16ToUTF8(const char16* src, size_t src_len, std::string* output) { 169c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott return WideToUTF8(src, src_len, output); 170c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott} 171c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 172c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottstd::string UTF16ToUTF8(const string16& utf16) { 173c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott return WideToUTF8(utf16); 174c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott} 175c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 176c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#endif 1773345a6884c488ff3a535c2c9acdd33d74b37e311Iain Merrick 17872a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsenstd::wstring ASCIIToWide(const base::StringPiece& ascii) { 1793345a6884c488ff3a535c2c9acdd33d74b37e311Iain Merrick DCHECK(IsStringASCII(ascii)) << ascii; 1803345a6884c488ff3a535c2c9acdd33d74b37e311Iain Merrick return std::wstring(ascii.begin(), ascii.end()); 1813345a6884c488ff3a535c2c9acdd33d74b37e311Iain Merrick} 1823345a6884c488ff3a535c2c9acdd33d74b37e311Iain Merrick 18372a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsenstring16 ASCIIToUTF16(const base::StringPiece& ascii) { 1843345a6884c488ff3a535c2c9acdd33d74b37e311Iain Merrick DCHECK(IsStringASCII(ascii)) << ascii; 1853345a6884c488ff3a535c2c9acdd33d74b37e311Iain Merrick return string16(ascii.begin(), ascii.end()); 1863345a6884c488ff3a535c2c9acdd33d74b37e311Iain Merrick} 187