13345a6884c488ff3a535c2c9acdd33d74b37e311Iain Merrick// Copyright (c) 2010 The Chromium Authors. All rights reserved.
2c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// Use of this source code is governed by a BSD-style license that can be
3c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// found in the LICENSE file.
4c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
5c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#include "base/utf_string_conversions.h"
6c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
7c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#include "base/string_piece.h"
83345a6884c488ff3a535c2c9acdd33d74b37e311Iain Merrick#include "base/string_util.h"
9c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#include "base/utf_string_conversion_utils.h"
10c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
11c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottusing base::PrepareForUTF8Output;
12c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottusing base::PrepareForUTF16Or32Output;
13c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottusing base::ReadUnicodeCharacter;
14c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottusing base::WriteUnicodeCharacter;
15c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
16c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottnamespace {
17c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
18c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// Generalized Unicode converter -----------------------------------------------
19c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
20c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// Converts the given source Unicode character type to the given destination
21c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// Unicode character type as a STL string. The given input buffer and size
22c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// determine the source, and the given output STL string will be replaced by
23c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// the result.
24c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scotttemplate<typename SRC_CHAR, typename DEST_STRING>
25c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottbool ConvertUnicode(const SRC_CHAR* src,
26c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott                    size_t src_len,
27c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott                    DEST_STRING* output) {
28c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  // ICU requires 32-bit numbers.
29c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  bool success = true;
30c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  int32 src_len32 = static_cast<int32>(src_len);
31c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  for (int32 i = 0; i < src_len32; i++) {
32c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    uint32 code_point;
33c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    if (ReadUnicodeCharacter(src, src_len32, &i, &code_point)) {
34c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott      WriteUnicodeCharacter(code_point, output);
35c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    } else {
36c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott      WriteUnicodeCharacter(0xFFFD, output);
37c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott      success = false;
38c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    }
39c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  }
40c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
41c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  return success;
42c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott}
43c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
44c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott}  // namespace
45c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
46c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// UTF-8 <-> Wide --------------------------------------------------------------
47c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
48c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottbool WideToUTF8(const wchar_t* src, size_t src_len, std::string* output) {
49c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  PrepareForUTF8Output(src, src_len, output);
50c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  return ConvertUnicode(src, src_len, output);
51c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott}
52c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
53c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottstd::string WideToUTF8(const std::wstring& wide) {
54c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  std::string ret;
55c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  // Ignore the success flag of this call, it will do the best it can for
56c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  // invalid input, which is what we want here.
57c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  WideToUTF8(wide.data(), wide.length(), &ret);
58c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  return ret;
59c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott}
60c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
61c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottbool UTF8ToWide(const char* src, size_t src_len, std::wstring* output) {
62c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  PrepareForUTF16Or32Output(src, src_len, output);
63c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  return ConvertUnicode(src, src_len, output);
64c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott}
65c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
66c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottstd::wstring UTF8ToWide(const base::StringPiece& utf8) {
67c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  std::wstring ret;
68c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  UTF8ToWide(utf8.data(), utf8.length(), &ret);
69c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  return ret;
70c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott}
71c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
72c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// UTF-16 <-> Wide -------------------------------------------------------------
73c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
74c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#if defined(WCHAR_T_IS_UTF16)
75c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
76c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// When wide == UTF-16, then conversions are a NOP.
77c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottbool WideToUTF16(const wchar_t* src, size_t src_len, string16* output) {
78c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  output->assign(src, src_len);
79c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  return true;
80c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott}
81c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
82c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottstring16 WideToUTF16(const std::wstring& wide) {
83c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  return wide;
84c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott}
85c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
86c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottbool UTF16ToWide(const char16* src, size_t src_len, std::wstring* output) {
87c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  output->assign(src, src_len);
88c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  return true;
89c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott}
90c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
91c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottstd::wstring UTF16ToWide(const string16& utf16) {
92c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  return utf16;
93c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott}
94c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
95c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#elif defined(WCHAR_T_IS_UTF32)
96c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
97c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottbool WideToUTF16(const wchar_t* src, size_t src_len, string16* output) {
98c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  output->clear();
99c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  // Assume that normally we won't have any non-BMP characters so the counts
100c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  // will be the same.
101c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  output->reserve(src_len);
102c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  return ConvertUnicode(src, src_len, output);
103c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott}
104c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
105c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottstring16 WideToUTF16(const std::wstring& wide) {
106c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  string16 ret;
107c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  WideToUTF16(wide.data(), wide.length(), &ret);
108c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  return ret;
109c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott}
110c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
111c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottbool UTF16ToWide(const char16* src, size_t src_len, std::wstring* output) {
112c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  output->clear();
113c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  // Assume that normally we won't have any non-BMP characters so the counts
114c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  // will be the same.
115c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  output->reserve(src_len);
116c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  return ConvertUnicode(src, src_len, output);
117c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott}
118c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
119c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottstd::wstring UTF16ToWide(const string16& utf16) {
120c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  std::wstring ret;
121c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  UTF16ToWide(utf16.data(), utf16.length(), &ret);
122c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  return ret;
123c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott}
124c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
125c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#endif  // defined(WCHAR_T_IS_UTF32)
126c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
127c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// UTF16 <-> UTF8 --------------------------------------------------------------
128c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
129c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#if defined(WCHAR_T_IS_UTF32)
130c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
131c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottbool UTF8ToUTF16(const char* src, size_t src_len, string16* output) {
132c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  PrepareForUTF16Or32Output(src, src_len, output);
133c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  return ConvertUnicode(src, src_len, output);
134c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott}
135c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
13672a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsenstring16 UTF8ToUTF16(const base::StringPiece& utf8) {
137c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  string16 ret;
138c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  // Ignore the success flag of this call, it will do the best it can for
139c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  // invalid input, which is what we want here.
140c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  UTF8ToUTF16(utf8.data(), utf8.length(), &ret);
141c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  return ret;
142c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott}
143c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
144c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottbool UTF16ToUTF8(const char16* src, size_t src_len, std::string* output) {
145c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  PrepareForUTF8Output(src, src_len, output);
146c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  return ConvertUnicode(src, src_len, output);
147c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott}
148c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
149c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottstd::string UTF16ToUTF8(const string16& utf16) {
150c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  std::string ret;
151c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  // Ignore the success flag of this call, it will do the best it can for
152c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  // invalid input, which is what we want here.
153c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  UTF16ToUTF8(utf16.data(), utf16.length(), &ret);
154c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  return ret;
155c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott}
156c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
157c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#elif defined(WCHAR_T_IS_UTF16)
158c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// Easy case since we can use the "wide" versions we already wrote above.
159c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
160c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottbool UTF8ToUTF16(const char* src, size_t src_len, string16* output) {
161c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  return UTF8ToWide(src, src_len, output);
162c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott}
163c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
16472a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsenstring16 UTF8ToUTF16(const base::StringPiece& utf8) {
165c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  return UTF8ToWide(utf8);
166c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott}
167c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
168c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottbool UTF16ToUTF8(const char16* src, size_t src_len, std::string* output) {
169c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  return WideToUTF8(src, src_len, output);
170c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott}
171c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
172c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottstd::string UTF16ToUTF8(const string16& utf16) {
173c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  return WideToUTF8(utf16);
174c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott}
175c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
176c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#endif
1773345a6884c488ff3a535c2c9acdd33d74b37e311Iain Merrick
17872a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsenstd::wstring ASCIIToWide(const base::StringPiece& ascii) {
1793345a6884c488ff3a535c2c9acdd33d74b37e311Iain Merrick  DCHECK(IsStringASCII(ascii)) << ascii;
1803345a6884c488ff3a535c2c9acdd33d74b37e311Iain Merrick  return std::wstring(ascii.begin(), ascii.end());
1813345a6884c488ff3a535c2c9acdd33d74b37e311Iain Merrick}
1823345a6884c488ff3a535c2c9acdd33d74b37e311Iain Merrick
18372a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsenstring16 ASCIIToUTF16(const base::StringPiece& ascii) {
1843345a6884c488ff3a535c2c9acdd33d74b37e311Iain Merrick  DCHECK(IsStringASCII(ascii)) << ascii;
1853345a6884c488ff3a535c2c9acdd33d74b37e311Iain Merrick  return string16(ascii.begin(), ascii.end());
1863345a6884c488ff3a535c2c9acdd33d74b37e311Iain Merrick}
187