1c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// Copyright (c) 2009 The Chromium Authors. All rights reserved.
2c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// Use of this source code is governed by a BSD-style license that can be
3c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// found in the LICENSE file.
4c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
5c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#include "base/i18n/icu_string_conversions.h"
6c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
7c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#include <vector>
8c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
9c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#include "base/basictypes.h"
10c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#include "base/logging.h"
11c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#include "base/string_util.h"
12513209b27ff55e2841eac0e4120199c23acce758Ben Murdoch#include "base/utf_string_conversions.h"
13c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#include "unicode/ucnv.h"
14c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#include "unicode/ucnv_cb.h"
15c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#include "unicode/ucnv_err.h"
16513209b27ff55e2841eac0e4120199c23acce758Ben Murdoch#include "unicode/unorm.h"
17c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#include "unicode/ustring.h"
18c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
19c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottnamespace base {
20c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
21c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottnamespace {
22c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// ToUnicodeCallbackSubstitute() is based on UCNV_TO_U_CALLBACK_SUSBSTITUTE
23c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// in source/common/ucnv_err.c.
24c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
25c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// Copyright (c) 1995-2006 International Business Machines Corporation
26c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// and others
27c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott//
28c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// All rights reserved.
29c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott//
30c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
31c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// Permission is hereby granted, free of charge, to any person obtaining a
32c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// copy of this software and associated documentation files (the "Software"),
33c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// to deal in the Software without restriction, including without limitation
34c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// the rights to use, copy, modify, merge, publish, distribute, and/or
35c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// sell copies of the Software, and to permit persons to whom the Software
36c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// is furnished to do so, provided that the above copyright notice(s) and
37c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// this permission notice appear in all copies of the Software and that
38c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// both the above copyright notice(s) and this permission notice appear in
39c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// supporting documentation.
40c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott//
41c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
42c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
43c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT
44c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS
45c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT
46c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
47c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
48c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
49c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// OR PERFORMANCE OF THIS SOFTWARE.
50c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott//
51c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// Except as contained in this notice, the name of a copyright holder
52c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// shall not be used in advertising or otherwise to promote the sale, use
53c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// or other dealings in this Software without prior written authorization
54c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// of the copyright holder.
55c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
56c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott//  ___________________________________________________________________________
57c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott//
58c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// All trademarks and registered trademarks mentioned herein are the property
59c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// of their respective owners.
60c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
61c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottvoid ToUnicodeCallbackSubstitute(const void* context,
62c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott                                 UConverterToUnicodeArgs *to_args,
63c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott                                 const char* code_units,
64c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott                                 int32_t length,
65c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott                                 UConverterCallbackReason reason,
66c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott                                 UErrorCode * err) {
67c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  static const UChar kReplacementChar = 0xFFFD;
68c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  if (reason <= UCNV_IRREGULAR) {
69c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott      if (context == NULL ||
70c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott          (*(reinterpret_cast<const char*>(context)) == 'i' &&
71c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott           reason == UCNV_UNASSIGNED)) {
72c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott        *err = U_ZERO_ERROR;
73c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott        ucnv_cbToUWriteUChars(to_args, &kReplacementChar, 1, 0, err);
74c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott      }
75c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott      // else the caller must have set the error code accordingly.
76c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  }
77c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  // else ignore the reset, close and clone calls.
78c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott}
79c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
80c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottbool ConvertFromUTF16(UConverter* converter, const UChar* uchar_src,
81c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott                      int uchar_len, OnStringConversionError::Type on_error,
82c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott                      std::string* encoded) {
83c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  int encoded_max_length = UCNV_GET_MAX_BYTES_FOR_STRING(uchar_len,
84c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott      ucnv_getMaxCharSize(converter));
85c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  encoded->resize(encoded_max_length);
86c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
87c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  UErrorCode status = U_ZERO_ERROR;
88c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
89c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  // Setup our error handler.
90c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  switch (on_error) {
91c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    case OnStringConversionError::FAIL:
92c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott      ucnv_setFromUCallBack(converter, UCNV_FROM_U_CALLBACK_STOP, 0,
93c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott                            NULL, NULL, &status);
94c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott      break;
95c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    case OnStringConversionError::SKIP:
96c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    case OnStringConversionError::SUBSTITUTE:
97c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott      ucnv_setFromUCallBack(converter, UCNV_FROM_U_CALLBACK_SKIP, 0,
98c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott                            NULL, NULL, &status);
99c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott      break;
100c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    default:
101c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott      NOTREACHED();
102c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  }
103c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
104c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  // ucnv_fromUChars returns size not including terminating null
105c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  int actual_size = ucnv_fromUChars(converter, &(*encoded)[0],
106c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott      encoded_max_length, uchar_src, uchar_len, &status);
107c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  encoded->resize(actual_size);
108c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  ucnv_close(converter);
109c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  if (U_SUCCESS(status))
110c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    return true;
111c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  encoded->clear();  // Make sure the output is empty on error.
112c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  return false;
113c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott}
114c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
115c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// Set up our error handler for ToUTF-16 converters
116c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottvoid SetUpErrorHandlerForToUChars(OnStringConversionError::Type on_error,
117c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott                                  UConverter* converter, UErrorCode* status) {
118c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  switch (on_error) {
119c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    case OnStringConversionError::FAIL:
120c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott      ucnv_setToUCallBack(converter, UCNV_TO_U_CALLBACK_STOP, 0,
121c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott                          NULL, NULL, status);
122c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott      break;
123c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    case OnStringConversionError::SKIP:
124c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott      ucnv_setToUCallBack(converter, UCNV_TO_U_CALLBACK_SKIP, 0,
125c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott                          NULL, NULL, status);
126c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott      break;
127c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    case OnStringConversionError::SUBSTITUTE:
128c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott      ucnv_setToUCallBack(converter, ToUnicodeCallbackSubstitute, 0,
129c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott                          NULL, NULL, status);
130c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott      break;
131c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    default:
132c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott      NOTREACHED();
133c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  }
134c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott}
135c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
136c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottinline UConverterType utf32_platform_endian() {
137c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#if U_IS_BIG_ENDIAN
138c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  return UCNV_UTF32_BigEndian;
139c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#else
140c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  return UCNV_UTF32_LittleEndian;
141c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#endif
142c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott}
143c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
144c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott}  // namespace
145c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
146c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottconst char kCodepageLatin1[] = "ISO-8859-1";
147c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottconst char kCodepageUTF8[] = "UTF-8";
148c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottconst char kCodepageUTF16BE[] = "UTF-16BE";
149c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottconst char kCodepageUTF16LE[] = "UTF-16LE";
150c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
151c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// Codepage <-> Wide/UTF-16  ---------------------------------------------------
152c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
153c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottbool UTF16ToCodepage(const string16& utf16,
154c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott                     const char* codepage_name,
155c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott                     OnStringConversionError::Type on_error,
156c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott                     std::string* encoded) {
157c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  encoded->clear();
158c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
159c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  UErrorCode status = U_ZERO_ERROR;
160c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  UConverter* converter = ucnv_open(codepage_name, &status);
161c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  if (!U_SUCCESS(status))
162c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    return false;
163c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
164c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  return ConvertFromUTF16(converter, utf16.c_str(),
165c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott                          static_cast<int>(utf16.length()), on_error, encoded);
166c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott}
167c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
168c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottbool CodepageToUTF16(const std::string& encoded,
169c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott                     const char* codepage_name,
170c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott                     OnStringConversionError::Type on_error,
171c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott                     string16* utf16) {
172c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  utf16->clear();
173c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
174c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  UErrorCode status = U_ZERO_ERROR;
175c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  UConverter* converter = ucnv_open(codepage_name, &status);
176c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  if (!U_SUCCESS(status))
177c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    return false;
178c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
179c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  // Even in the worst case, the maximum length in 2-byte units of UTF-16
180c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  // output would be at most the same as the number of bytes in input. There
181c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  // is no single-byte encoding in which a character is mapped to a
182c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  // non-BMP character requiring two 2-byte units.
183c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  //
184c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  // Moreover, non-BMP characters in legacy multibyte encodings
185c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  // (e.g. EUC-JP, GB18030) take at least 2 bytes. The only exceptions are
186c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  // BOCU and SCSU, but we don't care about them.
187c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  size_t uchar_max_length = encoded.length() + 1;
188c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
189c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  SetUpErrorHandlerForToUChars(on_error, converter, &status);
190c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  int actual_size = ucnv_toUChars(converter, WriteInto(utf16, uchar_max_length),
191c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott      static_cast<int>(uchar_max_length), encoded.data(),
192c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott      static_cast<int>(encoded.length()), &status);
193c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  ucnv_close(converter);
194c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  if (!U_SUCCESS(status)) {
195c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    utf16->clear();  // Make sure the output is empty on error.
196c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    return false;
197c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  }
198c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
199c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  utf16->resize(actual_size);
200c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  return true;
201c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott}
202c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
203c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottbool WideToCodepage(const std::wstring& wide,
204c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott                    const char* codepage_name,
205c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott                    OnStringConversionError::Type on_error,
206c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott                    std::string* encoded) {
207c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#if defined(WCHAR_T_IS_UTF16)
208c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  return UTF16ToCodepage(wide, codepage_name, on_error, encoded);
209c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#elif defined(WCHAR_T_IS_UTF32)
210c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  encoded->clear();
211c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
212c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  UErrorCode status = U_ZERO_ERROR;
213c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  UConverter* converter = ucnv_open(codepage_name, &status);
214c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  if (!U_SUCCESS(status))
215c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    return false;
216c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
217c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  int utf16_len;
218c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  // When wchar_t is wider than UChar (16 bits), transform |wide| into a
219c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  // UChar* string.  Size the UChar* buffer to be large enough to hold twice
220c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  // as many UTF-16 code units (UChar's) as there are Unicode code points,
221c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  // in case each code points translates to a UTF-16 surrogate pair,
222c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  // and leave room for a NUL terminator.
223c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  std::vector<UChar> utf16(wide.length() * 2 + 1);
224dc0f95d653279beabeb9817299e2902918ba123eKristian Monsen  u_strFromUTF32(&utf16[0], utf16.size(), &utf16_len,
225dc0f95d653279beabeb9817299e2902918ba123eKristian Monsen                 reinterpret_cast<const UChar32*>(wide.c_str()),
226dc0f95d653279beabeb9817299e2902918ba123eKristian Monsen                 wide.length(), &status);
227c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  DCHECK(U_SUCCESS(status)) << "failed to convert wstring to UChar*";
228c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
229c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  return ConvertFromUTF16(converter, &utf16[0], utf16_len, on_error, encoded);
230c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#endif  // defined(WCHAR_T_IS_UTF32)
231c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott}
232c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
233c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottbool CodepageToWide(const std::string& encoded,
234c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott                    const char* codepage_name,
235c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott                    OnStringConversionError::Type on_error,
236c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott                    std::wstring* wide) {
237c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#if defined(WCHAR_T_IS_UTF16)
238c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  return CodepageToUTF16(encoded, codepage_name, on_error, wide);
239c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#elif defined(WCHAR_T_IS_UTF32)
240c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  wide->clear();
241c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
242c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  UErrorCode status = U_ZERO_ERROR;
243c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  UConverter* converter = ucnv_open(codepage_name, &status);
244c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  if (!U_SUCCESS(status))
245c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    return false;
246c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
247c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  // The maximum length in 4 byte unit of UTF-32 output would be
248c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  // at most the same as the number of bytes in input. In the worst
249c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  // case of GB18030 (excluding escaped-based encodings like ISO-2022-JP),
250c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  // this can be 4 times larger than actually needed.
251c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  size_t wchar_max_length = encoded.length() + 1;
252c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
253c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  SetUpErrorHandlerForToUChars(on_error, converter, &status);
254c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  int actual_size = ucnv_toAlgorithmic(utf32_platform_endian(), converter,
255c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott      reinterpret_cast<char*>(WriteInto(wide, wchar_max_length)),
256c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott      static_cast<int>(wchar_max_length) * sizeof(wchar_t), encoded.data(),
257c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott      static_cast<int>(encoded.length()), &status);
258c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  ucnv_close(converter);
259c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  if (!U_SUCCESS(status)) {
260c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    wide->clear();  // Make sure the output is empty on error.
261c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    return false;
262c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  }
263c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
264c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  // actual_size is # of bytes.
265c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  wide->resize(actual_size / sizeof(wchar_t));
266c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  return true;
267c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#endif  // defined(WCHAR_T_IS_UTF32)
268c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott}
269c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
270513209b27ff55e2841eac0e4120199c23acce758Ben Murdochbool ConvertToUtf8AndNormalize(const std::string& text,
271513209b27ff55e2841eac0e4120199c23acce758Ben Murdoch                               const std::string& charset,
272513209b27ff55e2841eac0e4120199c23acce758Ben Murdoch                               std::string* result) {
273513209b27ff55e2841eac0e4120199c23acce758Ben Murdoch  result->clear();
274513209b27ff55e2841eac0e4120199c23acce758Ben Murdoch  string16 utf16;
275513209b27ff55e2841eac0e4120199c23acce758Ben Murdoch  if (!CodepageToUTF16(
276513209b27ff55e2841eac0e4120199c23acce758Ben Murdoch      text, charset.c_str(), OnStringConversionError::FAIL, &utf16))
277513209b27ff55e2841eac0e4120199c23acce758Ben Murdoch    return false;
278513209b27ff55e2841eac0e4120199c23acce758Ben Murdoch
279513209b27ff55e2841eac0e4120199c23acce758Ben Murdoch  UErrorCode status = U_ZERO_ERROR;
280513209b27ff55e2841eac0e4120199c23acce758Ben Murdoch  size_t max_length = utf16.length() + 1;
281513209b27ff55e2841eac0e4120199c23acce758Ben Murdoch  string16 normalized_utf16;
282513209b27ff55e2841eac0e4120199c23acce758Ben Murdoch  int actual_length = unorm_normalize(
283513209b27ff55e2841eac0e4120199c23acce758Ben Murdoch      utf16.c_str(), utf16.length(), UNORM_NFC, 0,
284513209b27ff55e2841eac0e4120199c23acce758Ben Murdoch      WriteInto(&normalized_utf16, max_length),
285513209b27ff55e2841eac0e4120199c23acce758Ben Murdoch      static_cast<int>(max_length), &status);
286513209b27ff55e2841eac0e4120199c23acce758Ben Murdoch  if (!U_SUCCESS(status))
287513209b27ff55e2841eac0e4120199c23acce758Ben Murdoch    return false;
288513209b27ff55e2841eac0e4120199c23acce758Ben Murdoch  normalized_utf16.resize(actual_length);
289513209b27ff55e2841eac0e4120199c23acce758Ben Murdoch
290513209b27ff55e2841eac0e4120199c23acce758Ben Murdoch  return UTF16ToUTF8(normalized_utf16.data(),
291513209b27ff55e2841eac0e4120199c23acce758Ben Murdoch                     normalized_utf16.length(), result);
292513209b27ff55e2841eac0e4120199c23acce758Ben Murdoch}
293513209b27ff55e2841eac0e4120199c23acce758Ben Murdoch
294c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott}  // namespace base
295