15821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Copyright (c) 2012 The Chromium Authors. All rights reserved.
25821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Use of this source code is governed by a BSD-style license that can be
35821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// found in the LICENSE file.
45821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
55821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "base/i18n/icu_string_conversions.h"
65821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
75821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include <vector>
85821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
95821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "base/basictypes.h"
105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "base/logging.h"
115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "base/memory/scoped_ptr.h"
12868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)#include "base/strings/string_util.h"
13868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)#include "base/strings/utf_string_conversions.h"
14ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch#include "third_party/icu/source/common/unicode/ucnv.h"
15ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch#include "third_party/icu/source/common/unicode/ucnv_cb.h"
16ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch#include "third_party/icu/source/common/unicode/ucnv_err.h"
17ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch#include "third_party/icu/source/common/unicode/unorm.h"
18ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch#include "third_party/icu/source/common/unicode/ustring.h"
195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)namespace base {
215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)namespace {
235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// ToUnicodeCallbackSubstitute() is based on UCNV_TO_U_CALLBACK_SUBSTITUTE
245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// in source/common/ucnv_err.c.
255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Copyright (c) 1995-2006 International Business Machines Corporation
275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// and others
285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//
295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// All rights reserved.
305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//
315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Permission is hereby granted, free of charge, to any person obtaining a
335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// copy of this software and associated documentation files (the "Software"),
345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// to deal in the Software without restriction, including without limitation
355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// the rights to use, copy, modify, merge, publish, distribute, and/or
365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// sell copies of the Software, and to permit persons to whom the Software
375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// is furnished to do so, provided that the above copyright notice(s) and
385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// this permission notice appear in all copies of the Software and that
395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// both the above copyright notice(s) and this permission notice appear in
405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// supporting documentation.
415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//
425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT
455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS
465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT
475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// OR PERFORMANCE OF THIS SOFTWARE.
515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//
525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Except as contained in this notice, the name of a copyright holder
535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// shall not be used in advertising or otherwise to promote the sale, use
545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// or other dealings in this Software without prior written authorization
555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// of the copyright holder.
565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//  ___________________________________________________________________________
585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//
595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// All trademarks and registered trademarks mentioned herein are the property
605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// of their respective owners.
615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)void ToUnicodeCallbackSubstitute(const void* context,
635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                                 UConverterToUnicodeArgs *to_args,
645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                                 const char* code_units,
655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                                 int32_t length,
665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                                 UConverterCallbackReason reason,
675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                                 UErrorCode * err) {
685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  static const UChar kReplacementChar = 0xFFFD;
695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (reason <= UCNV_IRREGULAR) {
705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      if (context == NULL ||
715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          (*(reinterpret_cast<const char*>(context)) == 'i' &&
725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)           reason == UCNV_UNASSIGNED)) {
735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        *err = U_ZERO_ERROR;
745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        ucnv_cbToUWriteUChars(to_args, &kReplacementChar, 1, 0, err);
755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      }
765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      // else the caller must have set the error code accordingly.
775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // else ignore the reset, close and clone calls.
795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)bool ConvertFromUTF16(UConverter* converter, const UChar* uchar_src,
825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                      int uchar_len, OnStringConversionError::Type on_error,
835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                      std::string* encoded) {
845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int encoded_max_length = UCNV_GET_MAX_BYTES_FOR_STRING(uchar_len,
855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      ucnv_getMaxCharSize(converter));
865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  encoded->resize(encoded_max_length);
875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  UErrorCode status = U_ZERO_ERROR;
895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Setup our error handler.
915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  switch (on_error) {
925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    case OnStringConversionError::FAIL:
935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      ucnv_setFromUCallBack(converter, UCNV_FROM_U_CALLBACK_STOP, 0,
945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                            NULL, NULL, &status);
955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      break;
965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    case OnStringConversionError::SKIP:
975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    case OnStringConversionError::SUBSTITUTE:
985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      ucnv_setFromUCallBack(converter, UCNV_FROM_U_CALLBACK_SKIP, 0,
995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                            NULL, NULL, &status);
1005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      break;
1015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    default:
1025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      NOTREACHED();
1035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
1045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // ucnv_fromUChars returns size not including terminating null
1065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int actual_size = ucnv_fromUChars(converter, &(*encoded)[0],
1075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      encoded_max_length, uchar_src, uchar_len, &status);
1085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  encoded->resize(actual_size);
1095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  ucnv_close(converter);
1105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (U_SUCCESS(status))
1115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return true;
1125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  encoded->clear();  // Make sure the output is empty on error.
1135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return false;
1145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
1155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Set up our error handler for ToUTF-16 converters
1175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)void SetUpErrorHandlerForToUChars(OnStringConversionError::Type on_error,
1185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                                  UConverter* converter, UErrorCode* status) {
1195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  switch (on_error) {
1205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    case OnStringConversionError::FAIL:
1215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      ucnv_setToUCallBack(converter, UCNV_TO_U_CALLBACK_STOP, 0,
1225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                          NULL, NULL, status);
1235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      break;
1245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    case OnStringConversionError::SKIP:
1255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      ucnv_setToUCallBack(converter, UCNV_TO_U_CALLBACK_SKIP, 0,
1265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                          NULL, NULL, status);
1275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      break;
1285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    case OnStringConversionError::SUBSTITUTE:
1295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      ucnv_setToUCallBack(converter, ToUnicodeCallbackSubstitute, 0,
1305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                          NULL, NULL, status);
1315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      break;
1325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    default:
1335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      NOTREACHED();
1345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
1355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
1365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)inline UConverterType utf32_platform_endian() {
1385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#if U_IS_BIG_ENDIAN
1395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return UCNV_UTF32_BigEndian;
1405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#else
1415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return UCNV_UTF32_LittleEndian;
1425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#endif
1435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
1445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}  // namespace
1465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Codepage <-> Wide/UTF-16  ---------------------------------------------------
1485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)bool UTF16ToCodepage(const string16& utf16,
1505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                     const char* codepage_name,
1515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                     OnStringConversionError::Type on_error,
1525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                     std::string* encoded) {
1535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  encoded->clear();
1545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  UErrorCode status = U_ZERO_ERROR;
1565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  UConverter* converter = ucnv_open(codepage_name, &status);
1575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (!U_SUCCESS(status))
1585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return false;
1595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return ConvertFromUTF16(converter, utf16.c_str(),
1615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                          static_cast<int>(utf16.length()), on_error, encoded);
1625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
1635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)bool CodepageToUTF16(const std::string& encoded,
1655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                     const char* codepage_name,
1665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                     OnStringConversionError::Type on_error,
1675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                     string16* utf16) {
1685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  utf16->clear();
1695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  UErrorCode status = U_ZERO_ERROR;
1715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  UConverter* converter = ucnv_open(codepage_name, &status);
1725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (!U_SUCCESS(status))
1735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return false;
1745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Even in the worst case, the maximum length in 2-byte units of UTF-16
1765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // output would be at most the same as the number of bytes in input. There
1775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // is no single-byte encoding in which a character is mapped to a
1785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // non-BMP character requiring two 2-byte units.
1795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  //
1805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Moreover, non-BMP characters in legacy multibyte encodings
1815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // (e.g. EUC-JP, GB18030) take at least 2 bytes. The only exceptions are
1825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // BOCU and SCSU, but we don't care about them.
1835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  size_t uchar_max_length = encoded.length() + 1;
1845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  SetUpErrorHandlerForToUChars(on_error, converter, &status);
1862a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)  scoped_ptr<char16[]> buffer(new char16[uchar_max_length]);
1875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int actual_size = ucnv_toUChars(converter, buffer.get(),
1885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      static_cast<int>(uchar_max_length), encoded.data(),
1895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      static_cast<int>(encoded.length()), &status);
1905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  ucnv_close(converter);
1915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (!U_SUCCESS(status)) {
1925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    utf16->clear();  // Make sure the output is empty on error.
1935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return false;
1945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
1955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  utf16->assign(buffer.get(), actual_size);
1975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return true;
1985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
1995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)bool WideToCodepage(const std::wstring& wide,
2015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                    const char* codepage_name,
2025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                    OnStringConversionError::Type on_error,
2035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                    std::string* encoded) {
2045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#if defined(WCHAR_T_IS_UTF16)
2055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return UTF16ToCodepage(wide, codepage_name, on_error, encoded);
2065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#elif defined(WCHAR_T_IS_UTF32)
2075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  encoded->clear();
2085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  UErrorCode status = U_ZERO_ERROR;
2105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  UConverter* converter = ucnv_open(codepage_name, &status);
2115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (!U_SUCCESS(status))
2125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return false;
2135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int utf16_len;
2155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // When wchar_t is wider than UChar (16 bits), transform |wide| into a
2165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // UChar* string.  Size the UChar* buffer to be large enough to hold twice
2175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // as many UTF-16 code units (UChar's) as there are Unicode code points,
2185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // in case each code points translates to a UTF-16 surrogate pair,
2195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // and leave room for a NUL terminator.
2205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  std::vector<UChar> utf16(wide.length() * 2 + 1);
2215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  u_strFromUTF32(&utf16[0], utf16.size(), &utf16_len,
2225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                 reinterpret_cast<const UChar32*>(wide.c_str()),
2235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                 wide.length(), &status);
2245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  DCHECK(U_SUCCESS(status)) << "failed to convert wstring to UChar*";
2255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return ConvertFromUTF16(converter, &utf16[0], utf16_len, on_error, encoded);
2275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#endif  // defined(WCHAR_T_IS_UTF32)
2285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
2295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)bool CodepageToWide(const std::string& encoded,
2315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                    const char* codepage_name,
2325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                    OnStringConversionError::Type on_error,
2335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                    std::wstring* wide) {
2345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#if defined(WCHAR_T_IS_UTF16)
2355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return CodepageToUTF16(encoded, codepage_name, on_error, wide);
2365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#elif defined(WCHAR_T_IS_UTF32)
2375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  wide->clear();
2385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  UErrorCode status = U_ZERO_ERROR;
2405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  UConverter* converter = ucnv_open(codepage_name, &status);
2415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (!U_SUCCESS(status))
2425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return false;
2435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // The maximum length in 4 byte unit of UTF-32 output would be
2455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // at most the same as the number of bytes in input. In the worst
2465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // case of GB18030 (excluding escaped-based encodings like ISO-2022-JP),
2475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // this can be 4 times larger than actually needed.
2485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  size_t wchar_max_length = encoded.length() + 1;
2495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  SetUpErrorHandlerForToUChars(on_error, converter, &status);
2512a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)  scoped_ptr<wchar_t[]> buffer(new wchar_t[wchar_max_length]);
2525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int actual_size = ucnv_toAlgorithmic(utf32_platform_endian(), converter,
2535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      reinterpret_cast<char*>(buffer.get()),
2545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      static_cast<int>(wchar_max_length) * sizeof(wchar_t), encoded.data(),
2555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      static_cast<int>(encoded.length()), &status);
2565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  ucnv_close(converter);
2575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (!U_SUCCESS(status)) {
2585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    wide->clear();  // Make sure the output is empty on error.
2595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return false;
2605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
2615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // actual_size is # of bytes.
2635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  wide->assign(buffer.get(), actual_size / sizeof(wchar_t));
2645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return true;
2655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#endif  // defined(WCHAR_T_IS_UTF32)
2665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
2675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)bool ConvertToUtf8AndNormalize(const std::string& text,
2695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                               const std::string& charset,
2705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                               std::string* result) {
2715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  result->clear();
2725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  string16 utf16;
2735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (!CodepageToUTF16(
2745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      text, charset.c_str(), OnStringConversionError::FAIL, &utf16))
2755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return false;
2765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  UErrorCode status = U_ZERO_ERROR;
2785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  size_t max_length = utf16.length() + 1;
2795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  string16 normalized_utf16;
2802a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)  scoped_ptr<char16[]> buffer(new char16[max_length]);
2815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int actual_length = unorm_normalize(
2825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      utf16.c_str(), utf16.length(), UNORM_NFC, 0,
2835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      buffer.get(), static_cast<int>(max_length), &status);
2845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (!U_SUCCESS(status))
2855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return false;
2865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  normalized_utf16.assign(buffer.get(), actual_length);
2875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return UTF16ToUTF8(normalized_utf16.data(),
2895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                     normalized_utf16.length(), result);
2905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
2915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}  // namespace base
293