15821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Copyright (c) 2011 The Chromium Authors. All rights reserved.
25821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Use of this source code is governed by a BSD-style license that can be
35821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// found in the LICENSE file.
45821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
52a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)#ifndef BASE_STRINGS_UTF_STRING_CONVERSION_UTILS_H_
62a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)#define BASE_STRINGS_UTF_STRING_CONVERSION_UTILS_H_
75821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
85821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// This should only be used by the various UTF string conversion files.
95821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "base/base_export.h"
11868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)#include "base/strings/string16.h"
125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)namespace base {
145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)inline bool IsValidCodepoint(uint32 code_point) {
165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Excludes the surrogate code points ([0xD800, 0xDFFF]) and
175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // codepoints larger than 0x10FFFF (the highest codepoint allowed).
185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Non-characters and unassigned codepoints are allowed.
195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return code_point < 0xD800u ||
205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)         (code_point >= 0xE000u && code_point <= 0x10FFFFu);
215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)inline bool IsValidCharacter(uint32 code_point) {
245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Excludes non-characters (U+FDD0..U+FDEF, and all codepoints ending in
255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // 0xFFFE or 0xFFFF) from the set of valid code points.
265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return code_point < 0xD800u || (code_point >= 0xE000u &&
275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      code_point < 0xFDD0u) || (code_point > 0xFDEFu &&
285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      code_point <= 0x10FFFFu && (code_point & 0xFFFEu) != 0xFFFEu);
295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// ReadUnicodeCharacter --------------------------------------------------------
325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Reads a UTF-8 stream, placing the next code point into the given output
345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// |*code_point|. |src| represents the entire string to read, and |*char_index|
355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// is the character offset within the string to start reading at. |*char_index|
365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// will be updated to index the last character read, such that incrementing it
375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// (as in a for loop) will take the reader to the next character.
385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//
395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Returns true on success. On false, |*code_point| will be invalid.
405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)BASE_EXPORT bool ReadUnicodeCharacter(const char* src,
415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                                      int32 src_len,
425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                                      int32* char_index,
435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                                      uint32* code_point_out);
445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Reads a UTF-16 character. The usage is the same as the 8-bit version above.
465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)BASE_EXPORT bool ReadUnicodeCharacter(const char16* src,
475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                                      int32 src_len,
485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                                      int32* char_index,
495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                                      uint32* code_point);
505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#if defined(WCHAR_T_IS_UTF32)
525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Reads UTF-32 character. The usage is the same as the 8-bit version above.
535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)BASE_EXPORT bool ReadUnicodeCharacter(const wchar_t* src,
545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                                      int32 src_len,
555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                                      int32* char_index,
565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                                      uint32* code_point);
575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#endif  // defined(WCHAR_T_IS_UTF32)
585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// WriteUnicodeCharacter -------------------------------------------------------
605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Appends a UTF-8 character to the given 8-bit string.  Returns the number of
625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// bytes written.
635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// TODO(brettw) Bug 79631: This function should not be exposed.
645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)BASE_EXPORT size_t WriteUnicodeCharacter(uint32 code_point,
655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                                         std::string* output);
665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Appends the given code point as a UTF-16 character to the given 16-bit
685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// string.  Returns the number of 16-bit values written.
695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)BASE_EXPORT size_t WriteUnicodeCharacter(uint32 code_point, string16* output);
705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#if defined(WCHAR_T_IS_UTF32)
725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Appends the given UTF-32 character to the given 32-bit string.  Returns the
735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// number of 32-bit values written.
745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)inline size_t WriteUnicodeCharacter(uint32 code_point, std::wstring* output) {
755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // This is the easy case, just append the character.
765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  output->push_back(code_point);
775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return 1;
785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#endif  // defined(WCHAR_T_IS_UTF32)
805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Generalized Unicode converter -----------------------------------------------
825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Guesses the length of the output in UTF-8 in bytes, clears that output
845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// string, and reserves that amount of space.  We assume that the input
855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// character types are unsigned, which will be true for UTF-16 and -32 on our
865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// systems.
875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)template<typename CHAR>
885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)void PrepareForUTF8Output(const CHAR* src, size_t src_len, std::string* output);
895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Prepares an output buffer (containing either UTF-16 or -32 data) given some
915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// UTF-8 input that will be converted to it.  See PrepareForUTF8Output().
925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)template<typename STRING>
935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)void PrepareForUTF16Or32Output(const char* src, size_t src_len, STRING* output);
945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}  // namespace base
965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
972a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)#endif  // BASE_STRINGS_UTF_STRING_CONVERSION_UTILS_H_
98