15821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Copyright (c) 2011 The Chromium Authors. All rights reserved. 25821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Use of this source code is governed by a BSD-style license that can be 35821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// found in the LICENSE file. 45821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 52a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)#ifndef BASE_STRINGS_UTF_STRING_CONVERSION_UTILS_H_ 62a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)#define BASE_STRINGS_UTF_STRING_CONVERSION_UTILS_H_ 75821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 85821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// This should only be used by the various UTF string conversion files. 95821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "base/base_export.h" 11868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)#include "base/strings/string16.h" 125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)namespace base { 145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)inline bool IsValidCodepoint(uint32 code_point) { 165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Excludes the surrogate code points ([0xD800, 0xDFFF]) and 175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // codepoints larger than 0x10FFFF (the highest codepoint allowed). 185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Non-characters and unassigned codepoints are allowed. 195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return code_point < 0xD800u || 205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) (code_point >= 0xE000u && code_point <= 0x10FFFFu); 215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)inline bool IsValidCharacter(uint32 code_point) { 245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Excludes non-characters (U+FDD0..U+FDEF, and all codepoints ending in 255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // 0xFFFE or 0xFFFF) from the set of valid code points. 265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return code_point < 0xD800u || (code_point >= 0xE000u && 275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) code_point < 0xFDD0u) || (code_point > 0xFDEFu && 285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) code_point <= 0x10FFFFu && (code_point & 0xFFFEu) != 0xFFFEu); 295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// ReadUnicodeCharacter -------------------------------------------------------- 325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Reads a UTF-8 stream, placing the next code point into the given output 345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// |*code_point|. |src| represents the entire string to read, and |*char_index| 355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// is the character offset within the string to start reading at. |*char_index| 365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// will be updated to index the last character read, such that incrementing it 375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// (as in a for loop) will take the reader to the next character. 385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Returns true on success. On false, |*code_point| will be invalid. 405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)BASE_EXPORT bool ReadUnicodeCharacter(const char* src, 415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int32 src_len, 425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int32* char_index, 435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) uint32* code_point_out); 445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Reads a UTF-16 character. The usage is the same as the 8-bit version above. 465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)BASE_EXPORT bool ReadUnicodeCharacter(const char16* src, 475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int32 src_len, 485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int32* char_index, 495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) uint32* code_point); 505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#if defined(WCHAR_T_IS_UTF32) 525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Reads UTF-32 character. The usage is the same as the 8-bit version above. 535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)BASE_EXPORT bool ReadUnicodeCharacter(const wchar_t* src, 545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int32 src_len, 555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int32* char_index, 565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) uint32* code_point); 575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#endif // defined(WCHAR_T_IS_UTF32) 585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// WriteUnicodeCharacter ------------------------------------------------------- 605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Appends a UTF-8 character to the given 8-bit string. Returns the number of 625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// bytes written. 635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// TODO(brettw) Bug 79631: This function should not be exposed. 645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)BASE_EXPORT size_t WriteUnicodeCharacter(uint32 code_point, 655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) std::string* output); 665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Appends the given code point as a UTF-16 character to the given 16-bit 685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// string. Returns the number of 16-bit values written. 695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)BASE_EXPORT size_t WriteUnicodeCharacter(uint32 code_point, string16* output); 705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#if defined(WCHAR_T_IS_UTF32) 725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Appends the given UTF-32 character to the given 32-bit string. Returns the 735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// number of 32-bit values written. 745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)inline size_t WriteUnicodeCharacter(uint32 code_point, std::wstring* output) { 755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // This is the easy case, just append the character. 765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) output->push_back(code_point); 775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return 1; 785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#endif // defined(WCHAR_T_IS_UTF32) 805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Generalized Unicode converter ----------------------------------------------- 825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Guesses the length of the output in UTF-8 in bytes, clears that output 845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// string, and reserves that amount of space. We assume that the input 855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// character types are unsigned, which will be true for UTF-16 and -32 on our 865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// systems. 875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)template<typename CHAR> 885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)void PrepareForUTF8Output(const CHAR* src, size_t src_len, std::string* output); 895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Prepares an output buffer (containing either UTF-16 or -32 data) given some 915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// UTF-8 input that will be converted to it. See PrepareForUTF8Output(). 925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)template<typename STRING> 935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)void PrepareForUTF16Or32Output(const char* src, size_t src_len, STRING* output); 945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} // namespace base 965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 972a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)#endif // BASE_STRINGS_UTF_STRING_CONVERSION_UTILS_H_ 98