1c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root/* 2c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root * Copyright (C) 2005 The Android Open Source Project 3c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root * 4c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root * Licensed under the Apache License, Version 2.0 (the "License"); 5c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root * you may not use this file except in compliance with the License. 6c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root * You may obtain a copy of the License at 7c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root * 8c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root * http://www.apache.org/licenses/LICENSE-2.0 9c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root * 10c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root * Unless required by applicable law or agreed to in writing, software 11c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root * distributed under the License is distributed on an "AS IS" BASIS, 12c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root * See the License for the specific language governing permissions and 14c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root * limitations under the License. 15c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root */ 16c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root 17c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root#include <utils/Unicode.h> 18c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root 19c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root#include <stddef.h> 20c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root 21c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root#ifdef HAVE_WINSOCK 22c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root# undef nhtol 23c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root# undef htonl 24c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root# undef nhtos 25c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root# undef htons 26c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root 27c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root# ifdef HAVE_LITTLE_ENDIAN 28c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root# define ntohl(x) ( ((x) << 24) | (((x) >> 24) & 255) | (((x) << 8) & 0xff0000) | (((x) >> 8) & 0xff00) ) 29c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root# define htonl(x) ntohl(x) 30c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root# define ntohs(x) ( (((x) << 8) & 0xff00) | (((x) >> 8) & 255) ) 31c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root# define htons(x) ntohs(x) 32c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root# else 33c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root# define ntohl(x) (x) 34c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root# define htonl(x) (x) 35c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root# define ntohs(x) (x) 36c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root# define htons(x) (x) 37c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root# endif 38c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root#else 39c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root# include <netinet/in.h> 40c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root#endif 41c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root 42c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Rootextern "C" { 43c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root 44c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Rootstatic const char32_t kByteMask = 0x000000BF; 45c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Rootstatic const char32_t kByteMark = 0x00000080; 46c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root 47c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root// Surrogates aren't valid for UTF-32 characters, so define some 48c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root// constants that will let us screen them out. 49c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Rootstatic const char32_t kUnicodeSurrogateHighStart = 0x0000D800; 50c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Rootstatic const char32_t kUnicodeSurrogateHighEnd = 0x0000DBFF; 51c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Rootstatic const char32_t kUnicodeSurrogateLowStart = 0x0000DC00; 52c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Rootstatic const char32_t kUnicodeSurrogateLowEnd = 0x0000DFFF; 53c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Rootstatic const char32_t kUnicodeSurrogateStart = kUnicodeSurrogateHighStart; 54c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Rootstatic const char32_t kUnicodeSurrogateEnd = kUnicodeSurrogateLowEnd; 55c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Rootstatic const char32_t kUnicodeMaxCodepoint = 0x0010FFFF; 56c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root 57c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root// Mask used to set appropriate bits in first byte of UTF-8 sequence, 58c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root// indexed by number of bytes in the sequence. 59c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root// 0xxxxxxx 60c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root// -> (00-7f) 7bit. Bit mask for the first byte is 0x00000000 61c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root// 110yyyyx 10xxxxxx 62c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root// -> (c0-df)(80-bf) 11bit. Bit mask is 0x000000C0 63c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root// 1110yyyy 10yxxxxx 10xxxxxx 64c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root// -> (e0-ef)(80-bf)(80-bf) 16bit. Bit mask is 0x000000E0 65c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root// 11110yyy 10yyxxxx 10xxxxxx 10xxxxxx 66c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root// -> (f0-f7)(80-bf)(80-bf)(80-bf) 21bit. Bit mask is 0x000000F0 67c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Rootstatic const char32_t kFirstByteMark[] = { 68c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root 0x00000000, 0x00000000, 0x000000C0, 0x000000E0, 0x000000F0 69c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root}; 70c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root 71c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root// -------------------------------------------------------------------------- 72c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root// UTF-32 73c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root// -------------------------------------------------------------------------- 74c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root 75c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root/** 76c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root * Return number of UTF-8 bytes required for the character. If the character 77c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root * is invalid, return size of 0. 78c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root */ 79c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Rootstatic inline size_t utf32_codepoint_utf8_length(char32_t srcChar) 80c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root{ 81c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root // Figure out how many bytes the result will require. 82c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root if (srcChar < 0x00000080) { 83c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root return 1; 84c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root } else if (srcChar < 0x00000800) { 85c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root return 2; 86c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root } else if (srcChar < 0x00010000) { 87c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root if ((srcChar < kUnicodeSurrogateStart) || (srcChar > kUnicodeSurrogateEnd)) { 88c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root return 3; 89c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root } else { 90c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root // Surrogates are invalid UTF-32 characters. 91c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root return 0; 92c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root } 93c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root } 94c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root // Max code point for Unicode is 0x0010FFFF. 95c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root else if (srcChar <= kUnicodeMaxCodepoint) { 96c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root return 4; 97c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root } else { 98c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root // Invalid UTF-32 character. 99c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root return 0; 100c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root } 101c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root} 102c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root 103c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root// Write out the source character to <dstP>. 104c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root 105c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Rootstatic inline void utf32_codepoint_to_utf8(uint8_t* dstP, char32_t srcChar, size_t bytes) 106c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root{ 107c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root dstP += bytes; 108c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root switch (bytes) 109c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root { /* note: everything falls through. */ 110c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root case 4: *--dstP = (uint8_t)((srcChar | kByteMark) & kByteMask); srcChar >>= 6; 111c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root case 3: *--dstP = (uint8_t)((srcChar | kByteMark) & kByteMask); srcChar >>= 6; 112c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root case 2: *--dstP = (uint8_t)((srcChar | kByteMark) & kByteMask); srcChar >>= 6; 113c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root case 1: *--dstP = (uint8_t)(srcChar | kFirstByteMark[bytes]); 114c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root } 115c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root} 116c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root 117c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Rootsize_t strlen32(const char32_t *s) 118c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root{ 119c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root const char32_t *ss = s; 120c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root while ( *ss ) 121c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root ss++; 122c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root return ss-s; 123c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root} 124c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root 125c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Rootsize_t strnlen32(const char32_t *s, size_t maxlen) 126c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root{ 127c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root const char32_t *ss = s; 128c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root while ((maxlen > 0) && *ss) { 129c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root ss++; 130c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root maxlen--; 131c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root } 132c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root return ss-s; 133c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root} 134c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root 135c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Rootstatic inline int32_t utf32_at_internal(const char* cur, size_t *num_read) 136c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root{ 137c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root const char first_char = *cur; 138c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root if ((first_char & 0x80) == 0) { // ASCII 139c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root *num_read = 1; 140c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root return *cur; 141c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root } 142c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root cur++; 143c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root char32_t mask, to_ignore_mask; 144c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root size_t num_to_read = 0; 145c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root char32_t utf32 = first_char; 146c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root for (num_to_read = 1, mask = 0x40, to_ignore_mask = 0xFFFFFF80; 147c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root (first_char & mask); 148c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root num_to_read++, to_ignore_mask |= mask, mask >>= 1) { 149c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root // 0x3F == 00111111 150c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root utf32 = (utf32 << 6) + (*cur++ & 0x3F); 151c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root } 152c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root to_ignore_mask |= mask; 153c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root utf32 &= ~(to_ignore_mask << (6 * (num_to_read - 1))); 154c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root 155c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root *num_read = num_to_read; 156c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root return static_cast<int32_t>(utf32); 157c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root} 158c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root 159c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Rootint32_t utf32_from_utf8_at(const char *src, size_t src_len, size_t index, size_t *next_index) 160c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root{ 161c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root if (index >= src_len) { 162c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root return -1; 163c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root } 164c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root size_t dummy_index; 165c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root if (next_index == NULL) { 166c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root next_index = &dummy_index; 167c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root } 168c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root size_t num_read; 169c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root int32_t ret = utf32_at_internal(src + index, &num_read); 170c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root if (ret >= 0) { 171c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root *next_index = index + num_read; 172c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root } 173c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root 174c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root return ret; 175c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root} 176c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root 177c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Rootssize_t utf32_to_utf8_length(const char32_t *src, size_t src_len) 178c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root{ 179c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root if (src == NULL || src_len == 0) { 180c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root return -1; 181c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root } 182c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root 183c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root size_t ret = 0; 184c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root const char32_t *end = src + src_len; 185c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root while (src < end) { 186c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root ret += utf32_codepoint_utf8_length(*src++); 187c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root } 188c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root return ret; 189c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root} 190c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root 191c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Rootvoid utf32_to_utf8(const char32_t* src, size_t src_len, char* dst) 192c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root{ 193c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root if (src == NULL || src_len == 0 || dst == NULL) { 194c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root return; 195c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root } 196c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root 197c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root const char32_t *cur_utf32 = src; 198c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root const char32_t *end_utf32 = src + src_len; 199c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root char *cur = dst; 200c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root while (cur_utf32 < end_utf32) { 201c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root size_t len = utf32_codepoint_utf8_length(*cur_utf32); 202c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root utf32_codepoint_to_utf8((uint8_t *)cur, *cur_utf32++, len); 203c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root cur += len; 204c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root } 205c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root *cur = '\0'; 206c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root} 207c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root 208c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root// -------------------------------------------------------------------------- 209c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root// UTF-16 210c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root// -------------------------------------------------------------------------- 211c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root 212c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Rootint strcmp16(const char16_t *s1, const char16_t *s2) 213c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root{ 214c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root char16_t ch; 215c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root int d = 0; 216c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root 217c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root while ( 1 ) { 218c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root d = (int)(ch = *s1++) - (int)*s2++; 219c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root if ( d || !ch ) 220c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root break; 221c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root } 222c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root 223c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root return d; 224c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root} 225c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root 226c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Rootint strncmp16(const char16_t *s1, const char16_t *s2, size_t n) 227c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root{ 228c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root char16_t ch; 229c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root int d = 0; 230c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root 231c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root while ( n-- ) { 232c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root d = (int)(ch = *s1++) - (int)*s2++; 233c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root if ( d || !ch ) 234c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root break; 235c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root } 236c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root 237c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root return d; 238c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root} 239c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root 240c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Rootchar16_t *strcpy16(char16_t *dst, const char16_t *src) 241c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root{ 242c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root char16_t *q = dst; 243c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root const char16_t *p = src; 244c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root char16_t ch; 245c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root 246c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root do { 247c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root *q++ = ch = *p++; 248c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root } while ( ch ); 249c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root 250c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root return dst; 251c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root} 252c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root 253c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Rootsize_t strlen16(const char16_t *s) 254c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root{ 255c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root const char16_t *ss = s; 256c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root while ( *ss ) 257c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root ss++; 258c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root return ss-s; 259c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root} 260c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root 261c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root 262c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Rootchar16_t *strncpy16(char16_t *dst, const char16_t *src, size_t n) 263c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root{ 264c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root char16_t *q = dst; 265c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root const char16_t *p = src; 266c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root char ch; 267c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root 268c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root while (n) { 269c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root n--; 270c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root *q++ = ch = *p++; 271c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root if ( !ch ) 272c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root break; 273c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root } 274c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root 275c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root *q = 0; 276c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root 277c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root return dst; 278c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root} 279c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root 280c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Rootsize_t strnlen16(const char16_t *s, size_t maxlen) 281c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root{ 282c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root const char16_t *ss = s; 283c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root 284c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root /* Important: the maxlen test must precede the reference through ss; 285c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root since the byte beyond the maximum may segfault */ 286c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root while ((maxlen > 0) && *ss) { 287c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root ss++; 288c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root maxlen--; 289c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root } 290c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root return ss-s; 291c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root} 292c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root 293c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Rootint strzcmp16(const char16_t *s1, size_t n1, const char16_t *s2, size_t n2) 294c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root{ 295c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root const char16_t* e1 = s1+n1; 296c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root const char16_t* e2 = s2+n2; 297c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root 298c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root while (s1 < e1 && s2 < e2) { 299c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root const int d = (int)*s1++ - (int)*s2++; 300c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root if (d) { 301c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root return d; 302c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root } 303c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root } 304c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root 305c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root return n1 < n2 306c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root ? (0 - (int)*s2) 307c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root : (n1 > n2 308c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root ? ((int)*s1 - 0) 309c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root : 0); 310c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root} 311c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root 312c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Rootint strzcmp16_h_n(const char16_t *s1H, size_t n1, const char16_t *s2N, size_t n2) 313c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root{ 314c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root const char16_t* e1 = s1H+n1; 315c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root const char16_t* e2 = s2N+n2; 316c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root 317c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root while (s1H < e1 && s2N < e2) { 318c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root const char16_t c2 = ntohs(*s2N); 319c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root const int d = (int)*s1H++ - (int)c2; 320c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root s2N++; 321c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root if (d) { 322c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root return d; 323c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root } 324c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root } 325c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root 326c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root return n1 < n2 327c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root ? (0 - (int)ntohs(*s2N)) 328c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root : (n1 > n2 329c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root ? ((int)*s1H - 0) 330c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root : 0); 331c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root} 332c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root 333c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Rootvoid utf16_to_utf8(const char16_t* src, size_t src_len, char* dst) 334c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root{ 335c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root if (src == NULL || src_len == 0 || dst == NULL) { 336c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root return; 337c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root } 338c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root 339c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root const char16_t* cur_utf16 = src; 340c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root const char16_t* const end_utf16 = src + src_len; 341c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root char *cur = dst; 342c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root while (cur_utf16 < end_utf16) { 343c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root char32_t utf32; 344c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root // surrogate pairs 345c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root if ((*cur_utf16 & 0xFC00) == 0xD800) { 346c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root utf32 = (*cur_utf16++ - 0xD800) << 10; 347c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root utf32 |= *cur_utf16++ - 0xDC00; 348c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root utf32 += 0x10000; 349c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root } else { 350c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root utf32 = (char32_t) *cur_utf16++; 351c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root } 352c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root const size_t len = utf32_codepoint_utf8_length(utf32); 353c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root utf32_codepoint_to_utf8((uint8_t*)cur, utf32, len); 354c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root cur += len; 355c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root } 356c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root *cur = '\0'; 357c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root} 358c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root 359c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root// -------------------------------------------------------------------------- 360c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root// UTF-8 361c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root// -------------------------------------------------------------------------- 362c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root 363c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Rootssize_t utf8_length(const char *src) 364c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root{ 365c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root const char *cur = src; 366c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root size_t ret = 0; 367c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root while (*cur != '\0') { 368c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root const char first_char = *cur++; 369c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root if ((first_char & 0x80) == 0) { // ASCII 370c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root ret += 1; 371c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root continue; 372c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root } 373c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root // (UTF-8's character must not be like 10xxxxxx, 374c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root // but 110xxxxx, 1110xxxx, ... or 1111110x) 375c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root if ((first_char & 0x40) == 0) { 376c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root return -1; 377c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root } 378c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root 379c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root int32_t mask, to_ignore_mask; 380c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root size_t num_to_read = 0; 381c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root char32_t utf32 = 0; 382c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root for (num_to_read = 1, mask = 0x40, to_ignore_mask = 0x80; 383c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root num_to_read < 5 && (first_char & mask); 384c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root num_to_read++, to_ignore_mask |= mask, mask >>= 1) { 385c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root if ((*cur & 0xC0) != 0x80) { // must be 10xxxxxx 386c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root return -1; 387c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root } 388c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root // 0x3F == 00111111 389c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root utf32 = (utf32 << 6) + (*cur++ & 0x3F); 390c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root } 391c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root // "first_char" must be (110xxxxx - 11110xxx) 392c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root if (num_to_read == 5) { 393c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root return -1; 394c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root } 395c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root to_ignore_mask |= mask; 396c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root utf32 |= ((~to_ignore_mask) & first_char) << (6 * (num_to_read - 1)); 397c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root if (utf32 > kUnicodeMaxCodepoint) { 398c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root return -1; 399c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root } 400c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root 401c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root ret += num_to_read; 402c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root } 403c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root return ret; 404c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root} 405c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root 406c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Rootssize_t utf16_to_utf8_length(const char16_t *src, size_t src_len) 407c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root{ 408c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root if (src == NULL || src_len == 0) { 409c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root return -1; 410c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root } 411c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root 412c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root size_t ret = 0; 413c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root const char16_t* const end = src + src_len; 414c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root while (src < end) { 415c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root if ((*src & 0xFC00) == 0xD800 && (src + 1) < end 416c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root && (*++src & 0xFC00) == 0xDC00) { 417c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root // surrogate pairs are always 4 bytes. 418c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root ret += 4; 419c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root src++; 420c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root } else { 421c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root ret += utf32_codepoint_utf8_length((char32_t) *src++); 422c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root } 423c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root } 424c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root return ret; 425c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root} 426c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root 427c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root/** 428c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root * Returns 1-4 based on the number of leading bits. 429c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root * 430c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root * 1111 -> 4 431c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root * 1110 -> 3 432c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root * 110x -> 2 433c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root * 10xx -> 1 434c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root * 0xxx -> 1 435c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root */ 436c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Rootstatic inline size_t utf8_codepoint_len(uint8_t ch) 437c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root{ 438c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root return ((0xe5000000 >> ((ch >> 3) & 0x1e)) & 3) + 1; 439c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root} 440c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root 441c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Rootstatic inline void utf8_shift_and_mask(uint32_t* codePoint, const uint8_t byte) 442c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root{ 443c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root *codePoint <<= 6; 444c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root *codePoint |= 0x3F & byte; 445c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root} 446c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root 447c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Rootsize_t utf8_to_utf32_length(const char *src, size_t src_len) 448c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root{ 449c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root if (src == NULL || src_len == 0) { 450c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root return 0; 451c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root } 452c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root size_t ret = 0; 453c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root const char* cur; 454c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root const char* end; 455c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root size_t num_to_skip; 456c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root for (cur = src, end = src + src_len, num_to_skip = 1; 457c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root cur < end; 458c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root cur += num_to_skip, ret++) { 459c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root const char first_char = *cur; 460c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root num_to_skip = 1; 461c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root if ((first_char & 0x80) == 0) { // ASCII 462c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root continue; 463c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root } 464c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root int32_t mask; 465c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root 466c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root for (mask = 0x40; (first_char & mask); num_to_skip++, mask >>= 1) { 467c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root } 468c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root } 469c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root return ret; 470c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root} 471c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root 472c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Rootvoid utf8_to_utf32(const char* src, size_t src_len, char32_t* dst) 473c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root{ 474c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root if (src == NULL || src_len == 0 || dst == NULL) { 475c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root return; 476c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root } 477c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root 478c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root const char* cur = src; 479c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root const char* const end = src + src_len; 480c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root char32_t* cur_utf32 = dst; 481c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root while (cur < end) { 482c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root size_t num_read; 483c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root *cur_utf32++ = static_cast<char32_t>(utf32_at_internal(cur, &num_read)); 484c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root cur += num_read; 485c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root } 486c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root *cur_utf32 = 0; 487c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root} 488c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root 489c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Rootstatic inline uint32_t utf8_to_utf32_codepoint(const uint8_t *src, size_t length) 490c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root{ 491c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root uint32_t unicode; 492c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root 493c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root switch (length) 494c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root { 495c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root case 1: 496c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root return src[0]; 497c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root case 2: 498c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root unicode = src[0] & 0x1f; 499c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root utf8_shift_and_mask(&unicode, src[1]); 500c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root return unicode; 501c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root case 3: 502c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root unicode = src[0] & 0x0f; 503c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root utf8_shift_and_mask(&unicode, src[1]); 504c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root utf8_shift_and_mask(&unicode, src[2]); 505c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root return unicode; 506c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root case 4: 507c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root unicode = src[0] & 0x07; 508c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root utf8_shift_and_mask(&unicode, src[1]); 509c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root utf8_shift_and_mask(&unicode, src[2]); 510c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root utf8_shift_and_mask(&unicode, src[3]); 511c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root return unicode; 512c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root default: 513c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root return 0xffff; 514c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root } 515c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root 516c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root //printf("Char at %p: len=%d, utf-16=%p\n", src, length, (void*)result); 517c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root} 518c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root 519c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Rootssize_t utf8_to_utf16_length(const uint8_t* u8str, size_t u8len) 520c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root{ 521c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root const uint8_t* const u8end = u8str + u8len; 522c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root const uint8_t* u8cur = u8str; 523c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root 524c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root /* Validate that the UTF-8 is the correct len */ 525c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root size_t u16measuredLen = 0; 526c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root while (u8cur < u8end) { 527c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root u16measuredLen++; 528c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root int u8charLen = utf8_codepoint_len(*u8cur); 529c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root uint32_t codepoint = utf8_to_utf32_codepoint(u8cur, u8charLen); 530c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root if (codepoint > 0xFFFF) u16measuredLen++; // this will be a surrogate pair in utf16 531c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root u8cur += u8charLen; 532c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root } 533c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root 534c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root /** 535c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root * Make sure that we ended where we thought we would and the output UTF-16 536c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root * will be exactly how long we were told it would be. 537c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root */ 538c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root if (u8cur != u8end) { 539c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root return -1; 540c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root } 541c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root 542c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root return u16measuredLen; 543c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root} 544c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root 545de27227026e01c26a9debef77e4e268f242e713eJeff Brownchar16_t* utf8_to_utf16_no_null_terminator(const uint8_t* u8str, size_t u8len, char16_t* u16str) 546c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root{ 547c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root const uint8_t* const u8end = u8str + u8len; 548c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root const uint8_t* u8cur = u8str; 549c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root char16_t* u16cur = u16str; 550c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root 551c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root while (u8cur < u8end) { 552c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root size_t u8len = utf8_codepoint_len(*u8cur); 553c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root uint32_t codepoint = utf8_to_utf32_codepoint(u8cur, u8len); 554c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root 555c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root // Convert the UTF32 codepoint to one or more UTF16 codepoints 556c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root if (codepoint <= 0xFFFF) { 557c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root // Single UTF16 character 558c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root *u16cur++ = (char16_t) codepoint; 559c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root } else { 560c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root // Multiple UTF16 characters with surrogates 561c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root codepoint = codepoint - 0x10000; 562c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root *u16cur++ = (char16_t) ((codepoint >> 10) + 0xD800); 563c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root *u16cur++ = (char16_t) ((codepoint & 0x3FF) + 0xDC00); 564c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root } 565c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root 566c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root u8cur += u8len; 567c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root } 568de27227026e01c26a9debef77e4e268f242e713eJeff Brown return u16cur; 569de27227026e01c26a9debef77e4e268f242e713eJeff Brown} 570de27227026e01c26a9debef77e4e268f242e713eJeff Brown 571de27227026e01c26a9debef77e4e268f242e713eJeff Brownvoid utf8_to_utf16(const uint8_t* u8str, size_t u8len, char16_t* u16str) { 572de27227026e01c26a9debef77e4e268f242e713eJeff Brown char16_t* end = utf8_to_utf16_no_null_terminator(u8str, u8len, u16str); 573de27227026e01c26a9debef77e4e268f242e713eJeff Brown *end = 0; 574c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root} 575c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root 576c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root} 577