1ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root/* 2ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root * Copyright (C) 2005 The Android Open Source Project 3ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root * 4ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root * Licensed under the Apache License, Version 2.0 (the "License"); 5ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root * you may not use this file except in compliance with the License. 6ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root * You may obtain a copy of the License at 7ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root * 8ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root * http://www.apache.org/licenses/LICENSE-2.0 9ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root * 10ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root * Unless required by applicable law or agreed to in writing, software 11ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root * distributed under the License is distributed on an "AS IS" BASIS, 12ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root * See the License for the specific language governing permissions and 14ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root * limitations under the License. 15ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root */ 16ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root 17cfd5b080af8de527d768f0ff7902c26af8d49307Mark Salyzyn#define LOG_TAG "unicode" 18cfd5b080af8de527d768f0ff7902c26af8d49307Mark Salyzyn 1922dbf3947fedf988e714a4703ddf85fc41413f90Mathias Agopian#include <utils/Unicode.h> 209de6776321b80d387e6108683547bc043f868925Sergio Giro#include <limits.h> 21ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root 2230f991f251940be3ed11566fb71139852286f68aMark Salyzyn#include <log/log.h> 23ff2dcd9af994a23ed483939a416b48bdc10eefd5Mark Salyzyn 24adbf442a515c51cb2acb34e20c1d2ea0e843c660Elliott Hughes#if defined(_WIN32) 25ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root# undef nhtol 26ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root# undef htonl 27ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root# undef nhtos 28ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root# undef htons 29ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root 3097ac0e1bb42ddd17c75691ecbe1df8a6520d4c81Elliott Hughes# define ntohl(x) ( ((x) << 24) | (((x) >> 24) & 255) | (((x) << 8) & 0xff0000) | (((x) >> 8) & 0xff00) ) 3197ac0e1bb42ddd17c75691ecbe1df8a6520d4c81Elliott Hughes# define htonl(x) ntohl(x) 3297ac0e1bb42ddd17c75691ecbe1df8a6520d4c81Elliott Hughes# define ntohs(x) ( (((x) << 8) & 0xff00) | (((x) >> 8) & 255) ) 3397ac0e1bb42ddd17c75691ecbe1df8a6520d4c81Elliott Hughes# define htons(x) ntohs(x) 34ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root#else 35ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root# include <netinet/in.h> 36ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root#endif 37ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root 38ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Rootextern "C" { 39ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root 40ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Rootstatic const char32_t kByteMask = 0x000000BF; 41ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Rootstatic const char32_t kByteMark = 0x00000080; 42ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root 43ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root// Surrogates aren't valid for UTF-32 characters, so define some 44ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root// constants that will let us screen them out. 45ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Rootstatic const char32_t kUnicodeSurrogateHighStart = 0x0000D800; 46a53c815323f828a71f739e31ca248c7b9cfc4771Andreas Gampe// Unused, here for completeness: 47a53c815323f828a71f739e31ca248c7b9cfc4771Andreas Gampe// static const char32_t kUnicodeSurrogateHighEnd = 0x0000DBFF; 48a53c815323f828a71f739e31ca248c7b9cfc4771Andreas Gampe// static const char32_t kUnicodeSurrogateLowStart = 0x0000DC00; 49ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Rootstatic const char32_t kUnicodeSurrogateLowEnd = 0x0000DFFF; 50ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Rootstatic const char32_t kUnicodeSurrogateStart = kUnicodeSurrogateHighStart; 51ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Rootstatic const char32_t kUnicodeSurrogateEnd = kUnicodeSurrogateLowEnd; 52ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Rootstatic const char32_t kUnicodeMaxCodepoint = 0x0010FFFF; 53ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root 54ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root// Mask used to set appropriate bits in first byte of UTF-8 sequence, 55ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root// indexed by number of bytes in the sequence. 56ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root// 0xxxxxxx 57ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root// -> (00-7f) 7bit. Bit mask for the first byte is 0x00000000 58ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root// 110yyyyx 10xxxxxx 59ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root// -> (c0-df)(80-bf) 11bit. Bit mask is 0x000000C0 60ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root// 1110yyyy 10yxxxxx 10xxxxxx 61ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root// -> (e0-ef)(80-bf)(80-bf) 16bit. Bit mask is 0x000000E0 62ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root// 11110yyy 10yyxxxx 10xxxxxx 10xxxxxx 63ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root// -> (f0-f7)(80-bf)(80-bf)(80-bf) 21bit. Bit mask is 0x000000F0 64ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Rootstatic const char32_t kFirstByteMark[] = { 65ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root 0x00000000, 0x00000000, 0x000000C0, 0x000000E0, 0x000000F0 66ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root}; 67ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root 68ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root// -------------------------------------------------------------------------- 69ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root// UTF-32 70ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root// -------------------------------------------------------------------------- 71ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root 72ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root/** 73ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root * Return number of UTF-8 bytes required for the character. If the character 74ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root * is invalid, return size of 0. 75ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root */ 76ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Rootstatic inline size_t utf32_codepoint_utf8_length(char32_t srcChar) 77ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root{ 78ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root // Figure out how many bytes the result will require. 79ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root if (srcChar < 0x00000080) { 80ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root return 1; 81ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root } else if (srcChar < 0x00000800) { 82ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root return 2; 83ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root } else if (srcChar < 0x00010000) { 84ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root if ((srcChar < kUnicodeSurrogateStart) || (srcChar > kUnicodeSurrogateEnd)) { 85ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root return 3; 86ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root } else { 87ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root // Surrogates are invalid UTF-32 characters. 88ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root return 0; 89ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root } 90ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root } 91ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root // Max code point for Unicode is 0x0010FFFF. 92ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root else if (srcChar <= kUnicodeMaxCodepoint) { 93ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root return 4; 94ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root } else { 95ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root // Invalid UTF-32 character. 96ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root return 0; 97ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root } 98ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root} 99ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root 100ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root// Write out the source character to <dstP>. 101ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root 102ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Rootstatic inline void utf32_codepoint_to_utf8(uint8_t* dstP, char32_t srcChar, size_t bytes) 103ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root{ 104ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root dstP += bytes; 105ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root switch (bytes) 106ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root { /* note: everything falls through. */ 107ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root case 4: *--dstP = (uint8_t)((srcChar | kByteMark) & kByteMask); srcChar >>= 6; 108ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root case 3: *--dstP = (uint8_t)((srcChar | kByteMark) & kByteMask); srcChar >>= 6; 109ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root case 2: *--dstP = (uint8_t)((srcChar | kByteMark) & kByteMask); srcChar >>= 6; 110ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root case 1: *--dstP = (uint8_t)(srcChar | kFirstByteMark[bytes]); 111ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root } 112ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root} 113ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root 114ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Rootsize_t strlen32(const char32_t *s) 115ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root{ 116ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root const char32_t *ss = s; 117ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root while ( *ss ) 118ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root ss++; 119ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root return ss-s; 120ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root} 121ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root 122ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Rootsize_t strnlen32(const char32_t *s, size_t maxlen) 123ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root{ 124ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root const char32_t *ss = s; 125ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root while ((maxlen > 0) && *ss) { 126ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root ss++; 127ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root maxlen--; 128ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root } 129ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root return ss-s; 130ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root} 131ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root 132ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Rootstatic inline int32_t utf32_at_internal(const char* cur, size_t *num_read) 133ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root{ 134ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root const char first_char = *cur; 135ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root if ((first_char & 0x80) == 0) { // ASCII 136ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root *num_read = 1; 137ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root return *cur; 138ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root } 139ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root cur++; 140ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root char32_t mask, to_ignore_mask; 141ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root size_t num_to_read = 0; 142ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root char32_t utf32 = first_char; 143ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root for (num_to_read = 1, mask = 0x40, to_ignore_mask = 0xFFFFFF80; 144ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root (first_char & mask); 145ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root num_to_read++, to_ignore_mask |= mask, mask >>= 1) { 146ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root // 0x3F == 00111111 147ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root utf32 = (utf32 << 6) + (*cur++ & 0x3F); 148ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root } 149ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root to_ignore_mask |= mask; 150ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root utf32 &= ~(to_ignore_mask << (6 * (num_to_read - 1))); 151ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root 152ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root *num_read = num_to_read; 153ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root return static_cast<int32_t>(utf32); 154ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root} 155ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root 156ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Rootint32_t utf32_from_utf8_at(const char *src, size_t src_len, size_t index, size_t *next_index) 157ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root{ 158ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root if (index >= src_len) { 159ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root return -1; 160ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root } 161ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root size_t dummy_index; 162ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root if (next_index == NULL) { 163ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root next_index = &dummy_index; 164ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root } 165ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root size_t num_read; 166ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root int32_t ret = utf32_at_internal(src + index, &num_read); 167ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root if (ret >= 0) { 168ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root *next_index = index + num_read; 169ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root } 170ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root 171ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root return ret; 172ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root} 173ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root 174ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Rootssize_t utf32_to_utf8_length(const char32_t *src, size_t src_len) 175ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root{ 176ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root if (src == NULL || src_len == 0) { 177ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root return -1; 178ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root } 179ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root 180ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root size_t ret = 0; 181ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root const char32_t *end = src + src_len; 182ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root while (src < end) { 18347efc676c849e3abf32001d66e2d6eb887e83c48Adam Vartanian size_t char_len = utf32_codepoint_utf8_length(*src++); 18447efc676c849e3abf32001d66e2d6eb887e83c48Adam Vartanian if (SSIZE_MAX - char_len < ret) { 18547efc676c849e3abf32001d66e2d6eb887e83c48Adam Vartanian // If this happens, we would overflow the ssize_t type when 18647efc676c849e3abf32001d66e2d6eb887e83c48Adam Vartanian // returning from this function, so we cannot express how 18747efc676c849e3abf32001d66e2d6eb887e83c48Adam Vartanian // long this string is in an ssize_t. 18847efc676c849e3abf32001d66e2d6eb887e83c48Adam Vartanian android_errorWriteLog(0x534e4554, "37723026"); 18947efc676c849e3abf32001d66e2d6eb887e83c48Adam Vartanian return -1; 19047efc676c849e3abf32001d66e2d6eb887e83c48Adam Vartanian } 19147efc676c849e3abf32001d66e2d6eb887e83c48Adam Vartanian ret += char_len; 192ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root } 193ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root return ret; 194ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root} 195ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root 1961cfa56d46c7c31300c25dafd722ff60a5294c3b3Sergio Girovoid utf32_to_utf8(const char32_t* src, size_t src_len, char* dst, size_t dst_len) 197ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root{ 198ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root if (src == NULL || src_len == 0 || dst == NULL) { 199ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root return; 200ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root } 201ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root 202ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root const char32_t *cur_utf32 = src; 203ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root const char32_t *end_utf32 = src + src_len; 204ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root char *cur = dst; 205ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root while (cur_utf32 < end_utf32) { 206ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root size_t len = utf32_codepoint_utf8_length(*cur_utf32); 2071cfa56d46c7c31300c25dafd722ff60a5294c3b3Sergio Giro LOG_ALWAYS_FATAL_IF(dst_len < len, "%zu < %zu", dst_len, len); 208ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root utf32_codepoint_to_utf8((uint8_t *)cur, *cur_utf32++, len); 209ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root cur += len; 2101cfa56d46c7c31300c25dafd722ff60a5294c3b3Sergio Giro dst_len -= len; 211ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root } 2121cfa56d46c7c31300c25dafd722ff60a5294c3b3Sergio Giro LOG_ALWAYS_FATAL_IF(dst_len < 1, "dst_len < 1: %zu < 1", dst_len); 213ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root *cur = '\0'; 214ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root} 215ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root 216ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root// -------------------------------------------------------------------------- 217ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root// UTF-16 218ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root// -------------------------------------------------------------------------- 219ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root 220ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Rootint strcmp16(const char16_t *s1, const char16_t *s2) 221ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root{ 222ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root char16_t ch; 223ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root int d = 0; 224ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root 225ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root while ( 1 ) { 226ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root d = (int)(ch = *s1++) - (int)*s2++; 227ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root if ( d || !ch ) 228ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root break; 229ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root } 230ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root 231ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root return d; 232ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root} 233ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root 234ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Rootint strncmp16(const char16_t *s1, const char16_t *s2, size_t n) 235ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root{ 236ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root char16_t ch; 237ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root int d = 0; 238ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root 2395bacef33c91e9625dfd09ecf638c2de7faecd34eMichael Wright if (n == 0) { 2405bacef33c91e9625dfd09ecf638c2de7faecd34eMichael Wright return 0; 2415bacef33c91e9625dfd09ecf638c2de7faecd34eMichael Wright } 2425bacef33c91e9625dfd09ecf638c2de7faecd34eMichael Wright 2435bacef33c91e9625dfd09ecf638c2de7faecd34eMichael Wright do { 244ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root d = (int)(ch = *s1++) - (int)*s2++; 2455bacef33c91e9625dfd09ecf638c2de7faecd34eMichael Wright if ( d || !ch ) { 246ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root break; 2475bacef33c91e9625dfd09ecf638c2de7faecd34eMichael Wright } 2485bacef33c91e9625dfd09ecf638c2de7faecd34eMichael Wright } while (--n); 249ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root 250ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root return d; 251ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root} 252ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root 253ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Rootchar16_t *strcpy16(char16_t *dst, const char16_t *src) 254ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root{ 255ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root char16_t *q = dst; 256ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root const char16_t *p = src; 257ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root char16_t ch; 258ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root 259ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root do { 260ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root *q++ = ch = *p++; 261ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root } while ( ch ); 262ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root 263ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root return dst; 264ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root} 265ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root 266ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Rootsize_t strlen16(const char16_t *s) 267ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root{ 268ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root const char16_t *ss = s; 269ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root while ( *ss ) 270ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root ss++; 271ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root return ss-s; 272ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root} 273ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root 274ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root 275ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Rootchar16_t *strncpy16(char16_t *dst, const char16_t *src, size_t n) 276ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root{ 277ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root char16_t *q = dst; 278ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root const char16_t *p = src; 279ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root char ch; 280ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root 281ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root while (n) { 282ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root n--; 283ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root *q++ = ch = *p++; 284ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root if ( !ch ) 285ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root break; 286ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root } 287ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root 288ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root *q = 0; 289ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root 290ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root return dst; 291ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root} 292ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root 293ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Rootsize_t strnlen16(const char16_t *s, size_t maxlen) 294ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root{ 295ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root const char16_t *ss = s; 296ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root 297ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root /* Important: the maxlen test must precede the reference through ss; 298ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root since the byte beyond the maximum may segfault */ 299ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root while ((maxlen > 0) && *ss) { 300ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root ss++; 301ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root maxlen--; 302ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root } 303ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root return ss-s; 304ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root} 305ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root 3065bacef33c91e9625dfd09ecf638c2de7faecd34eMichael Wrightchar16_t* strstr16(const char16_t* src, const char16_t* target) 3075bacef33c91e9625dfd09ecf638c2de7faecd34eMichael Wright{ 3085bacef33c91e9625dfd09ecf638c2de7faecd34eMichael Wright const char16_t needle = *target++; 3090fd60eb9b4026bcbc3a075e6aa9264346bcc25d1Michael Wright const size_t target_len = strlen16(target); 3105bacef33c91e9625dfd09ecf638c2de7faecd34eMichael Wright if (needle != '\0') { 3115bacef33c91e9625dfd09ecf638c2de7faecd34eMichael Wright do { 3125bacef33c91e9625dfd09ecf638c2de7faecd34eMichael Wright do { 3135bacef33c91e9625dfd09ecf638c2de7faecd34eMichael Wright if (*src == '\0') { 3145bacef33c91e9625dfd09ecf638c2de7faecd34eMichael Wright return nullptr; 3155bacef33c91e9625dfd09ecf638c2de7faecd34eMichael Wright } 3165bacef33c91e9625dfd09ecf638c2de7faecd34eMichael Wright } while (*src++ != needle); 3170fd60eb9b4026bcbc3a075e6aa9264346bcc25d1Michael Wright } while (strncmp16(src, target, target_len) != 0); 3185bacef33c91e9625dfd09ecf638c2de7faecd34eMichael Wright src--; 3195bacef33c91e9625dfd09ecf638c2de7faecd34eMichael Wright } 3205bacef33c91e9625dfd09ecf638c2de7faecd34eMichael Wright 3215bacef33c91e9625dfd09ecf638c2de7faecd34eMichael Wright return (char16_t*)src; 3225bacef33c91e9625dfd09ecf638c2de7faecd34eMichael Wright} 3235bacef33c91e9625dfd09ecf638c2de7faecd34eMichael Wright 3245bacef33c91e9625dfd09ecf638c2de7faecd34eMichael Wright 325ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Rootint strzcmp16(const char16_t *s1, size_t n1, const char16_t *s2, size_t n2) 326ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root{ 327ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root const char16_t* e1 = s1+n1; 328ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root const char16_t* e2 = s2+n2; 329ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root 330ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root while (s1 < e1 && s2 < e2) { 331ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root const int d = (int)*s1++ - (int)*s2++; 332ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root if (d) { 333ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root return d; 334ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root } 335ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root } 336ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root 337ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root return n1 < n2 338ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root ? (0 - (int)*s2) 339ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root : (n1 > n2 340ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root ? ((int)*s1 - 0) 341ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root : 0); 342ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root} 343ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root 344ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Rootint strzcmp16_h_n(const char16_t *s1H, size_t n1, const char16_t *s2N, size_t n2) 345ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root{ 346ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root const char16_t* e1 = s1H+n1; 347ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root const char16_t* e2 = s2N+n2; 348ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root 349ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root while (s1H < e1 && s2N < e2) { 350ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root const char16_t c2 = ntohs(*s2N); 351ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root const int d = (int)*s1H++ - (int)c2; 352ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root s2N++; 353ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root if (d) { 354ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root return d; 355ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root } 356ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root } 357ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root 358ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root return n1 < n2 359ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root ? (0 - (int)ntohs(*s2N)) 360ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root : (n1 > n2 361ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root ? ((int)*s1H - 0) 362ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root : 0); 363ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root} 364ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root 3651cfa56d46c7c31300c25dafd722ff60a5294c3b3Sergio Girovoid utf16_to_utf8(const char16_t* src, size_t src_len, char* dst, size_t dst_len) 366ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root{ 367ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root if (src == NULL || src_len == 0 || dst == NULL) { 368ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root return; 369ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root } 370ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root 371ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root const char16_t* cur_utf16 = src; 372ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root const char16_t* const end_utf16 = src + src_len; 373ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root char *cur = dst; 374ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root while (cur_utf16 < end_utf16) { 375ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root char32_t utf32; 376ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root // surrogate pairs 37772299bf0d240072174f847d13f1c9498b3ef9fa6Cylen Yao if((*cur_utf16 & 0xFC00) == 0xD800 && (cur_utf16 + 1) < end_utf16 37872299bf0d240072174f847d13f1c9498b3ef9fa6Cylen Yao && (*(cur_utf16 + 1) & 0xFC00) == 0xDC00) { 379ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root utf32 = (*cur_utf16++ - 0xD800) << 10; 380ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root utf32 |= *cur_utf16++ - 0xDC00; 381ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root utf32 += 0x10000; 382ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root } else { 383ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root utf32 = (char32_t) *cur_utf16++; 384ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root } 385ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root const size_t len = utf32_codepoint_utf8_length(utf32); 3861cfa56d46c7c31300c25dafd722ff60a5294c3b3Sergio Giro LOG_ALWAYS_FATAL_IF(dst_len < len, "%zu < %zu", dst_len, len); 387ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root utf32_codepoint_to_utf8((uint8_t*)cur, utf32, len); 388ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root cur += len; 3891cfa56d46c7c31300c25dafd722ff60a5294c3b3Sergio Giro dst_len -= len; 390ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root } 3911cfa56d46c7c31300c25dafd722ff60a5294c3b3Sergio Giro LOG_ALWAYS_FATAL_IF(dst_len < 1, "%zu < 1", dst_len); 392ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root *cur = '\0'; 393ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root} 394ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root 395ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root// -------------------------------------------------------------------------- 396ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root// UTF-8 397ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root// -------------------------------------------------------------------------- 398ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root 399ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Rootssize_t utf8_length(const char *src) 400ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root{ 401ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root const char *cur = src; 402ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root size_t ret = 0; 403ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root while (*cur != '\0') { 404ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root const char first_char = *cur++; 405ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root if ((first_char & 0x80) == 0) { // ASCII 406ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root ret += 1; 407ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root continue; 408ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root } 409ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root // (UTF-8's character must not be like 10xxxxxx, 410ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root // but 110xxxxx, 1110xxxx, ... or 1111110x) 411ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root if ((first_char & 0x40) == 0) { 412ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root return -1; 413ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root } 414ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root 415ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root int32_t mask, to_ignore_mask; 416ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root size_t num_to_read = 0; 417ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root char32_t utf32 = 0; 418ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root for (num_to_read = 1, mask = 0x40, to_ignore_mask = 0x80; 419ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root num_to_read < 5 && (first_char & mask); 420ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root num_to_read++, to_ignore_mask |= mask, mask >>= 1) { 421ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root if ((*cur & 0xC0) != 0x80) { // must be 10xxxxxx 422ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root return -1; 423ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root } 424ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root // 0x3F == 00111111 425ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root utf32 = (utf32 << 6) + (*cur++ & 0x3F); 426ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root } 427ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root // "first_char" must be (110xxxxx - 11110xxx) 428ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root if (num_to_read == 5) { 429ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root return -1; 430ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root } 431ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root to_ignore_mask |= mask; 432ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root utf32 |= ((~to_ignore_mask) & first_char) << (6 * (num_to_read - 1)); 433ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root if (utf32 > kUnicodeMaxCodepoint) { 434ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root return -1; 435ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root } 436ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root 437ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root ret += num_to_read; 438ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root } 439ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root return ret; 440ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root} 441ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root 442ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Rootssize_t utf16_to_utf8_length(const char16_t *src, size_t src_len) 443ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root{ 444ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root if (src == NULL || src_len == 0) { 445ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root return -1; 446ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root } 447ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root 448ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root size_t ret = 0; 449ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root const char16_t* const end = src + src_len; 450ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root while (src < end) { 45147efc676c849e3abf32001d66e2d6eb887e83c48Adam Vartanian size_t char_len; 452ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root if ((*src & 0xFC00) == 0xD800 && (src + 1) < end 4531cfa56d46c7c31300c25dafd722ff60a5294c3b3Sergio Giro && (*(src + 1) & 0xFC00) == 0xDC00) { 454ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root // surrogate pairs are always 4 bytes. 45547efc676c849e3abf32001d66e2d6eb887e83c48Adam Vartanian char_len = 4; 4561cfa56d46c7c31300c25dafd722ff60a5294c3b3Sergio Giro src += 2; 457ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root } else { 45847efc676c849e3abf32001d66e2d6eb887e83c48Adam Vartanian char_len = utf32_codepoint_utf8_length((char32_t)*src++); 45947efc676c849e3abf32001d66e2d6eb887e83c48Adam Vartanian } 46047efc676c849e3abf32001d66e2d6eb887e83c48Adam Vartanian if (SSIZE_MAX - char_len < ret) { 46147efc676c849e3abf32001d66e2d6eb887e83c48Adam Vartanian // If this happens, we would overflow the ssize_t type when 46247efc676c849e3abf32001d66e2d6eb887e83c48Adam Vartanian // returning from this function, so we cannot express how 46347efc676c849e3abf32001d66e2d6eb887e83c48Adam Vartanian // long this string is in an ssize_t. 46447efc676c849e3abf32001d66e2d6eb887e83c48Adam Vartanian android_errorWriteLog(0x534e4554, "37723026"); 46547efc676c849e3abf32001d66e2d6eb887e83c48Adam Vartanian return -1; 466ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root } 46747efc676c849e3abf32001d66e2d6eb887e83c48Adam Vartanian ret += char_len; 468ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root } 469ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root return ret; 470ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root} 471ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root 472ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root/** 473ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root * Returns 1-4 based on the number of leading bits. 474ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root * 475ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root * 1111 -> 4 476ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root * 1110 -> 3 477ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root * 110x -> 2 478ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root * 10xx -> 1 479ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root * 0xxx -> 1 480ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root */ 481ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Rootstatic inline size_t utf8_codepoint_len(uint8_t ch) 482ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root{ 483ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root return ((0xe5000000 >> ((ch >> 3) & 0x1e)) & 3) + 1; 484ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root} 485ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root 486ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Rootstatic inline void utf8_shift_and_mask(uint32_t* codePoint, const uint8_t byte) 487ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root{ 488ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root *codePoint <<= 6; 489ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root *codePoint |= 0x3F & byte; 490ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root} 491ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root 492ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Rootsize_t utf8_to_utf32_length(const char *src, size_t src_len) 493ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root{ 494ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root if (src == NULL || src_len == 0) { 495ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root return 0; 496ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root } 497ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root size_t ret = 0; 498ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root const char* cur; 499ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root const char* end; 500ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root size_t num_to_skip; 501ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root for (cur = src, end = src + src_len, num_to_skip = 1; 502ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root cur < end; 503ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root cur += num_to_skip, ret++) { 504ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root const char first_char = *cur; 505ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root num_to_skip = 1; 506ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root if ((first_char & 0x80) == 0) { // ASCII 507ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root continue; 508ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root } 509ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root int32_t mask; 510ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root 511ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root for (mask = 0x40; (first_char & mask); num_to_skip++, mask >>= 1) { 512ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root } 513ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root } 514ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root return ret; 515ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root} 516ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root 517ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Rootvoid utf8_to_utf32(const char* src, size_t src_len, char32_t* dst) 518ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root{ 519ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root if (src == NULL || src_len == 0 || dst == NULL) { 520ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root return; 521ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root } 522ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root 523ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root const char* cur = src; 524ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root const char* const end = src + src_len; 525ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root char32_t* cur_utf32 = dst; 526ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root while (cur < end) { 527ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root size_t num_read; 528ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root *cur_utf32++ = static_cast<char32_t>(utf32_at_internal(cur, &num_read)); 529ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root cur += num_read; 530ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root } 531ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root *cur_utf32 = 0; 532ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root} 533ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root 534ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Rootstatic inline uint32_t utf8_to_utf32_codepoint(const uint8_t *src, size_t length) 535ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root{ 536ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root uint32_t unicode; 537ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root 538ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root switch (length) 539ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root { 540ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root case 1: 541ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root return src[0]; 542ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root case 2: 543ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root unicode = src[0] & 0x1f; 544ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root utf8_shift_and_mask(&unicode, src[1]); 545ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root return unicode; 546ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root case 3: 547ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root unicode = src[0] & 0x0f; 548ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root utf8_shift_and_mask(&unicode, src[1]); 549ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root utf8_shift_and_mask(&unicode, src[2]); 550ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root return unicode; 551ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root case 4: 552ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root unicode = src[0] & 0x07; 553ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root utf8_shift_and_mask(&unicode, src[1]); 554ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root utf8_shift_and_mask(&unicode, src[2]); 555ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root utf8_shift_and_mask(&unicode, src[3]); 556ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root return unicode; 557ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root default: 558ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root return 0xffff; 559ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root } 560ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root 561ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root //printf("Char at %p: len=%d, utf-16=%p\n", src, length, (void*)result); 562ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root} 563ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root 5649de6776321b80d387e6108683547bc043f868925Sergio Girossize_t utf8_to_utf16_length(const uint8_t* u8str, size_t u8len, bool overreadIsFatal) 565ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root{ 566ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root const uint8_t* const u8end = u8str + u8len; 567ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root const uint8_t* u8cur = u8str; 568ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root 569ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root /* Validate that the UTF-8 is the correct len */ 570ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root size_t u16measuredLen = 0; 571ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root while (u8cur < u8end) { 572ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root u16measuredLen++; 573ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root int u8charLen = utf8_codepoint_len(*u8cur); 5749de6776321b80d387e6108683547bc043f868925Sergio Giro // Malformed utf8, some characters are beyond the end. 5759de6776321b80d387e6108683547bc043f868925Sergio Giro // Cases: 5769de6776321b80d387e6108683547bc043f868925Sergio Giro // If u8charLen == 1, this becomes u8cur >= u8end, which cannot happen as u8cur < u8end, 5779de6776321b80d387e6108683547bc043f868925Sergio Giro // then this condition fail and we continue, as expected. 5789de6776321b80d387e6108683547bc043f868925Sergio Giro // If u8charLen == 2, this becomes u8cur + 1 >= u8end, which fails only if 5799de6776321b80d387e6108683547bc043f868925Sergio Giro // u8cur == u8end - 1, that is, there was only one remaining character to read but we need 5809de6776321b80d387e6108683547bc043f868925Sergio Giro // 2 of them. This condition holds and we return -1, as expected. 5819de6776321b80d387e6108683547bc043f868925Sergio Giro if (u8cur + u8charLen - 1 >= u8end) { 5829de6776321b80d387e6108683547bc043f868925Sergio Giro if (overreadIsFatal) { 5839de6776321b80d387e6108683547bc043f868925Sergio Giro LOG_ALWAYS_FATAL("Attempt to overread computing length of utf8 string"); 5849de6776321b80d387e6108683547bc043f868925Sergio Giro } else { 5859de6776321b80d387e6108683547bc043f868925Sergio Giro return -1; 5869de6776321b80d387e6108683547bc043f868925Sergio Giro } 5879de6776321b80d387e6108683547bc043f868925Sergio Giro } 588ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root uint32_t codepoint = utf8_to_utf32_codepoint(u8cur, u8charLen); 589ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root if (codepoint > 0xFFFF) u16measuredLen++; // this will be a surrogate pair in utf16 590ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root u8cur += u8charLen; 591ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root } 592ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root 593ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root /** 594ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root * Make sure that we ended where we thought we would and the output UTF-16 595ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root * will be exactly how long we were told it would be. 596ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root */ 597ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root if (u8cur != u8end) { 598ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root return -1; 599ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root } 600ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root 601ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root return u16measuredLen; 602ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root} 603ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root 6049de6776321b80d387e6108683547bc043f868925Sergio Girochar16_t* utf8_to_utf16(const uint8_t* u8str, size_t u8len, char16_t* u16str, size_t u16len) { 6059de6776321b80d387e6108683547bc043f868925Sergio Giro // A value > SSIZE_MAX is probably a negative value returned as an error and casted. 6069de6776321b80d387e6108683547bc043f868925Sergio Giro LOG_ALWAYS_FATAL_IF(u16len == 0 || u16len > SSIZE_MAX, "u16len is %zu", u16len); 6079de6776321b80d387e6108683547bc043f868925Sergio Giro char16_t* end = utf8_to_utf16_no_null_terminator(u8str, u8len, u16str, u16len - 1); 608aa983c91caf848b9659599336a4156c28a125fa9Jeff Brown *end = 0; 6099de6776321b80d387e6108683547bc043f868925Sergio Giro return end; 610ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root} 611ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root 6129de6776321b80d387e6108683547bc043f868925Sergio Girochar16_t* utf8_to_utf16_no_null_terminator( 6139de6776321b80d387e6108683547bc043f868925Sergio Giro const uint8_t* src, size_t srcLen, char16_t* dst, size_t dstLen) { 6149de6776321b80d387e6108683547bc043f868925Sergio Giro if (dstLen == 0) { 6159de6776321b80d387e6108683547bc043f868925Sergio Giro return dst; 6169de6776321b80d387e6108683547bc043f868925Sergio Giro } 6179de6776321b80d387e6108683547bc043f868925Sergio Giro // A value > SSIZE_MAX is probably a negative value returned as an error and casted. 6189de6776321b80d387e6108683547bc043f868925Sergio Giro LOG_ALWAYS_FATAL_IF(dstLen > SSIZE_MAX, "dstLen is %zu", dstLen); 6190f10d0abf3e6f6b5631c091256f8b4e7a20a33d0Dianne Hackborn const uint8_t* const u8end = src + srcLen; 6200f10d0abf3e6f6b5631c091256f8b4e7a20a33d0Dianne Hackborn const uint8_t* u8cur = src; 6215bed8036644f552210a7cfcbed2d6d20cf2981b0Mark Salyzyn const char16_t* const u16end = dst + dstLen; 6220f10d0abf3e6f6b5631c091256f8b4e7a20a33d0Dianne Hackborn char16_t* u16cur = dst; 6230f10d0abf3e6f6b5631c091256f8b4e7a20a33d0Dianne Hackborn 6240f10d0abf3e6f6b5631c091256f8b4e7a20a33d0Dianne Hackborn while (u8cur < u8end && u16cur < u16end) { 6250f10d0abf3e6f6b5631c091256f8b4e7a20a33d0Dianne Hackborn size_t u8len = utf8_codepoint_len(*u8cur); 6260f10d0abf3e6f6b5631c091256f8b4e7a20a33d0Dianne Hackborn uint32_t codepoint = utf8_to_utf32_codepoint(u8cur, u8len); 6270f10d0abf3e6f6b5631c091256f8b4e7a20a33d0Dianne Hackborn 6280f10d0abf3e6f6b5631c091256f8b4e7a20a33d0Dianne Hackborn // Convert the UTF32 codepoint to one or more UTF16 codepoints 6290f10d0abf3e6f6b5631c091256f8b4e7a20a33d0Dianne Hackborn if (codepoint <= 0xFFFF) { 6300f10d0abf3e6f6b5631c091256f8b4e7a20a33d0Dianne Hackborn // Single UTF16 character 6310f10d0abf3e6f6b5631c091256f8b4e7a20a33d0Dianne Hackborn *u16cur++ = (char16_t) codepoint; 6320f10d0abf3e6f6b5631c091256f8b4e7a20a33d0Dianne Hackborn } else { 6330f10d0abf3e6f6b5631c091256f8b4e7a20a33d0Dianne Hackborn // Multiple UTF16 characters with surrogates 6340f10d0abf3e6f6b5631c091256f8b4e7a20a33d0Dianne Hackborn codepoint = codepoint - 0x10000; 6350f10d0abf3e6f6b5631c091256f8b4e7a20a33d0Dianne Hackborn *u16cur++ = (char16_t) ((codepoint >> 10) + 0xD800); 6360f10d0abf3e6f6b5631c091256f8b4e7a20a33d0Dianne Hackborn if (u16cur >= u16end) { 6370f10d0abf3e6f6b5631c091256f8b4e7a20a33d0Dianne Hackborn // Ooops... not enough room for this surrogate pair. 6380f10d0abf3e6f6b5631c091256f8b4e7a20a33d0Dianne Hackborn return u16cur-1; 6390f10d0abf3e6f6b5631c091256f8b4e7a20a33d0Dianne Hackborn } 6400f10d0abf3e6f6b5631c091256f8b4e7a20a33d0Dianne Hackborn *u16cur++ = (char16_t) ((codepoint & 0x3FF) + 0xDC00); 6410f10d0abf3e6f6b5631c091256f8b4e7a20a33d0Dianne Hackborn } 6420f10d0abf3e6f6b5631c091256f8b4e7a20a33d0Dianne Hackborn 6430f10d0abf3e6f6b5631c091256f8b4e7a20a33d0Dianne Hackborn u8cur += u8len; 6440f10d0abf3e6f6b5631c091256f8b4e7a20a33d0Dianne Hackborn } 6450f10d0abf3e6f6b5631c091256f8b4e7a20a33d0Dianne Hackborn return u16cur; 6460f10d0abf3e6f6b5631c091256f8b4e7a20a33d0Dianne Hackborn} 6470f10d0abf3e6f6b5631c091256f8b4e7a20a33d0Dianne Hackborn 648ba0165bef09729a33ab8e0ca329342be05e0d859Kenny Root} 649