1c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root/*
2c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root * Copyright (C) 2005 The Android Open Source Project
3c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root *
4c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root * Licensed under the Apache License, Version 2.0 (the "License");
5c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root * you may not use this file except in compliance with the License.
6c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root * You may obtain a copy of the License at
7c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root *
8c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root *      http://www.apache.org/licenses/LICENSE-2.0
9c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root *
10c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root * Unless required by applicable law or agreed to in writing, software
11c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root * distributed under the License is distributed on an "AS IS" BASIS,
12c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root * See the License for the specific language governing permissions and
14c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root * limitations under the License.
15c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root */
16c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root
17c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root#include <utils/Unicode.h>
18c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root
19c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root#include <stddef.h>
20c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root
21c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root#ifdef HAVE_WINSOCK
22c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root# undef  nhtol
23c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root# undef  htonl
24c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root# undef  nhtos
25c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root# undef  htons
26c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root
27c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root# ifdef HAVE_LITTLE_ENDIAN
28c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root#  define ntohl(x)    ( ((x) << 24) | (((x) >> 24) & 255) | (((x) << 8) & 0xff0000) | (((x) >> 8) & 0xff00) )
29c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root#  define htonl(x)    ntohl(x)
30c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root#  define ntohs(x)    ( (((x) << 8) & 0xff00) | (((x) >> 8) & 255) )
31c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root#  define htons(x)    ntohs(x)
32c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root# else
33c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root#  define ntohl(x)    (x)
34c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root#  define htonl(x)    (x)
35c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root#  define ntohs(x)    (x)
36c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root#  define htons(x)    (x)
37c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root# endif
38c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root#else
39c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root# include <netinet/in.h>
40c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root#endif
41c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root
42c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Rootextern "C" {
43c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root
44c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Rootstatic const char32_t kByteMask = 0x000000BF;
45c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Rootstatic const char32_t kByteMark = 0x00000080;
46c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root
47c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root// Surrogates aren't valid for UTF-32 characters, so define some
48c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root// constants that will let us screen them out.
49c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Rootstatic const char32_t kUnicodeSurrogateHighStart  = 0x0000D800;
50c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Rootstatic const char32_t kUnicodeSurrogateHighEnd    = 0x0000DBFF;
51c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Rootstatic const char32_t kUnicodeSurrogateLowStart   = 0x0000DC00;
52c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Rootstatic const char32_t kUnicodeSurrogateLowEnd     = 0x0000DFFF;
53c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Rootstatic const char32_t kUnicodeSurrogateStart      = kUnicodeSurrogateHighStart;
54c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Rootstatic const char32_t kUnicodeSurrogateEnd        = kUnicodeSurrogateLowEnd;
55c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Rootstatic const char32_t kUnicodeMaxCodepoint        = 0x0010FFFF;
56c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root
57c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root// Mask used to set appropriate bits in first byte of UTF-8 sequence,
58c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root// indexed by number of bytes in the sequence.
59c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root// 0xxxxxxx
60c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root// -> (00-7f) 7bit. Bit mask for the first byte is 0x00000000
61c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root// 110yyyyx 10xxxxxx
62c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root// -> (c0-df)(80-bf) 11bit. Bit mask is 0x000000C0
63c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root// 1110yyyy 10yxxxxx 10xxxxxx
64c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root// -> (e0-ef)(80-bf)(80-bf) 16bit. Bit mask is 0x000000E0
65c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root// 11110yyy 10yyxxxx 10xxxxxx 10xxxxxx
66c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root// -> (f0-f7)(80-bf)(80-bf)(80-bf) 21bit. Bit mask is 0x000000F0
67c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Rootstatic const char32_t kFirstByteMark[] = {
68c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root    0x00000000, 0x00000000, 0x000000C0, 0x000000E0, 0x000000F0
69c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root};
70c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root
71c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root// --------------------------------------------------------------------------
72c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root// UTF-32
73c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root// --------------------------------------------------------------------------
74c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root
75c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root/**
76c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root * Return number of UTF-8 bytes required for the character. If the character
77c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root * is invalid, return size of 0.
78c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root */
79c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Rootstatic inline size_t utf32_codepoint_utf8_length(char32_t srcChar)
80c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root{
81c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root    // Figure out how many bytes the result will require.
82c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root    if (srcChar < 0x00000080) {
83c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root        return 1;
84c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root    } else if (srcChar < 0x00000800) {
85c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root        return 2;
86c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root    } else if (srcChar < 0x00010000) {
87c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root        if ((srcChar < kUnicodeSurrogateStart) || (srcChar > kUnicodeSurrogateEnd)) {
88c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root            return 3;
89c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root        } else {
90c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root            // Surrogates are invalid UTF-32 characters.
91c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root            return 0;
92c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root        }
93c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root    }
94c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root    // Max code point for Unicode is 0x0010FFFF.
95c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root    else if (srcChar <= kUnicodeMaxCodepoint) {
96c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root        return 4;
97c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root    } else {
98c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root        // Invalid UTF-32 character.
99c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root        return 0;
100c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root    }
101c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root}
102c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root
103c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root// Write out the source character to <dstP>.
104c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root
105c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Rootstatic inline void utf32_codepoint_to_utf8(uint8_t* dstP, char32_t srcChar, size_t bytes)
106c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root{
107c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root    dstP += bytes;
108c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root    switch (bytes)
109c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root    {   /* note: everything falls through. */
110c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root        case 4: *--dstP = (uint8_t)((srcChar | kByteMark) & kByteMask); srcChar >>= 6;
111c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root        case 3: *--dstP = (uint8_t)((srcChar | kByteMark) & kByteMask); srcChar >>= 6;
112c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root        case 2: *--dstP = (uint8_t)((srcChar | kByteMark) & kByteMask); srcChar >>= 6;
113c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root        case 1: *--dstP = (uint8_t)(srcChar | kFirstByteMark[bytes]);
114c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root    }
115c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root}
116c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root
117c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Rootsize_t strlen32(const char32_t *s)
118c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root{
119c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root  const char32_t *ss = s;
120c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root  while ( *ss )
121c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root    ss++;
122c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root  return ss-s;
123c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root}
124c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root
125c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Rootsize_t strnlen32(const char32_t *s, size_t maxlen)
126c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root{
127c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root  const char32_t *ss = s;
128c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root  while ((maxlen > 0) && *ss) {
129c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root    ss++;
130c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root    maxlen--;
131c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root  }
132c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root  return ss-s;
133c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root}
134c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root
135c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Rootstatic inline int32_t utf32_at_internal(const char* cur, size_t *num_read)
136c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root{
137c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root    const char first_char = *cur;
138c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root    if ((first_char & 0x80) == 0) { // ASCII
139c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root        *num_read = 1;
140c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root        return *cur;
141c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root    }
142c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root    cur++;
143c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root    char32_t mask, to_ignore_mask;
144c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root    size_t num_to_read = 0;
145c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root    char32_t utf32 = first_char;
146c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root    for (num_to_read = 1, mask = 0x40, to_ignore_mask = 0xFFFFFF80;
147c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root         (first_char & mask);
148c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root         num_to_read++, to_ignore_mask |= mask, mask >>= 1) {
149c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root        // 0x3F == 00111111
150c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root        utf32 = (utf32 << 6) + (*cur++ & 0x3F);
151c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root    }
152c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root    to_ignore_mask |= mask;
153c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root    utf32 &= ~(to_ignore_mask << (6 * (num_to_read - 1)));
154c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root
155c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root    *num_read = num_to_read;
156c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root    return static_cast<int32_t>(utf32);
157c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root}
158c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root
159c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Rootint32_t utf32_from_utf8_at(const char *src, size_t src_len, size_t index, size_t *next_index)
160c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root{
161c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root    if (index >= src_len) {
162c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root        return -1;
163c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root    }
164c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root    size_t dummy_index;
165c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root    if (next_index == NULL) {
166c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root        next_index = &dummy_index;
167c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root    }
168c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root    size_t num_read;
169c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root    int32_t ret = utf32_at_internal(src + index, &num_read);
170c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root    if (ret >= 0) {
171c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root        *next_index = index + num_read;
172c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root    }
173c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root
174c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root    return ret;
175c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root}
176c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root
177c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Rootssize_t utf32_to_utf8_length(const char32_t *src, size_t src_len)
178c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root{
179c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root    if (src == NULL || src_len == 0) {
180c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root        return -1;
181c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root    }
182c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root
183c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root    size_t ret = 0;
184c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root    const char32_t *end = src + src_len;
185c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root    while (src < end) {
186c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root        ret += utf32_codepoint_utf8_length(*src++);
187c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root    }
188c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root    return ret;
189c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root}
190c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root
191c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Rootvoid utf32_to_utf8(const char32_t* src, size_t src_len, char* dst)
192c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root{
193c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root    if (src == NULL || src_len == 0 || dst == NULL) {
194c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root        return;
195c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root    }
196c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root
197c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root    const char32_t *cur_utf32 = src;
198c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root    const char32_t *end_utf32 = src + src_len;
199c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root    char *cur = dst;
200c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root    while (cur_utf32 < end_utf32) {
201c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root        size_t len = utf32_codepoint_utf8_length(*cur_utf32);
202c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root        utf32_codepoint_to_utf8((uint8_t *)cur, *cur_utf32++, len);
203c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root        cur += len;
204c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root    }
205c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root    *cur = '\0';
206c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root}
207c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root
208c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root// --------------------------------------------------------------------------
209c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root// UTF-16
210c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root// --------------------------------------------------------------------------
211c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root
212c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Rootint strcmp16(const char16_t *s1, const char16_t *s2)
213c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root{
214c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root  char16_t ch;
215c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root  int d = 0;
216c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root
217c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root  while ( 1 ) {
218c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root    d = (int)(ch = *s1++) - (int)*s2++;
219c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root    if ( d || !ch )
220c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root      break;
221c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root  }
222c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root
223c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root  return d;
224c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root}
225c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root
226c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Rootint strncmp16(const char16_t *s1, const char16_t *s2, size_t n)
227c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root{
228c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root  char16_t ch;
229c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root  int d = 0;
230c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root
231c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root  while ( n-- ) {
232c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root    d = (int)(ch = *s1++) - (int)*s2++;
233c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root    if ( d || !ch )
234c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root      break;
235c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root  }
236c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root
237c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root  return d;
238c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root}
239c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root
240c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Rootchar16_t *strcpy16(char16_t *dst, const char16_t *src)
241c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root{
242c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root  char16_t *q = dst;
243c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root  const char16_t *p = src;
244c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root  char16_t ch;
245c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root
246c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root  do {
247c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root    *q++ = ch = *p++;
248c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root  } while ( ch );
249c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root
250c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root  return dst;
251c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root}
252c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root
253c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Rootsize_t strlen16(const char16_t *s)
254c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root{
255c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root  const char16_t *ss = s;
256c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root  while ( *ss )
257c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root    ss++;
258c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root  return ss-s;
259c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root}
260c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root
261c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root
262c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Rootchar16_t *strncpy16(char16_t *dst, const char16_t *src, size_t n)
263c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root{
264c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root  char16_t *q = dst;
265c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root  const char16_t *p = src;
266c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root  char ch;
267c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root
268c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root  while (n) {
269c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root    n--;
270c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root    *q++ = ch = *p++;
271c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root    if ( !ch )
272c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root      break;
273c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root  }
274c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root
275c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root  *q = 0;
276c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root
277c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root  return dst;
278c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root}
279c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root
280c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Rootsize_t strnlen16(const char16_t *s, size_t maxlen)
281c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root{
282c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root  const char16_t *ss = s;
283c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root
284c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root  /* Important: the maxlen test must precede the reference through ss;
285c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root     since the byte beyond the maximum may segfault */
286c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root  while ((maxlen > 0) && *ss) {
287c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root    ss++;
288c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root    maxlen--;
289c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root  }
290c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root  return ss-s;
291c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root}
292c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root
293c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Rootint strzcmp16(const char16_t *s1, size_t n1, const char16_t *s2, size_t n2)
294c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root{
295c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root    const char16_t* e1 = s1+n1;
296c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root    const char16_t* e2 = s2+n2;
297c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root
298c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root    while (s1 < e1 && s2 < e2) {
299c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root        const int d = (int)*s1++ - (int)*s2++;
300c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root        if (d) {
301c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root            return d;
302c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root        }
303c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root    }
304c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root
305c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root    return n1 < n2
306c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root        ? (0 - (int)*s2)
307c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root        : (n1 > n2
308c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root           ? ((int)*s1 - 0)
309c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root           : 0);
310c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root}
311c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root
312c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Rootint strzcmp16_h_n(const char16_t *s1H, size_t n1, const char16_t *s2N, size_t n2)
313c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root{
314c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root    const char16_t* e1 = s1H+n1;
315c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root    const char16_t* e2 = s2N+n2;
316c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root
317c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root    while (s1H < e1 && s2N < e2) {
318c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root        const char16_t c2 = ntohs(*s2N);
319c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root        const int d = (int)*s1H++ - (int)c2;
320c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root        s2N++;
321c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root        if (d) {
322c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root            return d;
323c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root        }
324c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root    }
325c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root
326c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root    return n1 < n2
327c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root        ? (0 - (int)ntohs(*s2N))
328c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root        : (n1 > n2
329c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root           ? ((int)*s1H - 0)
330c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root           : 0);
331c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root}
332c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root
333c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Rootvoid utf16_to_utf8(const char16_t* src, size_t src_len, char* dst)
334c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root{
335c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root    if (src == NULL || src_len == 0 || dst == NULL) {
336c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root        return;
337c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root    }
338c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root
339c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root    const char16_t* cur_utf16 = src;
340c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root    const char16_t* const end_utf16 = src + src_len;
341c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root    char *cur = dst;
342c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root    while (cur_utf16 < end_utf16) {
343c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root        char32_t utf32;
344c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root        // surrogate pairs
345c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root        if ((*cur_utf16 & 0xFC00) == 0xD800) {
346c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root            utf32 = (*cur_utf16++ - 0xD800) << 10;
347c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root            utf32 |= *cur_utf16++ - 0xDC00;
348c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root            utf32 += 0x10000;
349c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root        } else {
350c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root            utf32 = (char32_t) *cur_utf16++;
351c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root        }
352c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root        const size_t len = utf32_codepoint_utf8_length(utf32);
353c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root        utf32_codepoint_to_utf8((uint8_t*)cur, utf32, len);
354c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root        cur += len;
355c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root    }
356c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root    *cur = '\0';
357c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root}
358c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root
359c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root// --------------------------------------------------------------------------
360c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root// UTF-8
361c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root// --------------------------------------------------------------------------
362c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root
363c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Rootssize_t utf8_length(const char *src)
364c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root{
365c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root    const char *cur = src;
366c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root    size_t ret = 0;
367c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root    while (*cur != '\0') {
368c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root        const char first_char = *cur++;
369c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root        if ((first_char & 0x80) == 0) { // ASCII
370c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root            ret += 1;
371c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root            continue;
372c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root        }
373c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root        // (UTF-8's character must not be like 10xxxxxx,
374c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root        //  but 110xxxxx, 1110xxxx, ... or 1111110x)
375c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root        if ((first_char & 0x40) == 0) {
376c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root            return -1;
377c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root        }
378c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root
379c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root        int32_t mask, to_ignore_mask;
380c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root        size_t num_to_read = 0;
381c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root        char32_t utf32 = 0;
382c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root        for (num_to_read = 1, mask = 0x40, to_ignore_mask = 0x80;
383c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root             num_to_read < 5 && (first_char & mask);
384c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root             num_to_read++, to_ignore_mask |= mask, mask >>= 1) {
385c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root            if ((*cur & 0xC0) != 0x80) { // must be 10xxxxxx
386c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root                return -1;
387c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root            }
388c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root            // 0x3F == 00111111
389c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root            utf32 = (utf32 << 6) + (*cur++ & 0x3F);
390c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root        }
391c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root        // "first_char" must be (110xxxxx - 11110xxx)
392c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root        if (num_to_read == 5) {
393c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root            return -1;
394c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root        }
395c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root        to_ignore_mask |= mask;
396c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root        utf32 |= ((~to_ignore_mask) & first_char) << (6 * (num_to_read - 1));
397c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root        if (utf32 > kUnicodeMaxCodepoint) {
398c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root            return -1;
399c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root        }
400c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root
401c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root        ret += num_to_read;
402c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root    }
403c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root    return ret;
404c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root}
405c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root
406c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Rootssize_t utf16_to_utf8_length(const char16_t *src, size_t src_len)
407c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root{
408c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root    if (src == NULL || src_len == 0) {
409c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root        return -1;
410c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root    }
411c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root
412c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root    size_t ret = 0;
413c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root    const char16_t* const end = src + src_len;
414c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root    while (src < end) {
415c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root        if ((*src & 0xFC00) == 0xD800 && (src + 1) < end
416c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root                && (*++src & 0xFC00) == 0xDC00) {
417c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root            // surrogate pairs are always 4 bytes.
418c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root            ret += 4;
419c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root            src++;
420c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root        } else {
421c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root            ret += utf32_codepoint_utf8_length((char32_t) *src++);
422c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root        }
423c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root    }
424c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root    return ret;
425c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root}
426c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root
427c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root/**
428c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root * Returns 1-4 based on the number of leading bits.
429c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root *
430c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root * 1111 -> 4
431c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root * 1110 -> 3
432c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root * 110x -> 2
433c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root * 10xx -> 1
434c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root * 0xxx -> 1
435c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root */
436c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Rootstatic inline size_t utf8_codepoint_len(uint8_t ch)
437c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root{
438c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root    return ((0xe5000000 >> ((ch >> 3) & 0x1e)) & 3) + 1;
439c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root}
440c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root
441c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Rootstatic inline void utf8_shift_and_mask(uint32_t* codePoint, const uint8_t byte)
442c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root{
443c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root    *codePoint <<= 6;
444c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root    *codePoint |= 0x3F & byte;
445c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root}
446c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root
447c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Rootsize_t utf8_to_utf32_length(const char *src, size_t src_len)
448c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root{
449c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root    if (src == NULL || src_len == 0) {
450c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root        return 0;
451c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root    }
452c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root    size_t ret = 0;
453c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root    const char* cur;
454c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root    const char* end;
455c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root    size_t num_to_skip;
456c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root    for (cur = src, end = src + src_len, num_to_skip = 1;
457c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root         cur < end;
458c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root         cur += num_to_skip, ret++) {
459c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root        const char first_char = *cur;
460c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root        num_to_skip = 1;
461c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root        if ((first_char & 0x80) == 0) {  // ASCII
462c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root            continue;
463c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root        }
464c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root        int32_t mask;
465c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root
466c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root        for (mask = 0x40; (first_char & mask); num_to_skip++, mask >>= 1) {
467c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root        }
468c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root    }
469c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root    return ret;
470c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root}
471c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root
472c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Rootvoid utf8_to_utf32(const char* src, size_t src_len, char32_t* dst)
473c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root{
474c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root    if (src == NULL || src_len == 0 || dst == NULL) {
475c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root        return;
476c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root    }
477c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root
478c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root    const char* cur = src;
479c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root    const char* const end = src + src_len;
480c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root    char32_t* cur_utf32 = dst;
481c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root    while (cur < end) {
482c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root        size_t num_read;
483c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root        *cur_utf32++ = static_cast<char32_t>(utf32_at_internal(cur, &num_read));
484c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root        cur += num_read;
485c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root    }
486c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root    *cur_utf32 = 0;
487c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root}
488c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root
489c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Rootstatic inline uint32_t utf8_to_utf32_codepoint(const uint8_t *src, size_t length)
490c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root{
491c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root    uint32_t unicode;
492c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root
493c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root    switch (length)
494c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root    {
495c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root        case 1:
496c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root            return src[0];
497c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root        case 2:
498c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root            unicode = src[0] & 0x1f;
499c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root            utf8_shift_and_mask(&unicode, src[1]);
500c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root            return unicode;
501c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root        case 3:
502c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root            unicode = src[0] & 0x0f;
503c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root            utf8_shift_and_mask(&unicode, src[1]);
504c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root            utf8_shift_and_mask(&unicode, src[2]);
505c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root            return unicode;
506c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root        case 4:
507c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root            unicode = src[0] & 0x07;
508c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root            utf8_shift_and_mask(&unicode, src[1]);
509c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root            utf8_shift_and_mask(&unicode, src[2]);
510c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root            utf8_shift_and_mask(&unicode, src[3]);
511c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root            return unicode;
512c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root        default:
513c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root            return 0xffff;
514c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root    }
515c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root
516c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root    //printf("Char at %p: len=%d, utf-16=%p\n", src, length, (void*)result);
517c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root}
518c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root
519c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Rootssize_t utf8_to_utf16_length(const uint8_t* u8str, size_t u8len)
520c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root{
521c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root    const uint8_t* const u8end = u8str + u8len;
522c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root    const uint8_t* u8cur = u8str;
523c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root
524c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root    /* Validate that the UTF-8 is the correct len */
525c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root    size_t u16measuredLen = 0;
526c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root    while (u8cur < u8end) {
527c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root        u16measuredLen++;
528c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root        int u8charLen = utf8_codepoint_len(*u8cur);
529c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root        uint32_t codepoint = utf8_to_utf32_codepoint(u8cur, u8charLen);
530c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root        if (codepoint > 0xFFFF) u16measuredLen++; // this will be a surrogate pair in utf16
531c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root        u8cur += u8charLen;
532c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root    }
533c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root
534c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root    /**
535c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root     * Make sure that we ended where we thought we would and the output UTF-16
536c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root     * will be exactly how long we were told it would be.
537c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root     */
538c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root    if (u8cur != u8end) {
539c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root        return -1;
540c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root    }
541c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root
542c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root    return u16measuredLen;
543c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root}
544c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root
545de27227026e01c26a9debef77e4e268f242e713eJeff Brownchar16_t* utf8_to_utf16_no_null_terminator(const uint8_t* u8str, size_t u8len, char16_t* u16str)
546c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root{
547c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root    const uint8_t* const u8end = u8str + u8len;
548c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root    const uint8_t* u8cur = u8str;
549c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root    char16_t* u16cur = u16str;
550c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root
551c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root    while (u8cur < u8end) {
552c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root        size_t u8len = utf8_codepoint_len(*u8cur);
553c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root        uint32_t codepoint = utf8_to_utf32_codepoint(u8cur, u8len);
554c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root
555c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root        // Convert the UTF32 codepoint to one or more UTF16 codepoints
556c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root        if (codepoint <= 0xFFFF) {
557c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root            // Single UTF16 character
558c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root            *u16cur++ = (char16_t) codepoint;
559c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root        } else {
560c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root            // Multiple UTF16 characters with surrogates
561c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root            codepoint = codepoint - 0x10000;
562c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root            *u16cur++ = (char16_t) ((codepoint >> 10) + 0xD800);
563c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root            *u16cur++ = (char16_t) ((codepoint & 0x3FF) + 0xDC00);
564c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root        }
565c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root
566c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root        u8cur += u8len;
567c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root    }
568de27227026e01c26a9debef77e4e268f242e713eJeff Brown    return u16cur;
569de27227026e01c26a9debef77e4e268f242e713eJeff Brown}
570de27227026e01c26a9debef77e4e268f242e713eJeff Brown
571de27227026e01c26a9debef77e4e268f242e713eJeff Brownvoid utf8_to_utf16(const uint8_t* u8str, size_t u8len, char16_t* u16str) {
572de27227026e01c26a9debef77e4e268f242e713eJeff Brown    char16_t* end = utf8_to_utf16_no_null_terminator(u8str, u8len, u16str);
573de27227026e01c26a9debef77e4e268f242e713eJeff Brown    *end = 0;
574c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root}
575c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root
576c412dcb3eb259458905aae9a2f5109c59c9548c0Kenny Root}
577