1/*
2 * Copyright 2006 The Android Open Source Project
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8
9#include "SkUtils.h"
10
11/*  0xxxxxxx    1 total
12    10xxxxxx    // never a leading byte
13    110xxxxx    2 total
14    1110xxxx    3 total
15    11110xxx    4 total
16
17    11 10 01 01 xx xx xx xx 0...
18    0xE5XX0000
19    0xE5 << 24
20*/
21
22static bool utf8_byte_is_valid(uint8_t c) {
23    return c < 0xF5 && (c & 0xFE) != 0xC0;
24}
25static bool utf8_byte_is_continuation(uint8_t c) {
26    return  (c & 0xC0) == 0x80;
27}
28static bool utf8_byte_is_leading_byte(uint8_t c) {
29    return utf8_byte_is_valid(c) && !utf8_byte_is_continuation(c);
30}
31
32#ifdef SK_DEBUG
33    static void assert_utf8_leadingbyte(unsigned c) {
34        SkASSERT(utf8_byte_is_leading_byte(SkToU8(c)));
35    }
36
37    int SkUTF8_LeadByteToCount(unsigned c) {
38        assert_utf8_leadingbyte(c);
39        return (((0xE5 << 24) >> (c >> 4 << 1)) & 3) + 1;
40    }
41#else
42    #define assert_utf8_leadingbyte(c)
43#endif
44
45/**
46 * @returns -1  iff invalid UTF8 byte,
47 *           0  iff UTF8 continuation byte,
48 *           1  iff ASCII byte,
49 *           2  iff leading byte of 2-byte sequence,
50 *           3  iff leading byte of 3-byte sequence, and
51 *           4  iff leading byte of 4-byte sequence.
52 *
53 * I.e.: if return value > 0, then gives length of sequence.
54*/
55static int utf8_byte_type(uint8_t c) {
56    if (c < 0x80) {
57        return 1;
58    } else if (c < 0xC0) {
59        return 0;
60    } else if (c < 0xF5 && (c & 0xFE) != 0xC0) { // "octet values C0, C1, F5 to FF never appear"
61        return (((0xE5 << 24) >> ((unsigned)c >> 4 << 1)) & 3) + 1;
62    } else {
63        return -1;
64    }
65}
66static bool utf8_type_is_valid_leading_byte(int type) { return type > 0; }
67
68int SkUTF8_CountUnichars(const char utf8[]) {
69    SkASSERT(utf8);
70
71    int count = 0;
72
73    for (;;) {
74        int c = *(const uint8_t*)utf8;
75        if (c == 0) {
76            break;
77        }
78        utf8 += SkUTF8_LeadByteToCount(c);
79        count += 1;
80    }
81    return count;
82}
83
84// SAFE: returns -1 if invalid UTF-8
85int SkUTF8_CountUnichars(const void* text, size_t byteLength) {
86    SkASSERT(text);
87    const char* utf8 = static_cast<const char*>(text);
88    if (byteLength == 0) {
89        return 0;
90    }
91
92    int         count = 0;
93    const char* stop = utf8 + byteLength;
94
95    while (utf8 < stop) {
96        int type = utf8_byte_type(*(const uint8_t*)utf8);
97        SkASSERT(type >= -1 && type <= 4);
98        if (!utf8_type_is_valid_leading_byte(type) || utf8 + type > stop) {
99            // Sequence extends beyond end.
100            return -1;
101        }
102        while(type-- > 1) {
103            ++utf8;
104            if (!utf8_byte_is_continuation(*(const uint8_t*)utf8)) {
105                return -1;
106            }
107        }
108        ++utf8;
109        ++count;
110    }
111    return count;
112}
113
114SkUnichar SkUTF8_ToUnichar(const char utf8[]) {
115    SkASSERT(utf8);
116
117    const uint8_t*  p = (const uint8_t*)utf8;
118    int             c = *p;
119    int             hic = c << 24;
120
121    assert_utf8_leadingbyte(c);
122
123    if (hic < 0) {
124        uint32_t mask = (uint32_t)~0x3F;
125        hic = SkLeftShift(hic, 1);
126        do {
127            c = (c << 6) | (*++p & 0x3F);
128            mask <<= 5;
129        } while ((hic = SkLeftShift(hic, 1)) < 0);
130        c &= ~mask;
131    }
132    return c;
133}
134
135// SAFE: returns -1 on invalid UTF-8 sequence.
136SkUnichar SkUTF8_NextUnicharWithError(const char** ptr, const char* end) {
137    SkASSERT(ptr && *ptr);
138    SkASSERT(*ptr < end);
139    const uint8_t*  p = (const uint8_t*)*ptr;
140    int             c = *p;
141    int             hic = c << 24;
142
143    if (!utf8_byte_is_leading_byte(c)) {
144        return -1;
145    }
146    if (hic < 0) {
147        uint32_t mask = (uint32_t)~0x3F;
148        hic = SkLeftShift(hic, 1);
149        do {
150            ++p;
151            if (p >= (const uint8_t*)end) {
152                return -1;
153            }
154            // check before reading off end of array.
155            uint8_t nextByte = *p;
156            if (!utf8_byte_is_continuation(nextByte)) {
157                return -1;
158            }
159            c = (c << 6) | (nextByte & 0x3F);
160            mask <<= 5;
161        } while ((hic = SkLeftShift(hic, 1)) < 0);
162        c &= ~mask;
163    }
164    *ptr = (char*)p + 1;
165    return c;
166}
167
168SkUnichar SkUTF8_NextUnichar(const char** ptr) {
169    SkASSERT(ptr && *ptr);
170
171    const uint8_t*  p = (const uint8_t*)*ptr;
172    int             c = *p;
173    int             hic = c << 24;
174
175    assert_utf8_leadingbyte(c);
176
177    if (hic < 0) {
178        uint32_t mask = (uint32_t)~0x3F;
179        hic = SkLeftShift(hic, 1);
180        do {
181            c = (c << 6) | (*++p & 0x3F);
182            mask <<= 5;
183        } while ((hic = SkLeftShift(hic, 1)) < 0);
184        c &= ~mask;
185    }
186    *ptr = (char*)p + 1;
187    return c;
188}
189
190SkUnichar SkUTF8_PrevUnichar(const char** ptr) {
191    SkASSERT(ptr && *ptr);
192
193    const char* p = *ptr;
194
195    if (*--p & 0x80) {
196        while (*--p & 0x40) {
197            ;
198        }
199    }
200
201    *ptr = (char*)p;
202    return SkUTF8_NextUnichar(&p);
203}
204
205size_t SkUTF8_FromUnichar(SkUnichar uni, char utf8[]) {
206    if ((uint32_t)uni > 0x10FFFF) {
207        SkDEBUGFAIL("bad unichar");
208        return 0;
209    }
210
211    if (uni <= 127) {
212        if (utf8) {
213            *utf8 = (char)uni;
214        }
215        return 1;
216    }
217
218    char    tmp[4];
219    char*   p = tmp;
220    size_t  count = 1;
221
222    SkDEBUGCODE(SkUnichar orig = uni;)
223
224    while (uni > 0x7F >> count) {
225        *p++ = (char)(0x80 | (uni & 0x3F));
226        uni >>= 6;
227        count += 1;
228    }
229
230    if (utf8) {
231        p = tmp;
232        utf8 += count;
233        while (p < tmp + count - 1) {
234            *--utf8 = *p++;
235        }
236        *--utf8 = (char)(~(0xFF >> count) | uni);
237    }
238
239    SkASSERT(utf8 == nullptr || orig == SkUTF8_ToUnichar(utf8));
240    return count;
241}
242
243///////////////////////////////////////////////////////////////////////////////
244
245int SkUTF16_CountUnichars(const uint16_t src[]) {
246    SkASSERT(src);
247
248    int count = 0;
249    unsigned c;
250    while ((c = *src++) != 0) {
251        SkASSERT(!SkUTF16_IsLowSurrogate(c));
252        if (SkUTF16_IsHighSurrogate(c)) {
253            c = *src++;
254            SkASSERT(SkUTF16_IsLowSurrogate(c));
255        }
256        count += 1;
257    }
258    return count;
259}
260
261// returns -1 on error
262int SkUTF16_CountUnichars(const void* text, size_t byteLength) {
263    SkASSERT(text);
264    if (byteLength == 0) {
265        return 0;
266    }
267    if (!SkIsAlign2(intptr_t(text)) || !SkIsAlign2(byteLength)) {
268        return -1;
269    }
270
271    const uint16_t* src = static_cast<const uint16_t*>(text);
272    const uint16_t* stop = src + (byteLength >> 1);
273    int count = 0;
274    while (src < stop) {
275        unsigned c = *src++;
276        SkASSERT(!SkUTF16_IsLowSurrogate(c));
277        if (SkUTF16_IsHighSurrogate(c)) {
278            if (src >= stop) {
279                return -1;
280            }
281            c = *src++;
282            if (!SkUTF16_IsLowSurrogate(c)) {
283                return -1;
284            }
285        }
286        count += 1;
287    }
288    return count;
289}
290
291SkUnichar SkUTF16_NextUnichar(const uint16_t** srcPtr) {
292    SkASSERT(srcPtr && *srcPtr);
293
294    const uint16_t* src = *srcPtr;
295    SkUnichar       c = *src++;
296
297    SkASSERT(!SkUTF16_IsLowSurrogate(c));
298    if (SkUTF16_IsHighSurrogate(c)) {
299        unsigned c2 = *src++;
300        SkASSERT(SkUTF16_IsLowSurrogate(c2));
301
302        // c = ((c & 0x3FF) << 10) + (c2 & 0x3FF) + 0x10000
303        // c = (((c & 0x3FF) + 64) << 10) + (c2 & 0x3FF)
304        c = (c << 10) + c2 + (0x10000 - (0xD800 << 10) - 0xDC00);
305    }
306    *srcPtr = src;
307    return c;
308}
309
310SkUnichar SkUTF16_PrevUnichar(const uint16_t** srcPtr) {
311    SkASSERT(srcPtr && *srcPtr);
312
313    const uint16_t* src = *srcPtr;
314    SkUnichar       c = *--src;
315
316    SkASSERT(!SkUTF16_IsHighSurrogate(c));
317    if (SkUTF16_IsLowSurrogate(c)) {
318        unsigned c2 = *--src;
319        SkASSERT(SkUTF16_IsHighSurrogate(c2));
320        c = (c2 << 10) + c + (0x10000 - (0xD800 << 10) - 0xDC00);
321    }
322    *srcPtr = src;
323    return c;
324}
325
326size_t SkUTF16_FromUnichar(SkUnichar uni, uint16_t dst[]) {
327    SkASSERT((unsigned)uni <= 0x10FFFF);
328
329    int extra = (uni > 0xFFFF);
330
331    if (dst) {
332        if (extra) {
333            // dst[0] = SkToU16(0xD800 | ((uni - 0x10000) >> 10));
334            // dst[0] = SkToU16(0xD800 | ((uni >> 10) - 64));
335            dst[0] = SkToU16((0xD800 - 64) + (uni >> 10));
336            dst[1] = SkToU16(0xDC00 | (uni & 0x3FF));
337
338            SkASSERT(SkUTF16_IsHighSurrogate(dst[0]));
339            SkASSERT(SkUTF16_IsLowSurrogate(dst[1]));
340        } else {
341            dst[0] = SkToU16(uni);
342            SkASSERT(!SkUTF16_IsHighSurrogate(dst[0]));
343            SkASSERT(!SkUTF16_IsLowSurrogate(dst[0]));
344        }
345    }
346    return 1 + extra;
347}
348
349size_t SkUTF16_ToUTF8(const uint16_t utf16[], int numberOf16BitValues,
350                      char utf8[]) {
351    SkASSERT(numberOf16BitValues >= 0);
352    if (numberOf16BitValues <= 0) {
353        return 0;
354    }
355
356    SkASSERT(utf16 != nullptr);
357
358    const uint16_t* stop = utf16 + numberOf16BitValues;
359    size_t          size = 0;
360
361    if (utf8 == nullptr) {    // just count
362        while (utf16 < stop) {
363            size += SkUTF8_FromUnichar(SkUTF16_NextUnichar(&utf16), nullptr);
364        }
365    } else {
366        char* start = utf8;
367        while (utf16 < stop) {
368            utf8 += SkUTF8_FromUnichar(SkUTF16_NextUnichar(&utf16), utf8);
369        }
370        size = utf8 - start;
371    }
372    return size;
373}
374
375const char SkHexadecimalDigits::gUpper[16] =
376           { '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F' };
377const char SkHexadecimalDigits::gLower[16] =
378           { '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f' };
379
380
381// returns -1 on error
382int SkUTF32_CountUnichars(const void* text, size_t byteLength) {
383    if (byteLength == 0) {
384        return 0;
385    }
386    if (!SkIsAlign4(intptr_t(text)) || !SkIsAlign4(byteLength)) {
387        return -1;
388    }
389    const uint32_t kInvalidUnicharMask = 0xFF000000;    // unichar fits in 24 bits
390    const uint32_t* ptr = static_cast<const uint32_t*>(text);
391    const uint32_t* stop = ptr + (byteLength >> 2);
392    while (ptr < stop) {
393        if (*ptr & kInvalidUnicharMask) {
394            return -1;
395        }
396        ptr += 1;
397    }
398    return SkToInt(byteLength >> 2);
399}
400
401