1/* 2 * Copyright 2006 The Android Open Source Project 3 * 4 * Use of this source code is governed by a BSD-style license that can be 5 * found in the LICENSE file. 6 */ 7 8 9#include "SkUtils.h" 10 11/* 0xxxxxxx 1 total 12 10xxxxxx // never a leading byte 13 110xxxxx 2 total 14 1110xxxx 3 total 15 11110xxx 4 total 16 17 11 10 01 01 xx xx xx xx 0... 18 0xE5XX0000 19 0xE5 << 24 20*/ 21 22static bool utf8_byte_is_valid(uint8_t c) { 23 return c < 0xF5 && (c & 0xFE) != 0xC0; 24} 25static bool utf8_byte_is_continuation(uint8_t c) { 26 return (c & 0xC0) == 0x80; 27} 28static bool utf8_byte_is_leading_byte(uint8_t c) { 29 return utf8_byte_is_valid(c) && !utf8_byte_is_continuation(c); 30} 31 32#ifdef SK_DEBUG 33 static void assert_utf8_leadingbyte(unsigned c) { 34 SkASSERT(utf8_byte_is_leading_byte(SkToU8(c))); 35 } 36 37 int SkUTF8_LeadByteToCount(unsigned c) { 38 assert_utf8_leadingbyte(c); 39 return (((0xE5 << 24) >> (c >> 4 << 1)) & 3) + 1; 40 } 41#else 42 #define assert_utf8_leadingbyte(c) 43#endif 44 45/** 46 * @returns -1 iff invalid UTF8 byte, 47 * 0 iff UTF8 continuation byte, 48 * 1 iff ASCII byte, 49 * 2 iff leading byte of 2-byte sequence, 50 * 3 iff leading byte of 3-byte sequence, and 51 * 4 iff leading byte of 4-byte sequence. 52 * 53 * I.e.: if return value > 0, then gives length of sequence. 54*/ 55static int utf8_byte_type(uint8_t c) { 56 if (c < 0x80) { 57 return 1; 58 } else if (c < 0xC0) { 59 return 0; 60 } else if (c < 0xF5 && (c & 0xFE) != 0xC0) { // "octet values C0, C1, F5 to FF never appear" 61 return (((0xE5 << 24) >> ((unsigned)c >> 4 << 1)) & 3) + 1; 62 } else { 63 return -1; 64 } 65} 66static bool utf8_type_is_valid_leading_byte(int type) { return type > 0; } 67 68int SkUTF8_CountUnichars(const char utf8[]) { 69 SkASSERT(utf8); 70 71 int count = 0; 72 73 for (;;) { 74 int c = *(const uint8_t*)utf8; 75 if (c == 0) { 76 break; 77 } 78 utf8 += SkUTF8_LeadByteToCount(c); 79 count += 1; 80 } 81 return count; 82} 83 84// SAFE: returns -1 if invalid UTF-8 85int SkUTF8_CountUnichars(const void* text, size_t byteLength) { 86 SkASSERT(text); 87 const char* utf8 = static_cast<const char*>(text); 88 if (byteLength == 0) { 89 return 0; 90 } 91 92 int count = 0; 93 const char* stop = utf8 + byteLength; 94 95 while (utf8 < stop) { 96 int type = utf8_byte_type(*(const uint8_t*)utf8); 97 SkASSERT(type >= -1 && type <= 4); 98 if (!utf8_type_is_valid_leading_byte(type) || utf8 + type > stop) { 99 // Sequence extends beyond end. 100 return -1; 101 } 102 while(type-- > 1) { 103 ++utf8; 104 if (!utf8_byte_is_continuation(*(const uint8_t*)utf8)) { 105 return -1; 106 } 107 } 108 ++utf8; 109 ++count; 110 } 111 return count; 112} 113 114SkUnichar SkUTF8_ToUnichar(const char utf8[]) { 115 SkASSERT(utf8); 116 117 const uint8_t* p = (const uint8_t*)utf8; 118 int c = *p; 119 int hic = c << 24; 120 121 assert_utf8_leadingbyte(c); 122 123 if (hic < 0) { 124 uint32_t mask = (uint32_t)~0x3F; 125 hic = SkLeftShift(hic, 1); 126 do { 127 c = (c << 6) | (*++p & 0x3F); 128 mask <<= 5; 129 } while ((hic = SkLeftShift(hic, 1)) < 0); 130 c &= ~mask; 131 } 132 return c; 133} 134 135// SAFE: returns -1 on invalid UTF-8 sequence. 136SkUnichar SkUTF8_NextUnicharWithError(const char** ptr, const char* end) { 137 SkASSERT(ptr && *ptr); 138 SkASSERT(*ptr < end); 139 const uint8_t* p = (const uint8_t*)*ptr; 140 int c = *p; 141 int hic = c << 24; 142 143 if (!utf8_byte_is_leading_byte(c)) { 144 return -1; 145 } 146 if (hic < 0) { 147 uint32_t mask = (uint32_t)~0x3F; 148 hic = SkLeftShift(hic, 1); 149 do { 150 ++p; 151 if (p >= (const uint8_t*)end) { 152 return -1; 153 } 154 // check before reading off end of array. 155 uint8_t nextByte = *p; 156 if (!utf8_byte_is_continuation(nextByte)) { 157 return -1; 158 } 159 c = (c << 6) | (nextByte & 0x3F); 160 mask <<= 5; 161 } while ((hic = SkLeftShift(hic, 1)) < 0); 162 c &= ~mask; 163 } 164 *ptr = (char*)p + 1; 165 return c; 166} 167 168SkUnichar SkUTF8_NextUnichar(const char** ptr) { 169 SkASSERT(ptr && *ptr); 170 171 const uint8_t* p = (const uint8_t*)*ptr; 172 int c = *p; 173 int hic = c << 24; 174 175 assert_utf8_leadingbyte(c); 176 177 if (hic < 0) { 178 uint32_t mask = (uint32_t)~0x3F; 179 hic = SkLeftShift(hic, 1); 180 do { 181 c = (c << 6) | (*++p & 0x3F); 182 mask <<= 5; 183 } while ((hic = SkLeftShift(hic, 1)) < 0); 184 c &= ~mask; 185 } 186 *ptr = (char*)p + 1; 187 return c; 188} 189 190SkUnichar SkUTF8_PrevUnichar(const char** ptr) { 191 SkASSERT(ptr && *ptr); 192 193 const char* p = *ptr; 194 195 if (*--p & 0x80) { 196 while (*--p & 0x40) { 197 ; 198 } 199 } 200 201 *ptr = (char*)p; 202 return SkUTF8_NextUnichar(&p); 203} 204 205size_t SkUTF8_FromUnichar(SkUnichar uni, char utf8[]) { 206 if ((uint32_t)uni > 0x10FFFF) { 207 SkDEBUGFAIL("bad unichar"); 208 return 0; 209 } 210 211 if (uni <= 127) { 212 if (utf8) { 213 *utf8 = (char)uni; 214 } 215 return 1; 216 } 217 218 char tmp[4]; 219 char* p = tmp; 220 size_t count = 1; 221 222 SkDEBUGCODE(SkUnichar orig = uni;) 223 224 while (uni > 0x7F >> count) { 225 *p++ = (char)(0x80 | (uni & 0x3F)); 226 uni >>= 6; 227 count += 1; 228 } 229 230 if (utf8) { 231 p = tmp; 232 utf8 += count; 233 while (p < tmp + count - 1) { 234 *--utf8 = *p++; 235 } 236 *--utf8 = (char)(~(0xFF >> count) | uni); 237 } 238 239 SkASSERT(utf8 == nullptr || orig == SkUTF8_ToUnichar(utf8)); 240 return count; 241} 242 243/////////////////////////////////////////////////////////////////////////////// 244 245int SkUTF16_CountUnichars(const uint16_t src[]) { 246 SkASSERT(src); 247 248 int count = 0; 249 unsigned c; 250 while ((c = *src++) != 0) { 251 SkASSERT(!SkUTF16_IsLowSurrogate(c)); 252 if (SkUTF16_IsHighSurrogate(c)) { 253 c = *src++; 254 SkASSERT(SkUTF16_IsLowSurrogate(c)); 255 } 256 count += 1; 257 } 258 return count; 259} 260 261// returns -1 on error 262int SkUTF16_CountUnichars(const void* text, size_t byteLength) { 263 SkASSERT(text); 264 if (byteLength == 0) { 265 return 0; 266 } 267 if (!SkIsAlign2(intptr_t(text)) || !SkIsAlign2(byteLength)) { 268 return -1; 269 } 270 271 const uint16_t* src = static_cast<const uint16_t*>(text); 272 const uint16_t* stop = src + (byteLength >> 1); 273 int count = 0; 274 while (src < stop) { 275 unsigned c = *src++; 276 SkASSERT(!SkUTF16_IsLowSurrogate(c)); 277 if (SkUTF16_IsHighSurrogate(c)) { 278 if (src >= stop) { 279 return -1; 280 } 281 c = *src++; 282 if (!SkUTF16_IsLowSurrogate(c)) { 283 return -1; 284 } 285 } 286 count += 1; 287 } 288 return count; 289} 290 291SkUnichar SkUTF16_NextUnichar(const uint16_t** srcPtr) { 292 SkASSERT(srcPtr && *srcPtr); 293 294 const uint16_t* src = *srcPtr; 295 SkUnichar c = *src++; 296 297 SkASSERT(!SkUTF16_IsLowSurrogate(c)); 298 if (SkUTF16_IsHighSurrogate(c)) { 299 unsigned c2 = *src++; 300 SkASSERT(SkUTF16_IsLowSurrogate(c2)); 301 302 // c = ((c & 0x3FF) << 10) + (c2 & 0x3FF) + 0x10000 303 // c = (((c & 0x3FF) + 64) << 10) + (c2 & 0x3FF) 304 c = (c << 10) + c2 + (0x10000 - (0xD800 << 10) - 0xDC00); 305 } 306 *srcPtr = src; 307 return c; 308} 309 310SkUnichar SkUTF16_PrevUnichar(const uint16_t** srcPtr) { 311 SkASSERT(srcPtr && *srcPtr); 312 313 const uint16_t* src = *srcPtr; 314 SkUnichar c = *--src; 315 316 SkASSERT(!SkUTF16_IsHighSurrogate(c)); 317 if (SkUTF16_IsLowSurrogate(c)) { 318 unsigned c2 = *--src; 319 SkASSERT(SkUTF16_IsHighSurrogate(c2)); 320 c = (c2 << 10) + c + (0x10000 - (0xD800 << 10) - 0xDC00); 321 } 322 *srcPtr = src; 323 return c; 324} 325 326size_t SkUTF16_FromUnichar(SkUnichar uni, uint16_t dst[]) { 327 SkASSERT((unsigned)uni <= 0x10FFFF); 328 329 int extra = (uni > 0xFFFF); 330 331 if (dst) { 332 if (extra) { 333 // dst[0] = SkToU16(0xD800 | ((uni - 0x10000) >> 10)); 334 // dst[0] = SkToU16(0xD800 | ((uni >> 10) - 64)); 335 dst[0] = SkToU16((0xD800 - 64) + (uni >> 10)); 336 dst[1] = SkToU16(0xDC00 | (uni & 0x3FF)); 337 338 SkASSERT(SkUTF16_IsHighSurrogate(dst[0])); 339 SkASSERT(SkUTF16_IsLowSurrogate(dst[1])); 340 } else { 341 dst[0] = SkToU16(uni); 342 SkASSERT(!SkUTF16_IsHighSurrogate(dst[0])); 343 SkASSERT(!SkUTF16_IsLowSurrogate(dst[0])); 344 } 345 } 346 return 1 + extra; 347} 348 349size_t SkUTF16_ToUTF8(const uint16_t utf16[], int numberOf16BitValues, 350 char utf8[]) { 351 SkASSERT(numberOf16BitValues >= 0); 352 if (numberOf16BitValues <= 0) { 353 return 0; 354 } 355 356 SkASSERT(utf16 != nullptr); 357 358 const uint16_t* stop = utf16 + numberOf16BitValues; 359 size_t size = 0; 360 361 if (utf8 == nullptr) { // just count 362 while (utf16 < stop) { 363 size += SkUTF8_FromUnichar(SkUTF16_NextUnichar(&utf16), nullptr); 364 } 365 } else { 366 char* start = utf8; 367 while (utf16 < stop) { 368 utf8 += SkUTF8_FromUnichar(SkUTF16_NextUnichar(&utf16), utf8); 369 } 370 size = utf8 - start; 371 } 372 return size; 373} 374 375const char SkHexadecimalDigits::gUpper[16] = 376 { '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F' }; 377const char SkHexadecimalDigits::gLower[16] = 378 { '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f' }; 379 380 381// returns -1 on error 382int SkUTF32_CountUnichars(const void* text, size_t byteLength) { 383 if (byteLength == 0) { 384 return 0; 385 } 386 if (!SkIsAlign4(intptr_t(text)) || !SkIsAlign4(byteLength)) { 387 return -1; 388 } 389 const uint32_t kInvalidUnicharMask = 0xFF000000; // unichar fits in 24 bits 390 const uint32_t* ptr = static_cast<const uint32_t*>(text); 391 const uint32_t* stop = ptr + (byteLength >> 2); 392 while (ptr < stop) { 393 if (*ptr & kInvalidUnicharMask) { 394 return -1; 395 } 396 ptr += 1; 397 } 398 return SkToInt(byteLength >> 2); 399} 400 401