12faa5f1271587cda765f26bcf2951065300a01ffElliott Hughes/* 22faa5f1271587cda765f26bcf2951065300a01ffElliott Hughes * Copyright (C) 2011 The Android Open Source Project 32faa5f1271587cda765f26bcf2951065300a01ffElliott Hughes * 42faa5f1271587cda765f26bcf2951065300a01ffElliott Hughes * Licensed under the Apache License, Version 2.0 (the "License"); 52faa5f1271587cda765f26bcf2951065300a01ffElliott Hughes * you may not use this file except in compliance with the License. 62faa5f1271587cda765f26bcf2951065300a01ffElliott Hughes * You may obtain a copy of the License at 72faa5f1271587cda765f26bcf2951065300a01ffElliott Hughes * 82faa5f1271587cda765f26bcf2951065300a01ffElliott Hughes * http://www.apache.org/licenses/LICENSE-2.0 92faa5f1271587cda765f26bcf2951065300a01ffElliott Hughes * 102faa5f1271587cda765f26bcf2951065300a01ffElliott Hughes * Unless required by applicable law or agreed to in writing, software 112faa5f1271587cda765f26bcf2951065300a01ffElliott Hughes * distributed under the License is distributed on an "AS IS" BASIS, 122faa5f1271587cda765f26bcf2951065300a01ffElliott Hughes * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 132faa5f1271587cda765f26bcf2951065300a01ffElliott Hughes * See the License for the specific language governing permissions and 142faa5f1271587cda765f26bcf2951065300a01ffElliott Hughes * limitations under the License. 152faa5f1271587cda765f26bcf2951065300a01ffElliott Hughes */ 16814e40397fe6c8a2c645bae99f356dbddd6dbe18Elliott Hughes 17814e40397fe6c8a2c645bae99f356dbddd6dbe18Elliott Hughes#include "utf.h" 18814e40397fe6c8a2c645bae99f356dbddd6dbe18Elliott Hughes 1907ed66b5ae659c452cbe1ab20c3dbf1d6f546461Elliott Hughes#include "base/logging.h" 202dd0e2cea360bc9206eb88ecc40d259e796c239dIan Rogers#include "mirror/array.h" 214f6ad8ab428038129b2d0d6c40b7fd625cca15e1Ian Rogers#include "mirror/object-inl.h" 22a67249065e4c9b3cf4a7c081d95a78df28291ee9Ian Rogers#include "utf-inl.h" 23b465ab0e103d7760df903c1fddf4fa6b89d5d1f5Elliott Hughes 24814e40397fe6c8a2c645bae99f356dbddd6dbe18Elliott Hughesnamespace art { 25814e40397fe6c8a2c645bae99f356dbddd6dbe18Elliott Hughes 261646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult// This is used only from debugger and test code. 27814e40397fe6c8a2c645bae99f356dbddd6dbe18Elliott Hughessize_t CountModifiedUtf8Chars(const char* utf8) { 281646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult return CountModifiedUtf8Chars(utf8, strlen(utf8)); 291646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult} 301646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult 311646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult/* 321646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult * This does not validate UTF8 rules (nor did older code). But it gets the right answer 331646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult * for valid UTF-8 and that's fine because it's used only to size a buffer for later 341646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult * conversion. 351646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult * 361646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult * Modified UTF-8 consists of a series of bytes up to 21 bit Unicode code points as follows: 371646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult * U+0001 - U+007F 0xxxxxxx 381646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult * U+0080 - U+07FF 110xxxxx 10xxxxxx 391646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult * U+0800 - U+FFFF 1110xxxx 10xxxxxx 10xxxxxx 401646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult * U+10000 - U+1FFFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 411646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult * 421646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult * U+0000 is encoded using the 2nd form to avoid nulls inside strings (this differs from 431646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult * standard UTF-8). 441646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult * The four byte encoding converts to two utf16 characters. 451646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult */ 461646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Houltsize_t CountModifiedUtf8Chars(const char* utf8, size_t byte_count) { 471646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult DCHECK_LE(byte_count, strlen(utf8)); 48814e40397fe6c8a2c645bae99f356dbddd6dbe18Elliott Hughes size_t len = 0; 491646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult const char* end = utf8 + byte_count; 501646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult for (; utf8 < end; ++utf8) { 511646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult int ic = *utf8; 52814e40397fe6c8a2c645bae99f356dbddd6dbe18Elliott Hughes len++; 531646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult if (LIKELY((ic & 0x80) == 0)) { 541646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult // One-byte encoding. 55814e40397fe6c8a2c645bae99f356dbddd6dbe18Elliott Hughes continue; 56814e40397fe6c8a2c645bae99f356dbddd6dbe18Elliott Hughes } 571646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult // Two- or three-byte encoding. 58814e40397fe6c8a2c645bae99f356dbddd6dbe18Elliott Hughes utf8++; 59814e40397fe6c8a2c645bae99f356dbddd6dbe18Elliott Hughes if ((ic & 0x20) == 0) { 601646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult // Two-byte encoding. 61814e40397fe6c8a2c645bae99f356dbddd6dbe18Elliott Hughes continue; 62814e40397fe6c8a2c645bae99f356dbddd6dbe18Elliott Hughes } 63814e40397fe6c8a2c645bae99f356dbddd6dbe18Elliott Hughes utf8++; 64a5afcfc73141e5e378d79a326d02c5c2039fb025Narayan Kamath if ((ic & 0x10) == 0) { 651646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult // Three-byte encoding. 66a5afcfc73141e5e378d79a326d02c5c2039fb025Narayan Kamath continue; 67a5afcfc73141e5e378d79a326d02c5c2039fb025Narayan Kamath } 68a5afcfc73141e5e378d79a326d02c5c2039fb025Narayan Kamath 691646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult // Four-byte encoding: needs to be converted into a surrogate 70a5afcfc73141e5e378d79a326d02c5c2039fb025Narayan Kamath // pair. 71a5afcfc73141e5e378d79a326d02c5c2039fb025Narayan Kamath utf8++; 72a5afcfc73141e5e378d79a326d02c5c2039fb025Narayan Kamath len++; 73814e40397fe6c8a2c645bae99f356dbddd6dbe18Elliott Hughes } 74814e40397fe6c8a2c645bae99f356dbddd6dbe18Elliott Hughes return len; 75814e40397fe6c8a2c645bae99f356dbddd6dbe18Elliott Hughes} 76814e40397fe6c8a2c645bae99f356dbddd6dbe18Elliott Hughes 771646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult// This is used only from debugger and test code. 78814e40397fe6c8a2c645bae99f356dbddd6dbe18Elliott Hughesvoid ConvertModifiedUtf8ToUtf16(uint16_t* utf16_data_out, const char* utf8_data_in) { 79814e40397fe6c8a2c645bae99f356dbddd6dbe18Elliott Hughes while (*utf8_data_in != '\0') { 80a5afcfc73141e5e378d79a326d02c5c2039fb025Narayan Kamath const uint32_t ch = GetUtf16FromUtf8(&utf8_data_in); 81a5afcfc73141e5e378d79a326d02c5c2039fb025Narayan Kamath const uint16_t leading = GetLeadingUtf16Char(ch); 82a5afcfc73141e5e378d79a326d02c5c2039fb025Narayan Kamath const uint16_t trailing = GetTrailingUtf16Char(ch); 83a5afcfc73141e5e378d79a326d02c5c2039fb025Narayan Kamath 84a5afcfc73141e5e378d79a326d02c5c2039fb025Narayan Kamath *utf16_data_out++ = leading; 85a5afcfc73141e5e378d79a326d02c5c2039fb025Narayan Kamath if (trailing != 0) { 86a5afcfc73141e5e378d79a326d02c5c2039fb025Narayan Kamath *utf16_data_out++ = trailing; 87a5afcfc73141e5e378d79a326d02c5c2039fb025Narayan Kamath } 88814e40397fe6c8a2c645bae99f356dbddd6dbe18Elliott Hughes } 89814e40397fe6c8a2c645bae99f356dbddd6dbe18Elliott Hughes} 90814e40397fe6c8a2c645bae99f356dbddd6dbe18Elliott Hughes 911646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Houltvoid ConvertModifiedUtf8ToUtf16(uint16_t* utf16_data_out, size_t out_chars, 921646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult const char* utf8_data_in, size_t in_bytes) { 931646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult const char *in_start = utf8_data_in; 941646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult const char *in_end = utf8_data_in + in_bytes; 951646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult uint16_t *out_p = utf16_data_out; 961646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult 971646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult if (LIKELY(out_chars == in_bytes)) { 981646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult // Common case where all characters are ASCII. 991646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult for (const char *p = in_start; p < in_end;) { 1001646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult // Safe even if char is signed because ASCII characters always have 1011646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult // the high bit cleared. 1021646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult *out_p++ = dchecked_integral_cast<uint16_t>(*p++); 1031646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult } 1041646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult return; 1051646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult } 1061646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult 1071646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult // String contains non-ASCII characters. 1081646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult for (const char *p = in_start; p < in_end;) { 1091646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult const uint32_t ch = GetUtf16FromUtf8(&p); 1101646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult const uint16_t leading = GetLeadingUtf16Char(ch); 1111646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult const uint16_t trailing = GetTrailingUtf16Char(ch); 1121646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult 1131646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult *out_p++ = leading; 1141646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult if (trailing != 0) { 1151646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult *out_p++ = trailing; 1161646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult } 1171646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult } 1181646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult} 1191646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult 1201646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Houltvoid ConvertUtf16ToModifiedUtf8(char* utf8_out, size_t byte_count, 1211646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult const uint16_t* utf16_in, size_t char_count) { 1221646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult if (LIKELY(byte_count == char_count)) { 1231646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult // Common case where all characters are ASCII. 1241646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult const uint16_t *utf16_end = utf16_in + char_count; 1251646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult for (const uint16_t *p = utf16_in; p < utf16_end;) { 1261646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult *utf8_out++ = dchecked_integral_cast<char>(*p++); 1271646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult } 1281646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult return; 1291646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult } 1301646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult 1311646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult // String contains non-ASCII characters. 132b465ab0e103d7760df903c1fddf4fa6b89d5d1f5Elliott Hughes while (char_count--) { 133e16dad1d6388b0305f13e2171308a77f42e7c682Narayan Kamath const uint16_t ch = *utf16_in++; 134b465ab0e103d7760df903c1fddf4fa6b89d5d1f5Elliott Hughes if (ch > 0 && ch <= 0x7f) { 135b465ab0e103d7760df903c1fddf4fa6b89d5d1f5Elliott Hughes *utf8_out++ = ch; 136b465ab0e103d7760df903c1fddf4fa6b89d5d1f5Elliott Hughes } else { 1371646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult // Char_count == 0 here implies we've encountered an unpaired 138e16dad1d6388b0305f13e2171308a77f42e7c682Narayan Kamath // surrogate and we have no choice but to encode it as 3-byte UTF 139e16dad1d6388b0305f13e2171308a77f42e7c682Narayan Kamath // sequence. Note that unpaired surrogates can occur as a part of 140e16dad1d6388b0305f13e2171308a77f42e7c682Narayan Kamath // "normal" operation. 141e16dad1d6388b0305f13e2171308a77f42e7c682Narayan Kamath if ((ch >= 0xd800 && ch <= 0xdbff) && (char_count > 0)) { 142e16dad1d6388b0305f13e2171308a77f42e7c682Narayan Kamath const uint16_t ch2 = *utf16_in; 143e16dad1d6388b0305f13e2171308a77f42e7c682Narayan Kamath 144e16dad1d6388b0305f13e2171308a77f42e7c682Narayan Kamath // Check if the other half of the pair is within the expected 145e16dad1d6388b0305f13e2171308a77f42e7c682Narayan Kamath // range. If it isn't, we will have to emit both "halves" as 146e16dad1d6388b0305f13e2171308a77f42e7c682Narayan Kamath // separate 3 byte sequences. 147e16dad1d6388b0305f13e2171308a77f42e7c682Narayan Kamath if (ch2 >= 0xdc00 && ch2 <= 0xdfff) { 148e16dad1d6388b0305f13e2171308a77f42e7c682Narayan Kamath utf16_in++; 149e16dad1d6388b0305f13e2171308a77f42e7c682Narayan Kamath char_count--; 150e16dad1d6388b0305f13e2171308a77f42e7c682Narayan Kamath const uint32_t code_point = (ch << 10) + ch2 - 0x035fdc00; 151e16dad1d6388b0305f13e2171308a77f42e7c682Narayan Kamath *utf8_out++ = (code_point >> 18) | 0xf0; 152e16dad1d6388b0305f13e2171308a77f42e7c682Narayan Kamath *utf8_out++ = ((code_point >> 12) & 0x3f) | 0x80; 153e16dad1d6388b0305f13e2171308a77f42e7c682Narayan Kamath *utf8_out++ = ((code_point >> 6) & 0x3f) | 0x80; 154e16dad1d6388b0305f13e2171308a77f42e7c682Narayan Kamath *utf8_out++ = (code_point & 0x3f) | 0x80; 155e16dad1d6388b0305f13e2171308a77f42e7c682Narayan Kamath continue; 156e16dad1d6388b0305f13e2171308a77f42e7c682Narayan Kamath } 157e16dad1d6388b0305f13e2171308a77f42e7c682Narayan Kamath } 158e16dad1d6388b0305f13e2171308a77f42e7c682Narayan Kamath 159b465ab0e103d7760df903c1fddf4fa6b89d5d1f5Elliott Hughes if (ch > 0x07ff) { 160e16dad1d6388b0305f13e2171308a77f42e7c682Narayan Kamath // Three byte encoding. 161b465ab0e103d7760df903c1fddf4fa6b89d5d1f5Elliott Hughes *utf8_out++ = (ch >> 12) | 0xe0; 162b465ab0e103d7760df903c1fddf4fa6b89d5d1f5Elliott Hughes *utf8_out++ = ((ch >> 6) & 0x3f) | 0x80; 163b465ab0e103d7760df903c1fddf4fa6b89d5d1f5Elliott Hughes *utf8_out++ = (ch & 0x3f) | 0x80; 164b465ab0e103d7760df903c1fddf4fa6b89d5d1f5Elliott Hughes } else /*(ch > 0x7f || ch == 0)*/ { 165e16dad1d6388b0305f13e2171308a77f42e7c682Narayan Kamath // Two byte encoding. 166b465ab0e103d7760df903c1fddf4fa6b89d5d1f5Elliott Hughes *utf8_out++ = (ch >> 6) | 0xc0; 167b465ab0e103d7760df903c1fddf4fa6b89d5d1f5Elliott Hughes *utf8_out++ = (ch & 0x3f) | 0x80; 168b465ab0e103d7760df903c1fddf4fa6b89d5d1f5Elliott Hughes } 169b465ab0e103d7760df903c1fddf4fa6b89d5d1f5Elliott Hughes } 170b465ab0e103d7760df903c1fddf4fa6b89d5d1f5Elliott Hughes } 171b465ab0e103d7760df903c1fddf4fa6b89d5d1f5Elliott Hughes} 172b465ab0e103d7760df903c1fddf4fa6b89d5d1f5Elliott Hughes 173814e40397fe6c8a2c645bae99f356dbddd6dbe18Elliott Hughesint32_t ComputeUtf16Hash(const uint16_t* chars, size_t char_count) { 1748f41dc389bd50b6e31caa3fe9d41cbaf76e4d8d6Ian Rogers uint32_t hash = 0; 175814e40397fe6c8a2c645bae99f356dbddd6dbe18Elliott Hughes while (char_count--) { 176814e40397fe6c8a2c645bae99f356dbddd6dbe18Elliott Hughes hash = hash * 31 + *chars++; 177814e40397fe6c8a2c645bae99f356dbddd6dbe18Elliott Hughes } 1788f41dc389bd50b6e31caa3fe9d41cbaf76e4d8d6Ian Rogers return static_cast<int32_t>(hash); 179814e40397fe6c8a2c645bae99f356dbddd6dbe18Elliott Hughes} 180814e40397fe6c8a2c645bae99f356dbddd6dbe18Elliott Hughes 181cac5a7e871f1f346b317894359ad06fa7bd67fbaVladimir Markoint32_t ComputeUtf16HashFromModifiedUtf8(const char* utf8, size_t utf16_length) { 182cac5a7e871f1f346b317894359ad06fa7bd67fbaVladimir Marko uint32_t hash = 0; 183cac5a7e871f1f346b317894359ad06fa7bd67fbaVladimir Marko while (utf16_length != 0u) { 184cac5a7e871f1f346b317894359ad06fa7bd67fbaVladimir Marko const uint32_t pair = GetUtf16FromUtf8(&utf8); 185cac5a7e871f1f346b317894359ad06fa7bd67fbaVladimir Marko const uint16_t first = GetLeadingUtf16Char(pair); 186cac5a7e871f1f346b317894359ad06fa7bd67fbaVladimir Marko hash = hash * 31 + first; 187cac5a7e871f1f346b317894359ad06fa7bd67fbaVladimir Marko --utf16_length; 188cac5a7e871f1f346b317894359ad06fa7bd67fbaVladimir Marko const uint16_t second = GetTrailingUtf16Char(pair); 189cac5a7e871f1f346b317894359ad06fa7bd67fbaVladimir Marko if (second != 0) { 190cac5a7e871f1f346b317894359ad06fa7bd67fbaVladimir Marko hash = hash * 31 + second; 191cac5a7e871f1f346b317894359ad06fa7bd67fbaVladimir Marko DCHECK_NE(utf16_length, 0u); 192cac5a7e871f1f346b317894359ad06fa7bd67fbaVladimir Marko --utf16_length; 193cac5a7e871f1f346b317894359ad06fa7bd67fbaVladimir Marko } 194cac5a7e871f1f346b317894359ad06fa7bd67fbaVladimir Marko } 195cac5a7e871f1f346b317894359ad06fa7bd67fbaVladimir Marko return static_cast<int32_t>(hash); 196cac5a7e871f1f346b317894359ad06fa7bd67fbaVladimir Marko} 197cac5a7e871f1f346b317894359ad06fa7bd67fbaVladimir Marko 198208a5cb383dd9dcd3461f89b74af5df67dc8d794Mathieu Chartieruint32_t ComputeModifiedUtf8Hash(const char* chars) { 199208a5cb383dd9dcd3461f89b74af5df67dc8d794Mathieu Chartier uint32_t hash = 0; 20068b56858367e29461ae290fd797443a1ef6d8005Ian Rogers while (*chars != '\0') { 201e7c9a8c2b8481aafbc6af4ce6229bd361ba24742Mathieu Chartier hash = hash * 31 + *chars++; 20268b56858367e29461ae290fd797443a1ef6d8005Ian Rogers } 2038f41dc389bd50b6e31caa3fe9d41cbaf76e4d8d6Ian Rogers return static_cast<int32_t>(hash); 20468b56858367e29461ae290fd797443a1ef6d8005Ian Rogers} 20568b56858367e29461ae290fd797443a1ef6d8005Ian Rogers 206a48aef4234768ed37828df613919391c21f561a7Vladimir Markoint CompareModifiedUtf8ToUtf16AsCodePointValues(const char* utf8, const uint16_t* utf16, 207a48aef4234768ed37828df613919391c21f561a7Vladimir Marko size_t utf16_length) { 208637c65b1e431fd90195b71c141b3590bd81cc91aIan Rogers for (;;) { 209a48aef4234768ed37828df613919391c21f561a7Vladimir Marko if (*utf8 == '\0') { 210a48aef4234768ed37828df613919391c21f561a7Vladimir Marko return (utf16_length == 0) ? 0 : -1; 211a48aef4234768ed37828df613919391c21f561a7Vladimir Marko } else if (utf16_length == 0) { 212637c65b1e431fd90195b71c141b3590bd81cc91aIan Rogers return 1; 213637c65b1e431fd90195b71c141b3590bd81cc91aIan Rogers } 214637c65b1e431fd90195b71c141b3590bd81cc91aIan Rogers 215a5afcfc73141e5e378d79a326d02c5c2039fb025Narayan Kamath const uint32_t pair = GetUtf16FromUtf8(&utf8); 216a5afcfc73141e5e378d79a326d02c5c2039fb025Narayan Kamath 217a5afcfc73141e5e378d79a326d02c5c2039fb025Narayan Kamath // First compare the leading utf16 char. 218a5afcfc73141e5e378d79a326d02c5c2039fb025Narayan Kamath const uint16_t lhs = GetLeadingUtf16Char(pair); 219a5afcfc73141e5e378d79a326d02c5c2039fb025Narayan Kamath const uint16_t rhs = *utf16++; 220a48aef4234768ed37828df613919391c21f561a7Vladimir Marko --utf16_length; 221a5afcfc73141e5e378d79a326d02c5c2039fb025Narayan Kamath if (lhs != rhs) { 222a5afcfc73141e5e378d79a326d02c5c2039fb025Narayan Kamath return lhs > rhs ? 1 : -1; 223a5afcfc73141e5e378d79a326d02c5c2039fb025Narayan Kamath } 224637c65b1e431fd90195b71c141b3590bd81cc91aIan Rogers 225a5afcfc73141e5e378d79a326d02c5c2039fb025Narayan Kamath // Then compare the trailing utf16 char. First check if there 226a5afcfc73141e5e378d79a326d02c5c2039fb025Narayan Kamath // are any characters left to consume. 227a5afcfc73141e5e378d79a326d02c5c2039fb025Narayan Kamath const uint16_t lhs2 = GetTrailingUtf16Char(pair); 228a5afcfc73141e5e378d79a326d02c5c2039fb025Narayan Kamath if (lhs2 != 0) { 229a5afcfc73141e5e378d79a326d02c5c2039fb025Narayan Kamath if (utf16_length == 0) { 230a5afcfc73141e5e378d79a326d02c5c2039fb025Narayan Kamath return 1; 231a5afcfc73141e5e378d79a326d02c5c2039fb025Narayan Kamath } 232a5afcfc73141e5e378d79a326d02c5c2039fb025Narayan Kamath 233a5afcfc73141e5e378d79a326d02c5c2039fb025Narayan Kamath const uint16_t rhs2 = *utf16++; 234a5afcfc73141e5e378d79a326d02c5c2039fb025Narayan Kamath --utf16_length; 235a5afcfc73141e5e378d79a326d02c5c2039fb025Narayan Kamath if (lhs2 != rhs2) { 236a5afcfc73141e5e378d79a326d02c5c2039fb025Narayan Kamath return lhs2 > rhs2 ? 1 : -1; 237a5afcfc73141e5e378d79a326d02c5c2039fb025Narayan Kamath } 238637c65b1e431fd90195b71c141b3590bd81cc91aIan Rogers } 239637c65b1e431fd90195b71c141b3590bd81cc91aIan Rogers } 240637c65b1e431fd90195b71c141b3590bd81cc91aIan Rogers} 241637c65b1e431fd90195b71c141b3590bd81cc91aIan Rogers 242814e40397fe6c8a2c645bae99f356dbddd6dbe18Elliott Hughessize_t CountUtf8Bytes(const uint16_t* chars, size_t char_count) { 243814e40397fe6c8a2c645bae99f356dbddd6dbe18Elliott Hughes size_t result = 0; 2441646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult const uint16_t *end = chars + char_count; 2451646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult while (chars < end) { 246e16dad1d6388b0305f13e2171308a77f42e7c682Narayan Kamath const uint16_t ch = *chars++; 2471646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult if (LIKELY(ch != 0 && ch < 0x80)) { 2481646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult result++; 2491646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult continue; 2501646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult } 2511646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult if (ch < 0x800) { 2521646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult result += 2; 2531646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult continue; 2541646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult } 2551646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult if (ch >= 0xd800 && ch < 0xdc00) { 2561646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult if (chars < end) { 257e16dad1d6388b0305f13e2171308a77f42e7c682Narayan Kamath const uint16_t ch2 = *chars; 258e16dad1d6388b0305f13e2171308a77f42e7c682Narayan Kamath // If we find a properly paired surrogate, we emit it as a 4 byte 259e16dad1d6388b0305f13e2171308a77f42e7c682Narayan Kamath // UTF sequence. If we find an unpaired leading or trailing surrogate, 260e16dad1d6388b0305f13e2171308a77f42e7c682Narayan Kamath // we emit it as a 3 byte sequence like would have done earlier. 2611646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult if (ch2 >= 0xdc00 && ch2 < 0xe000) { 262e16dad1d6388b0305f13e2171308a77f42e7c682Narayan Kamath chars++; 263e16dad1d6388b0305f13e2171308a77f42e7c682Narayan Kamath result += 4; 2641646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult continue; 265e16dad1d6388b0305f13e2171308a77f42e7c682Narayan Kamath } 266814e40397fe6c8a2c645bae99f356dbddd6dbe18Elliott Hughes } 267814e40397fe6c8a2c645bae99f356dbddd6dbe18Elliott Hughes } 2681646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult result += 3; 269814e40397fe6c8a2c645bae99f356dbddd6dbe18Elliott Hughes } 270814e40397fe6c8a2c645bae99f356dbddd6dbe18Elliott Hughes return result; 271814e40397fe6c8a2c645bae99f356dbddd6dbe18Elliott Hughes} 272814e40397fe6c8a2c645bae99f356dbddd6dbe18Elliott Hughes 273814e40397fe6c8a2c645bae99f356dbddd6dbe18Elliott Hughes} // namespace art 274