12faa5f1271587cda765f26bcf2951065300a01ffElliott Hughes/*
22faa5f1271587cda765f26bcf2951065300a01ffElliott Hughes * Copyright (C) 2011 The Android Open Source Project
32faa5f1271587cda765f26bcf2951065300a01ffElliott Hughes *
42faa5f1271587cda765f26bcf2951065300a01ffElliott Hughes * Licensed under the Apache License, Version 2.0 (the "License");
52faa5f1271587cda765f26bcf2951065300a01ffElliott Hughes * you may not use this file except in compliance with the License.
62faa5f1271587cda765f26bcf2951065300a01ffElliott Hughes * You may obtain a copy of the License at
72faa5f1271587cda765f26bcf2951065300a01ffElliott Hughes *
82faa5f1271587cda765f26bcf2951065300a01ffElliott Hughes *      http://www.apache.org/licenses/LICENSE-2.0
92faa5f1271587cda765f26bcf2951065300a01ffElliott Hughes *
102faa5f1271587cda765f26bcf2951065300a01ffElliott Hughes * Unless required by applicable law or agreed to in writing, software
112faa5f1271587cda765f26bcf2951065300a01ffElliott Hughes * distributed under the License is distributed on an "AS IS" BASIS,
122faa5f1271587cda765f26bcf2951065300a01ffElliott Hughes * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
132faa5f1271587cda765f26bcf2951065300a01ffElliott Hughes * See the License for the specific language governing permissions and
142faa5f1271587cda765f26bcf2951065300a01ffElliott Hughes * limitations under the License.
152faa5f1271587cda765f26bcf2951065300a01ffElliott Hughes */
16814e40397fe6c8a2c645bae99f356dbddd6dbe18Elliott Hughes
17814e40397fe6c8a2c645bae99f356dbddd6dbe18Elliott Hughes#include "utf.h"
18814e40397fe6c8a2c645bae99f356dbddd6dbe18Elliott Hughes
1907ed66b5ae659c452cbe1ab20c3dbf1d6f546461Elliott Hughes#include "base/logging.h"
202dd0e2cea360bc9206eb88ecc40d259e796c239dIan Rogers#include "mirror/array.h"
214f6ad8ab428038129b2d0d6c40b7fd625cca15e1Ian Rogers#include "mirror/object-inl.h"
22a67249065e4c9b3cf4a7c081d95a78df28291ee9Ian Rogers#include "utf-inl.h"
23b465ab0e103d7760df903c1fddf4fa6b89d5d1f5Elliott Hughes
24814e40397fe6c8a2c645bae99f356dbddd6dbe18Elliott Hughesnamespace art {
25814e40397fe6c8a2c645bae99f356dbddd6dbe18Elliott Hughes
261646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult// This is used only from debugger and test code.
27814e40397fe6c8a2c645bae99f356dbddd6dbe18Elliott Hughessize_t CountModifiedUtf8Chars(const char* utf8) {
281646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult  return CountModifiedUtf8Chars(utf8, strlen(utf8));
291646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult}
301646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult
311646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult/*
321646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult * This does not validate UTF8 rules (nor did older code). But it gets the right answer
331646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult * for valid UTF-8 and that's fine because it's used only to size a buffer for later
341646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult * conversion.
351646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult *
361646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult * Modified UTF-8 consists of a series of bytes up to 21 bit Unicode code points as follows:
371646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult * U+0001  - U+007F   0xxxxxxx
381646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult * U+0080  - U+07FF   110xxxxx 10xxxxxx
391646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult * U+0800  - U+FFFF   1110xxxx 10xxxxxx 10xxxxxx
401646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult * U+10000 - U+1FFFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
411646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult *
421646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult * U+0000 is encoded using the 2nd form to avoid nulls inside strings (this differs from
431646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult * standard UTF-8).
441646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult * The four byte encoding converts to two utf16 characters.
451646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult */
461646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Houltsize_t CountModifiedUtf8Chars(const char* utf8, size_t byte_count) {
471646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult  DCHECK_LE(byte_count, strlen(utf8));
48814e40397fe6c8a2c645bae99f356dbddd6dbe18Elliott Hughes  size_t len = 0;
491646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult  const char* end = utf8 + byte_count;
501646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult  for (; utf8 < end; ++utf8) {
511646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult    int ic = *utf8;
52814e40397fe6c8a2c645bae99f356dbddd6dbe18Elliott Hughes    len++;
531646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult    if (LIKELY((ic & 0x80) == 0)) {
541646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult      // One-byte encoding.
55814e40397fe6c8a2c645bae99f356dbddd6dbe18Elliott Hughes      continue;
56814e40397fe6c8a2c645bae99f356dbddd6dbe18Elliott Hughes    }
571646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult    // Two- or three-byte encoding.
58814e40397fe6c8a2c645bae99f356dbddd6dbe18Elliott Hughes    utf8++;
59814e40397fe6c8a2c645bae99f356dbddd6dbe18Elliott Hughes    if ((ic & 0x20) == 0) {
601646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult      // Two-byte encoding.
61814e40397fe6c8a2c645bae99f356dbddd6dbe18Elliott Hughes      continue;
62814e40397fe6c8a2c645bae99f356dbddd6dbe18Elliott Hughes    }
63814e40397fe6c8a2c645bae99f356dbddd6dbe18Elliott Hughes    utf8++;
64a5afcfc73141e5e378d79a326d02c5c2039fb025Narayan Kamath    if ((ic & 0x10) == 0) {
651646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult      // Three-byte encoding.
66a5afcfc73141e5e378d79a326d02c5c2039fb025Narayan Kamath      continue;
67a5afcfc73141e5e378d79a326d02c5c2039fb025Narayan Kamath    }
68a5afcfc73141e5e378d79a326d02c5c2039fb025Narayan Kamath
691646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult    // Four-byte encoding: needs to be converted into a surrogate
70a5afcfc73141e5e378d79a326d02c5c2039fb025Narayan Kamath    // pair.
71a5afcfc73141e5e378d79a326d02c5c2039fb025Narayan Kamath    utf8++;
72a5afcfc73141e5e378d79a326d02c5c2039fb025Narayan Kamath    len++;
73814e40397fe6c8a2c645bae99f356dbddd6dbe18Elliott Hughes  }
74814e40397fe6c8a2c645bae99f356dbddd6dbe18Elliott Hughes  return len;
75814e40397fe6c8a2c645bae99f356dbddd6dbe18Elliott Hughes}
76814e40397fe6c8a2c645bae99f356dbddd6dbe18Elliott Hughes
771646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult// This is used only from debugger and test code.
78814e40397fe6c8a2c645bae99f356dbddd6dbe18Elliott Hughesvoid ConvertModifiedUtf8ToUtf16(uint16_t* utf16_data_out, const char* utf8_data_in) {
79814e40397fe6c8a2c645bae99f356dbddd6dbe18Elliott Hughes  while (*utf8_data_in != '\0') {
80a5afcfc73141e5e378d79a326d02c5c2039fb025Narayan Kamath    const uint32_t ch = GetUtf16FromUtf8(&utf8_data_in);
81a5afcfc73141e5e378d79a326d02c5c2039fb025Narayan Kamath    const uint16_t leading = GetLeadingUtf16Char(ch);
82a5afcfc73141e5e378d79a326d02c5c2039fb025Narayan Kamath    const uint16_t trailing = GetTrailingUtf16Char(ch);
83a5afcfc73141e5e378d79a326d02c5c2039fb025Narayan Kamath
84a5afcfc73141e5e378d79a326d02c5c2039fb025Narayan Kamath    *utf16_data_out++ = leading;
85a5afcfc73141e5e378d79a326d02c5c2039fb025Narayan Kamath    if (trailing != 0) {
86a5afcfc73141e5e378d79a326d02c5c2039fb025Narayan Kamath      *utf16_data_out++ = trailing;
87a5afcfc73141e5e378d79a326d02c5c2039fb025Narayan Kamath    }
88814e40397fe6c8a2c645bae99f356dbddd6dbe18Elliott Hughes  }
89814e40397fe6c8a2c645bae99f356dbddd6dbe18Elliott Hughes}
90814e40397fe6c8a2c645bae99f356dbddd6dbe18Elliott Hughes
911646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Houltvoid ConvertModifiedUtf8ToUtf16(uint16_t* utf16_data_out, size_t out_chars,
921646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult                                const char* utf8_data_in, size_t in_bytes) {
931646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult  const char *in_start = utf8_data_in;
941646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult  const char *in_end = utf8_data_in + in_bytes;
951646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult  uint16_t *out_p = utf16_data_out;
961646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult
971646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult  if (LIKELY(out_chars == in_bytes)) {
981646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult    // Common case where all characters are ASCII.
991646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult    for (const char *p = in_start; p < in_end;) {
1001646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult      // Safe even if char is signed because ASCII characters always have
1011646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult      // the high bit cleared.
1021646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult      *out_p++ = dchecked_integral_cast<uint16_t>(*p++);
1031646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult    }
1041646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult    return;
1051646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult  }
1061646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult
1071646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult  // String contains non-ASCII characters.
1081646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult  for (const char *p = in_start; p < in_end;) {
1091646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult    const uint32_t ch = GetUtf16FromUtf8(&p);
1101646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult    const uint16_t leading = GetLeadingUtf16Char(ch);
1111646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult    const uint16_t trailing = GetTrailingUtf16Char(ch);
1121646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult
1131646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult    *out_p++ = leading;
1141646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult    if (trailing != 0) {
1151646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult      *out_p++ = trailing;
1161646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult    }
1171646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult  }
1181646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult}
1191646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult
1201646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Houltvoid ConvertUtf16ToModifiedUtf8(char* utf8_out, size_t byte_count,
1211646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult                                const uint16_t* utf16_in, size_t char_count) {
1221646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult  if (LIKELY(byte_count == char_count)) {
1231646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult    // Common case where all characters are ASCII.
1241646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult    const uint16_t *utf16_end = utf16_in + char_count;
1251646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult    for (const uint16_t *p = utf16_in; p < utf16_end;) {
1261646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult      *utf8_out++ = dchecked_integral_cast<char>(*p++);
1271646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult    }
1281646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult    return;
1291646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult  }
1301646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult
1311646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult  // String contains non-ASCII characters.
132b465ab0e103d7760df903c1fddf4fa6b89d5d1f5Elliott Hughes  while (char_count--) {
133e16dad1d6388b0305f13e2171308a77f42e7c682Narayan Kamath    const uint16_t ch = *utf16_in++;
134b465ab0e103d7760df903c1fddf4fa6b89d5d1f5Elliott Hughes    if (ch > 0 && ch <= 0x7f) {
135b465ab0e103d7760df903c1fddf4fa6b89d5d1f5Elliott Hughes      *utf8_out++ = ch;
136b465ab0e103d7760df903c1fddf4fa6b89d5d1f5Elliott Hughes    } else {
1371646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult      // Char_count == 0 here implies we've encountered an unpaired
138e16dad1d6388b0305f13e2171308a77f42e7c682Narayan Kamath      // surrogate and we have no choice but to encode it as 3-byte UTF
139e16dad1d6388b0305f13e2171308a77f42e7c682Narayan Kamath      // sequence. Note that unpaired surrogates can occur as a part of
140e16dad1d6388b0305f13e2171308a77f42e7c682Narayan Kamath      // "normal" operation.
141e16dad1d6388b0305f13e2171308a77f42e7c682Narayan Kamath      if ((ch >= 0xd800 && ch <= 0xdbff) && (char_count > 0)) {
142e16dad1d6388b0305f13e2171308a77f42e7c682Narayan Kamath        const uint16_t ch2 = *utf16_in;
143e16dad1d6388b0305f13e2171308a77f42e7c682Narayan Kamath
144e16dad1d6388b0305f13e2171308a77f42e7c682Narayan Kamath        // Check if the other half of the pair is within the expected
145e16dad1d6388b0305f13e2171308a77f42e7c682Narayan Kamath        // range. If it isn't, we will have to emit both "halves" as
146e16dad1d6388b0305f13e2171308a77f42e7c682Narayan Kamath        // separate 3 byte sequences.
147e16dad1d6388b0305f13e2171308a77f42e7c682Narayan Kamath        if (ch2 >= 0xdc00 && ch2 <= 0xdfff) {
148e16dad1d6388b0305f13e2171308a77f42e7c682Narayan Kamath          utf16_in++;
149e16dad1d6388b0305f13e2171308a77f42e7c682Narayan Kamath          char_count--;
150e16dad1d6388b0305f13e2171308a77f42e7c682Narayan Kamath          const uint32_t code_point = (ch << 10) + ch2 - 0x035fdc00;
151e16dad1d6388b0305f13e2171308a77f42e7c682Narayan Kamath          *utf8_out++ = (code_point >> 18) | 0xf0;
152e16dad1d6388b0305f13e2171308a77f42e7c682Narayan Kamath          *utf8_out++ = ((code_point >> 12) & 0x3f) | 0x80;
153e16dad1d6388b0305f13e2171308a77f42e7c682Narayan Kamath          *utf8_out++ = ((code_point >> 6) & 0x3f) | 0x80;
154e16dad1d6388b0305f13e2171308a77f42e7c682Narayan Kamath          *utf8_out++ = (code_point & 0x3f) | 0x80;
155e16dad1d6388b0305f13e2171308a77f42e7c682Narayan Kamath          continue;
156e16dad1d6388b0305f13e2171308a77f42e7c682Narayan Kamath        }
157e16dad1d6388b0305f13e2171308a77f42e7c682Narayan Kamath      }
158e16dad1d6388b0305f13e2171308a77f42e7c682Narayan Kamath
159b465ab0e103d7760df903c1fddf4fa6b89d5d1f5Elliott Hughes      if (ch > 0x07ff) {
160e16dad1d6388b0305f13e2171308a77f42e7c682Narayan Kamath        // Three byte encoding.
161b465ab0e103d7760df903c1fddf4fa6b89d5d1f5Elliott Hughes        *utf8_out++ = (ch >> 12) | 0xe0;
162b465ab0e103d7760df903c1fddf4fa6b89d5d1f5Elliott Hughes        *utf8_out++ = ((ch >> 6) & 0x3f) | 0x80;
163b465ab0e103d7760df903c1fddf4fa6b89d5d1f5Elliott Hughes        *utf8_out++ = (ch & 0x3f) | 0x80;
164b465ab0e103d7760df903c1fddf4fa6b89d5d1f5Elliott Hughes      } else /*(ch > 0x7f || ch == 0)*/ {
165e16dad1d6388b0305f13e2171308a77f42e7c682Narayan Kamath        // Two byte encoding.
166b465ab0e103d7760df903c1fddf4fa6b89d5d1f5Elliott Hughes        *utf8_out++ = (ch >> 6) | 0xc0;
167b465ab0e103d7760df903c1fddf4fa6b89d5d1f5Elliott Hughes        *utf8_out++ = (ch & 0x3f) | 0x80;
168b465ab0e103d7760df903c1fddf4fa6b89d5d1f5Elliott Hughes      }
169b465ab0e103d7760df903c1fddf4fa6b89d5d1f5Elliott Hughes    }
170b465ab0e103d7760df903c1fddf4fa6b89d5d1f5Elliott Hughes  }
171b465ab0e103d7760df903c1fddf4fa6b89d5d1f5Elliott Hughes}
172b465ab0e103d7760df903c1fddf4fa6b89d5d1f5Elliott Hughes
173814e40397fe6c8a2c645bae99f356dbddd6dbe18Elliott Hughesint32_t ComputeUtf16Hash(const uint16_t* chars, size_t char_count) {
1748f41dc389bd50b6e31caa3fe9d41cbaf76e4d8d6Ian Rogers  uint32_t hash = 0;
175814e40397fe6c8a2c645bae99f356dbddd6dbe18Elliott Hughes  while (char_count--) {
176814e40397fe6c8a2c645bae99f356dbddd6dbe18Elliott Hughes    hash = hash * 31 + *chars++;
177814e40397fe6c8a2c645bae99f356dbddd6dbe18Elliott Hughes  }
1788f41dc389bd50b6e31caa3fe9d41cbaf76e4d8d6Ian Rogers  return static_cast<int32_t>(hash);
179814e40397fe6c8a2c645bae99f356dbddd6dbe18Elliott Hughes}
180814e40397fe6c8a2c645bae99f356dbddd6dbe18Elliott Hughes
181cac5a7e871f1f346b317894359ad06fa7bd67fbaVladimir Markoint32_t ComputeUtf16HashFromModifiedUtf8(const char* utf8, size_t utf16_length) {
182cac5a7e871f1f346b317894359ad06fa7bd67fbaVladimir Marko  uint32_t hash = 0;
183cac5a7e871f1f346b317894359ad06fa7bd67fbaVladimir Marko  while (utf16_length != 0u) {
184cac5a7e871f1f346b317894359ad06fa7bd67fbaVladimir Marko    const uint32_t pair = GetUtf16FromUtf8(&utf8);
185cac5a7e871f1f346b317894359ad06fa7bd67fbaVladimir Marko    const uint16_t first = GetLeadingUtf16Char(pair);
186cac5a7e871f1f346b317894359ad06fa7bd67fbaVladimir Marko    hash = hash * 31 + first;
187cac5a7e871f1f346b317894359ad06fa7bd67fbaVladimir Marko    --utf16_length;
188cac5a7e871f1f346b317894359ad06fa7bd67fbaVladimir Marko    const uint16_t second = GetTrailingUtf16Char(pair);
189cac5a7e871f1f346b317894359ad06fa7bd67fbaVladimir Marko    if (second != 0) {
190cac5a7e871f1f346b317894359ad06fa7bd67fbaVladimir Marko      hash = hash * 31 + second;
191cac5a7e871f1f346b317894359ad06fa7bd67fbaVladimir Marko      DCHECK_NE(utf16_length, 0u);
192cac5a7e871f1f346b317894359ad06fa7bd67fbaVladimir Marko      --utf16_length;
193cac5a7e871f1f346b317894359ad06fa7bd67fbaVladimir Marko    }
194cac5a7e871f1f346b317894359ad06fa7bd67fbaVladimir Marko  }
195cac5a7e871f1f346b317894359ad06fa7bd67fbaVladimir Marko  return static_cast<int32_t>(hash);
196cac5a7e871f1f346b317894359ad06fa7bd67fbaVladimir Marko}
197cac5a7e871f1f346b317894359ad06fa7bd67fbaVladimir Marko
198208a5cb383dd9dcd3461f89b74af5df67dc8d794Mathieu Chartieruint32_t ComputeModifiedUtf8Hash(const char* chars) {
199208a5cb383dd9dcd3461f89b74af5df67dc8d794Mathieu Chartier  uint32_t hash = 0;
20068b56858367e29461ae290fd797443a1ef6d8005Ian Rogers  while (*chars != '\0') {
201e7c9a8c2b8481aafbc6af4ce6229bd361ba24742Mathieu Chartier    hash = hash * 31 + *chars++;
20268b56858367e29461ae290fd797443a1ef6d8005Ian Rogers  }
2038f41dc389bd50b6e31caa3fe9d41cbaf76e4d8d6Ian Rogers  return static_cast<int32_t>(hash);
20468b56858367e29461ae290fd797443a1ef6d8005Ian Rogers}
20568b56858367e29461ae290fd797443a1ef6d8005Ian Rogers
206a48aef4234768ed37828df613919391c21f561a7Vladimir Markoint CompareModifiedUtf8ToUtf16AsCodePointValues(const char* utf8, const uint16_t* utf16,
207a48aef4234768ed37828df613919391c21f561a7Vladimir Marko                                                size_t utf16_length) {
208637c65b1e431fd90195b71c141b3590bd81cc91aIan Rogers  for (;;) {
209a48aef4234768ed37828df613919391c21f561a7Vladimir Marko    if (*utf8 == '\0') {
210a48aef4234768ed37828df613919391c21f561a7Vladimir Marko      return (utf16_length == 0) ? 0 : -1;
211a48aef4234768ed37828df613919391c21f561a7Vladimir Marko    } else if (utf16_length == 0) {
212637c65b1e431fd90195b71c141b3590bd81cc91aIan Rogers      return 1;
213637c65b1e431fd90195b71c141b3590bd81cc91aIan Rogers    }
214637c65b1e431fd90195b71c141b3590bd81cc91aIan Rogers
215a5afcfc73141e5e378d79a326d02c5c2039fb025Narayan Kamath    const uint32_t pair = GetUtf16FromUtf8(&utf8);
216a5afcfc73141e5e378d79a326d02c5c2039fb025Narayan Kamath
217a5afcfc73141e5e378d79a326d02c5c2039fb025Narayan Kamath    // First compare the leading utf16 char.
218a5afcfc73141e5e378d79a326d02c5c2039fb025Narayan Kamath    const uint16_t lhs = GetLeadingUtf16Char(pair);
219a5afcfc73141e5e378d79a326d02c5c2039fb025Narayan Kamath    const uint16_t rhs = *utf16++;
220a48aef4234768ed37828df613919391c21f561a7Vladimir Marko    --utf16_length;
221a5afcfc73141e5e378d79a326d02c5c2039fb025Narayan Kamath    if (lhs != rhs) {
222a5afcfc73141e5e378d79a326d02c5c2039fb025Narayan Kamath      return lhs > rhs ? 1 : -1;
223a5afcfc73141e5e378d79a326d02c5c2039fb025Narayan Kamath    }
224637c65b1e431fd90195b71c141b3590bd81cc91aIan Rogers
225a5afcfc73141e5e378d79a326d02c5c2039fb025Narayan Kamath    // Then compare the trailing utf16 char. First check if there
226a5afcfc73141e5e378d79a326d02c5c2039fb025Narayan Kamath    // are any characters left to consume.
227a5afcfc73141e5e378d79a326d02c5c2039fb025Narayan Kamath    const uint16_t lhs2 = GetTrailingUtf16Char(pair);
228a5afcfc73141e5e378d79a326d02c5c2039fb025Narayan Kamath    if (lhs2 != 0) {
229a5afcfc73141e5e378d79a326d02c5c2039fb025Narayan Kamath      if (utf16_length == 0) {
230a5afcfc73141e5e378d79a326d02c5c2039fb025Narayan Kamath        return 1;
231a5afcfc73141e5e378d79a326d02c5c2039fb025Narayan Kamath      }
232a5afcfc73141e5e378d79a326d02c5c2039fb025Narayan Kamath
233a5afcfc73141e5e378d79a326d02c5c2039fb025Narayan Kamath      const uint16_t rhs2 = *utf16++;
234a5afcfc73141e5e378d79a326d02c5c2039fb025Narayan Kamath      --utf16_length;
235a5afcfc73141e5e378d79a326d02c5c2039fb025Narayan Kamath      if (lhs2 != rhs2) {
236a5afcfc73141e5e378d79a326d02c5c2039fb025Narayan Kamath        return lhs2 > rhs2 ? 1 : -1;
237a5afcfc73141e5e378d79a326d02c5c2039fb025Narayan Kamath      }
238637c65b1e431fd90195b71c141b3590bd81cc91aIan Rogers    }
239637c65b1e431fd90195b71c141b3590bd81cc91aIan Rogers  }
240637c65b1e431fd90195b71c141b3590bd81cc91aIan Rogers}
241637c65b1e431fd90195b71c141b3590bd81cc91aIan Rogers
242814e40397fe6c8a2c645bae99f356dbddd6dbe18Elliott Hughessize_t CountUtf8Bytes(const uint16_t* chars, size_t char_count) {
243814e40397fe6c8a2c645bae99f356dbddd6dbe18Elliott Hughes  size_t result = 0;
2441646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult  const uint16_t *end = chars + char_count;
2451646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult  while (chars < end) {
246e16dad1d6388b0305f13e2171308a77f42e7c682Narayan Kamath    const uint16_t ch = *chars++;
2471646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult    if (LIKELY(ch != 0 && ch < 0x80)) {
2481646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult      result++;
2491646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult      continue;
2501646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult    }
2511646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult    if (ch < 0x800) {
2521646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult      result += 2;
2531646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult      continue;
2541646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult    }
2551646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult    if (ch >= 0xd800 && ch < 0xdc00) {
2561646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult      if (chars < end) {
257e16dad1d6388b0305f13e2171308a77f42e7c682Narayan Kamath        const uint16_t ch2 = *chars;
258e16dad1d6388b0305f13e2171308a77f42e7c682Narayan Kamath        // If we find a properly paired surrogate, we emit it as a 4 byte
259e16dad1d6388b0305f13e2171308a77f42e7c682Narayan Kamath        // UTF sequence. If we find an unpaired leading or trailing surrogate,
260e16dad1d6388b0305f13e2171308a77f42e7c682Narayan Kamath        // we emit it as a 3 byte sequence like would have done earlier.
2611646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult        if (ch2 >= 0xdc00 && ch2 < 0xe000) {
262e16dad1d6388b0305f13e2171308a77f42e7c682Narayan Kamath          chars++;
263e16dad1d6388b0305f13e2171308a77f42e7c682Narayan Kamath          result += 4;
2641646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult          continue;
265e16dad1d6388b0305f13e2171308a77f42e7c682Narayan Kamath        }
266814e40397fe6c8a2c645bae99f356dbddd6dbe18Elliott Hughes      }
267814e40397fe6c8a2c645bae99f356dbddd6dbe18Elliott Hughes    }
2681646d7a22e43a1fb25452ead47a4073e63d7f391Bruce Hoult    result += 3;
269814e40397fe6c8a2c645bae99f356dbddd6dbe18Elliott Hughes  }
270814e40397fe6c8a2c645bae99f356dbddd6dbe18Elliott Hughes  return result;
271814e40397fe6c8a2c645bae99f356dbddd6dbe18Elliott Hughes}
272814e40397fe6c8a2c645bae99f356dbddd6dbe18Elliott Hughes
273814e40397fe6c8a2c645bae99f356dbddd6dbe18Elliott Hughes}  // namespace art
274