1/* 2 * Copyright (C) 2011 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17#include "utf.h" 18 19#include "base/logging.h" 20#include "mirror/array.h" 21#include "mirror/object-inl.h" 22#include "utf-inl.h" 23 24namespace art { 25 26size_t CountModifiedUtf8Chars(const char* utf8) { 27 size_t len = 0; 28 int ic; 29 while ((ic = *utf8++) != '\0') { 30 len++; 31 if ((ic & 0x80) == 0) { 32 // one-byte encoding 33 continue; 34 } 35 // two- or three-byte encoding 36 utf8++; 37 if ((ic & 0x20) == 0) { 38 // two-byte encoding 39 continue; 40 } 41 utf8++; 42 if ((ic & 0x10) == 0) { 43 // three-byte encoding 44 continue; 45 } 46 47 // four-byte encoding: needs to be converted into a surrogate 48 // pair. 49 utf8++; 50 len++; 51 } 52 return len; 53} 54 55void ConvertModifiedUtf8ToUtf16(uint16_t* utf16_data_out, const char* utf8_data_in) { 56 while (*utf8_data_in != '\0') { 57 const uint32_t ch = GetUtf16FromUtf8(&utf8_data_in); 58 const uint16_t leading = GetLeadingUtf16Char(ch); 59 const uint16_t trailing = GetTrailingUtf16Char(ch); 60 61 *utf16_data_out++ = leading; 62 if (trailing != 0) { 63 *utf16_data_out++ = trailing; 64 } 65 } 66} 67 68void ConvertUtf16ToModifiedUtf8(char* utf8_out, const uint16_t* utf16_in, size_t char_count) { 69 while (char_count--) { 70 const uint16_t ch = *utf16_in++; 71 if (ch > 0 && ch <= 0x7f) { 72 *utf8_out++ = ch; 73 } else { 74 // char_count == 0 here implies we've encountered an unpaired 75 // surrogate and we have no choice but to encode it as 3-byte UTF 76 // sequence. Note that unpaired surrogates can occur as a part of 77 // "normal" operation. 78 if ((ch >= 0xd800 && ch <= 0xdbff) && (char_count > 0)) { 79 const uint16_t ch2 = *utf16_in; 80 81 // Check if the other half of the pair is within the expected 82 // range. If it isn't, we will have to emit both "halves" as 83 // separate 3 byte sequences. 84 if (ch2 >= 0xdc00 && ch2 <= 0xdfff) { 85 utf16_in++; 86 char_count--; 87 const uint32_t code_point = (ch << 10) + ch2 - 0x035fdc00; 88 *utf8_out++ = (code_point >> 18) | 0xf0; 89 *utf8_out++ = ((code_point >> 12) & 0x3f) | 0x80; 90 *utf8_out++ = ((code_point >> 6) & 0x3f) | 0x80; 91 *utf8_out++ = (code_point & 0x3f) | 0x80; 92 continue; 93 } 94 } 95 96 if (ch > 0x07ff) { 97 // Three byte encoding. 98 *utf8_out++ = (ch >> 12) | 0xe0; 99 *utf8_out++ = ((ch >> 6) & 0x3f) | 0x80; 100 *utf8_out++ = (ch & 0x3f) | 0x80; 101 } else /*(ch > 0x7f || ch == 0)*/ { 102 // Two byte encoding. 103 *utf8_out++ = (ch >> 6) | 0xc0; 104 *utf8_out++ = (ch & 0x3f) | 0x80; 105 } 106 } 107 } 108} 109 110int32_t ComputeUtf16Hash(const uint16_t* chars, size_t char_count) { 111 uint32_t hash = 0; 112 while (char_count--) { 113 hash = hash * 31 + *chars++; 114 } 115 return static_cast<int32_t>(hash); 116} 117 118size_t ComputeModifiedUtf8Hash(const char* chars) { 119 size_t hash = 0; 120 while (*chars != '\0') { 121 hash = hash * 31 + *chars++; 122 } 123 return static_cast<int32_t>(hash); 124} 125 126int CompareModifiedUtf8ToUtf16AsCodePointValues(const char* utf8, const uint16_t* utf16, 127 size_t utf16_length) { 128 for (;;) { 129 if (*utf8 == '\0') { 130 return (utf16_length == 0) ? 0 : -1; 131 } else if (utf16_length == 0) { 132 return 1; 133 } 134 135 const uint32_t pair = GetUtf16FromUtf8(&utf8); 136 137 // First compare the leading utf16 char. 138 const uint16_t lhs = GetLeadingUtf16Char(pair); 139 const uint16_t rhs = *utf16++; 140 --utf16_length; 141 if (lhs != rhs) { 142 return lhs > rhs ? 1 : -1; 143 } 144 145 // Then compare the trailing utf16 char. First check if there 146 // are any characters left to consume. 147 const uint16_t lhs2 = GetTrailingUtf16Char(pair); 148 if (lhs2 != 0) { 149 if (utf16_length == 0) { 150 return 1; 151 } 152 153 const uint16_t rhs2 = *utf16++; 154 --utf16_length; 155 if (lhs2 != rhs2) { 156 return lhs2 > rhs2 ? 1 : -1; 157 } 158 } 159 } 160} 161 162size_t CountUtf8Bytes(const uint16_t* chars, size_t char_count) { 163 size_t result = 0; 164 while (char_count--) { 165 const uint16_t ch = *chars++; 166 if (ch > 0 && ch <= 0x7f) { 167 ++result; 168 } else if (ch >= 0xd800 && ch <= 0xdbff) { 169 if (char_count > 0) { 170 const uint16_t ch2 = *chars; 171 // If we find a properly paired surrogate, we emit it as a 4 byte 172 // UTF sequence. If we find an unpaired leading or trailing surrogate, 173 // we emit it as a 3 byte sequence like would have done earlier. 174 if (ch2 >= 0xdc00 && ch2 <= 0xdfff) { 175 chars++; 176 char_count--; 177 178 result += 4; 179 } else { 180 result += 3; 181 } 182 } else { 183 // This implies we found an unpaired trailing surrogate at the end 184 // of a string. 185 result += 3; 186 } 187 } else if (ch > 0x7ff) { 188 result += 3; 189 } else { 190 result += 2; 191 } 192 } 193 return result; 194} 195 196} // namespace art 197