UtfString.cpp revision 60fc806b679a3655c228b4093058c59941a49cfe
1/* 2 * Copyright (C) 2008 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17/* 18 * UTF-8 and Unicode string manipulation, plus java/lang/String convenience 19 * functions. 20 * 21 * In most cases we populate the fields in the String object directly, 22 * rather than going through an instance field lookup. 23 */ 24#include "Dalvik.h" 25#include <stdlib.h> 26 27/* 28 * Allocate a new instance of the class String, performing first-use 29 * initialization of the class if necessary. Upon success, the 30 * returned value will have all its fields except hashCode already 31 * filled in, including a reference to a newly-allocated char[] for 32 * the contents, sized as given. Additionally, a reference to the 33 * chars array is stored to the pChars pointer. Callers must 34 * subsequently call dvmReleaseTrackedAlloc() on the result pointer. 35 * This function returns NULL on failure. 36 */ 37static StringObject* makeStringObject(u4 charsLength, ArrayObject** pChars) 38{ 39 /* 40 * The String class should have already gotten found (but not 41 * necessarily initialized) before making it here. We assert it 42 * explicitly, since historically speaking, we have had bugs with 43 * regard to when the class String gets set up. The assert helps 44 * make any regressions easier to diagnose. 45 */ 46 assert(gDvm.classJavaLangString != NULL); 47 48 if (!dvmIsClassInitialized(gDvm.classJavaLangString)) { 49 /* Perform first-time use initialization of the class. */ 50 if (!dvmInitClass(gDvm.classJavaLangString)) { 51 LOGE("FATAL: Could not initialize class String"); 52 dvmAbort(); 53 } 54 } 55 56 Object* result = dvmAllocObject(gDvm.classJavaLangString, ALLOC_DEFAULT); 57 if (result == NULL) { 58 return NULL; 59 } 60 61 ArrayObject* chars = dvmAllocPrimitiveArray('C', charsLength, ALLOC_DEFAULT); 62 if (chars == NULL) { 63 dvmReleaseTrackedAlloc(result, NULL); 64 return NULL; 65 } 66 67 dvmSetFieldInt(result, STRING_FIELDOFF_COUNT, charsLength); 68 dvmSetFieldObject(result, STRING_FIELDOFF_VALUE, (Object*) chars); 69 dvmReleaseTrackedAlloc((Object*) chars, NULL); 70 /* Leave offset and hashCode set to zero. */ 71 72 *pChars = chars; 73 return (StringObject*) result; 74} 75 76/* 77 * Compute a hash code on a UTF-8 string, for use with internal hash tables. 78 * 79 * This may or may not yield the same results as the java/lang/String 80 * computeHashCode() function. (To make sure this doesn't get abused, 81 * I'm initializing the hash code to 1 so they *don't* match up.) 82 * 83 * It would be more correct to invoke dexGetUtf16FromUtf8() here and compute 84 * the hash with the result. That way, if something encoded the same 85 * character in two different ways, the hash value would be the same. For 86 * our purposes that isn't necessary. 87 */ 88u4 dvmComputeUtf8Hash(const char* utf8Str) 89{ 90 u4 hash = 1; 91 92 while (*utf8Str != '\0') 93 hash = hash * 31 + *utf8Str++; 94 95 return hash; 96} 97 98/* 99 * Like "strlen", but for strings encoded with "modified" UTF-8. 100 * 101 * The value returned is the number of characters, which may or may not 102 * be the same as the number of bytes. 103 * 104 * (If this needs optimizing, try: mask against 0xa0, shift right 5, 105 * get increment {1-3} from table of 8 values.) 106 */ 107size_t dvmUtf8Len(const char* utf8Str) 108{ 109 size_t len = 0; 110 int ic; 111 112 while ((ic = *utf8Str++) != '\0') { 113 len++; 114 if ((ic & 0x80) != 0) { 115 /* two- or three-byte encoding */ 116 utf8Str++; 117 if ((ic & 0x20) != 0) { 118 /* three-byte encoding */ 119 utf8Str++; 120 } 121 } 122 } 123 124 return len; 125} 126 127/* 128 * Convert a "modified" UTF-8 string to UTF-16. 129 */ 130void dvmConvertUtf8ToUtf16(u2* utf16Str, const char* utf8Str) 131{ 132 while (*utf8Str != '\0') 133 *utf16Str++ = dexGetUtf16FromUtf8(&utf8Str); 134} 135 136/* 137 * Given a UTF-16 string, compute the length of the corresponding UTF-8 138 * string in bytes. 139 */ 140static int utf16_utf8ByteLen(const u2* utf16Str, int len) 141{ 142 int utf8Len = 0; 143 144 while (len--) { 145 unsigned int uic = *utf16Str++; 146 147 /* 148 * The most common case is (uic > 0 && uic <= 0x7f). 149 */ 150 if (uic == 0 || uic > 0x7f) { 151 if (uic > 0x07ff) 152 utf8Len += 3; 153 else /*(uic > 0x7f || uic == 0) */ 154 utf8Len += 2; 155 } else 156 utf8Len++; 157 } 158 return utf8Len; 159} 160 161/* 162 * Convert a UTF-16 string to UTF-8. 163 * 164 * Make sure you allocate "utf8Str" with the result of utf16_utf8ByteLen(), 165 * not just "len". 166 */ 167static void convertUtf16ToUtf8(char* utf8Str, const u2* utf16Str, int len) 168{ 169 assert(len >= 0); 170 171 while (len--) { 172 unsigned int uic = *utf16Str++; 173 174 /* 175 * The most common case is (uic > 0 && uic <= 0x7f). 176 */ 177 if (uic == 0 || uic > 0x7f) { 178 if (uic > 0x07ff) { 179 *utf8Str++ = (uic >> 12) | 0xe0; 180 *utf8Str++ = ((uic >> 6) & 0x3f) | 0x80; 181 *utf8Str++ = (uic & 0x3f) | 0x80; 182 } else /*(uic > 0x7f || uic == 0)*/ { 183 *utf8Str++ = (uic >> 6) | 0xc0; 184 *utf8Str++ = (uic & 0x3f) | 0x80; 185 } 186 } else { 187 *utf8Str++ = uic; 188 } 189 } 190 191 *utf8Str = '\0'; 192} 193 194/* 195 * Use the java/lang/String.computeHashCode() algorithm. 196 */ 197static inline u4 computeUtf16Hash(const u2* utf16Str, size_t len) 198{ 199 u4 hash = 0; 200 201 while (len--) 202 hash = hash * 31 + *utf16Str++; 203 204 return hash; 205} 206 207u4 dvmComputeStringHash(StringObject* strObj) { 208 int hashCode = dvmGetFieldInt(strObj, STRING_FIELDOFF_HASHCODE); 209 if (hashCode != 0) { 210 return hashCode; 211 } 212 int len = dvmGetFieldInt(strObj, STRING_FIELDOFF_COUNT); 213 int offset = dvmGetFieldInt(strObj, STRING_FIELDOFF_OFFSET); 214 ArrayObject* chars = (ArrayObject*) dvmGetFieldObject(strObj, 215 STRING_FIELDOFF_VALUE); 216 hashCode = computeUtf16Hash((u2*)(void*)chars->contents + offset, len); 217 dvmSetFieldInt(strObj, STRING_FIELDOFF_HASHCODE, hashCode); 218 return hashCode; 219} 220 221/* 222 * Create a new java/lang/String object, using the string data in "utf8Str". 223 * 224 * The caller must call dvmReleaseTrackedAlloc() on the return value. 225 * 226 * Returns NULL and throws an exception on failure. 227 */ 228StringObject* dvmCreateStringFromCstr(const char* utf8Str) 229{ 230 assert(utf8Str != NULL); 231 return dvmCreateStringFromCstrAndLength(utf8Str, dvmUtf8Len(utf8Str)); 232} 233 234/* 235 * Create a java/lang/String from a C string, given its UTF-16 length 236 * (number of UTF-16 code points). 237 * 238 * The caller must call dvmReleaseTrackedAlloc() on the return value. 239 * 240 * Returns NULL and throws an exception on failure. 241 */ 242StringObject* dvmCreateStringFromCstrAndLength(const char* utf8Str, 243 size_t utf16Length) 244{ 245 assert(utf8Str != NULL); 246 247 ArrayObject* chars; 248 StringObject* newObj = makeStringObject(utf16Length, &chars); 249 if (newObj == NULL) { 250 return NULL; 251 } 252 253 dvmConvertUtf8ToUtf16((u2*)(void*)chars->contents, utf8Str); 254 255 u4 hashCode = computeUtf16Hash((u2*)(void*)chars->contents, utf16Length); 256 dvmSetFieldInt((Object*) newObj, STRING_FIELDOFF_HASHCODE, hashCode); 257 258 return newObj; 259} 260 261/* 262 * Create a new java/lang/String object, using the given Unicode data. 263 */ 264StringObject* dvmCreateStringFromUnicode(const u2* unichars, int len) 265{ 266 /* We allow a NULL pointer if the length is zero. */ 267 assert(len == 0 || unichars != NULL); 268 269 ArrayObject* chars; 270 StringObject* newObj = makeStringObject(len, &chars); 271 if (newObj == NULL) { 272 return NULL; 273 } 274 275 if (len > 0) memcpy(chars->contents, unichars, len * sizeof(u2)); 276 277 u4 hashCode = computeUtf16Hash((u2*)(void*)chars->contents, len); 278 dvmSetFieldInt((Object*)newObj, STRING_FIELDOFF_HASHCODE, hashCode); 279 280 return newObj; 281} 282 283/* 284 * Create a new C string from a java/lang/String object. 285 * 286 * Returns NULL if the object is NULL. 287 */ 288char* dvmCreateCstrFromString(StringObject* jstr) 289{ 290 char* newStr; 291 ArrayObject* chars; 292 int len, byteLen, offset; 293 const u2* data; 294 295 assert(gDvm.classJavaLangString != NULL); 296 297 if (jstr == NULL) 298 return NULL; 299 300 len = dvmGetFieldInt((Object*) jstr, STRING_FIELDOFF_COUNT); 301 offset = dvmGetFieldInt((Object*) jstr, STRING_FIELDOFF_OFFSET); 302 chars = (ArrayObject*) dvmGetFieldObject((Object*) jstr, 303 STRING_FIELDOFF_VALUE); 304 data = (const u2*)(void*)chars->contents + offset; 305 assert(offset + len <= (int) chars->length); 306 307 byteLen = utf16_utf8ByteLen(data, len); 308 newStr = (char*) malloc(byteLen+1); 309 if (newStr == NULL) 310 return NULL; 311 convertUtf16ToUtf8(newStr, data, len); 312 313 return newStr; 314} 315 316/* 317 * Create a UTF-8 C string from a region of a java/lang/String. (Used by 318 * the JNI GetStringUTFRegion call.) 319 */ 320void dvmCreateCstrFromStringRegion(StringObject* jstr, int start, int len, 321 char* buf) 322{ 323 const u2* data; 324 325 data = dvmStringChars(jstr) + start; 326 convertUtf16ToUtf8(buf, data, len); 327} 328 329/* 330 * Compute the length, in modified UTF-8, of a java/lang/String object. 331 * 332 * Does not include the terminating null byte. 333 */ 334int dvmStringUtf8ByteLen(StringObject* jstr) 335{ 336 ArrayObject* chars; 337 int len, offset; 338 const u2* data; 339 340 assert(gDvm.classJavaLangString != NULL); 341 342 if (jstr == NULL) 343 return 0; // should we throw something? assert? 344 345 len = dvmGetFieldInt((Object*) jstr, STRING_FIELDOFF_COUNT); 346 offset = dvmGetFieldInt((Object*) jstr, STRING_FIELDOFF_OFFSET); 347 chars = (ArrayObject*) dvmGetFieldObject((Object*) jstr, 348 STRING_FIELDOFF_VALUE); 349 data = (const u2*)(void*)chars->contents + offset; 350 assert(offset + len <= (int) chars->length); 351 352 return utf16_utf8ByteLen(data, len); 353} 354 355/* 356 * Get the string's length. 357 */ 358int dvmStringLen(StringObject* jstr) 359{ 360 return dvmGetFieldInt((Object*) jstr, STRING_FIELDOFF_COUNT); 361} 362 363/* 364 * Get the char[] object from the String. 365 */ 366ArrayObject* dvmStringCharArray(StringObject* jstr) 367{ 368 return (ArrayObject*) dvmGetFieldObject((Object*) jstr, 369 STRING_FIELDOFF_VALUE); 370} 371 372/* 373 * Get the string's data. 374 */ 375const u2* dvmStringChars(StringObject* jstr) 376{ 377 ArrayObject* chars; 378 int offset; 379 380 offset = dvmGetFieldInt((Object*) jstr, STRING_FIELDOFF_OFFSET); 381 chars = (ArrayObject*) dvmGetFieldObject((Object*) jstr, 382 STRING_FIELDOFF_VALUE); 383 return (const u2*)(void*)chars->contents + offset; 384} 385 386 387/* 388 * Compare two String objects. 389 * 390 * This is a dvmHashTableLookup() callback. The function has already 391 * compared their hash values; we need to do a full compare to ensure 392 * that the strings really match. 393 */ 394int dvmHashcmpStrings(const void* vstrObj1, const void* vstrObj2) 395{ 396 const StringObject* strObj1 = (const StringObject*) vstrObj1; 397 const StringObject* strObj2 = (const StringObject*) vstrObj2; 398 ArrayObject* chars1; 399 ArrayObject* chars2; 400 int len1, len2, offset1, offset2; 401 402 assert(gDvm.classJavaLangString != NULL); 403 404 /* get offset and length into char array; all values are in 16-bit units */ 405 len1 = dvmGetFieldInt((Object*) strObj1, STRING_FIELDOFF_COUNT); 406 offset1 = dvmGetFieldInt((Object*) strObj1, STRING_FIELDOFF_OFFSET); 407 len2 = dvmGetFieldInt((Object*) strObj2, STRING_FIELDOFF_COUNT); 408 offset2 = dvmGetFieldInt((Object*) strObj2, STRING_FIELDOFF_OFFSET); 409 if (len1 != len2) 410 return len1 - len2; 411 412 chars1 = (ArrayObject*) dvmGetFieldObject((Object*) strObj1, 413 STRING_FIELDOFF_VALUE); 414 chars2 = (ArrayObject*) dvmGetFieldObject((Object*) strObj2, 415 STRING_FIELDOFF_VALUE); 416 417 /* damage here actually indicates a broken java/lang/String */ 418 assert(offset1 + len1 <= (int) chars1->length); 419 assert(offset2 + len2 <= (int) chars2->length); 420 421 return memcmp((const u2*)(void*)chars1->contents + offset1, 422 (const u2*)(void*)chars2->contents + offset2, 423 len1 * sizeof(u2)); 424} 425 426ArrayObject* dvmCreateStringArray(const char** strings, size_t length) 427{ 428 Thread* self = dvmThreadSelf(); 429 430 /* 431 * Allocate an array to hold the String objects. 432 */ 433 ClassObject* elementClass = 434 dvmFindArrayClassForElement(gDvm.classJavaLangString); 435 ArrayObject* stringArray = 436 dvmAllocArrayByClass(elementClass, length, ALLOC_DEFAULT); 437 if (stringArray == NULL) { 438 /* probably OOM */ 439 assert(dvmCheckException(self)); 440 return NULL; 441 } 442 443 /* 444 * Create the individual String objects and add them to the array. 445 */ 446 for (size_t i = 0; i < length; i++) { 447 Object* str = 448 (Object*) dvmCreateStringFromCstr(strings[i]); 449 if (str == NULL) { 450 /* probably OOM; drop out now */ 451 assert(dvmCheckException(self)); 452 dvmReleaseTrackedAlloc((Object*) stringArray, self); 453 return NULL; 454 } 455 dvmSetObjectArrayElement(stringArray, i, str); 456 /* stored in tracked array, okay to release */ 457 dvmReleaseTrackedAlloc(str, self); 458 } 459 460 return stringArray; 461} 462