1/* 2 * Copyright (C) 2008 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17/* 18 * UTF-8 and Unicode string manipulation, plus java/lang/String convenience 19 * functions. 20 * 21 * In most cases we populate the fields in the String object directly, 22 * rather than going through an instance field lookup. 23 */ 24#include "Dalvik.h" 25#include <stdlib.h> 26 27/* 28 * Initialize string globals. 29 * 30 * This isn't part of the VM init sequence because it's hard to get the 31 * timing right -- we need it to happen after java/lang/String has been 32 * loaded, but before anybody wants to use a string. It's easiest to 33 * just initialize it on first use. 34 * 35 * In some unusual circumstances (e.g. trying to throw an exception because 36 * String implements java/lang/CharSequence, but CharSequence doesn't exist) 37 * we can try to create an exception string internally before anything has 38 * really tried to use String. In that case we basically self-destruct. 39 */ 40static bool stringStartup() 41{ 42 if (gDvm.javaLangStringReady < 0) { 43 LOGE("ERROR: reentrant string initialization\n"); 44 assert(false); 45 return false; 46 } 47 assert(gDvm.javaLangStringReady == 0); 48 49 gDvm.javaLangStringReady = -1; 50 51 if (gDvm.classJavaLangString == NULL) 52 gDvm.classJavaLangString = 53 dvmFindSystemClassNoInit("Ljava/lang/String;"); 54 55 gDvm.offJavaLangString_value = 56 dvmFindFieldOffset(gDvm.classJavaLangString, "value", "[C"); 57 gDvm.offJavaLangString_count = 58 dvmFindFieldOffset(gDvm.classJavaLangString, "count", "I"); 59 gDvm.offJavaLangString_offset = 60 dvmFindFieldOffset(gDvm.classJavaLangString, "offset", "I"); 61 gDvm.offJavaLangString_hashCode = 62 dvmFindFieldOffset(gDvm.classJavaLangString, "hashCode", "I"); 63 64 if (gDvm.offJavaLangString_value < 0 || 65 gDvm.offJavaLangString_count < 0 || 66 gDvm.offJavaLangString_offset < 0 || 67 gDvm.offJavaLangString_hashCode < 0) 68 { 69 LOGE("VM-required field missing from java/lang/String\n"); 70 return false; 71 } 72 73 bool badValue = false; 74 if (gDvm.offJavaLangString_value != STRING_FIELDOFF_VALUE) { 75 LOGE("InlineNative: String.value offset = %d, expected %d\n", 76 gDvm.offJavaLangString_value, STRING_FIELDOFF_VALUE); 77 badValue = true; 78 } 79 if (gDvm.offJavaLangString_count != STRING_FIELDOFF_COUNT) { 80 LOGE("InlineNative: String.count offset = %d, expected %d\n", 81 gDvm.offJavaLangString_count, STRING_FIELDOFF_COUNT); 82 badValue = true; 83 } 84 if (gDvm.offJavaLangString_offset != STRING_FIELDOFF_OFFSET) { 85 LOGE("InlineNative: String.offset offset = %d, expected %d\n", 86 gDvm.offJavaLangString_offset, STRING_FIELDOFF_OFFSET); 87 badValue = true; 88 } 89 if (gDvm.offJavaLangString_hashCode != STRING_FIELDOFF_HASHCODE) { 90 LOGE("InlineNative: String.hashCode offset = %d, expected %d\n", 91 gDvm.offJavaLangString_hashCode, STRING_FIELDOFF_HASHCODE); 92 badValue = true; 93 } 94 if (badValue) 95 return false; 96 97 gDvm.javaLangStringReady = 1; 98 99 return true; 100} 101 102/* 103 * Discard heap-allocated storage. 104 */ 105void dvmStringShutdown() 106{ 107 // currently unused 108} 109 110/* 111 * Compute a hash code on a UTF-8 string, for use with internal hash tables. 112 * 113 * This may or may not yield the same results as the java/lang/String 114 * computeHashCode() function. (To make sure this doesn't get abused, 115 * I'm initializing the hash code to 1 so they *don't* match up.) 116 * 117 * It would be more correct to invoke dexGetUtf16FromUtf8() here and compute 118 * the hash with the result. That way, if something encoded the same 119 * character in two different ways, the hash value would be the same. For 120 * our purposes that isn't necessary. 121 */ 122u4 dvmComputeUtf8Hash(const char* utf8Str) 123{ 124 u4 hash = 1; 125 126 while (*utf8Str != '\0') 127 hash = hash * 31 + *utf8Str++; 128 129 return hash; 130} 131 132/* 133 * Like "strlen", but for strings encoded with "modified" UTF-8. 134 * 135 * The value returned is the number of characters, which may or may not 136 * be the same as the number of bytes. 137 * 138 * (If this needs optimizing, try: mask against 0xa0, shift right 5, 139 * get increment {1-3} from table of 8 values.) 140 */ 141int dvmUtf8Len(const char* utf8Str) 142{ 143 int ic, len = 0; 144 145 while ((ic = *utf8Str++) != '\0') { 146 len++; 147 if ((ic & 0x80) != 0) { 148 /* two- or three-byte encoding */ 149 utf8Str++; 150 if ((ic & 0x20) != 0) { 151 /* three-byte encoding */ 152 utf8Str++; 153 } 154 } 155 } 156 157 return len; 158} 159 160/* 161 * Convert a "modified" UTF-8 string to UTF-16. 162 */ 163void dvmConvertUtf8ToUtf16(u2* utf16Str, const char* utf8Str) 164{ 165 while (*utf8Str != '\0') 166 *utf16Str++ = dexGetUtf16FromUtf8(&utf8Str); 167} 168 169/* 170 * Given a UTF-16 string, compute the length of the corresponding UTF-8 171 * string in bytes. 172 */ 173static int utf16_utf8ByteLen(const u2* utf16Str, int len) 174{ 175 int utf8Len = 0; 176 177 while (len--) { 178 unsigned int uic = *utf16Str++; 179 180 /* 181 * The most common case is (uic > 0 && uic <= 0x7f). 182 */ 183 if (uic == 0 || uic > 0x7f) { 184 if (uic > 0x07ff) 185 utf8Len += 3; 186 else /*(uic > 0x7f || uic == 0) */ 187 utf8Len += 2; 188 } else 189 utf8Len++; 190 } 191 return utf8Len; 192} 193 194/* 195 * Convert a UTF-16 string to UTF-8. 196 * 197 * Make sure you allocate "utf8Str" with the result of utf16_utf8ByteLen(), 198 * not just "len". 199 */ 200static void convertUtf16ToUtf8(char* utf8Str, const u2* utf16Str, int len) 201{ 202 assert(len >= 0); 203 204 while (len--) { 205 unsigned int uic = *utf16Str++; 206 207 /* 208 * The most common case is (uic > 0 && uic <= 0x7f). 209 */ 210 if (uic == 0 || uic > 0x7f) { 211 if (uic > 0x07ff) { 212 *utf8Str++ = (uic >> 12) | 0xe0; 213 *utf8Str++ = ((uic >> 6) & 0x3f) | 0x80; 214 *utf8Str++ = (uic & 0x3f) | 0x80; 215 } else /*(uic > 0x7f || uic == 0)*/ { 216 *utf8Str++ = (uic >> 6) | 0xc0; 217 *utf8Str++ = (uic & 0x3f) | 0x80; 218 } 219 } else { 220 *utf8Str++ = uic; 221 } 222 } 223 224 *utf8Str = '\0'; 225} 226 227/* 228 * Use the java/lang/String.computeHashCode() algorithm. 229 */ 230static inline u4 dvmComputeUtf16Hash(const u2* utf16Str, int len) 231{ 232 u4 hash = 0; 233 234 while (len--) 235 hash = hash * 31 + *utf16Str++; 236 237 return hash; 238} 239u4 dvmComputeStringHash(StringObject* strObj) { 240 ArrayObject* chars = (ArrayObject*) dvmGetFieldObject((Object*) strObj, 241 STRING_FIELDOFF_VALUE); 242 int offset, len; 243 244 len = dvmGetFieldInt((Object*) strObj, STRING_FIELDOFF_COUNT); 245 offset = dvmGetFieldInt((Object*) strObj, STRING_FIELDOFF_OFFSET); 246 247 return dvmComputeUtf16Hash((u2*) chars->contents + offset, len); 248} 249 250/* 251 * Create a new java/lang/String object, using the string data in "utf8Str". 252 * 253 * Note that "allocFlags" affects both of the allocations here. If you 254 * use ALLOC_DONT_TRACK in a context where a GC could happen between the 255 * two allocations, you could lose the array reference. 256 * 257 * Returns NULL and throws an exception on failure. 258 */ 259StringObject* dvmCreateStringFromCstr(const char* utf8Str, int allocFlags) 260{ 261 assert(utf8Str != NULL); 262 263 return dvmCreateStringFromCstrAndLength(utf8Str, dvmUtf8Len(utf8Str), 264 allocFlags); 265} 266 267/* 268 * Create a java/lang/String from a C string, given its UTF-16 length 269 * (number of UTF-16 code points). 270 * 271 * The caller must call dvmReleaseTrackedAlloc() on the return value or 272 * use a non-default value for "allocFlags". It is never appropriate 273 * to use ALLOC_DONT_TRACK with this function. 274 * 275 * Returns NULL and throws an exception on failure. 276 */ 277StringObject* dvmCreateStringFromCstrAndLength(const char* utf8Str, 278 u4 utf16Length, int allocFlags) 279{ 280 StringObject* newObj; 281 ArrayObject* chars; 282 u4 hashCode = 0; 283 284 //LOGV("Creating String from '%s'\n", utf8Str); 285 assert(allocFlags != ALLOC_DONT_TRACK); /* don't currently need */ 286 assert(utf8Str != NULL); 287 288 if (gDvm.javaLangStringReady <= 0) { 289 if (!stringStartup()) 290 return NULL; 291 } 292 293 /* init before alloc */ 294 if (!dvmIsClassInitialized(gDvm.classJavaLangString) && 295 !dvmInitClass(gDvm.classJavaLangString)) 296 { 297 return NULL; 298 } 299 300 newObj = (StringObject*) dvmAllocObject(gDvm.classJavaLangString, 301 allocFlags); 302 if (newObj == NULL) 303 return NULL; 304 305 chars = dvmAllocPrimitiveArray('C', utf16Length, allocFlags); 306 if (chars == NULL) { 307 dvmReleaseTrackedAllocIFN((Object*) newObj, NULL, allocFlags); 308 return NULL; 309 } 310 dvmConvertUtf8ToUtf16((u2*)chars->contents, utf8Str); 311 hashCode = dvmComputeUtf16Hash((u2*) chars->contents, utf16Length); 312 313 dvmSetFieldObject((Object*)newObj, STRING_FIELDOFF_VALUE, 314 (Object*)chars); 315 dvmReleaseTrackedAllocIFN((Object*) chars, NULL, allocFlags); 316 dvmSetFieldInt((Object*)newObj, STRING_FIELDOFF_COUNT, utf16Length); 317 dvmSetFieldInt((Object*)newObj, STRING_FIELDOFF_HASHCODE, hashCode); 318 /* leave offset set to zero */ 319 320 /* debugging stuff */ 321 //dvmDumpObject((Object*)newObj); 322 //printHexDumpEx(ANDROID_LOG_DEBUG, chars->contents, utf16Length * 2, 323 // kHexDumpMem); 324 325 /* caller may need to dvmReleaseTrackedAlloc(newObj) */ 326 return newObj; 327} 328 329/* 330 * Create a new java/lang/String object, using the Unicode data. 331 */ 332StringObject* dvmCreateStringFromUnicode(const u2* unichars, int len) 333{ 334 StringObject* newObj; 335 ArrayObject* chars; 336 u4 hashCode = 0; 337 338 /* we allow a null pointer if the length is zero */ 339 assert(len == 0 || unichars != NULL); 340 341 if (gDvm.javaLangStringReady <= 0) { 342 if (!stringStartup()) 343 return NULL; 344 } 345 346 /* init before alloc */ 347 if (!dvmIsClassInitialized(gDvm.classJavaLangString) && 348 !dvmInitClass(gDvm.classJavaLangString)) 349 { 350 return NULL; 351 } 352 353 newObj = (StringObject*) dvmAllocObject(gDvm.classJavaLangString, 354 ALLOC_DEFAULT); 355 if (newObj == NULL) 356 return NULL; 357 358 chars = dvmAllocPrimitiveArray('C', len, ALLOC_DEFAULT); 359 if (chars == NULL) { 360 dvmReleaseTrackedAlloc((Object*) newObj, NULL); 361 return NULL; 362 } 363 if (len > 0) 364 memcpy(chars->contents, unichars, len * sizeof(u2)); 365 hashCode = dvmComputeUtf16Hash((u2*) chars->contents, len); 366 367 dvmSetFieldObject((Object*)newObj, STRING_FIELDOFF_VALUE, 368 (Object*)chars); 369 dvmReleaseTrackedAlloc((Object*) chars, NULL); 370 dvmSetFieldInt((Object*)newObj, STRING_FIELDOFF_COUNT, len); 371 dvmSetFieldInt((Object*)newObj, STRING_FIELDOFF_HASHCODE, hashCode); 372 /* leave offset set to zero */ 373 374 /* debugging stuff */ 375 //dvmDumpObject((Object*)newObj); 376 //printHexDumpEx(ANDROID_LOG_DEBUG, chars->contents, len*2, kHexDumpMem); 377 378 /* caller must dvmReleaseTrackedAlloc(newObj) */ 379 return newObj; 380} 381 382/* 383 * Create a new C string from a java/lang/String object. 384 * 385 * Returns NULL if the object is NULL. 386 */ 387char* dvmCreateCstrFromString(StringObject* jstr) 388{ 389 char* newStr; 390 ArrayObject* chars; 391 int len, byteLen, offset; 392 const u2* data; 393 394 assert(gDvm.javaLangStringReady > 0); 395 396 if (jstr == NULL) 397 return NULL; 398 399 len = dvmGetFieldInt((Object*) jstr, STRING_FIELDOFF_COUNT); 400 offset = dvmGetFieldInt((Object*) jstr, STRING_FIELDOFF_OFFSET); 401 chars = (ArrayObject*) dvmGetFieldObject((Object*) jstr, 402 STRING_FIELDOFF_VALUE); 403 data = (const u2*) chars->contents + offset; 404 assert(offset + len <= (int) chars->length); 405 406 byteLen = utf16_utf8ByteLen(data, len); 407 newStr = (char*) malloc(byteLen+1); 408 if (newStr == NULL) 409 return NULL; 410 convertUtf16ToUtf8(newStr, data, len); 411 412 return newStr; 413} 414 415/* 416 * Create a UTF-8 C string from a region of a java/lang/String. (Used by 417 * the JNI GetStringUTFRegion call.) 418 */ 419void dvmCreateCstrFromStringRegion(StringObject* jstr, int start, int len, 420 char* buf) 421{ 422 const u2* data; 423 424 data = dvmStringChars(jstr) + start; 425 convertUtf16ToUtf8(buf, data, len); 426} 427 428/* 429 * Compute the length, in modified UTF-8, of a java/lang/String object. 430 * 431 * Does not include the terminating null byte. 432 */ 433int dvmStringUtf8ByteLen(StringObject* jstr) 434{ 435 ArrayObject* chars; 436 int len, offset; 437 const u2* data; 438 439 assert(gDvm.javaLangStringReady > 0); 440 441 if (jstr == NULL) 442 return 0; // should we throw something? assert? 443 444 len = dvmGetFieldInt((Object*) jstr, STRING_FIELDOFF_COUNT); 445 offset = dvmGetFieldInt((Object*) jstr, STRING_FIELDOFF_OFFSET); 446 chars = (ArrayObject*) dvmGetFieldObject((Object*) jstr, 447 STRING_FIELDOFF_VALUE); 448 data = (const u2*) chars->contents + offset; 449 assert(offset + len <= (int) chars->length); 450 451 return utf16_utf8ByteLen(data, len); 452} 453 454/* 455 * Get the string's length. 456 */ 457int dvmStringLen(StringObject* jstr) 458{ 459 return dvmGetFieldInt((Object*) jstr, STRING_FIELDOFF_COUNT); 460} 461 462/* 463 * Get the char[] object from the String. 464 */ 465ArrayObject* dvmStringCharArray(StringObject* jstr) 466{ 467 return (ArrayObject*) dvmGetFieldObject((Object*) jstr, 468 STRING_FIELDOFF_VALUE); 469} 470 471/* 472 * Get the string's data. 473 */ 474const u2* dvmStringChars(StringObject* jstr) 475{ 476 ArrayObject* chars; 477 int offset; 478 479 offset = dvmGetFieldInt((Object*) jstr, STRING_FIELDOFF_OFFSET); 480 chars = (ArrayObject*) dvmGetFieldObject((Object*) jstr, 481 STRING_FIELDOFF_VALUE); 482 return (const u2*) chars->contents + offset; 483} 484 485 486/* 487 * Compare two String objects. 488 * 489 * This is a dvmHashTableLookup() callback. The function has already 490 * compared their hash values; we need to do a full compare to ensure 491 * that the strings really match. 492 */ 493int dvmHashcmpStrings(const void* vstrObj1, const void* vstrObj2) 494{ 495 const StringObject* strObj1 = (const StringObject*) vstrObj1; 496 const StringObject* strObj2 = (const StringObject*) vstrObj2; 497 ArrayObject* chars1; 498 ArrayObject* chars2; 499 int len1, len2, offset1, offset2; 500 501 assert(gDvm.javaLangStringReady > 0); 502 503 /* get offset and length into char array; all values are in 16-bit units */ 504 len1 = dvmGetFieldInt((Object*) strObj1, STRING_FIELDOFF_COUNT); 505 offset1 = dvmGetFieldInt((Object*) strObj1, STRING_FIELDOFF_OFFSET); 506 len2 = dvmGetFieldInt((Object*) strObj2, STRING_FIELDOFF_COUNT); 507 offset2 = dvmGetFieldInt((Object*) strObj2, STRING_FIELDOFF_OFFSET); 508 if (len1 != len2) 509 return len1 - len2; 510 511 chars1 = (ArrayObject*) dvmGetFieldObject((Object*) strObj1, 512 STRING_FIELDOFF_VALUE); 513 chars2 = (ArrayObject*) dvmGetFieldObject((Object*) strObj2, 514 STRING_FIELDOFF_VALUE); 515 516 /* damage here actually indicates a broken java/lang/String */ 517 assert(offset1 + len1 <= (int) chars1->length); 518 assert(offset2 + len2 <= (int) chars2->length); 519 520 return memcmp((const u2*) chars1->contents + offset1, 521 (const u2*) chars2->contents + offset2, 522 len1 * sizeof(u2)); 523} 524 525