1/*
2 * Copyright (C) 2008 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17/*
18 * UTF-8 and Unicode string manipulation, plus java/lang/String convenience
19 * functions.
20 *
21 * In most cases we populate the fields in the String object directly,
22 * rather than going through an instance field lookup.
23 */
24#include "Dalvik.h"
25#include <stdlib.h>
26
27/*
28 * Initialize string globals.
29 *
30 * This isn't part of the VM init sequence because it's hard to get the
31 * timing right -- we need it to happen after java/lang/String has been
32 * loaded, but before anybody wants to use a string.  It's easiest to
33 * just initialize it on first use.
34 *
35 * In some unusual circumstances (e.g. trying to throw an exception because
36 * String implements java/lang/CharSequence, but CharSequence doesn't exist)
37 * we can try to create an exception string internally before anything has
38 * really tried to use String.  In that case we basically self-destruct.
39 */
40static bool stringStartup()
41{
42    if (gDvm.javaLangStringReady < 0) {
43        LOGE("ERROR: reentrant string initialization\n");
44        assert(false);
45        return false;
46    }
47    assert(gDvm.javaLangStringReady == 0);
48
49    gDvm.javaLangStringReady = -1;
50
51    if (gDvm.classJavaLangString == NULL)
52        gDvm.classJavaLangString =
53            dvmFindSystemClassNoInit("Ljava/lang/String;");
54
55    gDvm.offJavaLangString_value =
56        dvmFindFieldOffset(gDvm.classJavaLangString, "value", "[C");
57    gDvm.offJavaLangString_count =
58        dvmFindFieldOffset(gDvm.classJavaLangString, "count", "I");
59    gDvm.offJavaLangString_offset =
60        dvmFindFieldOffset(gDvm.classJavaLangString, "offset", "I");
61    gDvm.offJavaLangString_hashCode =
62        dvmFindFieldOffset(gDvm.classJavaLangString, "hashCode", "I");
63
64    if (gDvm.offJavaLangString_value < 0 ||
65        gDvm.offJavaLangString_count < 0 ||
66        gDvm.offJavaLangString_offset < 0 ||
67        gDvm.offJavaLangString_hashCode < 0)
68    {
69        LOGE("VM-required field missing from java/lang/String\n");
70        return false;
71    }
72
73    bool badValue = false;
74    if (gDvm.offJavaLangString_value != STRING_FIELDOFF_VALUE) {
75        LOGE("InlineNative: String.value offset = %d, expected %d\n",
76            gDvm.offJavaLangString_value, STRING_FIELDOFF_VALUE);
77        badValue = true;
78    }
79    if (gDvm.offJavaLangString_count != STRING_FIELDOFF_COUNT) {
80        LOGE("InlineNative: String.count offset = %d, expected %d\n",
81            gDvm.offJavaLangString_count, STRING_FIELDOFF_COUNT);
82        badValue = true;
83    }
84    if (gDvm.offJavaLangString_offset != STRING_FIELDOFF_OFFSET) {
85        LOGE("InlineNative: String.offset offset = %d, expected %d\n",
86            gDvm.offJavaLangString_offset, STRING_FIELDOFF_OFFSET);
87        badValue = true;
88    }
89    if (gDvm.offJavaLangString_hashCode != STRING_FIELDOFF_HASHCODE) {
90        LOGE("InlineNative: String.hashCode offset = %d, expected %d\n",
91            gDvm.offJavaLangString_hashCode, STRING_FIELDOFF_HASHCODE);
92        badValue = true;
93    }
94    if (badValue)
95        return false;
96
97    gDvm.javaLangStringReady = 1;
98
99    return true;
100}
101
102/*
103 * Discard heap-allocated storage.
104 */
105void dvmStringShutdown()
106{
107    // currently unused
108}
109
110/*
111 * Compute a hash code on a UTF-8 string, for use with internal hash tables.
112 *
113 * This may or may not yield the same results as the java/lang/String
114 * computeHashCode() function.  (To make sure this doesn't get abused,
115 * I'm initializing the hash code to 1 so they *don't* match up.)
116 *
117 * It would be more correct to invoke dexGetUtf16FromUtf8() here and compute
118 * the hash with the result.  That way, if something encoded the same
119 * character in two different ways, the hash value would be the same.  For
120 * our purposes that isn't necessary.
121 */
122u4 dvmComputeUtf8Hash(const char* utf8Str)
123{
124    u4 hash = 1;
125
126    while (*utf8Str != '\0')
127        hash = hash * 31 + *utf8Str++;
128
129    return hash;
130}
131
132/*
133 * Like "strlen", but for strings encoded with "modified" UTF-8.
134 *
135 * The value returned is the number of characters, which may or may not
136 * be the same as the number of bytes.
137 *
138 * (If this needs optimizing, try: mask against 0xa0, shift right 5,
139 * get increment {1-3} from table of 8 values.)
140 */
141int dvmUtf8Len(const char* utf8Str)
142{
143    int ic, len = 0;
144
145    while ((ic = *utf8Str++) != '\0') {
146        len++;
147        if ((ic & 0x80) != 0) {
148            /* two- or three-byte encoding */
149            utf8Str++;
150            if ((ic & 0x20) != 0) {
151                /* three-byte encoding */
152                utf8Str++;
153            }
154        }
155    }
156
157    return len;
158}
159
160/*
161 * Convert a "modified" UTF-8 string to UTF-16.
162 */
163void dvmConvertUtf8ToUtf16(u2* utf16Str, const char* utf8Str)
164{
165    while (*utf8Str != '\0')
166        *utf16Str++ = dexGetUtf16FromUtf8(&utf8Str);
167}
168
169/*
170 * Given a UTF-16 string, compute the length of the corresponding UTF-8
171 * string in bytes.
172 */
173static int utf16_utf8ByteLen(const u2* utf16Str, int len)
174{
175    int utf8Len = 0;
176
177    while (len--) {
178        unsigned int uic = *utf16Str++;
179
180        /*
181         * The most common case is (uic > 0 && uic <= 0x7f).
182         */
183        if (uic == 0 || uic > 0x7f) {
184            if (uic > 0x07ff)
185                utf8Len += 3;
186            else /*(uic > 0x7f || uic == 0) */
187                utf8Len += 2;
188        } else
189            utf8Len++;
190    }
191    return utf8Len;
192}
193
194/*
195 * Convert a UTF-16 string to UTF-8.
196 *
197 * Make sure you allocate "utf8Str" with the result of utf16_utf8ByteLen(),
198 * not just "len".
199 */
200static void convertUtf16ToUtf8(char* utf8Str, const u2* utf16Str, int len)
201{
202    assert(len >= 0);
203
204    while (len--) {
205        unsigned int uic = *utf16Str++;
206
207        /*
208         * The most common case is (uic > 0 && uic <= 0x7f).
209         */
210        if (uic == 0 || uic > 0x7f) {
211            if (uic > 0x07ff) {
212                *utf8Str++ = (uic >> 12) | 0xe0;
213                *utf8Str++ = ((uic >> 6) & 0x3f) | 0x80;
214                *utf8Str++ = (uic & 0x3f) | 0x80;
215            } else /*(uic > 0x7f || uic == 0)*/ {
216                *utf8Str++ = (uic >> 6) | 0xc0;
217                *utf8Str++ = (uic & 0x3f) | 0x80;
218            }
219        } else {
220            *utf8Str++ = uic;
221        }
222    }
223
224    *utf8Str = '\0';
225}
226
227/*
228 * Use the java/lang/String.computeHashCode() algorithm.
229 */
230static inline u4 dvmComputeUtf16Hash(const u2* utf16Str, int len)
231{
232    u4 hash = 0;
233
234    while (len--)
235        hash = hash * 31 + *utf16Str++;
236
237    return hash;
238}
239u4 dvmComputeStringHash(StringObject* strObj) {
240    ArrayObject* chars = (ArrayObject*) dvmGetFieldObject((Object*) strObj,
241                                STRING_FIELDOFF_VALUE);
242    int offset, len;
243
244    len = dvmGetFieldInt((Object*) strObj, STRING_FIELDOFF_COUNT);
245    offset = dvmGetFieldInt((Object*) strObj, STRING_FIELDOFF_OFFSET);
246
247    return dvmComputeUtf16Hash((u2*) chars->contents + offset, len);
248}
249
250/*
251 * Create a new java/lang/String object, using the string data in "utf8Str".
252 *
253 * Note that "allocFlags" affects both of the allocations here.  If you
254 * use ALLOC_DONT_TRACK in a context where a GC could happen between the
255 * two allocations, you could lose the array reference.
256 *
257 * Returns NULL and throws an exception on failure.
258 */
259StringObject* dvmCreateStringFromCstr(const char* utf8Str, int allocFlags)
260{
261    assert(utf8Str != NULL);
262
263    return dvmCreateStringFromCstrAndLength(utf8Str, dvmUtf8Len(utf8Str),
264            allocFlags);
265}
266
267/*
268 * Create a java/lang/String from a C string, given its UTF-16 length
269 * (number of UTF-16 code points).
270 *
271 * The caller must call dvmReleaseTrackedAlloc() on the return value or
272 * use a non-default value for "allocFlags".  It is never appropriate
273 * to use ALLOC_DONT_TRACK with this function.
274 *
275 * Returns NULL and throws an exception on failure.
276 */
277StringObject* dvmCreateStringFromCstrAndLength(const char* utf8Str,
278    u4 utf16Length, int allocFlags)
279{
280    StringObject* newObj;
281    ArrayObject* chars;
282    u4 hashCode = 0;
283
284    //LOGV("Creating String from '%s'\n", utf8Str);
285    assert(allocFlags != ALLOC_DONT_TRACK);     /* don't currently need */
286    assert(utf8Str != NULL);
287
288    if (gDvm.javaLangStringReady <= 0) {
289        if (!stringStartup())
290            return NULL;
291    }
292
293    /* init before alloc */
294    if (!dvmIsClassInitialized(gDvm.classJavaLangString) &&
295        !dvmInitClass(gDvm.classJavaLangString))
296    {
297        return NULL;
298    }
299
300    newObj = (StringObject*) dvmAllocObject(gDvm.classJavaLangString,
301                allocFlags);
302    if (newObj == NULL)
303        return NULL;
304
305    chars = dvmAllocPrimitiveArray('C', utf16Length, allocFlags);
306    if (chars == NULL) {
307        dvmReleaseTrackedAllocIFN((Object*) newObj, NULL, allocFlags);
308        return NULL;
309    }
310    dvmConvertUtf8ToUtf16((u2*)chars->contents, utf8Str);
311    hashCode = dvmComputeUtf16Hash((u2*) chars->contents, utf16Length);
312
313    dvmSetFieldObject((Object*)newObj, STRING_FIELDOFF_VALUE,
314        (Object*)chars);
315    dvmReleaseTrackedAllocIFN((Object*) chars, NULL, allocFlags);
316    dvmSetFieldInt((Object*)newObj, STRING_FIELDOFF_COUNT, utf16Length);
317    dvmSetFieldInt((Object*)newObj, STRING_FIELDOFF_HASHCODE, hashCode);
318    /* leave offset set to zero */
319
320    /* debugging stuff */
321    //dvmDumpObject((Object*)newObj);
322    //printHexDumpEx(ANDROID_LOG_DEBUG, chars->contents, utf16Length * 2,
323    //    kHexDumpMem);
324
325    /* caller may need to dvmReleaseTrackedAlloc(newObj) */
326    return newObj;
327}
328
329/*
330 * Create a new java/lang/String object, using the Unicode data.
331 */
332StringObject* dvmCreateStringFromUnicode(const u2* unichars, int len)
333{
334    StringObject* newObj;
335    ArrayObject* chars;
336    u4 hashCode = 0;
337
338    /* we allow a null pointer if the length is zero */
339    assert(len == 0 || unichars != NULL);
340
341    if (gDvm.javaLangStringReady <= 0) {
342        if (!stringStartup())
343            return NULL;
344    }
345
346    /* init before alloc */
347    if (!dvmIsClassInitialized(gDvm.classJavaLangString) &&
348        !dvmInitClass(gDvm.classJavaLangString))
349    {
350        return NULL;
351    }
352
353    newObj = (StringObject*) dvmAllocObject(gDvm.classJavaLangString,
354        ALLOC_DEFAULT);
355    if (newObj == NULL)
356        return NULL;
357
358    chars = dvmAllocPrimitiveArray('C', len, ALLOC_DEFAULT);
359    if (chars == NULL) {
360        dvmReleaseTrackedAlloc((Object*) newObj, NULL);
361        return NULL;
362    }
363    if (len > 0)
364        memcpy(chars->contents, unichars, len * sizeof(u2));
365    hashCode = dvmComputeUtf16Hash((u2*) chars->contents, len);
366
367    dvmSetFieldObject((Object*)newObj, STRING_FIELDOFF_VALUE,
368        (Object*)chars);
369    dvmReleaseTrackedAlloc((Object*) chars, NULL);
370    dvmSetFieldInt((Object*)newObj, STRING_FIELDOFF_COUNT, len);
371    dvmSetFieldInt((Object*)newObj, STRING_FIELDOFF_HASHCODE, hashCode);
372    /* leave offset set to zero */
373
374    /* debugging stuff */
375    //dvmDumpObject((Object*)newObj);
376    //printHexDumpEx(ANDROID_LOG_DEBUG, chars->contents, len*2, kHexDumpMem);
377
378    /* caller must dvmReleaseTrackedAlloc(newObj) */
379    return newObj;
380}
381
382/*
383 * Create a new C string from a java/lang/String object.
384 *
385 * Returns NULL if the object is NULL.
386 */
387char* dvmCreateCstrFromString(StringObject* jstr)
388{
389    char* newStr;
390    ArrayObject* chars;
391    int len, byteLen, offset;
392    const u2* data;
393
394    assert(gDvm.javaLangStringReady > 0);
395
396    if (jstr == NULL)
397        return NULL;
398
399    len = dvmGetFieldInt((Object*) jstr, STRING_FIELDOFF_COUNT);
400    offset = dvmGetFieldInt((Object*) jstr, STRING_FIELDOFF_OFFSET);
401    chars = (ArrayObject*) dvmGetFieldObject((Object*) jstr,
402                                STRING_FIELDOFF_VALUE);
403    data = (const u2*) chars->contents + offset;
404    assert(offset + len <= (int) chars->length);
405
406    byteLen = utf16_utf8ByteLen(data, len);
407    newStr = (char*) malloc(byteLen+1);
408    if (newStr == NULL)
409        return NULL;
410    convertUtf16ToUtf8(newStr, data, len);
411
412    return newStr;
413}
414
415/*
416 * Create a UTF-8 C string from a region of a java/lang/String.  (Used by
417 * the JNI GetStringUTFRegion call.)
418 */
419void dvmCreateCstrFromStringRegion(StringObject* jstr, int start, int len,
420    char* buf)
421{
422    const u2* data;
423
424    data = dvmStringChars(jstr) + start;
425    convertUtf16ToUtf8(buf, data, len);
426}
427
428/*
429 * Compute the length, in modified UTF-8, of a java/lang/String object.
430 *
431 * Does not include the terminating null byte.
432 */
433int dvmStringUtf8ByteLen(StringObject* jstr)
434{
435    ArrayObject* chars;
436    int len, offset;
437    const u2* data;
438
439    assert(gDvm.javaLangStringReady > 0);
440
441    if (jstr == NULL)
442        return 0;       // should we throw something?  assert?
443
444    len = dvmGetFieldInt((Object*) jstr, STRING_FIELDOFF_COUNT);
445    offset = dvmGetFieldInt((Object*) jstr, STRING_FIELDOFF_OFFSET);
446    chars = (ArrayObject*) dvmGetFieldObject((Object*) jstr,
447                                STRING_FIELDOFF_VALUE);
448    data = (const u2*) chars->contents + offset;
449    assert(offset + len <= (int) chars->length);
450
451    return utf16_utf8ByteLen(data, len);
452}
453
454/*
455 * Get the string's length.
456 */
457int dvmStringLen(StringObject* jstr)
458{
459    return dvmGetFieldInt((Object*) jstr, STRING_FIELDOFF_COUNT);
460}
461
462/*
463 * Get the char[] object from the String.
464 */
465ArrayObject* dvmStringCharArray(StringObject* jstr)
466{
467    return (ArrayObject*) dvmGetFieldObject((Object*) jstr,
468                                STRING_FIELDOFF_VALUE);
469}
470
471/*
472 * Get the string's data.
473 */
474const u2* dvmStringChars(StringObject* jstr)
475{
476    ArrayObject* chars;
477    int offset;
478
479    offset = dvmGetFieldInt((Object*) jstr, STRING_FIELDOFF_OFFSET);
480    chars = (ArrayObject*) dvmGetFieldObject((Object*) jstr,
481                                STRING_FIELDOFF_VALUE);
482    return (const u2*) chars->contents + offset;
483}
484
485
486/*
487 * Compare two String objects.
488 *
489 * This is a dvmHashTableLookup() callback.  The function has already
490 * compared their hash values; we need to do a full compare to ensure
491 * that the strings really match.
492 */
493int dvmHashcmpStrings(const void* vstrObj1, const void* vstrObj2)
494{
495    const StringObject* strObj1 = (const StringObject*) vstrObj1;
496    const StringObject* strObj2 = (const StringObject*) vstrObj2;
497    ArrayObject* chars1;
498    ArrayObject* chars2;
499    int len1, len2, offset1, offset2;
500
501    assert(gDvm.javaLangStringReady > 0);
502
503    /* get offset and length into char array; all values are in 16-bit units */
504    len1 = dvmGetFieldInt((Object*) strObj1, STRING_FIELDOFF_COUNT);
505    offset1 = dvmGetFieldInt((Object*) strObj1, STRING_FIELDOFF_OFFSET);
506    len2 = dvmGetFieldInt((Object*) strObj2, STRING_FIELDOFF_COUNT);
507    offset2 = dvmGetFieldInt((Object*) strObj2, STRING_FIELDOFF_OFFSET);
508    if (len1 != len2)
509        return len1 - len2;
510
511    chars1 = (ArrayObject*) dvmGetFieldObject((Object*) strObj1,
512                                STRING_FIELDOFF_VALUE);
513    chars2 = (ArrayObject*) dvmGetFieldObject((Object*) strObj2,
514                                STRING_FIELDOFF_VALUE);
515
516    /* damage here actually indicates a broken java/lang/String */
517    assert(offset1 + len1 <= (int) chars1->length);
518    assert(offset2 + len2 <= (int) chars2->length);
519
520    return memcmp((const u2*) chars1->contents + offset1,
521                  (const u2*) chars2->contents + offset2,
522                  len1 * sizeof(u2));
523}
524
525