UtfString.cpp revision 60fc806b679a3655c228b4093058c59941a49cfe
1/*
2 * Copyright (C) 2008 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17/*
18 * UTF-8 and Unicode string manipulation, plus java/lang/String convenience
19 * functions.
20 *
21 * In most cases we populate the fields in the String object directly,
22 * rather than going through an instance field lookup.
23 */
24#include "Dalvik.h"
25#include <stdlib.h>
26
27/*
28 * Allocate a new instance of the class String, performing first-use
29 * initialization of the class if necessary. Upon success, the
30 * returned value will have all its fields except hashCode already
31 * filled in, including a reference to a newly-allocated char[] for
32 * the contents, sized as given. Additionally, a reference to the
33 * chars array is stored to the pChars pointer. Callers must
34 * subsequently call dvmReleaseTrackedAlloc() on the result pointer.
35 * This function returns NULL on failure.
36 */
37static StringObject* makeStringObject(u4 charsLength, ArrayObject** pChars)
38{
39    /*
40     * The String class should have already gotten found (but not
41     * necessarily initialized) before making it here. We assert it
42     * explicitly, since historically speaking, we have had bugs with
43     * regard to when the class String gets set up. The assert helps
44     * make any regressions easier to diagnose.
45     */
46    assert(gDvm.classJavaLangString != NULL);
47
48    if (!dvmIsClassInitialized(gDvm.classJavaLangString)) {
49        /* Perform first-time use initialization of the class. */
50        if (!dvmInitClass(gDvm.classJavaLangString)) {
51            LOGE("FATAL: Could not initialize class String");
52            dvmAbort();
53        }
54    }
55
56    Object* result = dvmAllocObject(gDvm.classJavaLangString, ALLOC_DEFAULT);
57    if (result == NULL) {
58        return NULL;
59    }
60
61    ArrayObject* chars = dvmAllocPrimitiveArray('C', charsLength, ALLOC_DEFAULT);
62    if (chars == NULL) {
63        dvmReleaseTrackedAlloc(result, NULL);
64        return NULL;
65    }
66
67    dvmSetFieldInt(result, STRING_FIELDOFF_COUNT, charsLength);
68    dvmSetFieldObject(result, STRING_FIELDOFF_VALUE, (Object*) chars);
69    dvmReleaseTrackedAlloc((Object*) chars, NULL);
70    /* Leave offset and hashCode set to zero. */
71
72    *pChars = chars;
73    return (StringObject*) result;
74}
75
76/*
77 * Compute a hash code on a UTF-8 string, for use with internal hash tables.
78 *
79 * This may or may not yield the same results as the java/lang/String
80 * computeHashCode() function.  (To make sure this doesn't get abused,
81 * I'm initializing the hash code to 1 so they *don't* match up.)
82 *
83 * It would be more correct to invoke dexGetUtf16FromUtf8() here and compute
84 * the hash with the result.  That way, if something encoded the same
85 * character in two different ways, the hash value would be the same.  For
86 * our purposes that isn't necessary.
87 */
88u4 dvmComputeUtf8Hash(const char* utf8Str)
89{
90    u4 hash = 1;
91
92    while (*utf8Str != '\0')
93        hash = hash * 31 + *utf8Str++;
94
95    return hash;
96}
97
98/*
99 * Like "strlen", but for strings encoded with "modified" UTF-8.
100 *
101 * The value returned is the number of characters, which may or may not
102 * be the same as the number of bytes.
103 *
104 * (If this needs optimizing, try: mask against 0xa0, shift right 5,
105 * get increment {1-3} from table of 8 values.)
106 */
107size_t dvmUtf8Len(const char* utf8Str)
108{
109    size_t len = 0;
110    int ic;
111
112    while ((ic = *utf8Str++) != '\0') {
113        len++;
114        if ((ic & 0x80) != 0) {
115            /* two- or three-byte encoding */
116            utf8Str++;
117            if ((ic & 0x20) != 0) {
118                /* three-byte encoding */
119                utf8Str++;
120            }
121        }
122    }
123
124    return len;
125}
126
127/*
128 * Convert a "modified" UTF-8 string to UTF-16.
129 */
130void dvmConvertUtf8ToUtf16(u2* utf16Str, const char* utf8Str)
131{
132    while (*utf8Str != '\0')
133        *utf16Str++ = dexGetUtf16FromUtf8(&utf8Str);
134}
135
136/*
137 * Given a UTF-16 string, compute the length of the corresponding UTF-8
138 * string in bytes.
139 */
140static int utf16_utf8ByteLen(const u2* utf16Str, int len)
141{
142    int utf8Len = 0;
143
144    while (len--) {
145        unsigned int uic = *utf16Str++;
146
147        /*
148         * The most common case is (uic > 0 && uic <= 0x7f).
149         */
150        if (uic == 0 || uic > 0x7f) {
151            if (uic > 0x07ff)
152                utf8Len += 3;
153            else /*(uic > 0x7f || uic == 0) */
154                utf8Len += 2;
155        } else
156            utf8Len++;
157    }
158    return utf8Len;
159}
160
161/*
162 * Convert a UTF-16 string to UTF-8.
163 *
164 * Make sure you allocate "utf8Str" with the result of utf16_utf8ByteLen(),
165 * not just "len".
166 */
167static void convertUtf16ToUtf8(char* utf8Str, const u2* utf16Str, int len)
168{
169    assert(len >= 0);
170
171    while (len--) {
172        unsigned int uic = *utf16Str++;
173
174        /*
175         * The most common case is (uic > 0 && uic <= 0x7f).
176         */
177        if (uic == 0 || uic > 0x7f) {
178            if (uic > 0x07ff) {
179                *utf8Str++ = (uic >> 12) | 0xe0;
180                *utf8Str++ = ((uic >> 6) & 0x3f) | 0x80;
181                *utf8Str++ = (uic & 0x3f) | 0x80;
182            } else /*(uic > 0x7f || uic == 0)*/ {
183                *utf8Str++ = (uic >> 6) | 0xc0;
184                *utf8Str++ = (uic & 0x3f) | 0x80;
185            }
186        } else {
187            *utf8Str++ = uic;
188        }
189    }
190
191    *utf8Str = '\0';
192}
193
194/*
195 * Use the java/lang/String.computeHashCode() algorithm.
196 */
197static inline u4 computeUtf16Hash(const u2* utf16Str, size_t len)
198{
199    u4 hash = 0;
200
201    while (len--)
202        hash = hash * 31 + *utf16Str++;
203
204    return hash;
205}
206
207u4 dvmComputeStringHash(StringObject* strObj) {
208    int hashCode = dvmGetFieldInt(strObj, STRING_FIELDOFF_HASHCODE);
209    if (hashCode != 0) {
210      return hashCode;
211    }
212    int len = dvmGetFieldInt(strObj, STRING_FIELDOFF_COUNT);
213    int offset = dvmGetFieldInt(strObj, STRING_FIELDOFF_OFFSET);
214    ArrayObject* chars = (ArrayObject*) dvmGetFieldObject(strObj,
215                                STRING_FIELDOFF_VALUE);
216    hashCode = computeUtf16Hash((u2*)(void*)chars->contents + offset, len);
217    dvmSetFieldInt(strObj, STRING_FIELDOFF_HASHCODE, hashCode);
218    return hashCode;
219}
220
221/*
222 * Create a new java/lang/String object, using the string data in "utf8Str".
223 *
224 * The caller must call dvmReleaseTrackedAlloc() on the return value.
225 *
226 * Returns NULL and throws an exception on failure.
227 */
228StringObject* dvmCreateStringFromCstr(const char* utf8Str)
229{
230    assert(utf8Str != NULL);
231    return dvmCreateStringFromCstrAndLength(utf8Str, dvmUtf8Len(utf8Str));
232}
233
234/*
235 * Create a java/lang/String from a C string, given its UTF-16 length
236 * (number of UTF-16 code points).
237 *
238 * The caller must call dvmReleaseTrackedAlloc() on the return value.
239 *
240 * Returns NULL and throws an exception on failure.
241 */
242StringObject* dvmCreateStringFromCstrAndLength(const char* utf8Str,
243    size_t utf16Length)
244{
245    assert(utf8Str != NULL);
246
247    ArrayObject* chars;
248    StringObject* newObj = makeStringObject(utf16Length, &chars);
249    if (newObj == NULL) {
250        return NULL;
251    }
252
253    dvmConvertUtf8ToUtf16((u2*)(void*)chars->contents, utf8Str);
254
255    u4 hashCode = computeUtf16Hash((u2*)(void*)chars->contents, utf16Length);
256    dvmSetFieldInt((Object*) newObj, STRING_FIELDOFF_HASHCODE, hashCode);
257
258    return newObj;
259}
260
261/*
262 * Create a new java/lang/String object, using the given Unicode data.
263 */
264StringObject* dvmCreateStringFromUnicode(const u2* unichars, int len)
265{
266    /* We allow a NULL pointer if the length is zero. */
267    assert(len == 0 || unichars != NULL);
268
269    ArrayObject* chars;
270    StringObject* newObj = makeStringObject(len, &chars);
271    if (newObj == NULL) {
272        return NULL;
273    }
274
275    if (len > 0) memcpy(chars->contents, unichars, len * sizeof(u2));
276
277    u4 hashCode = computeUtf16Hash((u2*)(void*)chars->contents, len);
278    dvmSetFieldInt((Object*)newObj, STRING_FIELDOFF_HASHCODE, hashCode);
279
280    return newObj;
281}
282
283/*
284 * Create a new C string from a java/lang/String object.
285 *
286 * Returns NULL if the object is NULL.
287 */
288char* dvmCreateCstrFromString(StringObject* jstr)
289{
290    char* newStr;
291    ArrayObject* chars;
292    int len, byteLen, offset;
293    const u2* data;
294
295    assert(gDvm.classJavaLangString != NULL);
296
297    if (jstr == NULL)
298        return NULL;
299
300    len = dvmGetFieldInt((Object*) jstr, STRING_FIELDOFF_COUNT);
301    offset = dvmGetFieldInt((Object*) jstr, STRING_FIELDOFF_OFFSET);
302    chars = (ArrayObject*) dvmGetFieldObject((Object*) jstr,
303                                STRING_FIELDOFF_VALUE);
304    data = (const u2*)(void*)chars->contents + offset;
305    assert(offset + len <= (int) chars->length);
306
307    byteLen = utf16_utf8ByteLen(data, len);
308    newStr = (char*) malloc(byteLen+1);
309    if (newStr == NULL)
310        return NULL;
311    convertUtf16ToUtf8(newStr, data, len);
312
313    return newStr;
314}
315
316/*
317 * Create a UTF-8 C string from a region of a java/lang/String.  (Used by
318 * the JNI GetStringUTFRegion call.)
319 */
320void dvmCreateCstrFromStringRegion(StringObject* jstr, int start, int len,
321    char* buf)
322{
323    const u2* data;
324
325    data = dvmStringChars(jstr) + start;
326    convertUtf16ToUtf8(buf, data, len);
327}
328
329/*
330 * Compute the length, in modified UTF-8, of a java/lang/String object.
331 *
332 * Does not include the terminating null byte.
333 */
334int dvmStringUtf8ByteLen(StringObject* jstr)
335{
336    ArrayObject* chars;
337    int len, offset;
338    const u2* data;
339
340    assert(gDvm.classJavaLangString != NULL);
341
342    if (jstr == NULL)
343        return 0;       // should we throw something?  assert?
344
345    len = dvmGetFieldInt((Object*) jstr, STRING_FIELDOFF_COUNT);
346    offset = dvmGetFieldInt((Object*) jstr, STRING_FIELDOFF_OFFSET);
347    chars = (ArrayObject*) dvmGetFieldObject((Object*) jstr,
348                                STRING_FIELDOFF_VALUE);
349    data = (const u2*)(void*)chars->contents + offset;
350    assert(offset + len <= (int) chars->length);
351
352    return utf16_utf8ByteLen(data, len);
353}
354
355/*
356 * Get the string's length.
357 */
358int dvmStringLen(StringObject* jstr)
359{
360    return dvmGetFieldInt((Object*) jstr, STRING_FIELDOFF_COUNT);
361}
362
363/*
364 * Get the char[] object from the String.
365 */
366ArrayObject* dvmStringCharArray(StringObject* jstr)
367{
368    return (ArrayObject*) dvmGetFieldObject((Object*) jstr,
369                                STRING_FIELDOFF_VALUE);
370}
371
372/*
373 * Get the string's data.
374 */
375const u2* dvmStringChars(StringObject* jstr)
376{
377    ArrayObject* chars;
378    int offset;
379
380    offset = dvmGetFieldInt((Object*) jstr, STRING_FIELDOFF_OFFSET);
381    chars = (ArrayObject*) dvmGetFieldObject((Object*) jstr,
382                                STRING_FIELDOFF_VALUE);
383    return (const u2*)(void*)chars->contents + offset;
384}
385
386
387/*
388 * Compare two String objects.
389 *
390 * This is a dvmHashTableLookup() callback.  The function has already
391 * compared their hash values; we need to do a full compare to ensure
392 * that the strings really match.
393 */
394int dvmHashcmpStrings(const void* vstrObj1, const void* vstrObj2)
395{
396    const StringObject* strObj1 = (const StringObject*) vstrObj1;
397    const StringObject* strObj2 = (const StringObject*) vstrObj2;
398    ArrayObject* chars1;
399    ArrayObject* chars2;
400    int len1, len2, offset1, offset2;
401
402    assert(gDvm.classJavaLangString != NULL);
403
404    /* get offset and length into char array; all values are in 16-bit units */
405    len1 = dvmGetFieldInt((Object*) strObj1, STRING_FIELDOFF_COUNT);
406    offset1 = dvmGetFieldInt((Object*) strObj1, STRING_FIELDOFF_OFFSET);
407    len2 = dvmGetFieldInt((Object*) strObj2, STRING_FIELDOFF_COUNT);
408    offset2 = dvmGetFieldInt((Object*) strObj2, STRING_FIELDOFF_OFFSET);
409    if (len1 != len2)
410        return len1 - len2;
411
412    chars1 = (ArrayObject*) dvmGetFieldObject((Object*) strObj1,
413                                STRING_FIELDOFF_VALUE);
414    chars2 = (ArrayObject*) dvmGetFieldObject((Object*) strObj2,
415                                STRING_FIELDOFF_VALUE);
416
417    /* damage here actually indicates a broken java/lang/String */
418    assert(offset1 + len1 <= (int) chars1->length);
419    assert(offset2 + len2 <= (int) chars2->length);
420
421    return memcmp((const u2*)(void*)chars1->contents + offset1,
422                  (const u2*)(void*)chars2->contents + offset2,
423                  len1 * sizeof(u2));
424}
425
426ArrayObject* dvmCreateStringArray(const char** strings, size_t length)
427{
428    Thread* self = dvmThreadSelf();
429
430    /*
431     * Allocate an array to hold the String objects.
432     */
433    ClassObject* elementClass =
434        dvmFindArrayClassForElement(gDvm.classJavaLangString);
435    ArrayObject* stringArray =
436        dvmAllocArrayByClass(elementClass, length, ALLOC_DEFAULT);
437    if (stringArray == NULL) {
438        /* probably OOM */
439        assert(dvmCheckException(self));
440        return NULL;
441    }
442
443    /*
444     * Create the individual String objects and add them to the array.
445     */
446    for (size_t i = 0; i < length; i++) {
447        Object* str =
448            (Object*) dvmCreateStringFromCstr(strings[i]);
449        if (str == NULL) {
450            /* probably OOM; drop out now */
451            assert(dvmCheckException(self));
452            dvmReleaseTrackedAlloc((Object*) stringArray, self);
453            return NULL;
454        }
455        dvmSetObjectArrayElement(stringArray, i, str);
456        /* stored in tracked array, okay to release */
457        dvmReleaseTrackedAlloc(str, self);
458    }
459
460    return stringArray;
461}
462