1/*
2 * Copyright (C) 2008 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17/*
18 * UTF-8 and Unicode string manipulation, plus java/lang/String convenience
19 * functions.
20 *
21 * In most cases we populate the fields in the String object directly,
22 * rather than going through an instance field lookup.
23 */
24#include "Dalvik.h"
25#include <stdlib.h>
26
27/*
28 * Allocate a new instance of the class String, performing first-use
29 * initialization of the class if necessary. Upon success, the
30 * returned value will have all its fields except hashCode already
31 * filled in, including a reference to a newly-allocated char[] for
32 * the contents, sized as given. Additionally, a reference to the
33 * chars array is stored to the pChars pointer. Callers must
34 * subsequently call dvmReleaseTrackedAlloc() on the result pointer.
35 * This function returns NULL on failure.
36 */
37static StringObject* makeStringObject(u4 charsLength, ArrayObject** pChars)
38{
39    /*
40     * The String class should have already gotten found (but not
41     * necessarily initialized) before making it here. We assert it
42     * explicitly, since historically speaking, we have had bugs with
43     * regard to when the class String gets set up. The assert helps
44     * make any regressions easier to diagnose.
45     */
46    assert(gDvm.classJavaLangString != NULL);
47
48    if (!dvmIsClassInitialized(gDvm.classJavaLangString)) {
49        /* Perform first-time use initialization of the class. */
50        if (!dvmInitClass(gDvm.classJavaLangString)) {
51            ALOGE("FATAL: Could not initialize class String");
52            dvmAbort();
53        }
54    }
55
56    Object* result = dvmAllocObject(gDvm.classJavaLangString, ALLOC_DEFAULT);
57    if (result == NULL) {
58        return NULL;
59    }
60
61    ArrayObject* chars = dvmAllocPrimitiveArray('C', charsLength, ALLOC_DEFAULT);
62    if (chars == NULL) {
63        dvmReleaseTrackedAlloc(result, NULL);
64        return NULL;
65    }
66
67    dvmSetFieldInt(result, STRING_FIELDOFF_COUNT, charsLength);
68    dvmSetFieldObject(result, STRING_FIELDOFF_VALUE, (Object*) chars);
69    dvmReleaseTrackedAlloc((Object*) chars, NULL);
70    /* Leave offset and hashCode set to zero. */
71
72    *pChars = chars;
73    return (StringObject*) result;
74}
75
76/*
77 * Compute a hash code on a UTF-8 string, for use with internal hash tables.
78 *
79 * This may or may not yield the same results as the java/lang/String
80 * computeHashCode() function.  (To make sure this doesn't get abused,
81 * I'm initializing the hash code to 1 so they *don't* match up.)
82 *
83 * It would be more correct to invoke dexGetUtf16FromUtf8() here and compute
84 * the hash with the result.  That way, if something encoded the same
85 * character in two different ways, the hash value would be the same.  For
86 * our purposes that isn't necessary.
87 */
88u4 dvmComputeUtf8Hash(const char* utf8Str)
89{
90    u4 hash = 1;
91
92    while (*utf8Str != '\0')
93        hash = hash * 31 + *utf8Str++;
94
95    return hash;
96}
97
98/*
99 * Like "strlen", but for strings encoded with "modified" UTF-8.
100 *
101 * The value returned is the number of characters, which may or may not
102 * be the same as the number of bytes.
103 *
104 * (If this needs optimizing, try: mask against 0xa0, shift right 5,
105 * get increment {1-3} from table of 8 values.)
106 */
107size_t dvmUtf8Len(const char* utf8Str)
108{
109    size_t len = 0;
110    int ic;
111
112    while ((ic = *utf8Str++) != '\0') {
113        len++;
114        if ((ic & 0x80) != 0) {
115            /* two- or three-byte encoding */
116            utf8Str++;
117            if ((ic & 0x20) != 0) {
118                /* three-byte encoding */
119                utf8Str++;
120            }
121        }
122    }
123
124    return len;
125}
126
127/*
128 * Convert a "modified" UTF-8 string to UTF-16.
129 */
130void dvmConvertUtf8ToUtf16(u2* utf16Str, const char* utf8Str)
131{
132    while (*utf8Str != '\0')
133        *utf16Str++ = dexGetUtf16FromUtf8(&utf8Str);
134}
135
136/*
137 * Given a UTF-16 string, compute the length of the corresponding UTF-8
138 * string in bytes.
139 */
140static int utf16_utf8ByteLen(const u2* utf16Str, int len)
141{
142    int utf8Len = 0;
143
144    while (len--) {
145        unsigned int uic = *utf16Str++;
146
147        /*
148         * The most common case is (uic > 0 && uic <= 0x7f).
149         */
150        if (uic == 0 || uic > 0x7f) {
151            if (uic > 0x07ff)
152                utf8Len += 3;
153            else /*(uic > 0x7f || uic == 0) */
154                utf8Len += 2;
155        } else
156            utf8Len++;
157    }
158    return utf8Len;
159}
160
161/*
162 * Convert a UTF-16 string to UTF-8.
163 *
164 * Make sure you allocate "utf8Str" with the result of utf16_utf8ByteLen(),
165 * not just "len".
166 */
167static void convertUtf16ToUtf8(char* utf8Str, const u2* utf16Str, int len)
168{
169    assert(len >= 0);
170
171    while (len--) {
172        unsigned int uic = *utf16Str++;
173
174        /*
175         * The most common case is (uic > 0 && uic <= 0x7f).
176         */
177        if (uic == 0 || uic > 0x7f) {
178            if (uic > 0x07ff) {
179                *utf8Str++ = (uic >> 12) | 0xe0;
180                *utf8Str++ = ((uic >> 6) & 0x3f) | 0x80;
181                *utf8Str++ = (uic & 0x3f) | 0x80;
182            } else /*(uic > 0x7f || uic == 0)*/ {
183                *utf8Str++ = (uic >> 6) | 0xc0;
184                *utf8Str++ = (uic & 0x3f) | 0x80;
185            }
186        } else {
187            *utf8Str++ = uic;
188        }
189    }
190
191    *utf8Str = '\0';
192}
193
194/*
195 * Use the java/lang/String.computeHashCode() algorithm.
196 */
197static inline u4 computeUtf16Hash(const u2* utf16Str, size_t len)
198{
199    u4 hash = 0;
200
201    while (len--)
202        hash = hash * 31 + *utf16Str++;
203
204    return hash;
205}
206
207u4 dvmComputeStringHash(StringObject* strObj) {
208    int hashCode = dvmGetFieldInt(strObj, STRING_FIELDOFF_HASHCODE);
209    if (hashCode != 0) {
210      return hashCode;
211    }
212    int len = dvmGetFieldInt(strObj, STRING_FIELDOFF_COUNT);
213    int offset = dvmGetFieldInt(strObj, STRING_FIELDOFF_OFFSET);
214    ArrayObject* chars =
215            (ArrayObject*) dvmGetFieldObject(strObj, STRING_FIELDOFF_VALUE);
216    hashCode = computeUtf16Hash((u2*)(void*)chars->contents + offset, len);
217    dvmSetFieldInt(strObj, STRING_FIELDOFF_HASHCODE, hashCode);
218    return hashCode;
219}
220
221StringObject* dvmCreateStringFromCstr(const char* utf8Str) {
222    assert(utf8Str != NULL);
223    return dvmCreateStringFromCstrAndLength(utf8Str, dvmUtf8Len(utf8Str));
224}
225
226StringObject* dvmCreateStringFromCstr(const std::string& utf8Str) {
227    return dvmCreateStringFromCstr(utf8Str.c_str());
228}
229
230/*
231 * Create a java/lang/String from a C string, given its UTF-16 length
232 * (number of UTF-16 code points).
233 *
234 * The caller must call dvmReleaseTrackedAlloc() on the return value.
235 *
236 * Returns NULL and throws an exception on failure.
237 */
238StringObject* dvmCreateStringFromCstrAndLength(const char* utf8Str,
239    size_t utf16Length)
240{
241    assert(utf8Str != NULL);
242
243    ArrayObject* chars;
244    StringObject* newObj = makeStringObject(utf16Length, &chars);
245    if (newObj == NULL) {
246        return NULL;
247    }
248
249    dvmConvertUtf8ToUtf16((u2*)(void*)chars->contents, utf8Str);
250
251    u4 hashCode = computeUtf16Hash((u2*)(void*)chars->contents, utf16Length);
252    dvmSetFieldInt((Object*) newObj, STRING_FIELDOFF_HASHCODE, hashCode);
253
254    return newObj;
255}
256
257/*
258 * Create a new java/lang/String object, using the given Unicode data.
259 */
260StringObject* dvmCreateStringFromUnicode(const u2* unichars, int len)
261{
262    /* We allow a NULL pointer if the length is zero. */
263    assert(len == 0 || unichars != NULL);
264
265    ArrayObject* chars;
266    StringObject* newObj = makeStringObject(len, &chars);
267    if (newObj == NULL) {
268        return NULL;
269    }
270
271    if (len > 0) memcpy(chars->contents, unichars, len * sizeof(u2));
272
273    u4 hashCode = computeUtf16Hash((u2*)(void*)chars->contents, len);
274    dvmSetFieldInt((Object*)newObj, STRING_FIELDOFF_HASHCODE, hashCode);
275
276    return newObj;
277}
278
279/*
280 * Create a new C string from a java/lang/String object.
281 *
282 * Returns NULL if the object is NULL.
283 */
284char* dvmCreateCstrFromString(const StringObject* jstr)
285{
286    assert(gDvm.classJavaLangString != NULL);
287    if (jstr == NULL) {
288        return NULL;
289    }
290
291    int len = dvmGetFieldInt(jstr, STRING_FIELDOFF_COUNT);
292    int offset = dvmGetFieldInt(jstr, STRING_FIELDOFF_OFFSET);
293    ArrayObject* chars =
294            (ArrayObject*) dvmGetFieldObject(jstr, STRING_FIELDOFF_VALUE);
295    const u2* data = (const u2*)(void*)chars->contents + offset;
296    assert(offset + len <= (int) chars->length);
297
298    int byteLen = utf16_utf8ByteLen(data, len);
299    char* newStr = (char*) malloc(byteLen+1);
300    if (newStr == NULL) {
301        return NULL;
302    }
303    convertUtf16ToUtf8(newStr, data, len);
304
305    return newStr;
306}
307
308void dvmGetStringUtfRegion(const StringObject* jstr,
309        int start, int len, char* buf)
310{
311    const u2* data = jstr->chars() + start;
312    convertUtf16ToUtf8(buf, data, len);
313}
314
315int StringObject::utfLength() const
316{
317    assert(gDvm.classJavaLangString != NULL);
318
319    int len = dvmGetFieldInt(this, STRING_FIELDOFF_COUNT);
320    int offset = dvmGetFieldInt(this, STRING_FIELDOFF_OFFSET);
321    ArrayObject* chars =
322            (ArrayObject*) dvmGetFieldObject(this, STRING_FIELDOFF_VALUE);
323    const u2* data = (const u2*)(void*)chars->contents + offset;
324    assert(offset + len <= (int) chars->length);
325
326    return utf16_utf8ByteLen(data, len);
327}
328
329int StringObject::length() const
330{
331    return dvmGetFieldInt(this, STRING_FIELDOFF_COUNT);
332}
333
334ArrayObject* StringObject::array() const
335{
336    return (ArrayObject*) dvmGetFieldObject(this, STRING_FIELDOFF_VALUE);
337}
338
339const u2* StringObject::chars() const
340{
341    int offset = dvmGetFieldInt(this, STRING_FIELDOFF_OFFSET);
342    ArrayObject* chars =
343            (ArrayObject*) dvmGetFieldObject(this, STRING_FIELDOFF_VALUE);
344    return (const u2*)(void*)chars->contents + offset;
345}
346
347
348/*
349 * Compare two String objects.
350 *
351 * This is a dvmHashTableLookup() callback.  The function has already
352 * compared their hash values; we need to do a full compare to ensure
353 * that the strings really match.
354 */
355int dvmHashcmpStrings(const void* vstrObj1, const void* vstrObj2)
356{
357    const StringObject* strObj1 = (const StringObject*) vstrObj1;
358    const StringObject* strObj2 = (const StringObject*) vstrObj2;
359
360    assert(gDvm.classJavaLangString != NULL);
361
362    /* get offset and length into char array; all values are in 16-bit units */
363    int len1 = dvmGetFieldInt(strObj1, STRING_FIELDOFF_COUNT);
364    int offset1 = dvmGetFieldInt(strObj1, STRING_FIELDOFF_OFFSET);
365    int len2 = dvmGetFieldInt(strObj2, STRING_FIELDOFF_COUNT);
366    int offset2 = dvmGetFieldInt(strObj2, STRING_FIELDOFF_OFFSET);
367    if (len1 != len2) {
368        return len1 - len2;
369    }
370
371    ArrayObject* chars1 =
372            (ArrayObject*) dvmGetFieldObject(strObj1, STRING_FIELDOFF_VALUE);
373    ArrayObject* chars2 =
374            (ArrayObject*) dvmGetFieldObject(strObj2, STRING_FIELDOFF_VALUE);
375
376    /* damage here actually indicates a broken java/lang/String */
377    assert(offset1 + len1 <= (int) chars1->length);
378    assert(offset2 + len2 <= (int) chars2->length);
379
380    return memcmp((const u2*)(void*)chars1->contents + offset1,
381                  (const u2*)(void*)chars2->contents + offset2,
382                  len1 * sizeof(u2));
383}
384
385ArrayObject* dvmCreateStringArray(const std::vector<std::string>& strings) {
386    Thread* self = dvmThreadSelf();
387
388    // Allocate an array to hold the String objects.
389    ClassObject* elementClass = dvmFindArrayClassForElement(gDvm.classJavaLangString);
390    ArrayObject* stringArray = dvmAllocArrayByClass(elementClass, strings.size(), ALLOC_DEFAULT);
391    if (stringArray == NULL) {
392        // Probably OOM.
393        assert(dvmCheckException(self));
394        return NULL;
395    }
396
397    // Create the individual String objects and add them to the array.
398    for (size_t i = 0; i < strings.size(); i++) {
399        Object* str = (Object*) dvmCreateStringFromCstr(strings[i]);
400        if (str == NULL) {
401            // Probably OOM; drop out now.
402            assert(dvmCheckException(self));
403            dvmReleaseTrackedAlloc((Object*) stringArray, self);
404            return NULL;
405        }
406        dvmSetObjectArrayElement(stringArray, i, str);
407        /* stored in tracked array, okay to release */
408        dvmReleaseTrackedAlloc(str, self);
409    }
410
411    return stringArray;
412}
413