1/*
2 * Copyright (C) 2011 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17/*
18 * Validate and manipulate MUTF-8 encoded string data.
19 */
20
21#include "DexUtf.h"
22
23/* Compare two '\0'-terminated modified UTF-8 strings, using Unicode
24 * code point values for comparison. This treats different encodings
25 * for the same code point as equivalent, except that only a real '\0'
26 * byte is considered the string terminator. The return value is as
27 * for strcmp(). */
28int dexUtf8Cmp(const char* s1, const char* s2) {
29    for (;;) {
30        if (*s1 == '\0') {
31            if (*s2 == '\0') {
32                return 0;
33            }
34            return -1;
35        } else if (*s2 == '\0') {
36            return 1;
37        }
38
39        int utf1 = dexGetUtf16FromUtf8(&s1);
40        int utf2 = dexGetUtf16FromUtf8(&s2);
41        int diff = utf1 - utf2;
42
43        if (diff != 0) {
44            return diff;
45        }
46    }
47}
48
49/* for dexIsValidMemberNameUtf8(), a bit vector indicating valid low ascii */
50u4 DEX_MEMBER_VALID_LOW_ASCII[4] = {
51    0x00000000, // 00..1f low control characters; nothing valid
52    0x03ff2010, // 20..3f digits and symbols; valid: '0'..'9', '$', '-'
53    0x87fffffe, // 40..5f uppercase etc.; valid: 'A'..'Z', '_'
54    0x07fffffe  // 60..7f lowercase etc.; valid: 'a'..'z'
55};
56
57/* Helper for dexIsValidMemberNameUtf8(); do not call directly. */
58bool dexIsValidMemberNameUtf8_0(const char** pUtf8Ptr) {
59    /*
60     * It's a multibyte encoded character. Decode it and analyze. We
61     * accept anything that isn't (a) an improperly encoded low value,
62     * (b) an improper surrogate pair, (c) an encoded '\0', (d) a high
63     * control character, or (e) a high space, layout, or special
64     * character (U+00a0, U+2000..U+200f, U+2028..U+202f,
65     * U+fff0..U+ffff). This is all specified in the dex format
66     * document.
67     */
68
69    u2 utf16 = dexGetUtf16FromUtf8(pUtf8Ptr);
70
71    // Perform follow-up tests based on the high 8 bits.
72    switch (utf16 >> 8) {
73        case 0x00: {
74            // It's only valid if it's above the ISO-8859-1 high space (0xa0).
75            return (utf16 > 0x00a0);
76        }
77        case 0xd8:
78        case 0xd9:
79        case 0xda:
80        case 0xdb: {
81            /*
82             * It's a leading surrogate. Check to see that a trailing
83             * surrogate follows.
84             */
85            utf16 = dexGetUtf16FromUtf8(pUtf8Ptr);
86            return (utf16 >= 0xdc00) && (utf16 <= 0xdfff);
87        }
88        case 0xdc:
89        case 0xdd:
90        case 0xde:
91        case 0xdf: {
92            // It's a trailing surrogate, which is not valid at this point.
93            return false;
94        }
95        case 0x20:
96        case 0xff: {
97            // It's in the range that has spaces, controls, and specials.
98            switch (utf16 & 0xfff8) {
99                case 0x2000:
100                case 0x2008:
101                case 0x2028:
102                case 0xfff0:
103                case 0xfff8: {
104                    return false;
105                }
106            }
107            break;
108        }
109    }
110
111    return true;
112}
113
114/* Return whether the given string is a valid field or method name. */
115bool dexIsValidMemberName(const char* s) {
116    bool angleName = false;
117
118    switch (*s) {
119        case '\0': {
120            // The empty string is not a valid name.
121            return false;
122        }
123        case '<': {
124            /*
125             * '<' is allowed only at the start of a name, and if present,
126             * means that the name must end with '>'.
127             */
128            angleName = true;
129            s++;
130            break;
131        }
132    }
133
134    for (;;) {
135        switch (*s) {
136            case '\0': {
137                return !angleName;
138            }
139            case '>': {
140                return angleName && s[1] == '\0';
141            }
142        }
143        if (!dexIsValidMemberNameUtf8(&s)) {
144            return false;
145        }
146    }
147}
148
149/* Helper for validating type descriptors and class names, which is parametric
150 * with respect to type vs. class and dot vs. slash. */
151static bool isValidTypeDescriptorOrClassName(const char* s, bool isClassName,
152        bool dotSeparator) {
153    int arrayCount = 0;
154
155    while (*s == '[') {
156        arrayCount++;
157        s++;
158    }
159
160    if (arrayCount > 255) {
161        // Arrays may have no more than 255 dimensions.
162        return false;
163    }
164
165    if (arrayCount != 0) {
166        /*
167         * If we're looking at an array of some sort, then it doesn't
168         * matter if what is being asked for is a class name; the
169         * format looks the same as a type descriptor in that case, so
170         * treat it as such.
171         */
172        isClassName = false;
173    }
174
175    if (!isClassName) {
176        /*
177         * We are looking for a descriptor. Either validate it as a
178         * single-character primitive type, or continue on to check the
179         * embedded class name (bracketed by "L" and ";").
180         */
181        switch (*(s++)) {
182            case 'B':
183            case 'C':
184            case 'D':
185            case 'F':
186            case 'I':
187            case 'J':
188            case 'S':
189            case 'Z': {
190                // These are all single-character descriptors for primitive types.
191                return (*s == '\0');
192            }
193            case 'V': {
194                // Non-array void is valid, but you can't have an array of void.
195                return (arrayCount == 0) && (*s == '\0');
196            }
197            case 'L': {
198                // Class name: Break out and continue below.
199                break;
200            }
201            default: {
202                // Oddball descriptor character.
203                return false;
204            }
205        }
206    }
207
208    /*
209     * We just consumed the 'L' that introduces a class name as part
210     * of a type descriptor, or we are looking for an unadorned class
211     * name.
212     */
213
214    bool sepOrFirst = true; // first character or just encountered a separator.
215    for (;;) {
216        u1 c = (u1) *s;
217        switch (c) {
218            case '\0': {
219                /*
220                 * Premature end for a type descriptor, but valid for
221                 * a class name as long as we haven't encountered an
222                 * empty component (including the degenerate case of
223                 * the empty string "").
224                 */
225                return isClassName && !sepOrFirst;
226            }
227            case ';': {
228                /*
229                 * Invalid character for a class name, but the
230                 * legitimate end of a type descriptor. In the latter
231                 * case, make sure that this is the end of the string
232                 * and that it doesn't end with an empty component
233                 * (including the degenerate case of "L;").
234                 */
235                return !isClassName && !sepOrFirst && (s[1] == '\0');
236            }
237            case '/':
238            case '.': {
239                if (dotSeparator != (c == '.')) {
240                    // The wrong separator character.
241                    return false;
242                }
243                if (sepOrFirst) {
244                    // Separator at start or two separators in a row.
245                    return false;
246                }
247                sepOrFirst = true;
248                s++;
249                break;
250            }
251            default: {
252                if (!dexIsValidMemberNameUtf8(&s)) {
253                    return false;
254                }
255                sepOrFirst = false;
256                break;
257            }
258        }
259    }
260}
261
262/* Return whether the given string is a valid type descriptor. */
263bool dexIsValidTypeDescriptor(const char* s) {
264    return isValidTypeDescriptorOrClassName(s, false, false);
265}
266
267/* (documented in header) */
268bool dexIsValidClassName(const char* s, bool dotSeparator) {
269    return isValidTypeDescriptorOrClassName(s, true, dotSeparator);
270}
271
272/* Return whether the given string is a valid reference descriptor. This
273 * is true if dexIsValidTypeDescriptor() returns true and the descriptor
274 * is for a class or array and not a primitive type. */
275bool dexIsReferenceDescriptor(const char* s) {
276    if (!dexIsValidTypeDescriptor(s)) {
277        return false;
278    }
279
280    return (s[0] == 'L') || (s[0] == '[');
281}
282
283/* Return whether the given string is a valid class descriptor. This
284 * is true if dexIsValidTypeDescriptor() returns true and the descriptor
285 * is for a class and not an array or primitive type. */
286bool dexIsClassDescriptor(const char* s) {
287    if (!dexIsValidTypeDescriptor(s)) {
288        return false;
289    }
290
291    return s[0] == 'L';
292}
293
294/* Return whether the given string is a valid field type descriptor. This
295 * is true if dexIsValidTypeDescriptor() returns true and the descriptor
296 * is for anything but "void". */
297bool dexIsFieldDescriptor(const char* s) {
298    if (!dexIsValidTypeDescriptor(s)) {
299        return false;
300    }
301
302    return s[0] != 'V';
303}
304
305