2 * Copyright (C) 2007 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
17package com.android.dexgen.rop.cst;
19import com.android.dexgen.util.ByteArray;
20import com.android.dexgen.util.Hex;
23 * Constants of type {@code CONSTANT_Utf8_info}.
24 */
25public final class CstUtf8 extends Constant {
26    /**
27     * {@code non-null;} instance representing {@code ""}, that is, the
28     * empty string
29     */
30    public static final CstUtf8 EMPTY_STRING = new CstUtf8("");
32    /** {@code non-null;} the UTF-8 value as a string */
33    private final String string;
35    /** {@code non-null;} the UTF-8 value as bytes */
36    private final ByteArray bytes;
38    /**
39     * Converts a string into its Java-style UTF-8 form. Java-style UTF-8
40     * differs from normal UTF-8 in the handling of character '\0' and
41     * surrogate pairs.
42     *
43     * @param string {@code non-null;} the string to convert
44     * @return {@code non-null;} the UTF-8 bytes for it
45     */
46    public static byte[] stringToUtf8Bytes(String string) {
47        int len = string.length();
48        byte[] bytes = new byte[len * 3]; // Avoid having to reallocate.
49        int outAt = 0;
51        for (int i = 0; i < len; i++) {
52            char c = string.charAt(i);
53            if ((c != 0) && (c < 0x80)) {
54                bytes[outAt] = (byte) c;
55                outAt++;
56            } else if (c < 0x800) {
57                bytes[outAt] = (byte) (((c >> 6) & 0x1f) | 0xc0);
58                bytes[outAt + 1] = (byte) ((c & 0x3f) | 0x80);
59                outAt += 2;
60            } else {
61                bytes[outAt] = (byte) (((c >> 12) & 0x0f) | 0xe0);
62                bytes[outAt + 1] = (byte) (((c >> 6) & 0x3f) | 0x80);
63                bytes[outAt + 2] = (byte) ((c & 0x3f) | 0x80);
64                outAt += 3;
65            }
66        }
68        byte[] result = new byte[outAt];
69        System.arraycopy(bytes, 0, result, 0, outAt);
70        return result;
71    }
73    /**
74     * Converts an array of UTF-8 bytes into a string.
75     *
76     * @param bytes {@code non-null;} the bytes to convert
77     * @return {@code non-null;} the converted string
78     */
79    public static String utf8BytesToString(ByteArray bytes) {
80        int length = bytes.size();
81        char[] chars = new char[length]; // This is sized to avoid a realloc.
82        int outAt = 0;
84        for (int at = 0; length > 0; /*at*/) {
85            int v0 = bytes.getUnsignedByte(at);
86            char out;
87            switch (v0 >> 4) {
88                case 0x00: case 0x01: case 0x02: case 0x03:
89                case 0x04: case 0x05: case 0x06: case 0x07: {
90                    // 0XXXXXXX -- single-byte encoding
91                    length--;
92                    if (v0 == 0) {
93                        // A single zero byte is illegal.
94                        return throwBadUtf8(v0, at);
95                    }
96                    out = (char) v0;
97                    at++;
98                    break;
99                }
100                case 0x0c: case 0x0d: {
101                    // 110XXXXX -- two-byte encoding
102                    length -= 2;
103                    if (length < 0) {
104                        return throwBadUtf8(v0, at);
105                    }
106                    int v1 = bytes.getUnsignedByte(at + 1);
107                    if ((v1 & 0xc0) != 0x80) {
108                        return throwBadUtf8(v1, at + 1);
109                    }
110                    int value = ((v0 & 0x1f) << 6) | (v1 & 0x3f);
111                    if ((value != 0) && (value < 0x80)) {
112                        /*
113                         * This should have been represented with
114                         * one-byte encoding.
115                         */
116                        return throwBadUtf8(v1, at + 1);
117                    }
118                    out = (char) value;
119                    at += 2;
120                    break;
121                }
122                case 0x0e: {
123                    // 1110XXXX -- three-byte encoding
124                    length -= 3;
125                    if (length < 0) {
126                        return throwBadUtf8(v0, at);
127                    }
128                    int v1 = bytes.getUnsignedByte(at + 1);
129                    if ((v1 & 0xc0) != 0x80) {
130                        return throwBadUtf8(v1, at + 1);
131                    }
132                    int v2 = bytes.getUnsignedByte(at + 2);
133                    if ((v1 & 0xc0) != 0x80) {
134                        return throwBadUtf8(v2, at + 2);
135                    }
136                    int value = ((v0 & 0x0f) << 12) | ((v1 & 0x3f) << 6) |
137                        (v2 & 0x3f);
138                    if (value < 0x800) {
139                        /*
140                         * This should have been represented with one- or
141                         * two-byte encoding.
142                         */
143                        return throwBadUtf8(v2, at + 2);
144                    }
145                    out = (char) value;
146                    at += 3;
147                    break;
148                }
149                default: {
150                    // 10XXXXXX, 1111XXXX -- illegal
151                    return throwBadUtf8(v0, at);
152                }
153            }
154            chars[outAt] = out;
155            outAt++;
156        }
158        return new String(chars, 0, outAt);
159    }
161    /**
162     * Helper for {@link #utf8BytesToString}, which throws the right
163     * exception for a bogus utf-8 byte.
164     *
165     * @param value the byte value
166     * @param offset the file offset
167     * @return never
168     * @throws IllegalArgumentException always thrown
169     */
170    private static String throwBadUtf8(int value, int offset) {
171        throw new IllegalArgumentException("bad utf-8 byte " + Hex.u1(value) +
172                                           " at offset " + Hex.u4(offset));
173    }
175    /**
176     * Constructs an instance from a {@code String}.
177     *
178     * @param string {@code non-null;} the UTF-8 value as a string
179     */
180    public CstUtf8(String string) {
181        if (string == null) {
182            throw new NullPointerException("string == null");
183        }
185        this.string = string.intern();
186        this.bytes = new ByteArray(stringToUtf8Bytes(string));
187    }
189    /**
190     * Constructs an instance from some UTF-8 bytes.
191     *
192     * @param bytes {@code non-null;} array of the UTF-8 bytes
193     */
194    public CstUtf8(ByteArray bytes) {
195        if (bytes == null) {
196            throw new NullPointerException("bytes == null");
197        }
199        this.bytes = bytes;
200        this.string = utf8BytesToString(bytes).intern();
201    }
203    /** {@inheritDoc} */
204    @Override
205    public boolean equals(Object other) {
206        if (!(other instanceof CstUtf8)) {
207            return false;
208        }
210        return string.equals(((CstUtf8) other).string);
211    }
213    /** {@inheritDoc} */
214    @Override
215    public int hashCode() {
216        return string.hashCode();
217    }
219    /** {@inheritDoc} */
220    @Override
221    protected int compareTo0(Constant other) {
222        return string.compareTo(((CstUtf8) other).string);
223    }
225    /** {@inheritDoc} */
226    @Override
227    public String toString() {
228        return "utf8{\"" + toHuman() + "\"}";
229    }
231    /** {@inheritDoc} */
232    @Override
233    public String typeName() {
234        return "utf8";
235    }
237    /** {@inheritDoc} */
238    @Override
239    public boolean isCategory2() {
240        return false;
241    }
243    /** {@inheritDoc} */
244    public String toHuman() {
245        int len = string.length();
246        StringBuilder sb = new StringBuilder(len * 3 / 2);
248        for (int i = 0; i < len; i++) {
249            char c = string.charAt(i);
250            if ((c >= ' ') && (c < 0x7f)) {
251                if ((c == '\'') || (c == '\"') || (c == '\\')) {
252                    sb.append('\\');
253                }
254                sb.append(c);
255            } else if (c <= 0x7f) {
256                switch (c) {
257                    case '\n': sb.append("\\n"); break;
258                    case '\r': sb.append("\\r"); break;
259                    case '\t': sb.append("\\t"); break;
260                    default: {
261                        /*
262                         * Represent the character as an octal escape.
263                         * If the next character is a valid octal
264                         * digit, disambiguate by using the
265                         * three-digit form.
266                         */
267                        char nextChar =
268                            (i < (len - 1)) ? string.charAt(i + 1) : 0;
269                        boolean displayZero =
270                            (nextChar >= '0') && (nextChar <= '7');
271                        sb.append('\\');
272                        for (int shift = 6; shift >= 0; shift -= 3) {
273                            char outChar = (char) (((c >> shift) & 7) + '0');
274                            if ((outChar != '0') || displayZero) {
275                                sb.append(outChar);
276                                displayZero = true;
277                            }
278                        }
279                        if (! displayZero) {
280                            // Ironic edge case: The original value was 0.
281                            sb.append('0');
282                        }
283                        break;
284                    }
285                }
286            } else {
287                sb.append("\\u");
288                sb.append(Character.forDigit(c >> 12, 16));
289                sb.append(Character.forDigit((c >> 8) & 0x0f, 16));
290                sb.append(Character.forDigit((c >> 4) & 0x0f, 16));
291                sb.append(Character.forDigit(c & 0x0f, 16));
292            }
293        }
295        return sb.toString();
296    }
298    /**
299     * Gets the value as a human-oriented string, surrounded by double
300     * quotes.
301     *
302     * @return {@code non-null;} the quoted string
303     */
304    public String toQuoted() {
305        return '\"' + toHuman() + '\"';
306    }
308    /**
309     * Gets the value as a human-oriented string, surrounded by double
310     * quotes, but ellipsizes the result if it is longer than the given
311     * maximum length
312     *
313     * @param maxLength {@code >= 5;} the maximum length of the string to return
314     * @return {@code non-null;} the quoted string
315     */
316    public String toQuoted(int maxLength) {
317        String string = toHuman();
318        int length = string.length();
319        String ellipses;
321        if (length <= (maxLength - 2)) {
322            ellipses = "";
323        } else {
324            string = string.substring(0, maxLength - 5);
325            ellipses = "...";
326        }
328        return '\"' + string + ellipses + '\"';
329    }
331    /**
332     * Gets the UTF-8 value as a string.
333     * The returned string is always already interned.
334     *
335     * @return {@code non-null;} the UTF-8 value as a string
336     */
337    public String getString() {
338        return string;
339    }
341    /**
342     * Gets the UTF-8 value as UTF-8 encoded bytes.
343     *
344     * @return {@code non-null;} an array of the UTF-8 bytes
345     */
346    public ByteArray getBytes() {
347        return bytes;
348    }
350    /**
351     * Gets the size of this instance as UTF-8 code points. That is,
352     * get the number of bytes in the UTF-8 encoding of this instance.
353     *
354     * @return {@code >= 0;} the UTF-8 size
355     */
356    public int getUtf8Size() {
357        return bytes.size();
358    }
360    /**
361     * Gets the size of this instance as UTF-16 code points. That is,
362     * get the number of 16-bit chars in the UTF-16 encoding of this
363     * instance. This is the same as the {@code length} of the
364     * Java {@code String} representation of this instance.
365     *
366     * @return {@code >= 0;} the UTF-16 size
367     */
368    public int getUtf16Size() {
369        return string.length();
370    }