Utf8Utils.java revision bcc4d2d9e186b00386cba334a31b0f9ebffd299a
1/*
2 * Copyright (C) 2007 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17/*
18 * As per the Apache license requirements, this file has been modified
19 * from its original state.
20 *
21 * Such modifications are Copyright (C) 2010 Ben Gruver, and are released
22 * under the original license
23 */
24
25package org.jf.dexlib.Util;
26
27import java.io.IOException;
28import java.io.Writer;
29
30/**
31 * Constants of type <code>CONSTANT_Utf8_info</code>.
32 */
33public final class Utf8Utils {
34
35
36    /**
37     * Converts a string into its Java-style UTF-8 form. Java-style UTF-8
38     * differs from normal UTF-8 in the handling of character '\0' and
39     * surrogate pairs.
40     *
41     * @param string non-null; the string to convert
42     * @return non-null; the UTF-8 bytes for it
43     */
44    public static byte[] stringToUtf8Bytes(String string) {
45        int len = string.length();
46        byte[] bytes = new byte[len * 3]; // Avoid having to reallocate.
47        int outAt = 0;
48
49        for (int i = 0; i < len; i++) {
50            char c = string.charAt(i);
51            if ((c != 0) && (c < 0x80)) {
52                bytes[outAt] = (byte) c;
53                outAt++;
54            } else if (c < 0x800) {
55                bytes[outAt] = (byte) (((c >> 6) & 0x1f) | 0xc0);
56                bytes[outAt + 1] = (byte) ((c & 0x3f) | 0x80);
57                outAt += 2;
58            } else {
59                bytes[outAt] = (byte) (((c >> 12) & 0x0f) | 0xe0);
60                bytes[outAt + 1] = (byte) (((c >> 6) & 0x3f) | 0x80);
61                bytes[outAt + 2] = (byte) ((c & 0x3f) | 0x80);
62                outAt += 3;
63            }
64        }
65
66        byte[] result = new byte[outAt];
67        System.arraycopy(bytes, 0, result, 0, outAt);
68        return result;
69    }
70
71    private static char[] tempBuffer = null;
72
73    /**
74     * Converts an array of UTF-8 bytes into a string.
75     *
76     * This method uses a global buffer to avoid having to allocate one every time, so it is *not* thread-safe
77     *
78     * @param bytes non-null; the bytes to convert
79     * @param start the start index of the utf8 string to convert
80     * @param length the length of the utf8 string to convert, not including any null-terminator that might be present
81     * @return non-null; the converted string
82     */
83    public static String utf8BytesToString(byte[] bytes, int start, int length) {
84        if (tempBuffer == null || tempBuffer.length < length) {
85            tempBuffer = new char[length];
86        }
87        char[] chars = tempBuffer;
88        int outAt = 0;
89
90        for (int at = start; length > 0; /*at*/) {
91            int v0 = bytes[at] & 0xFF;
92            char out;
93            switch (v0 >> 4) {
94                case 0x00: case 0x01: case 0x02: case 0x03:
95                case 0x04: case 0x05: case 0x06: case 0x07: {
96                    // 0XXXXXXX -- single-byte encoding
97                    length--;
98                    if (v0 == 0) {
99                        // A single zero byte is illegal.
100                        return throwBadUtf8(v0, at);
101                    }
102                    out = (char) v0;
103                    at++;
104                    break;
105                }
106                case 0x0c: case 0x0d: {
107                    // 110XXXXX -- two-byte encoding
108                    length -= 2;
109                    if (length < 0) {
110                        return throwBadUtf8(v0, at);
111                    }
112                    int v1 = bytes[at + 1] & 0xFF;
113                    if ((v1 & 0xc0) != 0x80) {
114                        return throwBadUtf8(v1, at + 1);
115                    }
116                    int value = ((v0 & 0x1f) << 6) | (v1 & 0x3f);
117                    if ((value != 0) && (value < 0x80)) {
118                        /*
119                         * This should have been represented with
120                         * one-byte encoding.
121                         */
122                        return throwBadUtf8(v1, at + 1);
123                    }
124                    out = (char) value;
125                    at += 2;
126                    break;
127                }
128                case 0x0e: {
129                    // 1110XXXX -- three-byte encoding
130                    length -= 3;
131                    if (length < 0) {
132                        return throwBadUtf8(v0, at);
133                    }
134                    int v1 = bytes[at + 1] & 0xFF;
135                    if ((v1 & 0xc0) != 0x80) {
136                        return throwBadUtf8(v1, at + 1);
137                    }
138                    int v2 = bytes[at + 2] & 0xFF;
139                    if ((v1 & 0xc0) != 0x80) {
140                        return throwBadUtf8(v2, at + 2);
141                    }
142                    int value = ((v0 & 0x0f) << 12) | ((v1 & 0x3f) << 6) |
143                        (v2 & 0x3f);
144                    if (value < 0x800) {
145                        /*
146                         * This should have been represented with one- or
147                         * two-byte encoding.
148                         */
149                        return throwBadUtf8(v2, at + 2);
150                    }
151                    out = (char) value;
152                    at += 3;
153                    break;
154                }
155                default: {
156                    // 10XXXXXX, 1111XXXX -- illegal
157                    return throwBadUtf8(v0, at);
158                }
159            }
160            chars[outAt] = out;
161            outAt++;
162        }
163
164        return new String(chars, 0, outAt);
165    }
166
167    /**
168     * Helper for {@link #utf8BytesToString}, which throws the right
169     * exception for a bogus utf-8 byte.
170     *
171     * @param value the byte value
172     * @param offset the file offset
173     * @return never
174     * @throws IllegalArgumentException always thrown
175     */
176    private static String throwBadUtf8(int value, int offset) {
177        throw new IllegalArgumentException("bad utf-8 byte " + Hex.u1(value) +
178                                           " at offset " + Hex.u4(offset));
179    }
180
181    public static void writeEscapedChar(Writer writer, char c) throws IOException {
182        if ((c >= ' ') && (c < 0x7f)) {
183            if ((c == '\'') || (c == '\"') || (c == '\\')) {
184                writer.write('\\');
185            }
186            writer.write(c);
187            return;
188        } else if (c <= 0x7f) {
189            switch (c) {
190                case '\n': writer.write("\\n"); return;
191                case '\r': writer.write("\\r"); return;
192                case '\t': writer.write("\\t"); return;
193            }
194        }
195
196        writer.write("\\u");
197        writer.write(Character.forDigit(c >> 12, 16));
198        writer.write(Character.forDigit((c >> 8) & 0x0f, 16));
199        writer.write(Character.forDigit((c >> 4) & 0x0f, 16));
200        writer.write(Character.forDigit(c & 0x0f, 16));
201
202    }
203
204    public static void writeEscapedString(Writer writer, String value) throws IOException {
205        for (int i = 0; i < value.length(); i++) {
206            char c = value.charAt(i);
207
208            if ((c >= ' ') && (c < 0x7f)) {
209                if ((c == '\'') || (c == '\"') || (c == '\\')) {
210                    writer.write('\\');
211                }
212                writer.write(c);
213                continue;
214            } else if (c <= 0x7f) {
215                switch (c) {
216                    case '\n': writer.write("\\n"); continue;
217                    case '\r': writer.write("\\r"); continue;
218                    case '\t': writer.write("\\t"); continue;
219                }
220            }
221
222            writer.write("\\u");
223            writer.write(Character.forDigit(c >> 12, 16));
224            writer.write(Character.forDigit((c >> 8) & 0x0f, 16));
225            writer.write(Character.forDigit((c >> 4) & 0x0f, 16));
226            writer.write(Character.forDigit(c & 0x0f, 16));
227        }
228    }
229
230    public static String escapeString(String value) {
231        int len = value.length();
232        StringBuilder sb = new StringBuilder(len * 3 / 2);
233
234        for (int i = 0; i < len; i++) {
235            char c = value.charAt(i);
236
237            if ((c >= ' ') && (c < 0x7f)) {
238                if ((c == '\'') || (c == '\"') || (c == '\\')) {
239                    sb.append('\\');
240                }
241                sb.append(c);
242                continue;
243            } else if (c <= 0x7f) {
244                switch (c) {
245                    case '\n': sb.append("\\n"); continue;
246                    case '\r': sb.append("\\r"); continue;
247                    case '\t': sb.append("\\t"); continue;
248                }
249            }
250
251            sb.append("\\u");
252            sb.append(Character.forDigit(c >> 12, 16));
253            sb.append(Character.forDigit((c >> 8) & 0x0f, 16));
254            sb.append(Character.forDigit((c >> 4) & 0x0f, 16));
255            sb.append(Character.forDigit(c & 0x0f, 16));
256        }
257
258        return sb.toString();
259    }
260}
261