1/*
2 * Copyright (C) 2007 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17/*
18 * As per the Apache license requirements, this file has been modified
19 * from its original state.
20 *
21 * Such modifications are Copyright (C) 2010 Ben Gruver, and are released
22 * under the original license
23 */
24
25package org.jf.util;
26
27import javax.annotation.Nonnull;
28import javax.annotation.Nullable;
29
30/**
31 * Constants of type <code>CONSTANT_Utf8_info</code>.
32 */
33public final class Utf8Utils {
34    /**
35     * Converts a string into its Java-style UTF-8 form. Java-style UTF-8
36     * differs from normal UTF-8 in the handling of character '\0' and
37     * surrogate pairs.
38     *
39     * @param string non-null; the string to convert
40     * @return non-null; the UTF-8 bytes for it
41     */
42    public static byte[] stringToUtf8Bytes(String string) {
43        int len = string.length();
44        byte[] bytes = new byte[len * 3]; // Avoid having to reallocate.
45        int outAt = 0;
46
47        for (int i = 0; i < len; i++) {
48            char c = string.charAt(i);
49            if ((c != 0) && (c < 0x80)) {
50                bytes[outAt] = (byte) c;
51                outAt++;
52            } else if (c < 0x800) {
53                bytes[outAt] = (byte) (((c >> 6) & 0x1f) | 0xc0);
54                bytes[outAt + 1] = (byte) ((c & 0x3f) | 0x80);
55                outAt += 2;
56            } else {
57                bytes[outAt] = (byte) (((c >> 12) & 0x0f) | 0xe0);
58                bytes[outAt + 1] = (byte) (((c >> 6) & 0x3f) | 0x80);
59                bytes[outAt + 2] = (byte) ((c & 0x3f) | 0x80);
60                outAt += 3;
61            }
62        }
63
64        byte[] result = new byte[outAt];
65        System.arraycopy(bytes, 0, result, 0, outAt);
66        return result;
67    }
68
69    private static final ThreadLocal<char[]> localBuffer =
70            new ThreadLocal<char[]> () {
71                @Override protected char[] initialValue() {
72                    // A reasonably sized initial value
73                    return new char[256];
74                }
75            };
76
77    /**
78     * Converts an array of UTF-8 bytes into a string.
79     *
80     * @param bytes non-null; the bytes to convert
81     * @param start the start index of the utf8 string to convert
82     * @param length the length of the utf8 string to convert, not including any null-terminator that might be present
83     * @return non-null; the converted string
84     */
85    public static String utf8BytesToString(byte[] bytes, int start, int length) {
86        char[] chars = localBuffer.get();
87        if (chars == null || chars.length < length) {
88            chars = new char[length];
89            localBuffer.set(chars);
90        }
91        int outAt = 0;
92
93        for (int at = start; length > 0; /*at*/) {
94            int v0 = bytes[at] & 0xFF;
95            char out;
96            switch (v0 >> 4) {
97                case 0x00: case 0x01: case 0x02: case 0x03:
98                case 0x04: case 0x05: case 0x06: case 0x07: {
99                    // 0XXXXXXX -- single-byte encoding
100                    length--;
101                    if (v0 == 0) {
102                        // A single zero byte is illegal.
103                        return throwBadUtf8(v0, at);
104                    }
105                    out = (char) v0;
106                    at++;
107                    break;
108                }
109                case 0x0c: case 0x0d: {
110                    // 110XXXXX -- two-byte encoding
111                    length -= 2;
112                    if (length < 0) {
113                        return throwBadUtf8(v0, at);
114                    }
115                    int v1 = bytes[at + 1] & 0xFF;
116                    if ((v1 & 0xc0) != 0x80) {
117                        return throwBadUtf8(v1, at + 1);
118                    }
119                    int value = ((v0 & 0x1f) << 6) | (v1 & 0x3f);
120                    if ((value != 0) && (value < 0x80)) {
121                        /*
122                         * This should have been represented with
123                         * one-byte encoding.
124                         */
125                        return throwBadUtf8(v1, at + 1);
126                    }
127                    out = (char) value;
128                    at += 2;
129                    break;
130                }
131                case 0x0e: {
132                    // 1110XXXX -- three-byte encoding
133                    length -= 3;
134                    if (length < 0) {
135                        return throwBadUtf8(v0, at);
136                    }
137                    int v1 = bytes[at + 1] & 0xFF;
138                    if ((v1 & 0xc0) != 0x80) {
139                        return throwBadUtf8(v1, at + 1);
140                    }
141                    int v2 = bytes[at + 2] & 0xFF;
142                    if ((v2 & 0xc0) != 0x80) {
143                        return throwBadUtf8(v2, at + 2);
144                    }
145                    int value = ((v0 & 0x0f) << 12) | ((v1 & 0x3f) << 6) |
146                            (v2 & 0x3f);
147                    if (value < 0x800) {
148                        /*
149                         * This should have been represented with one- or
150                         * two-byte encoding.
151                         */
152                        return throwBadUtf8(v2, at + 2);
153                    }
154                    out = (char) value;
155                    at += 3;
156                    break;
157                }
158                default: {
159                    // 10XXXXXX, 1111XXXX -- illegal
160                    return throwBadUtf8(v0, at);
161                }
162            }
163            chars[outAt] = out;
164            outAt++;
165        }
166
167        return new String(chars, 0, outAt);
168    }
169
170    /**
171     * Converts an array of UTF-8 bytes into a string.
172     *
173     * @param bytes non-null; the bytes to convert
174     * @param start the start index of the utf8 string to convert
175     * @param utf16Length the number of utf16 characters in the string to decode
176     * @return non-null; the converted string
177     */
178    public static String utf8BytesWithUtf16LengthToString(@Nonnull byte[] bytes, int start, int utf16Length) {
179        return utf8BytesWithUtf16LengthToString(bytes, start, utf16Length, null);
180    }
181
182    /**
183     * Converts an array of UTF-8 bytes into a string.
184     *
185     * @param bytes non-null; the bytes to convert
186     * @param start the start index of the utf8 string to convert
187     * @param utf16Length the number of utf16 characters in the string to decode
188     * @param readLength If non-null, the first element will contain the number of bytes read after the method exits
189     * @return non-null; the converted string
190     */
191    public static String utf8BytesWithUtf16LengthToString(@Nonnull byte[] bytes, int start, int utf16Length,
192                                                          @Nullable int[] readLength) {
193        char[] chars = localBuffer.get();
194        if (chars == null || chars.length < utf16Length) {
195            chars = new char[utf16Length];
196            localBuffer.set(chars);
197        }
198        int outAt = 0;
199
200        int at = 0;
201        for (at = start; utf16Length > 0; utf16Length--) {
202            int v0 = bytes[at] & 0xFF;
203            char out;
204            switch (v0 >> 4) {
205                case 0x00: case 0x01: case 0x02: case 0x03:
206                case 0x04: case 0x05: case 0x06: case 0x07: {
207                    // 0XXXXXXX -- single-byte encoding
208                    if (v0 == 0) {
209                        // A single zero byte is illegal.
210                        return throwBadUtf8(v0, at);
211                    }
212                    out = (char) v0;
213                    at++;
214                    break;
215                }
216                case 0x0c: case 0x0d: {
217                    // 110XXXXX -- two-byte encoding
218                    int v1 = bytes[at + 1] & 0xFF;
219                    if ((v1 & 0xc0) != 0x80) {
220                        return throwBadUtf8(v1, at + 1);
221                    }
222                    int value = ((v0 & 0x1f) << 6) | (v1 & 0x3f);
223                    if ((value != 0) && (value < 0x80)) {
224                        /*
225                         * This should have been represented with
226                         * one-byte encoding.
227                         */
228                        return throwBadUtf8(v1, at + 1);
229                    }
230                    out = (char) value;
231                    at += 2;
232                    break;
233                }
234                case 0x0e: {
235                    // 1110XXXX -- three-byte encoding
236                    int v1 = bytes[at + 1] & 0xFF;
237                    if ((v1 & 0xc0) != 0x80) {
238                        return throwBadUtf8(v1, at + 1);
239                    }
240                    int v2 = bytes[at + 2] & 0xFF;
241                    if ((v2 & 0xc0) != 0x80) {
242                        return throwBadUtf8(v2, at + 2);
243                    }
244                    int value = ((v0 & 0x0f) << 12) | ((v1 & 0x3f) << 6) |
245                        (v2 & 0x3f);
246                    if (value < 0x800) {
247                        /*
248                         * This should have been represented with one- or
249                         * two-byte encoding.
250                         */
251                        return throwBadUtf8(v2, at + 2);
252                    }
253                    out = (char) value;
254                    at += 3;
255                    break;
256                }
257                default: {
258                    // 10XXXXXX, 1111XXXX -- illegal
259                    return throwBadUtf8(v0, at);
260                }
261            }
262            chars[outAt] = out;
263            outAt++;
264        }
265
266        if (readLength != null && readLength.length > 0) {
267            readLength[0] = at - start;
268            readLength[0] = at - start;
269        }
270        return new String(chars, 0, outAt);
271    }
272
273    /**
274     * Helper for {@link #utf8BytesToString}, which throws the right
275     * exception for a bogus utf-8 byte.
276     *
277     * @param value the byte value
278     * @param offset the file offset
279     * @return never
280     * @throws IllegalArgumentException always thrown
281     */
282    private static String throwBadUtf8(int value, int offset) {
283        throw new IllegalArgumentException("bad utf-8 byte " + Hex.u1(value) +
284                                           " at offset " + Hex.u4(offset));
285    }
286}
287