183b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com/*
283b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com * Copyright (C) 2007 The Android Open Source Project
383b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com *
483b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com * Licensed under the Apache License, Version 2.0 (the "License");
583b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com * you may not use this file except in compliance with the License.
683b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com * You may obtain a copy of the License at
783b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com *
883b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com *      http://www.apache.org/licenses/LICENSE-2.0
983b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com *
1083b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com * Unless required by applicable law or agreed to in writing, software
1183b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com * distributed under the License is distributed on an "AS IS" BASIS,
1283b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1383b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com * See the License for the specific language governing permissions and
1483b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com * limitations under the License.
1583b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com */
1683b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com
17128e8279c3cf44cc1d1c8f263035ba8e4044d5c6JesusFreke@JesusFreke.com/*
18128e8279c3cf44cc1d1c8f263035ba8e4044d5c6JesusFreke@JesusFreke.com * As per the Apache license requirements, this file has been modified
19128e8279c3cf44cc1d1c8f263035ba8e4044d5c6JesusFreke@JesusFreke.com * from its original state.
20128e8279c3cf44cc1d1c8f263035ba8e4044d5c6JesusFreke@JesusFreke.com *
21128e8279c3cf44cc1d1c8f263035ba8e4044d5c6JesusFreke@JesusFreke.com * Such modifications are Copyright (C) 2010 Ben Gruver, and are released
22128e8279c3cf44cc1d1c8f263035ba8e4044d5c6JesusFreke@JesusFreke.com * under the original license
23128e8279c3cf44cc1d1c8f263035ba8e4044d5c6JesusFreke@JesusFreke.com */
24128e8279c3cf44cc1d1c8f263035ba8e4044d5c6JesusFreke@JesusFreke.com
259f1d05eb44d85183651753d9b83ae3115a8ea256Ben Gruverpackage org.jf.util;
266eae34831fee1f116f3a453bdc5e143d68e05e03JesusFreke@JesusFreke.com
2760d834f95104bcbe856f4100ec967f4aae67eb04Ben Gruverimport javax.annotation.Nonnull;
2860d834f95104bcbe856f4100ec967f4aae67eb04Ben Gruverimport javax.annotation.Nullable;
2960d834f95104bcbe856f4100ec967f4aae67eb04Ben Gruver
3083b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com/**
3183b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com * Constants of type <code>CONSTANT_Utf8_info</code>.
3283b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com */
3383b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.compublic final class Utf8Utils {
3483b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com    /**
3583b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com     * Converts a string into its Java-style UTF-8 form. Java-style UTF-8
3683b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com     * differs from normal UTF-8 in the handling of character '\0' and
3783b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com     * surrogate pairs.
382ba2d0f16bf8a40d89f8ba26ed01096b2cb629f3JesusFreke@JesusFreke.com     *
3983b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com     * @param string non-null; the string to convert
4083b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com     * @return non-null; the UTF-8 bytes for it
4183b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com     */
4283b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com    public static byte[] stringToUtf8Bytes(String string) {
4383b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com        int len = string.length();
4483b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com        byte[] bytes = new byte[len * 3]; // Avoid having to reallocate.
4583b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com        int outAt = 0;
4683b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com
4783b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com        for (int i = 0; i < len; i++) {
4883b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com            char c = string.charAt(i);
4983b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com            if ((c != 0) && (c < 0x80)) {
5083b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com                bytes[outAt] = (byte) c;
5183b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com                outAt++;
5283b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com            } else if (c < 0x800) {
5383b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com                bytes[outAt] = (byte) (((c >> 6) & 0x1f) | 0xc0);
5483b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com                bytes[outAt + 1] = (byte) ((c & 0x3f) | 0x80);
5583b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com                outAt += 2;
5683b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com            } else {
5783b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com                bytes[outAt] = (byte) (((c >> 12) & 0x0f) | 0xe0);
5883b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com                bytes[outAt + 1] = (byte) (((c >> 6) & 0x3f) | 0x80);
5983b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com                bytes[outAt + 2] = (byte) ((c & 0x3f) | 0x80);
6083b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com                outAt += 3;
6183b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com            }
6283b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com        }
6383b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com
6483b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com        byte[] result = new byte[outAt];
6583b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com        System.arraycopy(bytes, 0, result, 0, outAt);
6683b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com        return result;
6783b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com    }
6883b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com
6909e6d003cbe194778ac322e248ba30438a30bb68Ben Gruver    private static final ThreadLocal<char[]> localBuffer =
7009e6d003cbe194778ac322e248ba30438a30bb68Ben Gruver            new ThreadLocal<char[]> () {
7109e6d003cbe194778ac322e248ba30438a30bb68Ben Gruver                @Override protected char[] initialValue() {
7209e6d003cbe194778ac322e248ba30438a30bb68Ben Gruver                    // A reasonably sized initial value
7309e6d003cbe194778ac322e248ba30438a30bb68Ben Gruver                    return new char[256];
7409e6d003cbe194778ac322e248ba30438a30bb68Ben Gruver                }
7509e6d003cbe194778ac322e248ba30438a30bb68Ben Gruver            };
762ba2d0f16bf8a40d89f8ba26ed01096b2cb629f3JesusFreke@JesusFreke.com
7783b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com    /**
7883b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com     * Converts an array of UTF-8 bytes into a string.
792ba2d0f16bf8a40d89f8ba26ed01096b2cb629f3JesusFreke@JesusFreke.com     *
8083b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com     * @param bytes non-null; the bytes to convert
811f29ee7351fd7fb48bb093b39b5f9ffddb34a3eaJesusFreke@JesusFreke.com     * @param start the start index of the utf8 string to convert
82bcc4d2d9e186b00386cba334a31b0f9ebffd299aJesusFreke@JesusFreke.com     * @param length the length of the utf8 string to convert, not including any null-terminator that might be present
8383b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com     * @return non-null; the converted string
8483b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com     */
851f29ee7351fd7fb48bb093b39b5f9ffddb34a3eaJesusFreke@JesusFreke.com    public static String utf8BytesToString(byte[] bytes, int start, int length) {
8609e6d003cbe194778ac322e248ba30438a30bb68Ben Gruver        char[] chars = localBuffer.get();
8709e6d003cbe194778ac322e248ba30438a30bb68Ben Gruver        if (chars == null || chars.length < length) {
8809e6d003cbe194778ac322e248ba30438a30bb68Ben Gruver            chars = new char[length];
8909e6d003cbe194778ac322e248ba30438a30bb68Ben Gruver            localBuffer.set(chars);
902ba2d0f16bf8a40d89f8ba26ed01096b2cb629f3JesusFreke@JesusFreke.com        }
9183b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com        int outAt = 0;
9283b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com
931f29ee7351fd7fb48bb093b39b5f9ffddb34a3eaJesusFreke@JesusFreke.com        for (int at = start; length > 0; /*at*/) {
9483b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com            int v0 = bytes[at] & 0xFF;
9583b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com            char out;
9683b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com            switch (v0 >> 4) {
9783b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com                case 0x00: case 0x01: case 0x02: case 0x03:
9883b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com                case 0x04: case 0x05: case 0x06: case 0x07: {
9983b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com                    // 0XXXXXXX -- single-byte encoding
10083b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com                    length--;
10183b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com                    if (v0 == 0) {
10283b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com                        // A single zero byte is illegal.
10383b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com                        return throwBadUtf8(v0, at);
10483b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com                    }
10583b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com                    out = (char) v0;
10683b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com                    at++;
10783b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com                    break;
10883b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com                }
10983b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com                case 0x0c: case 0x0d: {
11083b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com                    // 110XXXXX -- two-byte encoding
11183b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com                    length -= 2;
11283b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com                    if (length < 0) {
11383b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com                        return throwBadUtf8(v0, at);
11483b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com                    }
11583b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com                    int v1 = bytes[at + 1] & 0xFF;
11683b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com                    if ((v1 & 0xc0) != 0x80) {
11783b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com                        return throwBadUtf8(v1, at + 1);
11883b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com                    }
11983b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com                    int value = ((v0 & 0x1f) << 6) | (v1 & 0x3f);
12083b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com                    if ((value != 0) && (value < 0x80)) {
12183b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com                        /*
12283b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com                         * This should have been represented with
12383b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com                         * one-byte encoding.
12483b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com                         */
12583b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com                        return throwBadUtf8(v1, at + 1);
12683b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com                    }
12783b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com                    out = (char) value;
12883b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com                    at += 2;
12983b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com                    break;
13083b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com                }
13183b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com                case 0x0e: {
13283b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com                    // 1110XXXX -- three-byte encoding
13383b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com                    length -= 3;
13483b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com                    if (length < 0) {
13583b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com                        return throwBadUtf8(v0, at);
13683b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com                    }
13783b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com                    int v1 = bytes[at + 1] & 0xFF;
13883b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com                    if ((v1 & 0xc0) != 0x80) {
13983b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com                        return throwBadUtf8(v1, at + 1);
14083b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com                    }
14183b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com                    int v2 = bytes[at + 2] & 0xFF;
14266b89545a4b397a7e90dff7f8fff5349c074dcaeBen Gruver                    if ((v2 & 0xc0) != 0x80) {
14383b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com                        return throwBadUtf8(v2, at + 2);
14483b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com                    }
14583b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com                    int value = ((v0 & 0x0f) << 12) | ((v1 & 0x3f) << 6) |
146e593a563aeb74e31f9051f0ebf449b6d3d6d7394Ben Gruver                            (v2 & 0x3f);
147e593a563aeb74e31f9051f0ebf449b6d3d6d7394Ben Gruver                    if (value < 0x800) {
148e593a563aeb74e31f9051f0ebf449b6d3d6d7394Ben Gruver                        /*
149e593a563aeb74e31f9051f0ebf449b6d3d6d7394Ben Gruver                         * This should have been represented with one- or
150e593a563aeb74e31f9051f0ebf449b6d3d6d7394Ben Gruver                         * two-byte encoding.
151e593a563aeb74e31f9051f0ebf449b6d3d6d7394Ben Gruver                         */
152e593a563aeb74e31f9051f0ebf449b6d3d6d7394Ben Gruver                        return throwBadUtf8(v2, at + 2);
153e593a563aeb74e31f9051f0ebf449b6d3d6d7394Ben Gruver                    }
154e593a563aeb74e31f9051f0ebf449b6d3d6d7394Ben Gruver                    out = (char) value;
155e593a563aeb74e31f9051f0ebf449b6d3d6d7394Ben Gruver                    at += 3;
156e593a563aeb74e31f9051f0ebf449b6d3d6d7394Ben Gruver                    break;
157e593a563aeb74e31f9051f0ebf449b6d3d6d7394Ben Gruver                }
158e593a563aeb74e31f9051f0ebf449b6d3d6d7394Ben Gruver                default: {
159e593a563aeb74e31f9051f0ebf449b6d3d6d7394Ben Gruver                    // 10XXXXXX, 1111XXXX -- illegal
160e593a563aeb74e31f9051f0ebf449b6d3d6d7394Ben Gruver                    return throwBadUtf8(v0, at);
161e593a563aeb74e31f9051f0ebf449b6d3d6d7394Ben Gruver                }
162e593a563aeb74e31f9051f0ebf449b6d3d6d7394Ben Gruver            }
163e593a563aeb74e31f9051f0ebf449b6d3d6d7394Ben Gruver            chars[outAt] = out;
164e593a563aeb74e31f9051f0ebf449b6d3d6d7394Ben Gruver            outAt++;
165e593a563aeb74e31f9051f0ebf449b6d3d6d7394Ben Gruver        }
166e593a563aeb74e31f9051f0ebf449b6d3d6d7394Ben Gruver
167e593a563aeb74e31f9051f0ebf449b6d3d6d7394Ben Gruver        return new String(chars, 0, outAt);
168e593a563aeb74e31f9051f0ebf449b6d3d6d7394Ben Gruver    }
169e593a563aeb74e31f9051f0ebf449b6d3d6d7394Ben Gruver
170e593a563aeb74e31f9051f0ebf449b6d3d6d7394Ben Gruver    /**
171e593a563aeb74e31f9051f0ebf449b6d3d6d7394Ben Gruver     * Converts an array of UTF-8 bytes into a string.
172e593a563aeb74e31f9051f0ebf449b6d3d6d7394Ben Gruver     *
173e593a563aeb74e31f9051f0ebf449b6d3d6d7394Ben Gruver     * @param bytes non-null; the bytes to convert
174e593a563aeb74e31f9051f0ebf449b6d3d6d7394Ben Gruver     * @param start the start index of the utf8 string to convert
175e593a563aeb74e31f9051f0ebf449b6d3d6d7394Ben Gruver     * @param utf16Length the number of utf16 characters in the string to decode
176e593a563aeb74e31f9051f0ebf449b6d3d6d7394Ben Gruver     * @return non-null; the converted string
177e593a563aeb74e31f9051f0ebf449b6d3d6d7394Ben Gruver     */
17860d834f95104bcbe856f4100ec967f4aae67eb04Ben Gruver    public static String utf8BytesWithUtf16LengthToString(@Nonnull byte[] bytes, int start, int utf16Length) {
17960d834f95104bcbe856f4100ec967f4aae67eb04Ben Gruver        return utf8BytesWithUtf16LengthToString(bytes, start, utf16Length, null);
18060d834f95104bcbe856f4100ec967f4aae67eb04Ben Gruver    }
18160d834f95104bcbe856f4100ec967f4aae67eb04Ben Gruver
18260d834f95104bcbe856f4100ec967f4aae67eb04Ben Gruver    /**
18360d834f95104bcbe856f4100ec967f4aae67eb04Ben Gruver     * Converts an array of UTF-8 bytes into a string.
18460d834f95104bcbe856f4100ec967f4aae67eb04Ben Gruver     *
18560d834f95104bcbe856f4100ec967f4aae67eb04Ben Gruver     * @param bytes non-null; the bytes to convert
18660d834f95104bcbe856f4100ec967f4aae67eb04Ben Gruver     * @param start the start index of the utf8 string to convert
18760d834f95104bcbe856f4100ec967f4aae67eb04Ben Gruver     * @param utf16Length the number of utf16 characters in the string to decode
18860d834f95104bcbe856f4100ec967f4aae67eb04Ben Gruver     * @param readLength If non-null, the first element will contain the number of bytes read after the method exits
18960d834f95104bcbe856f4100ec967f4aae67eb04Ben Gruver     * @return non-null; the converted string
19060d834f95104bcbe856f4100ec967f4aae67eb04Ben Gruver     */
19160d834f95104bcbe856f4100ec967f4aae67eb04Ben Gruver    public static String utf8BytesWithUtf16LengthToString(@Nonnull byte[] bytes, int start, int utf16Length,
19260d834f95104bcbe856f4100ec967f4aae67eb04Ben Gruver                                                          @Nullable int[] readLength) {
193e593a563aeb74e31f9051f0ebf449b6d3d6d7394Ben Gruver        char[] chars = localBuffer.get();
194e593a563aeb74e31f9051f0ebf449b6d3d6d7394Ben Gruver        if (chars == null || chars.length < utf16Length) {
195e593a563aeb74e31f9051f0ebf449b6d3d6d7394Ben Gruver            chars = new char[utf16Length];
196e593a563aeb74e31f9051f0ebf449b6d3d6d7394Ben Gruver            localBuffer.set(chars);
197e593a563aeb74e31f9051f0ebf449b6d3d6d7394Ben Gruver        }
198e593a563aeb74e31f9051f0ebf449b6d3d6d7394Ben Gruver        int outAt = 0;
199e593a563aeb74e31f9051f0ebf449b6d3d6d7394Ben Gruver
20060d834f95104bcbe856f4100ec967f4aae67eb04Ben Gruver        int at = 0;
20160d834f95104bcbe856f4100ec967f4aae67eb04Ben Gruver        for (at = start; utf16Length > 0; utf16Length--) {
202e593a563aeb74e31f9051f0ebf449b6d3d6d7394Ben Gruver            int v0 = bytes[at] & 0xFF;
203e593a563aeb74e31f9051f0ebf449b6d3d6d7394Ben Gruver            char out;
204e593a563aeb74e31f9051f0ebf449b6d3d6d7394Ben Gruver            switch (v0 >> 4) {
205e593a563aeb74e31f9051f0ebf449b6d3d6d7394Ben Gruver                case 0x00: case 0x01: case 0x02: case 0x03:
206e593a563aeb74e31f9051f0ebf449b6d3d6d7394Ben Gruver                case 0x04: case 0x05: case 0x06: case 0x07: {
207e593a563aeb74e31f9051f0ebf449b6d3d6d7394Ben Gruver                    // 0XXXXXXX -- single-byte encoding
208e593a563aeb74e31f9051f0ebf449b6d3d6d7394Ben Gruver                    if (v0 == 0) {
209e593a563aeb74e31f9051f0ebf449b6d3d6d7394Ben Gruver                        // A single zero byte is illegal.
210e593a563aeb74e31f9051f0ebf449b6d3d6d7394Ben Gruver                        return throwBadUtf8(v0, at);
211e593a563aeb74e31f9051f0ebf449b6d3d6d7394Ben Gruver                    }
212e593a563aeb74e31f9051f0ebf449b6d3d6d7394Ben Gruver                    out = (char) v0;
213e593a563aeb74e31f9051f0ebf449b6d3d6d7394Ben Gruver                    at++;
214e593a563aeb74e31f9051f0ebf449b6d3d6d7394Ben Gruver                    break;
215e593a563aeb74e31f9051f0ebf449b6d3d6d7394Ben Gruver                }
216e593a563aeb74e31f9051f0ebf449b6d3d6d7394Ben Gruver                case 0x0c: case 0x0d: {
217e593a563aeb74e31f9051f0ebf449b6d3d6d7394Ben Gruver                    // 110XXXXX -- two-byte encoding
218e593a563aeb74e31f9051f0ebf449b6d3d6d7394Ben Gruver                    int v1 = bytes[at + 1] & 0xFF;
219e593a563aeb74e31f9051f0ebf449b6d3d6d7394Ben Gruver                    if ((v1 & 0xc0) != 0x80) {
220e593a563aeb74e31f9051f0ebf449b6d3d6d7394Ben Gruver                        return throwBadUtf8(v1, at + 1);
221e593a563aeb74e31f9051f0ebf449b6d3d6d7394Ben Gruver                    }
222e593a563aeb74e31f9051f0ebf449b6d3d6d7394Ben Gruver                    int value = ((v0 & 0x1f) << 6) | (v1 & 0x3f);
223e593a563aeb74e31f9051f0ebf449b6d3d6d7394Ben Gruver                    if ((value != 0) && (value < 0x80)) {
224e593a563aeb74e31f9051f0ebf449b6d3d6d7394Ben Gruver                        /*
225e593a563aeb74e31f9051f0ebf449b6d3d6d7394Ben Gruver                         * This should have been represented with
226e593a563aeb74e31f9051f0ebf449b6d3d6d7394Ben Gruver                         * one-byte encoding.
227e593a563aeb74e31f9051f0ebf449b6d3d6d7394Ben Gruver                         */
228e593a563aeb74e31f9051f0ebf449b6d3d6d7394Ben Gruver                        return throwBadUtf8(v1, at + 1);
229e593a563aeb74e31f9051f0ebf449b6d3d6d7394Ben Gruver                    }
230e593a563aeb74e31f9051f0ebf449b6d3d6d7394Ben Gruver                    out = (char) value;
231e593a563aeb74e31f9051f0ebf449b6d3d6d7394Ben Gruver                    at += 2;
232e593a563aeb74e31f9051f0ebf449b6d3d6d7394Ben Gruver                    break;
233e593a563aeb74e31f9051f0ebf449b6d3d6d7394Ben Gruver                }
234e593a563aeb74e31f9051f0ebf449b6d3d6d7394Ben Gruver                case 0x0e: {
235e593a563aeb74e31f9051f0ebf449b6d3d6d7394Ben Gruver                    // 1110XXXX -- three-byte encoding
236e593a563aeb74e31f9051f0ebf449b6d3d6d7394Ben Gruver                    int v1 = bytes[at + 1] & 0xFF;
237e593a563aeb74e31f9051f0ebf449b6d3d6d7394Ben Gruver                    if ((v1 & 0xc0) != 0x80) {
238e593a563aeb74e31f9051f0ebf449b6d3d6d7394Ben Gruver                        return throwBadUtf8(v1, at + 1);
239e593a563aeb74e31f9051f0ebf449b6d3d6d7394Ben Gruver                    }
240e593a563aeb74e31f9051f0ebf449b6d3d6d7394Ben Gruver                    int v2 = bytes[at + 2] & 0xFF;
241e593a563aeb74e31f9051f0ebf449b6d3d6d7394Ben Gruver                    if ((v2 & 0xc0) != 0x80) {
242e593a563aeb74e31f9051f0ebf449b6d3d6d7394Ben Gruver                        return throwBadUtf8(v2, at + 2);
243e593a563aeb74e31f9051f0ebf449b6d3d6d7394Ben Gruver                    }
244e593a563aeb74e31f9051f0ebf449b6d3d6d7394Ben Gruver                    int value = ((v0 & 0x0f) << 12) | ((v1 & 0x3f) << 6) |
24583b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com                        (v2 & 0x3f);
24683b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com                    if (value < 0x800) {
24783b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com                        /*
24883b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com                         * This should have been represented with one- or
24983b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com                         * two-byte encoding.
25083b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com                         */
25183b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com                        return throwBadUtf8(v2, at + 2);
25283b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com                    }
25383b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com                    out = (char) value;
25483b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com                    at += 3;
25583b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com                    break;
25683b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com                }
25783b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com                default: {
25883b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com                    // 10XXXXXX, 1111XXXX -- illegal
25983b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com                    return throwBadUtf8(v0, at);
26083b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com                }
26183b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com            }
26283b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com            chars[outAt] = out;
26383b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com            outAt++;
26483b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com        }
26583b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com
26660d834f95104bcbe856f4100ec967f4aae67eb04Ben Gruver        if (readLength != null && readLength.length > 0) {
26760d834f95104bcbe856f4100ec967f4aae67eb04Ben Gruver            readLength[0] = at - start;
26860d834f95104bcbe856f4100ec967f4aae67eb04Ben Gruver            readLength[0] = at - start;
26960d834f95104bcbe856f4100ec967f4aae67eb04Ben Gruver        }
27083b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com        return new String(chars, 0, outAt);
27183b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com    }
27283b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com
27383b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com    /**
27483b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com     * Helper for {@link #utf8BytesToString}, which throws the right
27583b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com     * exception for a bogus utf-8 byte.
2762ba2d0f16bf8a40d89f8ba26ed01096b2cb629f3JesusFreke@JesusFreke.com     *
27783b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com     * @param value the byte value
27883b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com     * @param offset the file offset
27983b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com     * @return never
28083b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com     * @throws IllegalArgumentException always thrown
28183b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com     */
28283b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com    private static String throwBadUtf8(int value, int offset) {
28383b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com        throw new IllegalArgumentException("bad utf-8 byte " + Hex.u1(value) +
28483b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com                                           " at offset " + Hex.u4(offset));
28583b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com    }
28683b80f81d311b233188c281059aad4a9f5e8b4e6JesusFreke@JesusFreke.com}
287