1e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes/*
2e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes *  Licensed to the Apache Software Foundation (ASF) under one or more
3e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes *  contributor license agreements.  See the NOTICE file distributed with
4e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes *  this work for additional information regarding copyright ownership.
5e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes *  The ASF licenses this file to You under the Apache License, Version 2.0
6e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes *  (the "License"); you may not use this file except in compliance with
7e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes *  the License.  You may obtain a copy of the License at
8e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes *
9e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes *     http://www.apache.org/licenses/LICENSE-2.0
10e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes *
11e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes *  Unless required by applicable law or agreed to in writing, software
12e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes *  distributed under the License is distributed on an "AS IS" BASIS,
13e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes *  See the License for the specific language governing permissions and
15e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes *  limitations under the License.
16e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes */
17e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes
18e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughespackage java.nio.charset;
19e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes
20e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughesimport java.io.UTFDataFormatException;
219559e748729ef1deb6400f31d0407543cbff3566Elliott Hughesimport java.nio.ByteOrder;
22f934c3d2c8dd9e6bc5299cef41adace2a671637dElliott Hughesimport libcore.io.Memory;
239559e748729ef1deb6400f31d0407543cbff3566Elliott Hughesimport libcore.io.SizeOf;
24e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes
25e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes/**
26e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes * @hide internal use only
27e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes */
28e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughespublic class ModifiedUtf8 {
29e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes    /**
30e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes     * Decodes a byte array containing <i>modified UTF-8</i> bytes into a string.
31e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes     *
32e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes     * <p>Note that although this method decodes the (supposedly impossible) zero byte to U+0000,
33e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes     * that's what the RI does too.
34e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes     */
35e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes    public static String decode(byte[] in, char[] out, int offset, int utfSize) throws UTFDataFormatException {
36e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes        int count = 0, s = 0, a;
37e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes        while (count < utfSize) {
38e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes            if ((out[s] = (char) in[offset + count++]) < '\u0080') {
39e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes                s++;
40e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes            } else if (((a = out[s]) & 0xe0) == 0xc0) {
41e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes                if (count >= utfSize) {
42e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes                    throw new UTFDataFormatException("bad second byte at " + count);
43e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes                }
44e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes                int b = in[offset + count++];
45e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes                if ((b & 0xC0) != 0x80) {
46e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes                    throw new UTFDataFormatException("bad second byte at " + (count - 1));
47e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes                }
48e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes                out[s++] = (char) (((a & 0x1F) << 6) | (b & 0x3F));
49e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes            } else if ((a & 0xf0) == 0xe0) {
50e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes                if (count + 1 >= utfSize) {
51e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes                    throw new UTFDataFormatException("bad third byte at " + (count + 1));
52e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes                }
53e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes                int b = in[offset + count++];
54e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes                int c = in[offset + count++];
55e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes                if (((b & 0xC0) != 0x80) || ((c & 0xC0) != 0x80)) {
56e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes                    throw new UTFDataFormatException("bad second or third byte at " + (count - 2));
57e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes                }
58e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes                out[s++] = (char) (((a & 0x0F) << 12) | ((b & 0x3F) << 6) | (c & 0x3F));
59e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes            } else {
60e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes                throw new UTFDataFormatException("bad byte at " + (count - 1));
61e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes            }
62e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes        }
63e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes        return new String(out, 0, s);
64e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes    }
65e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes
669559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes    /**
676aa068b481cc4cca7765ce90fdf32f3eb2b5a77cElliott Hughes     * Returns the number of bytes the modified UTF-8 representation of 's' would take. Note
689559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes     * that this is just the space for the bytes representing the characters, not the length
699559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes     * which precedes those bytes, because different callers represent the length differently,
709559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes     * as two, four, or even eight bytes. If {@code shortLength} is true, we'll throw an
719559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes     * exception if the string is too long for its length to be represented by a short.
729559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes     */
739559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes    public static long countBytes(String s, boolean shortLength) throws UTFDataFormatException {
749559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes        long result = 0;
759559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes        final int length = s.length();
769559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes        for (int i = 0; i < length; ++i) {
779559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes            char ch = s.charAt(i);
789559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes            if (ch != 0 && ch <= 127) { // U+0000 uses two bytes.
799559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes                ++result;
809559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes            } else if (ch <= 2047) {
819559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes                result += 2;
829559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes            } else {
839559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes                result += 3;
849559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes            }
859559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes            if (shortLength && result > 65535) {
869559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes                throw new UTFDataFormatException("String more than 65535 UTF bytes long");
879559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes            }
889559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes        }
899559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes        return result;
909559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes    }
919559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes
929559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes    /**
939559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes     * Encodes the <i>modified UTF-8</i> bytes corresponding to string {@code s} into the
949559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes     * byte array {@code dst}, starting at the given {@code offset}.
959559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes     */
969559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes    public static void encode(byte[] dst, int offset, String s) {
979559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes        final int length = s.length();
989559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes        for (int i = 0; i < length; i++) {
999559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes            char ch = s.charAt(i);
1009559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes            if (ch != 0 && ch <= 127) { // U+0000 uses two bytes.
1019559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes                dst[offset++] = (byte) ch;
1029559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes            } else if (ch <= 2047) {
1039559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes                dst[offset++] = (byte) (0xc0 | (0x1f & (ch >> 6)));
1049559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes                dst[offset++] = (byte) (0x80 | (0x3f & ch));
1059559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes            } else {
1069559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes                dst[offset++] = (byte) (0xe0 | (0x0f & (ch >> 12)));
1079559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes                dst[offset++] = (byte) (0x80 | (0x3f & (ch >> 6)));
1089559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes                dst[offset++] = (byte) (0x80 | (0x3f & ch));
1099559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes            }
1109559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes        }
1119559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes    }
1129559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes
1139559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes    /**
1149559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes     * Returns an array containing the <i>modified UTF-8</i> form of {@code s}, using a
1159559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes     * big-endian 16-bit length. Throws UTFDataFormatException if {@code s} is too long
1169559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes     * for a two-byte length.
1179559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes     */
1189559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes    public static byte[] encode(String s) throws UTFDataFormatException {
1199559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes        int utfCount = (int) ModifiedUtf8.countBytes(s, true);
1209559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes        byte[] result = new byte[SizeOf.SHORT + utfCount];
121f934c3d2c8dd9e6bc5299cef41adace2a671637dElliott Hughes        Memory.pokeShort(result, 0, (short) utfCount, ByteOrder.BIG_ENDIAN);
1229559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes        ModifiedUtf8.encode(result, SizeOf.SHORT, s);
1239559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes        return result;
1249559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes    }
1259559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes
126e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes    private ModifiedUtf8() {
127e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes    }
128e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes}
129