1e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes/*
2e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro * Copyright (C) 2015 The Android Open Source Project
3e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes *
4e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro * Licensed under the Apache License, Version 2.0 (the "License");
5e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro * you may not use this file except in compliance with the License.
6e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro * You may obtain a copy of the License at
7e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes *
8e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro *      http://www.apache.org/licenses/LICENSE-2.0
9e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro *
10e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro * Unless required by applicable law or agreed to in writing, software
11e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro * distributed under the License is distributed on an "AS IS" BASIS,
12e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro * See the License for the specific language governing permissions and
14e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro * limitations under the License
15e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes */
16e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes
17e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughespackage java.nio.charset;
18e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes
19e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughesimport java.io.UTFDataFormatException;
20e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes
21e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes/**
22e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro * Encoding and decoding methods for Modified UTF-8
23e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro *
24e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro * <p>Modified UTF-8 is a simple variation of UTF-8 in which {@code \u0000} is encoded as
25e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro * 0xc0 0x80 . This avoids the presence of bytes 0 in the output.
26e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro *
27e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro * @hide
28e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes */
29e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughespublic class ModifiedUtf8 {
30e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro
31e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes    /**
32e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro     * Count the number of bytes in the modified UTF-8 representation of {@code s}.
33e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes     *
34e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro     * <p>Additionally, if {@code shortLength} is true, throw a {@code UTFDataFormatException} if
35e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro     * the size cannot be presented in an (unsigned) java short.
36e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes     */
37e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro    public static long countBytes(String s, boolean shortLength) throws UTFDataFormatException {
38e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro        long counter = 0;
39e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro        int strLen = s.length();
40e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro        for (int i = 0; i < strLen; i++) {
41e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro            char c = s.charAt(i);
42e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro            if (c < '\u0080') {
43e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro                counter++;
44e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro                if (c == '\u0000') {
45e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro                    counter++;
46e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes                }
47e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro            } else if (c < '\u0800') {
48e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro                counter += 2;
49e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes            } else {
50e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro                counter += 3;
51e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes            }
52e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes        }
53e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro        // Allow up to the maximum value of an unsigned short (as the value is known to be
54e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro        // unsigned.
55e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro        if (shortLength && counter > 0xffff) {
56e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro            throw new UTFDataFormatException(
57e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro                    "Size of the encoded string doesn't fit in two bytes");
589559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes        }
59e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro        return counter;
609559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes    }
619559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes
629559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes    /**
63e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro     * Encode {@code s} into {@code dst} starting at offset {@code offset}.
64e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro     *
65e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro     * <p>The output buffer is guaranteed to have enough space.
669559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes     */
679559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes    public static void encode(byte[] dst, int offset, String s) {
68e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro        int strLen = s.length();
69e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro        for (int i = 0; i < strLen; i++) {
70e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro            char c = s.charAt(i);
71e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro            if (c < '\u0080') {
72e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro                if (c == 0) {
73e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro                    dst[offset++] = (byte) 0xc0;
74e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro                    dst[offset++] = (byte) 0x80;
75e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro                } else {
76e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro                    dst[offset++] = (byte) c;
77e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro                }
78e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro            } else if (c < '\u0800') {
79e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro                dst[offset++] = (byte) ((c >>> 6) | 0xc0);
80e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro                dst[offset++] = (byte) ((c & 0x3f) | 0x80);
819559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes            } else {
82e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro                dst[offset++] = (byte) ((c >>> 12) | 0xe0);
83e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro                dst[offset++] = (byte) (((c >>> 6) & 0x3f) | 0x80);
84e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro                dst[offset++] = (byte) ((c & 0x3f) | 0x80);
859559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes            }
869559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes        }
879559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes    }
889559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes
899559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes    /**
90e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro     * Encodes {@code s} into a buffer with the following format:
91e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro     *
92e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro     * <p>- the first two bytes of the buffer are the length of the modified-utf8 output
93e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro     * (as a big endian short. A UTFDataFormatException is thrown if the encoded size cannot be
94e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro     * represented as a short.
95e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro     *
96e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro     * <p>- the remainder of the buffer contains the modified-utf8 output (equivalent to
97e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro     * {@code encode(buf, 2, s)}).
989559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes     */
999559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes    public static byte[] encode(String s) throws UTFDataFormatException {
100e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro        long size = countBytes(s, true);
101e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro        byte[] output = new byte[(int) size + 2];
102e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro        encode(output, 2, s);
103e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro        output[0] = (byte) (size >>> 8);
104e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro        output[1] = (byte) size;
105e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro        return output;
1069559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes    }
1079559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes
108e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro    /**
109e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro     * Decodes {@code length} utf-8 bytes from {@code in} starting at offset {@code offset} to
110e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro     * {@code out},
111e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro     *
112e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro     * <p>A maximum of {@code length} chars are written to the output starting at offset 0.
1138b2fa6ff346cab3683bbb93750fc72e74e4881beSergio Giro     * {@code out} is assumed to have enough space for the output (a standard
114e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro     * {@code ArrayIndexOutOfBoundsException} is thrown otherwise).
115e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro     *
116e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro     * <p>If a ‘0’ byte is encountered, it is converted to U+0000.
117e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro     */
118e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro    public static String decode(byte[] in, char[] out, int offset, int length)
119e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro            throws UTFDataFormatException {
120e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro        if (offset < 0 || length < 0) {
121e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro            throw new IllegalArgumentException("Illegal arguments: offset " + offset
122e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro                    + ". Length: " + length);
123e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro        }
124e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro        int outputIndex = 0;
125e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro        int limitIndex = offset + length;
126e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro        while (offset < limitIndex) {
127e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro            int i = in[offset] & 0xff;
128e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro            offset++;
129e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro            if (i < 0x80) {
130e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro                out[outputIndex] = (char) i;
131e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro                outputIndex++;
132e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro                continue;
133e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro            }
134e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro            if (0xc0 <= i && i < 0xe0) {
135e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro                // This branch covers the case 0 = 0xc080.
136e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro
137e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro                // The result is: 5 least-significant bits of i + 6 l-s bits of next input byte.
138e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro                i = (i & 0x1f) << 6;
139e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro                if(offset == limitIndex) {
140e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro                    throw new UTFDataFormatException("unexpected end of input");
141e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro                }
142e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro                // Include 6 least-significant bits of the input byte.
143e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro                if ((in[offset] & 0xc0) != 0x80) {
144e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro                    throw new UTFDataFormatException("bad second byte at " + offset);
145e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro                }
146e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro                out[outputIndex] = (char) (i | (in[offset] & 0x3f));
147e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro                offset++;
148e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro                outputIndex++;
149e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro            } else if(i < 0xf0) {
150e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro                // The result is: 5 least-significant bits of i + 6 l-s bits of next input byte
151e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro                // + 6 l-s of next to next input byte.
152e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro                i = (i & 0x1f) << 12;
153e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro                // Make sure there are are at least two bytes left.
154e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro                if (offset + 1 >= limitIndex) {
155e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro                    throw new UTFDataFormatException("unexpected end of input");
156e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro                }
157e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro                // Include 6 least-significant bits of the input byte, with 6 bits of room
158e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro                // for the next byte.
159e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro                if ((in[offset] & 0xc0) != 0x80) {
160e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro                    throw new UTFDataFormatException("bad second byte at " + offset);
161e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro                }
162e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro                i = i | (in[offset] & 0x3f) << 6;
163e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro                offset++;
164e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro                // Include 6 least-significant bits of the input byte.
165e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro                if ((in[offset] & 0xc0) != 0x80) {
166e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro                    throw new UTFDataFormatException("bad third byte at " + offset);
167e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro                }
168e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro                out[outputIndex] = (char) (i | (in[offset] & 0x3f));
169e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro                offset++;
170e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro                outputIndex++;
171e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro            } else {
172e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro                throw new UTFDataFormatException("Invalid UTF8 byte "
173e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro                        + (int) i + " at position " + (offset - 1));
174e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro            }
175e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro        }
176e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro        return String.valueOf(out, 0, outputIndex);
177e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes    }
178e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes}
179