1/*
2 * Copyright (C) 2015 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License
15 */
16
17package java.nio.charset;
18
19import java.io.UTFDataFormatException;
20
21/**
22 * Encoding and decoding methods for Modified UTF-8
23 *
24 * <p>Modified UTF-8 is a simple variation of UTF-8 in which {@code \u0000} is encoded as
25 * 0xc0 0x80 . This avoids the presence of bytes 0 in the output.
26 *
27 * @hide
28 */
29public class ModifiedUtf8 {
30
31    /**
32     * Count the number of bytes in the modified UTF-8 representation of {@code s}.
33     *
34     * <p>Additionally, if {@code shortLength} is true, throw a {@code UTFDataFormatException} if
35     * the size cannot be presented in an (unsigned) java short.
36     */
37    public static long countBytes(String s, boolean shortLength) throws UTFDataFormatException {
38        long counter = 0;
39        int strLen = s.length();
40        for (int i = 0; i < strLen; i++) {
41            char c = s.charAt(i);
42            if (c < '\u0080') {
43                counter++;
44                if (c == '\u0000') {
45                    counter++;
46                }
47            } else if (c < '\u0800') {
48                counter += 2;
49            } else {
50                counter += 3;
51            }
52        }
53        // Allow up to the maximum value of an unsigned short (as the value is known to be
54        // unsigned.
55        if (shortLength && counter > 0xffff) {
56            throw new UTFDataFormatException(
57                    "Size of the encoded string doesn't fit in two bytes");
58        }
59        return counter;
60    }
61
62    /**
63     * Encode {@code s} into {@code dst} starting at offset {@code offset}.
64     *
65     * <p>The output buffer is guaranteed to have enough space.
66     */
67    public static void encode(byte[] dst, int offset, String s) {
68        int strLen = s.length();
69        for (int i = 0; i < strLen; i++) {
70            char c = s.charAt(i);
71            if (c < '\u0080') {
72                if (c == 0) {
73                    dst[offset++] = (byte) 0xc0;
74                    dst[offset++] = (byte) 0x80;
75                } else {
76                    dst[offset++] = (byte) c;
77                }
78            } else if (c < '\u0800') {
79                dst[offset++] = (byte) ((c >>> 6) | 0xc0);
80                dst[offset++] = (byte) ((c & 0x3f) | 0x80);
81            } else {
82                dst[offset++] = (byte) ((c >>> 12) | 0xe0);
83                dst[offset++] = (byte) (((c >>> 6) & 0x3f) | 0x80);
84                dst[offset++] = (byte) ((c & 0x3f) | 0x80);
85            }
86        }
87    }
88
89    /**
90     * Encodes {@code s} into a buffer with the following format:
91     *
92     * <p>- the first two bytes of the buffer are the length of the modified-utf8 output
93     * (as a big endian short. A UTFDataFormatException is thrown if the encoded size cannot be
94     * represented as a short.
95     *
96     * <p>- the remainder of the buffer contains the modified-utf8 output (equivalent to
97     * {@code encode(buf, 2, s)}).
98     */
99    public static byte[] encode(String s) throws UTFDataFormatException {
100        long size = countBytes(s, true);
101        byte[] output = new byte[(int) size + 2];
102        encode(output, 2, s);
103        output[0] = (byte) (size >>> 8);
104        output[1] = (byte) size;
105        return output;
106    }
107
108    /**
109     * Decodes {@code length} utf-8 bytes from {@code in} starting at offset {@code offset} to
110     * {@code out},
111     *
112     * <p>A maximum of {@code length} chars are written to the output starting at offset 0.
113     * {@code out} is assumed to have enough space for the output (a standard
114     * {@code ArrayIndexOutOfBoundsException} is thrown otherwise).
115     *
116     * <p>If a ‘0’ byte is encountered, it is converted to U+0000.
117     */
118    public static String decode(byte[] in, char[] out, int offset, int length)
119            throws UTFDataFormatException {
120        if (offset < 0 || length < 0) {
121            throw new IllegalArgumentException("Illegal arguments: offset " + offset
122                    + ". Length: " + length);
123        }
124        int outputIndex = 0;
125        int limitIndex = offset + length;
126        while (offset < limitIndex) {
127            int i = in[offset] & 0xff;
128            offset++;
129            if (i < 0x80) {
130                out[outputIndex] = (char) i;
131                outputIndex++;
132                continue;
133            }
134            if (0xc0 <= i && i < 0xe0) {
135                // This branch covers the case 0 = 0xc080.
136
137                // The result is: 5 least-significant bits of i + 6 l-s bits of next input byte.
138                i = (i & 0x1f) << 6;
139                if(offset == limitIndex) {
140                    throw new UTFDataFormatException("unexpected end of input");
141                }
142                // Include 6 least-significant bits of the input byte.
143                if ((in[offset] & 0xc0) != 0x80) {
144                    throw new UTFDataFormatException("bad second byte at " + offset);
145                }
146                out[outputIndex] = (char) (i | (in[offset] & 0x3f));
147                offset++;
148                outputIndex++;
149            } else if(i < 0xf0) {
150                // The result is: 5 least-significant bits of i + 6 l-s bits of next input byte
151                // + 6 l-s of next to next input byte.
152                i = (i & 0x1f) << 12;
153                // Make sure there are are at least two bytes left.
154                if (offset + 1 >= limitIndex) {
155                    throw new UTFDataFormatException("unexpected end of input");
156                }
157                // Include 6 least-significant bits of the input byte, with 6 bits of room
158                // for the next byte.
159                if ((in[offset] & 0xc0) != 0x80) {
160                    throw new UTFDataFormatException("bad second byte at " + offset);
161                }
162                i = i | (in[offset] & 0x3f) << 6;
163                offset++;
164                // Include 6 least-significant bits of the input byte.
165                if ((in[offset] & 0xc0) != 0x80) {
166                    throw new UTFDataFormatException("bad third byte at " + offset);
167                }
168                out[outputIndex] = (char) (i | (in[offset] & 0x3f));
169                offset++;
170                outputIndex++;
171            } else {
172                throw new UTFDataFormatException("Invalid UTF8 byte "
173                        + (int) i + " at position " + (offset - 1));
174            }
175        }
176        return String.valueOf(out, 0, outputIndex);
177    }
178}
179