nio/charset/ModifiedUtf8.java

/*
 * Copyright (C) 2015 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License
 */

package java.nio.charset;

import java.io.UTFDataFormatException;

/**
 * Encoding and decoding methods for Modified UTF-8
 *
 * <p>Modified UTF-8 is a simple variation of UTF-8 in which {@code \u0000} is encoded as
 * 0xc0 0x80 . This avoids the presence of bytes 0 in the output.
 *
 * @hide
 */
public class ModifiedUtf8 {

    /**
     * Count the number of bytes in the modified UTF-8 representation of {@code s}.
     *
     * <p>Additionally, if {@code shortLength} is true, throw a {@code UTFDataFormatException} if
     * the size cannot be presented in an (unsigned) java short.
     */
    public static long countBytes(String s, boolean shortLength) throws UTFDataFormatException {
        long counter = 0;
        int strLen = s.length();
        for (int i = 0; i < strLen; i++) {
            char c = s.charAt(i);
            if (c < '\u0080') {
                counter++;
                if (c == '\u0000') {
                    counter++;
                }
            } else if (c < '\u0800') {
                counter += 2;
            } else {
                counter += 3;
            }
        }
        // Allow up to the maximum value of an unsigned short (as the value is known to be
        // unsigned.
        if (shortLength && counter > 0xffff) {
            throw new UTFDataFormatException(
                    "Size of the encoded string doesn't fit in two bytes");
        }
        return counter;
    }

    /**
     * Encode {@code s} into {@code dst} starting at offset {@code offset}.
     *
     * <p>The output buffer is guaranteed to have enough space.
     */
    public static void encode(byte[] dst, int offset, String s) {
        int strLen = s.length();
        for (int i = 0; i < strLen; i++) {
            char c = s.charAt(i);
            if (c < '\u0080') {
                if (c == 0) {
                    dst[offset++] = (byte) 0xc0;
                    dst[offset++] = (byte) 0x80;
                } else {
                    dst[offset++] = (byte) c;
                }
            } else if (c < '\u0800') {
                dst[offset++] = (byte) ((c >>> 6) | 0xc0);
                dst[offset++] = (byte) ((c & 0x3f) | 0x80);
            } else {
                dst[offset++] = (byte) ((c >>> 12) | 0xe0);
                dst[offset++] = (byte) (((c >>> 6) & 0x3f) | 0x80);
                dst[offset++] = (byte) ((c & 0x3f) | 0x80);
            }
        }
    }

    /**
     * Encodes {@code s} into a buffer with the following format:
     *
     * <p>- the first two bytes of the buffer are the length of the modified-utf8 output
     * (as a big endian short. A UTFDataFormatException is thrown if the encoded size cannot be
     * represented as a short.
     *
     * <p>- the remainder of the buffer contains the modified-utf8 output (equivalent to
     * {@code encode(buf, 2, s)}).
     */
    public static byte[] encode(String s) throws UTFDataFormatException {
        long size = countBytes(s, true);
        byte[] output = new byte[(int) size + 2];
        encode(output, 2, s);
        output[0] = (byte) (size >>> 8);
        output[1] = (byte) size;
        return output;
    }

    /**
     * Decodes {@code length} utf-8 bytes from {@code in} starting at offset {@code offset} to
     * {@code out},
     *
     * <p>A maximum of {@code length} chars are written to the output starting at offset 0.
     * {@code out} is assumed to have enough space for the output (a standard
     * {@code ArrayIndexOutOfBoundsException} is thrown otherwise).
     *
     * <p>If a ‘0’ byte is encountered, it is converted to U+0000.
     */
    public static String decode(byte[] in, char[] out, int offset, int length)
            throws UTFDataFormatException {
        if (offset < 0 || length < 0) {
            throw new IllegalArgumentException("Illegal arguments: offset " + offset
                    + ". Length: " + length);
        }
        int outputIndex = 0;
        int limitIndex = offset + length;
        while (offset < limitIndex) {
            int i = in[offset] & 0xff;
            offset++;
            if (i < 0x80) {
                out[outputIndex] = (char) i;
                outputIndex++;
                continue;
            }
            if (0xc0 <= i && i < 0xe0) {
                // This branch covers the case 0 = 0xc080.

                // The result is: 5 least-significant bits of i + 6 l-s bits of next input byte.
                i = (i & 0x1f) << 6;
                if(offset == limitIndex) {
                    throw new UTFDataFormatException("unexpected end of input");
                }
                // Include 6 least-significant bits of the input byte.
                if ((in[offset] & 0xc0) != 0x80) {
                    throw new UTFDataFormatException("bad second byte at " + offset);
                }
                out[outputIndex] = (char) (i | (in[offset] & 0x3f));
                offset++;
                outputIndex++;
            } else if(i < 0xf0) {
                // The result is: 5 least-significant bits of i + 6 l-s bits of next input byte
                // + 6 l-s of next to next input byte.
                i = (i & 0x1f) << 12;
                // Make sure there are are at least two bytes left.
                if (offset + 1 >= limitIndex) {
                    throw new UTFDataFormatException("unexpected end of input");
                }
                // Include 6 least-significant bits of the input byte, with 6 bits of room
                // for the next byte.
                if ((in[offset] & 0xc0) != 0x80) {
                    throw new UTFDataFormatException("bad second byte at " + offset);
                }
                i = i | (in[offset] & 0x3f) << 6;
                offset++;
                // Include 6 least-significant bits of the input byte.
                if ((in[offset] & 0xc0) != 0x80) {
                    throw new UTFDataFormatException("bad third byte at " + offset);
                }
                out[outputIndex] = (char) (i | (in[offset] & 0x3f));
                offset++;
                outputIndex++;
            } else {
                throw new UTFDataFormatException("Invalid UTF8 byte "
                        + (int) i + " at position " + (offset - 1));
            }
        }
        return String.valueOf(out, 0, outputIndex);
    }
}