1e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes/* 2e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro * Copyright (C) 2015 The Android Open Source Project 3e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes * 4e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro * Licensed under the Apache License, Version 2.0 (the "License"); 5e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro * you may not use this file except in compliance with the License. 6e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro * You may obtain a copy of the License at 7e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes * 8e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro * http://www.apache.org/licenses/LICENSE-2.0 9e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro * 10e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro * Unless required by applicable law or agreed to in writing, software 11e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro * distributed under the License is distributed on an "AS IS" BASIS, 12e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro * See the License for the specific language governing permissions and 14e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro * limitations under the License 15e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes */ 16e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes 17e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughespackage java.nio.charset; 18e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes 19e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughesimport java.io.UTFDataFormatException; 20e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes 21e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes/** 22e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro * Encoding and decoding methods for Modified UTF-8 23e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro * 24e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro * <p>Modified UTF-8 is a simple variation of UTF-8 in which {@code \u0000} is encoded as 25e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro * 0xc0 0x80 . This avoids the presence of bytes 0 in the output. 26e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro * 27e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro * @hide 28e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes */ 29e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughespublic class ModifiedUtf8 { 30e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro 31e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes /** 32e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro * Count the number of bytes in the modified UTF-8 representation of {@code s}. 33e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes * 34e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro * <p>Additionally, if {@code shortLength} is true, throw a {@code UTFDataFormatException} if 35e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro * the size cannot be presented in an (unsigned) java short. 36e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes */ 37e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro public static long countBytes(String s, boolean shortLength) throws UTFDataFormatException { 38e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro long counter = 0; 39e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro int strLen = s.length(); 40e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro for (int i = 0; i < strLen; i++) { 41e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro char c = s.charAt(i); 42e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro if (c < '\u0080') { 43e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro counter++; 44e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro if (c == '\u0000') { 45e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro counter++; 46e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes } 47e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro } else if (c < '\u0800') { 48e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro counter += 2; 49e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes } else { 50e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro counter += 3; 51e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes } 52e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes } 53e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro // Allow up to the maximum value of an unsigned short (as the value is known to be 54e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro // unsigned. 55e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro if (shortLength && counter > 0xffff) { 56e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro throw new UTFDataFormatException( 57e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro "Size of the encoded string doesn't fit in two bytes"); 589559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes } 59e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro return counter; 609559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes } 619559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes 629559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes /** 63e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro * Encode {@code s} into {@code dst} starting at offset {@code offset}. 64e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro * 65e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro * <p>The output buffer is guaranteed to have enough space. 669559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes */ 679559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes public static void encode(byte[] dst, int offset, String s) { 68e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro int strLen = s.length(); 69e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro for (int i = 0; i < strLen; i++) { 70e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro char c = s.charAt(i); 71e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro if (c < '\u0080') { 72e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro if (c == 0) { 73e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro dst[offset++] = (byte) 0xc0; 74e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro dst[offset++] = (byte) 0x80; 75e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro } else { 76e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro dst[offset++] = (byte) c; 77e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro } 78e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro } else if (c < '\u0800') { 79e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro dst[offset++] = (byte) ((c >>> 6) | 0xc0); 80e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro dst[offset++] = (byte) ((c & 0x3f) | 0x80); 819559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes } else { 82e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro dst[offset++] = (byte) ((c >>> 12) | 0xe0); 83e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro dst[offset++] = (byte) (((c >>> 6) & 0x3f) | 0x80); 84e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro dst[offset++] = (byte) ((c & 0x3f) | 0x80); 859559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes } 869559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes } 879559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes } 889559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes 899559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes /** 90e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro * Encodes {@code s} into a buffer with the following format: 91e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro * 92e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro * <p>- the first two bytes of the buffer are the length of the modified-utf8 output 93e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro * (as a big endian short. A UTFDataFormatException is thrown if the encoded size cannot be 94e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro * represented as a short. 95e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro * 96e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro * <p>- the remainder of the buffer contains the modified-utf8 output (equivalent to 97e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro * {@code encode(buf, 2, s)}). 989559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes */ 999559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes public static byte[] encode(String s) throws UTFDataFormatException { 100e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro long size = countBytes(s, true); 101e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro byte[] output = new byte[(int) size + 2]; 102e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro encode(output, 2, s); 103e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro output[0] = (byte) (size >>> 8); 104e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro output[1] = (byte) size; 105e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro return output; 1069559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes } 1079559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes 108e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro /** 109e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro * Decodes {@code length} utf-8 bytes from {@code in} starting at offset {@code offset} to 110e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro * {@code out}, 111e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro * 112e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro * <p>A maximum of {@code length} chars are written to the output starting at offset 0. 1138b2fa6ff346cab3683bbb93750fc72e74e4881beSergio Giro * {@code out} is assumed to have enough space for the output (a standard 114e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro * {@code ArrayIndexOutOfBoundsException} is thrown otherwise). 115e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro * 116e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro * <p>If a ‘0’ byte is encountered, it is converted to U+0000. 117e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro */ 118e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro public static String decode(byte[] in, char[] out, int offset, int length) 119e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro throws UTFDataFormatException { 120e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro if (offset < 0 || length < 0) { 121e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro throw new IllegalArgumentException("Illegal arguments: offset " + offset 122e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro + ". Length: " + length); 123e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro } 124e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro int outputIndex = 0; 125e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro int limitIndex = offset + length; 126e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro while (offset < limitIndex) { 127e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro int i = in[offset] & 0xff; 128e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro offset++; 129e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro if (i < 0x80) { 130e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro out[outputIndex] = (char) i; 131e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro outputIndex++; 132e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro continue; 133e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro } 134e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro if (0xc0 <= i && i < 0xe0) { 135e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro // This branch covers the case 0 = 0xc080. 136e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro 137e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro // The result is: 5 least-significant bits of i + 6 l-s bits of next input byte. 138e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro i = (i & 0x1f) << 6; 139e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro if(offset == limitIndex) { 140e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro throw new UTFDataFormatException("unexpected end of input"); 141e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro } 142e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro // Include 6 least-significant bits of the input byte. 143e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro if ((in[offset] & 0xc0) != 0x80) { 144e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro throw new UTFDataFormatException("bad second byte at " + offset); 145e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro } 146e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro out[outputIndex] = (char) (i | (in[offset] & 0x3f)); 147e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro offset++; 148e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro outputIndex++; 149e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro } else if(i < 0xf0) { 150e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro // The result is: 5 least-significant bits of i + 6 l-s bits of next input byte 151e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro // + 6 l-s of next to next input byte. 152e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro i = (i & 0x1f) << 12; 153e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro // Make sure there are are at least two bytes left. 154e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro if (offset + 1 >= limitIndex) { 155e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro throw new UTFDataFormatException("unexpected end of input"); 156e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro } 157e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro // Include 6 least-significant bits of the input byte, with 6 bits of room 158e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro // for the next byte. 159e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro if ((in[offset] & 0xc0) != 0x80) { 160e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro throw new UTFDataFormatException("bad second byte at " + offset); 161e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro } 162e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro i = i | (in[offset] & 0x3f) << 6; 163e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro offset++; 164e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro // Include 6 least-significant bits of the input byte. 165e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro if ((in[offset] & 0xc0) != 0x80) { 166e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro throw new UTFDataFormatException("bad third byte at " + offset); 167e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro } 168e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro out[outputIndex] = (char) (i | (in[offset] & 0x3f)); 169e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro offset++; 170e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro outputIndex++; 171e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro } else { 172e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro throw new UTFDataFormatException("Invalid UTF8 byte " 173e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro + (int) i + " at position " + (offset - 1)); 174e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro } 175e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro } 176e1cbd774c3aa254bead141d7177dbe1722845937Sergio Giro return String.valueOf(out, 0, outputIndex); 177e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes } 178e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes} 179