1/* 2 * Copyright (C) 2015 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License 15 */ 16 17package java.nio.charset; 18 19import java.io.UTFDataFormatException; 20 21/** 22 * Encoding and decoding methods for Modified UTF-8 23 * 24 * <p>Modified UTF-8 is a simple variation of UTF-8 in which {@code \u0000} is encoded as 25 * 0xc0 0x80 . This avoids the presence of bytes 0 in the output. 26 * 27 * @hide 28 */ 29public class ModifiedUtf8 { 30 31 /** 32 * Count the number of bytes in the modified UTF-8 representation of {@code s}. 33 * 34 * <p>Additionally, if {@code shortLength} is true, throw a {@code UTFDataFormatException} if 35 * the size cannot be presented in an (unsigned) java short. 36 */ 37 public static long countBytes(String s, boolean shortLength) throws UTFDataFormatException { 38 long counter = 0; 39 int strLen = s.length(); 40 for (int i = 0; i < strLen; i++) { 41 char c = s.charAt(i); 42 if (c < '\u0080') { 43 counter++; 44 if (c == '\u0000') { 45 counter++; 46 } 47 } else if (c < '\u0800') { 48 counter += 2; 49 } else { 50 counter += 3; 51 } 52 } 53 // Allow up to the maximum value of an unsigned short (as the value is known to be 54 // unsigned. 55 if (shortLength && counter > 0xffff) { 56 throw new UTFDataFormatException( 57 "Size of the encoded string doesn't fit in two bytes"); 58 } 59 return counter; 60 } 61 62 /** 63 * Encode {@code s} into {@code dst} starting at offset {@code offset}. 64 * 65 * <p>The output buffer is guaranteed to have enough space. 66 */ 67 public static void encode(byte[] dst, int offset, String s) { 68 int strLen = s.length(); 69 for (int i = 0; i < strLen; i++) { 70 char c = s.charAt(i); 71 if (c < '\u0080') { 72 if (c == 0) { 73 dst[offset++] = (byte) 0xc0; 74 dst[offset++] = (byte) 0x80; 75 } else { 76 dst[offset++] = (byte) c; 77 } 78 } else if (c < '\u0800') { 79 dst[offset++] = (byte) ((c >>> 6) | 0xc0); 80 dst[offset++] = (byte) ((c & 0x3f) | 0x80); 81 } else { 82 dst[offset++] = (byte) ((c >>> 12) | 0xe0); 83 dst[offset++] = (byte) (((c >>> 6) & 0x3f) | 0x80); 84 dst[offset++] = (byte) ((c & 0x3f) | 0x80); 85 } 86 } 87 } 88 89 /** 90 * Encodes {@code s} into a buffer with the following format: 91 * 92 * <p>- the first two bytes of the buffer are the length of the modified-utf8 output 93 * (as a big endian short. A UTFDataFormatException is thrown if the encoded size cannot be 94 * represented as a short. 95 * 96 * <p>- the remainder of the buffer contains the modified-utf8 output (equivalent to 97 * {@code encode(buf, 2, s)}). 98 */ 99 public static byte[] encode(String s) throws UTFDataFormatException { 100 long size = countBytes(s, true); 101 byte[] output = new byte[(int) size + 2]; 102 encode(output, 2, s); 103 output[0] = (byte) (size >>> 8); 104 output[1] = (byte) size; 105 return output; 106 } 107 108 /** 109 * Decodes {@code length} utf-8 bytes from {@code in} starting at offset {@code offset} to 110 * {@code out}, 111 * 112 * <p>A maximum of {@code length} chars are written to the output starting at offset 0. 113 * {@code out} is assumed to have enough space for the output (a standard 114 * {@code ArrayIndexOutOfBoundsException} is thrown otherwise). 115 * 116 * <p>If a ‘0’ byte is encountered, it is converted to U+0000. 117 */ 118 public static String decode(byte[] in, char[] out, int offset, int length) 119 throws UTFDataFormatException { 120 if (offset < 0 || length < 0) { 121 throw new IllegalArgumentException("Illegal arguments: offset " + offset 122 + ". Length: " + length); 123 } 124 int outputIndex = 0; 125 int limitIndex = offset + length; 126 while (offset < limitIndex) { 127 int i = in[offset] & 0xff; 128 offset++; 129 if (i < 0x80) { 130 out[outputIndex] = (char) i; 131 outputIndex++; 132 continue; 133 } 134 if (0xc0 <= i && i < 0xe0) { 135 // This branch covers the case 0 = 0xc080. 136 137 // The result is: 5 least-significant bits of i + 6 l-s bits of next input byte. 138 i = (i & 0x1f) << 6; 139 if(offset == limitIndex) { 140 throw new UTFDataFormatException("unexpected end of input"); 141 } 142 // Include 6 least-significant bits of the input byte. 143 if ((in[offset] & 0xc0) != 0x80) { 144 throw new UTFDataFormatException("bad second byte at " + offset); 145 } 146 out[outputIndex] = (char) (i | (in[offset] & 0x3f)); 147 offset++; 148 outputIndex++; 149 } else if(i < 0xf0) { 150 // The result is: 5 least-significant bits of i + 6 l-s bits of next input byte 151 // + 6 l-s of next to next input byte. 152 i = (i & 0x1f) << 12; 153 // Make sure there are are at least two bytes left. 154 if (offset + 1 >= limitIndex) { 155 throw new UTFDataFormatException("unexpected end of input"); 156 } 157 // Include 6 least-significant bits of the input byte, with 6 bits of room 158 // for the next byte. 159 if ((in[offset] & 0xc0) != 0x80) { 160 throw new UTFDataFormatException("bad second byte at " + offset); 161 } 162 i = i | (in[offset] & 0x3f) << 6; 163 offset++; 164 // Include 6 least-significant bits of the input byte. 165 if ((in[offset] & 0xc0) != 0x80) { 166 throw new UTFDataFormatException("bad third byte at " + offset); 167 } 168 out[outputIndex] = (char) (i | (in[offset] & 0x3f)); 169 offset++; 170 outputIndex++; 171 } else { 172 throw new UTFDataFormatException("Invalid UTF8 byte " 173 + (int) i + " at position " + (offset - 1)); 174 } 175 } 176 return String.valueOf(out, 0, outputIndex); 177 } 178} 179