1e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes/* 2e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes * Licensed to the Apache Software Foundation (ASF) under one or more 3e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes * contributor license agreements. See the NOTICE file distributed with 4e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes * this work for additional information regarding copyright ownership. 5e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes * The ASF licenses this file to You under the Apache License, Version 2.0 6e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes * (the "License"); you may not use this file except in compliance with 7e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes * the License. You may obtain a copy of the License at 8e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes * 9e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes * http://www.apache.org/licenses/LICENSE-2.0 10e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes * 11e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes * Unless required by applicable law or agreed to in writing, software 12e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes * distributed under the License is distributed on an "AS IS" BASIS, 13e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes * See the License for the specific language governing permissions and 15e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes * limitations under the License. 16e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes */ 17e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes 18e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughespackage java.nio.charset; 19e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes 20e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughesimport java.io.UTFDataFormatException; 219559e748729ef1deb6400f31d0407543cbff3566Elliott Hughesimport java.nio.ByteOrder; 22f934c3d2c8dd9e6bc5299cef41adace2a671637dElliott Hughesimport libcore.io.Memory; 239559e748729ef1deb6400f31d0407543cbff3566Elliott Hughesimport libcore.io.SizeOf; 24e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes 25e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes/** 26e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes * @hide internal use only 27e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes */ 28e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughespublic class ModifiedUtf8 { 29e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes /** 30e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes * Decodes a byte array containing <i>modified UTF-8</i> bytes into a string. 31e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes * 32e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes * <p>Note that although this method decodes the (supposedly impossible) zero byte to U+0000, 33e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes * that's what the RI does too. 34e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes */ 35e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes public static String decode(byte[] in, char[] out, int offset, int utfSize) throws UTFDataFormatException { 36e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes int count = 0, s = 0, a; 37e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes while (count < utfSize) { 38e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes if ((out[s] = (char) in[offset + count++]) < '\u0080') { 39e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes s++; 40e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes } else if (((a = out[s]) & 0xe0) == 0xc0) { 41e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes if (count >= utfSize) { 42e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes throw new UTFDataFormatException("bad second byte at " + count); 43e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes } 44e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes int b = in[offset + count++]; 45e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes if ((b & 0xC0) != 0x80) { 46e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes throw new UTFDataFormatException("bad second byte at " + (count - 1)); 47e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes } 48e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes out[s++] = (char) (((a & 0x1F) << 6) | (b & 0x3F)); 49e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes } else if ((a & 0xf0) == 0xe0) { 50e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes if (count + 1 >= utfSize) { 51e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes throw new UTFDataFormatException("bad third byte at " + (count + 1)); 52e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes } 53e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes int b = in[offset + count++]; 54e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes int c = in[offset + count++]; 55e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes if (((b & 0xC0) != 0x80) || ((c & 0xC0) != 0x80)) { 56e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes throw new UTFDataFormatException("bad second or third byte at " + (count - 2)); 57e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes } 58e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes out[s++] = (char) (((a & 0x0F) << 12) | ((b & 0x3F) << 6) | (c & 0x3F)); 59e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes } else { 60e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes throw new UTFDataFormatException("bad byte at " + (count - 1)); 61e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes } 62e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes } 63e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes return new String(out, 0, s); 64e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes } 65e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes 669559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes /** 676aa068b481cc4cca7765ce90fdf32f3eb2b5a77cElliott Hughes * Returns the number of bytes the modified UTF-8 representation of 's' would take. Note 689559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes * that this is just the space for the bytes representing the characters, not the length 699559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes * which precedes those bytes, because different callers represent the length differently, 709559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes * as two, four, or even eight bytes. If {@code shortLength} is true, we'll throw an 719559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes * exception if the string is too long for its length to be represented by a short. 729559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes */ 739559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes public static long countBytes(String s, boolean shortLength) throws UTFDataFormatException { 749559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes long result = 0; 759559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes final int length = s.length(); 769559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes for (int i = 0; i < length; ++i) { 779559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes char ch = s.charAt(i); 789559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes if (ch != 0 && ch <= 127) { // U+0000 uses two bytes. 799559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes ++result; 809559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes } else if (ch <= 2047) { 819559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes result += 2; 829559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes } else { 839559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes result += 3; 849559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes } 859559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes if (shortLength && result > 65535) { 869559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes throw new UTFDataFormatException("String more than 65535 UTF bytes long"); 879559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes } 889559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes } 899559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes return result; 909559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes } 919559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes 929559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes /** 939559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes * Encodes the <i>modified UTF-8</i> bytes corresponding to string {@code s} into the 949559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes * byte array {@code dst}, starting at the given {@code offset}. 959559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes */ 969559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes public static void encode(byte[] dst, int offset, String s) { 979559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes final int length = s.length(); 989559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes for (int i = 0; i < length; i++) { 999559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes char ch = s.charAt(i); 1009559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes if (ch != 0 && ch <= 127) { // U+0000 uses two bytes. 1019559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes dst[offset++] = (byte) ch; 1029559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes } else if (ch <= 2047) { 1039559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes dst[offset++] = (byte) (0xc0 | (0x1f & (ch >> 6))); 1049559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes dst[offset++] = (byte) (0x80 | (0x3f & ch)); 1059559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes } else { 1069559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes dst[offset++] = (byte) (0xe0 | (0x0f & (ch >> 12))); 1079559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes dst[offset++] = (byte) (0x80 | (0x3f & (ch >> 6))); 1089559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes dst[offset++] = (byte) (0x80 | (0x3f & ch)); 1099559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes } 1109559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes } 1119559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes } 1129559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes 1139559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes /** 1149559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes * Returns an array containing the <i>modified UTF-8</i> form of {@code s}, using a 1159559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes * big-endian 16-bit length. Throws UTFDataFormatException if {@code s} is too long 1169559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes * for a two-byte length. 1179559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes */ 1189559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes public static byte[] encode(String s) throws UTFDataFormatException { 1199559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes int utfCount = (int) ModifiedUtf8.countBytes(s, true); 1209559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes byte[] result = new byte[SizeOf.SHORT + utfCount]; 121f934c3d2c8dd9e6bc5299cef41adace2a671637dElliott Hughes Memory.pokeShort(result, 0, (short) utfCount, ByteOrder.BIG_ENDIAN); 1229559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes ModifiedUtf8.encode(result, SizeOf.SHORT, s); 1239559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes return result; 1249559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes } 1259559e748729ef1deb6400f31d0407543cbff3566Elliott Hughes 126e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes private ModifiedUtf8() { 127e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes } 128e810d3b49631329b11440aa5b7a54db181d42ed1Elliott Hughes} 129