1/* 2 * Licensed to the Apache Software Foundation (ASF) under one or more 3 * contributor license agreements. See the NOTICE file distributed with 4 * this work for additional information regarding copyright ownership. 5 * The ASF licenses this file to You under the Apache License, Version 2.0 6 * (the "License"); you may not use this file except in compliance with 7 * the License. You may obtain a copy of the License at 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 */ 17 18package java.nio.charset; 19 20import java.io.UTFDataFormatException; 21import java.nio.ByteOrder; 22import libcore.io.Memory; 23import libcore.io.SizeOf; 24 25/** 26 * @hide internal use only 27 */ 28public class ModifiedUtf8 { 29 /** 30 * Decodes a byte array containing <i>modified UTF-8</i> bytes into a string. 31 * 32 * <p>Note that although this method decodes the (supposedly impossible) zero byte to U+0000, 33 * that's what the RI does too. 34 */ 35 public static String decode(byte[] in, char[] out, int offset, int utfSize) throws UTFDataFormatException { 36 int count = 0, s = 0, a; 37 while (count < utfSize) { 38 if ((out[s] = (char) in[offset + count++]) < '\u0080') { 39 s++; 40 } else if (((a = out[s]) & 0xe0) == 0xc0) { 41 if (count >= utfSize) { 42 throw new UTFDataFormatException("bad second byte at " + count); 43 } 44 int b = in[offset + count++]; 45 if ((b & 0xC0) != 0x80) { 46 throw new UTFDataFormatException("bad second byte at " + (count - 1)); 47 } 48 out[s++] = (char) (((a & 0x1F) << 6) | (b & 0x3F)); 49 } else if ((a & 0xf0) == 0xe0) { 50 if (count + 1 >= utfSize) { 51 throw new UTFDataFormatException("bad third byte at " + (count + 1)); 52 } 53 int b = in[offset + count++]; 54 int c = in[offset + count++]; 55 if (((b & 0xC0) != 0x80) || ((c & 0xC0) != 0x80)) { 56 throw new UTFDataFormatException("bad second or third byte at " + (count - 2)); 57 } 58 out[s++] = (char) (((a & 0x0F) << 12) | ((b & 0x3F) << 6) | (c & 0x3F)); 59 } else { 60 throw new UTFDataFormatException("bad byte at " + (count - 1)); 61 } 62 } 63 return new String(out, 0, s); 64 } 65 66 /** 67 * Returns the number of bytes the modified UTF-8 representation of 's' would take. Note 68 * that this is just the space for the bytes representing the characters, not the length 69 * which precedes those bytes, because different callers represent the length differently, 70 * as two, four, or even eight bytes. If {@code shortLength} is true, we'll throw an 71 * exception if the string is too long for its length to be represented by a short. 72 */ 73 public static long countBytes(String s, boolean shortLength) throws UTFDataFormatException { 74 long result = 0; 75 final int length = s.length(); 76 for (int i = 0; i < length; ++i) { 77 char ch = s.charAt(i); 78 if (ch != 0 && ch <= 127) { // U+0000 uses two bytes. 79 ++result; 80 } else if (ch <= 2047) { 81 result += 2; 82 } else { 83 result += 3; 84 } 85 if (shortLength && result > 65535) { 86 throw new UTFDataFormatException("String more than 65535 UTF bytes long"); 87 } 88 } 89 return result; 90 } 91 92 /** 93 * Encodes the <i>modified UTF-8</i> bytes corresponding to string {@code s} into the 94 * byte array {@code dst}, starting at the given {@code offset}. 95 */ 96 public static void encode(byte[] dst, int offset, String s) { 97 final int length = s.length(); 98 for (int i = 0; i < length; i++) { 99 char ch = s.charAt(i); 100 if (ch != 0 && ch <= 127) { // U+0000 uses two bytes. 101 dst[offset++] = (byte) ch; 102 } else if (ch <= 2047) { 103 dst[offset++] = (byte) (0xc0 | (0x1f & (ch >> 6))); 104 dst[offset++] = (byte) (0x80 | (0x3f & ch)); 105 } else { 106 dst[offset++] = (byte) (0xe0 | (0x0f & (ch >> 12))); 107 dst[offset++] = (byte) (0x80 | (0x3f & (ch >> 6))); 108 dst[offset++] = (byte) (0x80 | (0x3f & ch)); 109 } 110 } 111 } 112 113 /** 114 * Returns an array containing the <i>modified UTF-8</i> form of {@code s}, using a 115 * big-endian 16-bit length. Throws UTFDataFormatException if {@code s} is too long 116 * for a two-byte length. 117 */ 118 public static byte[] encode(String s) throws UTFDataFormatException { 119 int utfCount = (int) ModifiedUtf8.countBytes(s, true); 120 byte[] result = new byte[SizeOf.SHORT + utfCount]; 121 Memory.pokeShort(result, 0, (short) utfCount, ByteOrder.BIG_ENDIAN); 122 ModifiedUtf8.encode(result, SizeOf.SHORT, s); 123 return result; 124 } 125 126 private ModifiedUtf8() { 127 } 128} 129