1/* 2 * Copyright (C) 2007 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17/* 18 * As per the Apache license requirements, this file has been modified 19 * from its original state. 20 * 21 * Such modifications are Copyright (C) 2010 Ben Gruver, and are released 22 * under the original license 23 */ 24 25package org.jf.util; 26 27import javax.annotation.Nonnull; 28import javax.annotation.Nullable; 29 30/** 31 * Constants of type <code>CONSTANT_Utf8_info</code>. 32 */ 33public final class Utf8Utils { 34 /** 35 * Converts a string into its Java-style UTF-8 form. Java-style UTF-8 36 * differs from normal UTF-8 in the handling of character '\0' and 37 * surrogate pairs. 38 * 39 * @param string non-null; the string to convert 40 * @return non-null; the UTF-8 bytes for it 41 */ 42 public static byte[] stringToUtf8Bytes(String string) { 43 int len = string.length(); 44 byte[] bytes = new byte[len * 3]; // Avoid having to reallocate. 45 int outAt = 0; 46 47 for (int i = 0; i < len; i++) { 48 char c = string.charAt(i); 49 if ((c != 0) && (c < 0x80)) { 50 bytes[outAt] = (byte) c; 51 outAt++; 52 } else if (c < 0x800) { 53 bytes[outAt] = (byte) (((c >> 6) & 0x1f) | 0xc0); 54 bytes[outAt + 1] = (byte) ((c & 0x3f) | 0x80); 55 outAt += 2; 56 } else { 57 bytes[outAt] = (byte) (((c >> 12) & 0x0f) | 0xe0); 58 bytes[outAt + 1] = (byte) (((c >> 6) & 0x3f) | 0x80); 59 bytes[outAt + 2] = (byte) ((c & 0x3f) | 0x80); 60 outAt += 3; 61 } 62 } 63 64 byte[] result = new byte[outAt]; 65 System.arraycopy(bytes, 0, result, 0, outAt); 66 return result; 67 } 68 69 private static final ThreadLocal<char[]> localBuffer = 70 new ThreadLocal<char[]> () { 71 @Override protected char[] initialValue() { 72 // A reasonably sized initial value 73 return new char[256]; 74 } 75 }; 76 77 /** 78 * Converts an array of UTF-8 bytes into a string. 79 * 80 * @param bytes non-null; the bytes to convert 81 * @param start the start index of the utf8 string to convert 82 * @param length the length of the utf8 string to convert, not including any null-terminator that might be present 83 * @return non-null; the converted string 84 */ 85 public static String utf8BytesToString(byte[] bytes, int start, int length) { 86 char[] chars = localBuffer.get(); 87 if (chars == null || chars.length < length) { 88 chars = new char[length]; 89 localBuffer.set(chars); 90 } 91 int outAt = 0; 92 93 for (int at = start; length > 0; /*at*/) { 94 int v0 = bytes[at] & 0xFF; 95 char out; 96 switch (v0 >> 4) { 97 case 0x00: case 0x01: case 0x02: case 0x03: 98 case 0x04: case 0x05: case 0x06: case 0x07: { 99 // 0XXXXXXX -- single-byte encoding 100 length--; 101 if (v0 == 0) { 102 // A single zero byte is illegal. 103 return throwBadUtf8(v0, at); 104 } 105 out = (char) v0; 106 at++; 107 break; 108 } 109 case 0x0c: case 0x0d: { 110 // 110XXXXX -- two-byte encoding 111 length -= 2; 112 if (length < 0) { 113 return throwBadUtf8(v0, at); 114 } 115 int v1 = bytes[at + 1] & 0xFF; 116 if ((v1 & 0xc0) != 0x80) { 117 return throwBadUtf8(v1, at + 1); 118 } 119 int value = ((v0 & 0x1f) << 6) | (v1 & 0x3f); 120 if ((value != 0) && (value < 0x80)) { 121 /* 122 * This should have been represented with 123 * one-byte encoding. 124 */ 125 return throwBadUtf8(v1, at + 1); 126 } 127 out = (char) value; 128 at += 2; 129 break; 130 } 131 case 0x0e: { 132 // 1110XXXX -- three-byte encoding 133 length -= 3; 134 if (length < 0) { 135 return throwBadUtf8(v0, at); 136 } 137 int v1 = bytes[at + 1] & 0xFF; 138 if ((v1 & 0xc0) != 0x80) { 139 return throwBadUtf8(v1, at + 1); 140 } 141 int v2 = bytes[at + 2] & 0xFF; 142 if ((v2 & 0xc0) != 0x80) { 143 return throwBadUtf8(v2, at + 2); 144 } 145 int value = ((v0 & 0x0f) << 12) | ((v1 & 0x3f) << 6) | 146 (v2 & 0x3f); 147 if (value < 0x800) { 148 /* 149 * This should have been represented with one- or 150 * two-byte encoding. 151 */ 152 return throwBadUtf8(v2, at + 2); 153 } 154 out = (char) value; 155 at += 3; 156 break; 157 } 158 default: { 159 // 10XXXXXX, 1111XXXX -- illegal 160 return throwBadUtf8(v0, at); 161 } 162 } 163 chars[outAt] = out; 164 outAt++; 165 } 166 167 return new String(chars, 0, outAt); 168 } 169 170 /** 171 * Converts an array of UTF-8 bytes into a string. 172 * 173 * @param bytes non-null; the bytes to convert 174 * @param start the start index of the utf8 string to convert 175 * @param utf16Length the number of utf16 characters in the string to decode 176 * @return non-null; the converted string 177 */ 178 public static String utf8BytesWithUtf16LengthToString(@Nonnull byte[] bytes, int start, int utf16Length) { 179 return utf8BytesWithUtf16LengthToString(bytes, start, utf16Length, null); 180 } 181 182 /** 183 * Converts an array of UTF-8 bytes into a string. 184 * 185 * @param bytes non-null; the bytes to convert 186 * @param start the start index of the utf8 string to convert 187 * @param utf16Length the number of utf16 characters in the string to decode 188 * @param readLength If non-null, the first element will contain the number of bytes read after the method exits 189 * @return non-null; the converted string 190 */ 191 public static String utf8BytesWithUtf16LengthToString(@Nonnull byte[] bytes, int start, int utf16Length, 192 @Nullable int[] readLength) { 193 char[] chars = localBuffer.get(); 194 if (chars == null || chars.length < utf16Length) { 195 chars = new char[utf16Length]; 196 localBuffer.set(chars); 197 } 198 int outAt = 0; 199 200 int at = 0; 201 for (at = start; utf16Length > 0; utf16Length--) { 202 int v0 = bytes[at] & 0xFF; 203 char out; 204 switch (v0 >> 4) { 205 case 0x00: case 0x01: case 0x02: case 0x03: 206 case 0x04: case 0x05: case 0x06: case 0x07: { 207 // 0XXXXXXX -- single-byte encoding 208 if (v0 == 0) { 209 // A single zero byte is illegal. 210 return throwBadUtf8(v0, at); 211 } 212 out = (char) v0; 213 at++; 214 break; 215 } 216 case 0x0c: case 0x0d: { 217 // 110XXXXX -- two-byte encoding 218 int v1 = bytes[at + 1] & 0xFF; 219 if ((v1 & 0xc0) != 0x80) { 220 return throwBadUtf8(v1, at + 1); 221 } 222 int value = ((v0 & 0x1f) << 6) | (v1 & 0x3f); 223 if ((value != 0) && (value < 0x80)) { 224 /* 225 * This should have been represented with 226 * one-byte encoding. 227 */ 228 return throwBadUtf8(v1, at + 1); 229 } 230 out = (char) value; 231 at += 2; 232 break; 233 } 234 case 0x0e: { 235 // 1110XXXX -- three-byte encoding 236 int v1 = bytes[at + 1] & 0xFF; 237 if ((v1 & 0xc0) != 0x80) { 238 return throwBadUtf8(v1, at + 1); 239 } 240 int v2 = bytes[at + 2] & 0xFF; 241 if ((v2 & 0xc0) != 0x80) { 242 return throwBadUtf8(v2, at + 2); 243 } 244 int value = ((v0 & 0x0f) << 12) | ((v1 & 0x3f) << 6) | 245 (v2 & 0x3f); 246 if (value < 0x800) { 247 /* 248 * This should have been represented with one- or 249 * two-byte encoding. 250 */ 251 return throwBadUtf8(v2, at + 2); 252 } 253 out = (char) value; 254 at += 3; 255 break; 256 } 257 default: { 258 // 10XXXXXX, 1111XXXX -- illegal 259 return throwBadUtf8(v0, at); 260 } 261 } 262 chars[outAt] = out; 263 outAt++; 264 } 265 266 if (readLength != null && readLength.length > 0) { 267 readLength[0] = at - start; 268 readLength[0] = at - start; 269 } 270 return new String(chars, 0, outAt); 271 } 272 273 /** 274 * Helper for {@link #utf8BytesToString}, which throws the right 275 * exception for a bogus utf-8 byte. 276 * 277 * @param value the byte value 278 * @param offset the file offset 279 * @return never 280 * @throws IllegalArgumentException always thrown 281 */ 282 private static String throwBadUtf8(int value, int offset) { 283 throw new IllegalArgumentException("bad utf-8 byte " + Hex.u1(value) + 284 " at offset " + Hex.u4(offset)); 285 } 286} 287