1/* 2 * Licensed to the Apache Software Foundation (ASF) under one or more 3 * contributor license agreements. See the NOTICE file distributed with 4 * this work for additional information regarding copyright ownership. 5 * The ASF licenses this file to You under the Apache License, Version 2.0 6 * (the "License"); you may not use this file except in compliance with 7 * the License. You may obtain a copy of the License at 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 */ 17 18package java.lang; 19 20import dalvik.annotation.optimization.FastNative; 21import java.io.Serializable; 22import java.io.UnsupportedEncodingException; 23import java.nio.ByteBuffer; 24import java.nio.CharBuffer; 25import java.nio.charset.Charset; 26import java.util.Arrays; 27import java.util.Comparator; 28import libcore.util.CharsetUtils; 29import libcore.util.EmptyArray; 30 31/** 32 * Class used to generate strings instead of calling String.<init>. 33 * 34 * @hide 35 */ 36public final class StringFactory { 37 38 // TODO: Remove once native methods are in place. 39 private static final char REPLACEMENT_CHAR = (char) 0xfffd; 40 41 public static String newEmptyString() { 42 return newStringFromChars(EmptyArray.CHAR, 0, 0); 43 } 44 45 public static String newStringFromBytes(byte[] data) { 46 return newStringFromBytes(data, 0, data.length); 47 } 48 49 public static String newStringFromBytes(byte[] data, int high) { 50 return newStringFromBytes(data, high, 0, data.length); 51 } 52 53 public static String newStringFromBytes(byte[] data, int offset, int byteCount) { 54 return newStringFromBytes(data, offset, byteCount, Charset.defaultCharset()); 55 } 56 57 @FastNative 58 public static native String newStringFromBytes(byte[] data, int high, int offset, int byteCount); 59 60 public static String newStringFromBytes(byte[] data, int offset, int byteCount, String charsetName) throws UnsupportedEncodingException { 61 return newStringFromBytes(data, offset, byteCount, Charset.forNameUEE(charsetName)); 62 } 63 64 public static String newStringFromBytes(byte[] data, String charsetName) throws UnsupportedEncodingException { 65 return newStringFromBytes(data, 0, data.length, Charset.forNameUEE(charsetName)); 66 } 67 68 private static final int[] TABLE_UTF8_NEEDED = new int[] { 69 // 0 1 2 3 4 5 6 7 8 9 a b c d e f 70 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0xc0 - 0xcf 71 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0xd0 - 0xdf 72 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // 0xe0 - 0xef 73 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xf0 - 0xff 74 }; 75 76 // TODO: Implement this method natively. 77 public static String newStringFromBytes(byte[] data, int offset, int byteCount, Charset charset) { 78 if ((offset | byteCount) < 0 || byteCount > data.length - offset) { 79 throw new StringIndexOutOfBoundsException(data.length, offset, byteCount); 80 } 81 82 char[] value; 83 int length; 84 85 // We inline UTF-8, ISO-8859-1, and US-ASCII decoders for speed. 86 String canonicalCharsetName = charset.name(); 87 if (canonicalCharsetName.equals("UTF-8")) { 88 /* 89 This code converts a UTF-8 byte sequence to a Java String (UTF-16). 90 It implements the W3C recommended UTF-8 decoder. 91 https://www.w3.org/TR/encoding/#utf-8-decoder 92 93 Unicode 3.2 Well-Formed UTF-8 Byte Sequences 94 Code Points First Second Third Fourth 95 U+0000..U+007F 00..7F 96 U+0080..U+07FF C2..DF 80..BF 97 U+0800..U+0FFF E0 A0..BF 80..BF 98 U+1000..U+CFFF E1..EC 80..BF 80..BF 99 U+D000..U+D7FF ED 80..9F 80..BF 100 U+E000..U+FFFF EE..EF 80..BF 80..BF 101 U+10000..U+3FFFF F0 90..BF 80..BF 80..BF 102 U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF 103 U+100000..U+10FFFF F4 80..8F 80..BF 80..BF 104 105 Please refer to Unicode as the authority. 106 p.126 Table 3-7 in http://www.unicode.org/versions/Unicode10.0.0/ch03.pdf 107 108 Handling Malformed Input 109 The maximal subpart should be replaced by a single U+FFFD. Maximal subpart is 110 the longest code unit subsequence starting at an unconvertible offset that is either 111 1) the initial subsequence of a well-formed code unit sequence, or 112 2) a subsequence of length one: 113 One U+FFFD should be emitted for every sequence of bytes that is an incomplete prefix 114 of a valid sequence, and with the conversion to restart after the incomplete sequence. 115 116 For example, in byte sequence "41 C0 AF 41 F4 80 80 41", the maximal subparts are 117 "C0", "AF", and "F4 80 80". "F4 80 80" can be the initial subsequence of "F4 80 80 80", 118 but "C0" can't be the initial subsequence of any well-formed code unit sequence. 119 Thus, the output should be "A\ufffd\ufffdA\ufffdA". 120 121 Please refer to section "Best Practices for Using U+FFFD." in 122 http://www.unicode.org/versions/Unicode10.0.0/ch03.pdf 123 */ 124 byte[] d = data; 125 char[] v = new char[byteCount]; 126 127 int idx = offset; 128 int last = offset + byteCount; 129 int s = 0; 130 131 int codePoint = 0; 132 int utf8BytesSeen = 0; 133 int utf8BytesNeeded = 0; 134 int lowerBound = 0x80; 135 int upperBound = 0xbf; 136 137 while (idx < last) { 138 int b = d[idx++] & 0xff; 139 if (utf8BytesNeeded == 0) { 140 if ((b & 0x80) == 0) { // ASCII char. 0xxxxxxx 141 v[s++] = (char) b; 142 continue; 143 } 144 145 if ((b & 0x40) == 0) { // 10xxxxxx is illegal as first byte 146 v[s++] = REPLACEMENT_CHAR; 147 continue; 148 } 149 150 // 11xxxxxx 151 int tableLookupIndex = b & 0x3f; 152 utf8BytesNeeded = TABLE_UTF8_NEEDED[tableLookupIndex]; 153 if (utf8BytesNeeded == 0) { 154 v[s++] = REPLACEMENT_CHAR; 155 continue; 156 } 157 158 // utf8BytesNeeded 159 // 1: b & 0x1f 160 // 2: b & 0x0f 161 // 3: b & 0x07 162 codePoint = b & (0x3f >> utf8BytesNeeded); 163 if (b == 0xe0) { 164 lowerBound = 0xa0; 165 } else if (b == 0xed) { 166 upperBound = 0x9f; 167 } else if (b == 0xf0) { 168 lowerBound = 0x90; 169 } else if (b == 0xf4) { 170 upperBound = 0x8f; 171 } 172 } else { 173 if (b < lowerBound || b > upperBound) { 174 // The bytes seen are ill-formed. Substitute them with U+FFFD 175 v[s++] = REPLACEMENT_CHAR; 176 codePoint = 0; 177 utf8BytesNeeded = 0; 178 utf8BytesSeen = 0; 179 lowerBound = 0x80; 180 upperBound = 0xbf; 181 /* 182 * According to the Unicode Standard, 183 * "a UTF-8 conversion process is required to never consume well-formed 184 * subsequences as part of its error handling for ill-formed subsequences" 185 * The current byte could be part of well-formed subsequences. Reduce the 186 * index by 1 to parse it in next loop. 187 */ 188 idx--; 189 continue; 190 } 191 192 lowerBound = 0x80; 193 upperBound = 0xbf; 194 codePoint = (codePoint << 6) | (b & 0x3f); 195 utf8BytesSeen++; 196 if (utf8BytesNeeded != utf8BytesSeen) { 197 continue; 198 } 199 200 // Encode chars from U+10000 up as surrogate pairs 201 if (codePoint < 0x10000) { 202 v[s++] = (char) codePoint; 203 } else { 204 v[s++] = (char) ((codePoint >> 10) + 0xd7c0); 205 v[s++] = (char) ((codePoint & 0x3ff) + 0xdc00); 206 } 207 208 utf8BytesSeen = 0; 209 utf8BytesNeeded = 0; 210 codePoint = 0; 211 } 212 } 213 214 // The bytes seen are ill-formed. Substitute them by U+FFFD 215 if (utf8BytesNeeded != 0) { 216 v[s++] = REPLACEMENT_CHAR; 217 } 218 219 if (s == byteCount) { 220 // We guessed right, so we can use our temporary array as-is. 221 value = v; 222 length = s; 223 } else { 224 // Our temporary array was too big, so reallocate and copy. 225 value = new char[s]; 226 length = s; 227 System.arraycopy(v, 0, value, 0, s); 228 } 229 } else if (canonicalCharsetName.equals("ISO-8859-1")) { 230 value = new char[byteCount]; 231 length = byteCount; 232 CharsetUtils.isoLatin1BytesToChars(data, offset, byteCount, value); 233 } else if (canonicalCharsetName.equals("US-ASCII")) { 234 value = new char[byteCount]; 235 length = byteCount; 236 CharsetUtils.asciiBytesToChars(data, offset, byteCount, value); 237 } else { 238 CharBuffer cb = charset.decode(ByteBuffer.wrap(data, offset, byteCount)); 239 length = cb.length(); 240 if (length > 0) { 241 // We could use cb.array() directly, but that would mean we'd have to trust 242 // the CharsetDecoder doesn't hang on to the CharBuffer and mutate it later, 243 // which would break String's immutability guarantee. It would also tend to 244 // mean that we'd be wasting memory because CharsetDecoder doesn't trim the 245 // array. So we copy. 246 value = new char[length]; 247 System.arraycopy(cb.array(), 0, value, 0, length); 248 } else { 249 value = EmptyArray.CHAR; 250 } 251 } 252 return newStringFromChars(value, 0, length); 253 } 254 255 public static String newStringFromBytes(byte[] data, Charset charset) { 256 return newStringFromBytes(data, 0, data.length, charset); 257 } 258 259 public static String newStringFromChars(char[] data) { 260 return newStringFromChars(data, 0, data.length); 261 } 262 263 public static String newStringFromChars(char[] data, int offset, int charCount) { 264 if ((offset | charCount) < 0 || charCount > data.length - offset) { 265 throw new StringIndexOutOfBoundsException(data.length, offset, charCount); 266 } 267 return newStringFromChars(offset, charCount, data); 268 } 269 270 // The char array passed as {@code java_data} must not be a null reference. 271 @FastNative 272 static native String newStringFromChars(int offset, int charCount, char[] data); 273 274 @FastNative 275 public static native String newStringFromString(String toCopy); 276 277 public static String newStringFromStringBuffer(StringBuffer stringBuffer) { 278 synchronized (stringBuffer) { 279 return newStringFromChars(stringBuffer.getValue(), 0, stringBuffer.length()); 280 } 281 } 282 283 // TODO: Implement this method natively. 284 public static String newStringFromCodePoints(int[] codePoints, int offset, int count) { 285 if (codePoints == null) { 286 throw new NullPointerException("codePoints == null"); 287 } 288 if ((offset | count) < 0 || count > codePoints.length - offset) { 289 throw new StringIndexOutOfBoundsException(codePoints.length, offset, count); 290 } 291 char[] value = new char[count * 2]; 292 int end = offset + count; 293 int length = 0; 294 for (int i = offset; i < end; i++) { 295 length += Character.toChars(codePoints[i], value, length); 296 } 297 return newStringFromChars(value, 0, length); 298 } 299 300 public static String newStringFromStringBuilder(StringBuilder stringBuilder) { 301 return newStringFromChars(stringBuilder.getValue(), 0, stringBuilder.length()); 302 } 303} 304