1/*
2 *  Licensed to the Apache Software Foundation (ASF) under one or more
3 *  contributor license agreements.  See the NOTICE file distributed with
4 *  this work for additional information regarding copyright ownership.
5 *  The ASF licenses this file to You under the Apache License, Version 2.0
6 *  (the "License"); you may not use this file except in compliance with
7 *  the License.  You may obtain a copy of the License at
8 *
9 *     http://www.apache.org/licenses/LICENSE-2.0
10 *
11 *  Unless required by applicable law or agreed to in writing, software
12 *  distributed under the License is distributed on an "AS IS" BASIS,
13 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 *  See the License for the specific language governing permissions and
15 *  limitations under the License.
16 */
17
18package java.lang;
19
20import dalvik.annotation.optimization.FastNative;
21import java.io.Serializable;
22import java.io.UnsupportedEncodingException;
23import java.nio.ByteBuffer;
24import java.nio.CharBuffer;
25import java.nio.charset.Charset;
26import java.util.Arrays;
27import java.util.Comparator;
28import libcore.util.CharsetUtils;
29import libcore.util.EmptyArray;
30
31/**
32 * Class used to generate strings instead of calling String.<init>.
33 *
34 * @hide
35 */
36public final class StringFactory {
37
38    // TODO: Remove once native methods are in place.
39    private static final char REPLACEMENT_CHAR = (char) 0xfffd;
40
41    public static String newEmptyString() {
42        return newStringFromChars(EmptyArray.CHAR, 0, 0);
43    }
44
45    public static String newStringFromBytes(byte[] data) {
46        return newStringFromBytes(data, 0, data.length);
47    }
48
49    public static String newStringFromBytes(byte[] data, int high) {
50        return newStringFromBytes(data, high, 0, data.length);
51    }
52
53    public static String newStringFromBytes(byte[] data, int offset, int byteCount) {
54        return newStringFromBytes(data, offset, byteCount, Charset.defaultCharset());
55    }
56
57    @FastNative
58    public static native String newStringFromBytes(byte[] data, int high, int offset, int byteCount);
59
60    public static String newStringFromBytes(byte[] data, int offset, int byteCount, String charsetName) throws UnsupportedEncodingException {
61        return newStringFromBytes(data, offset, byteCount, Charset.forNameUEE(charsetName));
62    }
63
64    public static String newStringFromBytes(byte[] data, String charsetName) throws UnsupportedEncodingException {
65        return newStringFromBytes(data, 0, data.length, Charset.forNameUEE(charsetName));
66    }
67
68    private static final int[] TABLE_UTF8_NEEDED = new int[] {
69    //      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f
70            0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0xc0 - 0xcf
71            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0xd0 - 0xdf
72            2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // 0xe0 - 0xef
73            3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xf0 - 0xff
74    };
75
76    // TODO: Implement this method natively.
77    public static String newStringFromBytes(byte[] data, int offset, int byteCount, Charset charset) {
78        if ((offset | byteCount) < 0 || byteCount > data.length - offset) {
79            throw new StringIndexOutOfBoundsException(data.length, offset, byteCount);
80        }
81
82        char[] value;
83        int length;
84
85        // We inline UTF-8, ISO-8859-1, and US-ASCII decoders for speed.
86        String canonicalCharsetName = charset.name();
87        if (canonicalCharsetName.equals("UTF-8")) {
88            /*
89            This code converts a UTF-8 byte sequence to a Java String (UTF-16).
90            It implements the W3C recommended UTF-8 decoder.
91            https://www.w3.org/TR/encoding/#utf-8-decoder
92
93            Unicode 3.2 Well-Formed UTF-8 Byte Sequences
94            Code Points        First  Second Third Fourth
95            U+0000..U+007F     00..7F
96            U+0080..U+07FF     C2..DF 80..BF
97            U+0800..U+0FFF     E0     A0..BF 80..BF
98            U+1000..U+CFFF     E1..EC 80..BF 80..BF
99            U+D000..U+D7FF     ED     80..9F 80..BF
100            U+E000..U+FFFF     EE..EF 80..BF 80..BF
101            U+10000..U+3FFFF   F0     90..BF 80..BF 80..BF
102            U+40000..U+FFFFF   F1..F3 80..BF 80..BF 80..BF
103            U+100000..U+10FFFF F4     80..8F 80..BF 80..BF
104
105            Please refer to Unicode as the authority.
106            p.126 Table 3-7 in http://www.unicode.org/versions/Unicode10.0.0/ch03.pdf
107
108            Handling Malformed Input
109            The maximal subpart should be replaced by a single U+FFFD. Maximal subpart is
110            the longest code unit subsequence starting at an unconvertible offset that is either
111            1) the initial subsequence of a well-formed code unit sequence, or
112            2) a subsequence of length one:
113            One U+FFFD should be emitted for every sequence of bytes that is an incomplete prefix
114            of a valid sequence, and with the conversion to restart after the incomplete sequence.
115
116            For example, in byte sequence "41 C0 AF 41 F4 80 80 41", the maximal subparts are
117            "C0", "AF", and "F4 80 80". "F4 80 80" can be the initial subsequence of "F4 80 80 80",
118            but "C0" can't be the initial subsequence of any well-formed code unit sequence.
119            Thus, the output should be "A\ufffd\ufffdA\ufffdA".
120
121            Please refer to section "Best Practices for Using U+FFFD." in
122            http://www.unicode.org/versions/Unicode10.0.0/ch03.pdf
123            */
124            byte[] d = data;
125            char[] v = new char[byteCount];
126
127            int idx = offset;
128            int last = offset + byteCount;
129            int s = 0;
130
131            int codePoint = 0;
132            int utf8BytesSeen = 0;
133            int utf8BytesNeeded = 0;
134            int lowerBound = 0x80;
135            int upperBound = 0xbf;
136
137            while (idx < last) {
138                int b = d[idx++] & 0xff;
139                if (utf8BytesNeeded == 0) {
140                    if ((b & 0x80) == 0) { // ASCII char. 0xxxxxxx
141                        v[s++] = (char) b;
142                        continue;
143                    }
144
145                    if ((b & 0x40) == 0) { // 10xxxxxx is illegal as first byte
146                        v[s++] = REPLACEMENT_CHAR;
147                        continue;
148                    }
149
150                    // 11xxxxxx
151                    int tableLookupIndex = b & 0x3f;
152                    utf8BytesNeeded = TABLE_UTF8_NEEDED[tableLookupIndex];
153                    if (utf8BytesNeeded == 0) {
154                        v[s++] = REPLACEMENT_CHAR;
155                        continue;
156                    }
157
158                    // utf8BytesNeeded
159                    // 1: b & 0x1f
160                    // 2: b & 0x0f
161                    // 3: b & 0x07
162                    codePoint = b & (0x3f >> utf8BytesNeeded);
163                    if (b == 0xe0) {
164                        lowerBound = 0xa0;
165                    } else if (b == 0xed) {
166                        upperBound = 0x9f;
167                    } else if (b == 0xf0) {
168                        lowerBound = 0x90;
169                    } else if (b == 0xf4) {
170                        upperBound = 0x8f;
171                    }
172                } else {
173                    if (b < lowerBound || b > upperBound) {
174                        // The bytes seen are ill-formed. Substitute them with U+FFFD
175                        v[s++] = REPLACEMENT_CHAR;
176                        codePoint = 0;
177                        utf8BytesNeeded = 0;
178                        utf8BytesSeen = 0;
179                        lowerBound = 0x80;
180                        upperBound = 0xbf;
181                        /*
182                         * According to the Unicode Standard,
183                         * "a UTF-8 conversion process is required to never consume well-formed
184                         * subsequences as part of its error handling for ill-formed subsequences"
185                         * The current byte could be part of well-formed subsequences. Reduce the
186                         * index by 1 to parse it in next loop.
187                         */
188                        idx--;
189                        continue;
190                    }
191
192                    lowerBound = 0x80;
193                    upperBound = 0xbf;
194                    codePoint = (codePoint << 6) | (b & 0x3f);
195                    utf8BytesSeen++;
196                    if (utf8BytesNeeded != utf8BytesSeen) {
197                        continue;
198                    }
199
200                    // Encode chars from U+10000 up as surrogate pairs
201                    if (codePoint < 0x10000) {
202                        v[s++] = (char) codePoint;
203                    } else {
204                        v[s++] = (char) ((codePoint >> 10) + 0xd7c0);
205                        v[s++] = (char) ((codePoint & 0x3ff) + 0xdc00);
206                    }
207
208                    utf8BytesSeen = 0;
209                    utf8BytesNeeded = 0;
210                    codePoint = 0;
211                }
212            }
213
214            // The bytes seen are ill-formed. Substitute them by U+FFFD
215            if (utf8BytesNeeded != 0) {
216                v[s++] = REPLACEMENT_CHAR;
217            }
218
219            if (s == byteCount) {
220                // We guessed right, so we can use our temporary array as-is.
221                value = v;
222                length = s;
223            } else {
224                // Our temporary array was too big, so reallocate and copy.
225                value = new char[s];
226                length = s;
227                System.arraycopy(v, 0, value, 0, s);
228            }
229        } else if (canonicalCharsetName.equals("ISO-8859-1")) {
230            value = new char[byteCount];
231            length = byteCount;
232            CharsetUtils.isoLatin1BytesToChars(data, offset, byteCount, value);
233        } else if (canonicalCharsetName.equals("US-ASCII")) {
234            value = new char[byteCount];
235            length = byteCount;
236            CharsetUtils.asciiBytesToChars(data, offset, byteCount, value);
237        } else {
238            CharBuffer cb = charset.decode(ByteBuffer.wrap(data, offset, byteCount));
239            length = cb.length();
240            if (length > 0) {
241                // We could use cb.array() directly, but that would mean we'd have to trust
242                // the CharsetDecoder doesn't hang on to the CharBuffer and mutate it later,
243                // which would break String's immutability guarantee. It would also tend to
244                // mean that we'd be wasting memory because CharsetDecoder doesn't trim the
245                // array. So we copy.
246                value = new char[length];
247                System.arraycopy(cb.array(), 0, value, 0, length);
248            } else {
249                value = EmptyArray.CHAR;
250            }
251        }
252        return newStringFromChars(value, 0, length);
253    }
254
255    public static String newStringFromBytes(byte[] data, Charset charset) {
256        return newStringFromBytes(data, 0, data.length, charset);
257    }
258
259    public static String newStringFromChars(char[] data) {
260        return newStringFromChars(data, 0, data.length);
261    }
262
263    public static String newStringFromChars(char[] data, int offset, int charCount) {
264        if ((offset | charCount) < 0 || charCount > data.length - offset) {
265            throw new StringIndexOutOfBoundsException(data.length, offset, charCount);
266        }
267        return newStringFromChars(offset, charCount, data);
268    }
269
270    // The char array passed as {@code java_data} must not be a null reference.
271    @FastNative
272    static native String newStringFromChars(int offset, int charCount, char[] data);
273
274    @FastNative
275    public static native String newStringFromString(String toCopy);
276
277    public static String newStringFromStringBuffer(StringBuffer stringBuffer) {
278        synchronized (stringBuffer) {
279            return newStringFromChars(stringBuffer.getValue(), 0, stringBuffer.length());
280        }
281    }
282
283    // TODO: Implement this method natively.
284    public static String newStringFromCodePoints(int[] codePoints, int offset, int count) {
285        if (codePoints == null) {
286            throw new NullPointerException("codePoints == null");
287        }
288        if ((offset | count) < 0 || count > codePoints.length - offset) {
289            throw new StringIndexOutOfBoundsException(codePoints.length, offset, count);
290        }
291        char[] value = new char[count * 2];
292        int end = offset + count;
293        int length = 0;
294        for (int i = offset; i < end; i++) {
295            length += Character.toChars(codePoints[i], value, length);
296        }
297        return newStringFromChars(value, 0, length);
298    }
299
300    public static String newStringFromStringBuilder(StringBuilder stringBuilder) {
301        return newStringFromChars(stringBuilder.getValue(), 0, stringBuilder.length());
302    }
303}
304