1/*
2 * Copyright (C) 2015 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License
15 */
16
17package libcore.net;
18
19import java.io.ByteArrayOutputStream;
20import java.net.URISyntaxException;
21import java.nio.ByteBuffer;
22import java.nio.CharBuffer;
23import java.nio.charset.CharacterCodingException;
24import java.nio.charset.Charset;
25import java.nio.charset.CharsetDecoder;
26import java.nio.charset.CharsetEncoder;
27import java.nio.charset.CoderResult;
28import java.nio.charset.CodingErrorAction;
29import java.nio.charset.StandardCharsets;
30
31/**
32 * Encodes and decodes “application/x-www-form-urlencoded” content.
33 *
34 * Subclasses define “isRetained”, which decides which chars need to be escaped and which don’t.
35 * Output is encoded as UTF-8 by default. I.e, each character (or surrogate pair) is converted to
36 * its equivalent UTF-8 encoded byte sequence, which is then converted to it’s escaped form.
37 * e.g a 4 byte sequence might look like” %c6%ef%e0%e8”
38 */
39public abstract class UriCodec {
40    /**
41     * Returns true iff. ‘c’ does not need to be escaped.
42     * 'a’ - ‘z’ , ‘A’ - ‘Z’ and ‘0’ - ‘9’ are always considered valid (i.e, don’t need to be
43     * escaped. This set is referred to as the ``whitelist''.
44     */
45    protected abstract boolean isRetained(char c);
46
47    private static boolean isWhitelisted(char c) {
48        return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || ('0' <= c && c <= '9');
49    }
50
51    private boolean isWhitelistedOrRetained(char c) {
52        return isWhitelisted(c) || isRetained(c);
53    }
54
55    /**
56     * Throw URISyntaxException if any of the characters in the range [start, end) are not valid
57     * according to this codec.
58     *  - If a char is in the whitelist or retained, it is valid both escaped and unescaped.
59     *  - All escaped octets appearing in the input are structurally valid hex, i.e convertible to
60     *  decimals.
61     *
62     * On success, the substring [start, end) is returned.
63     * {@code name} is not used, except to generate debugging info.
64     */
65    public final String validate(String uri, int start, int end, String name)
66            throws URISyntaxException {
67        int i = start;
68        while (i < end) {
69            char c = uri.charAt(i++);
70            if (isWhitelistedOrRetained(c)) {
71                continue;
72            }
73            // c is either '%' or character not allowed in a uri.
74            if (c != '%') {
75                throw unexpectedCharacterException(uri, name, c, i - 1);
76            }
77            // Expect two characters representing a number in hex.
78            for (int j = 0; j < 2; j++) {
79                c = getNextCharacter(uri, i++, end, name);
80                if (hexCharToValue(c) < 0) {
81                    throw unexpectedCharacterException(uri, name, c, i - 1);
82                }
83            }
84        }
85        return uri.substring(start, end);
86    }
87
88    /**
89     * Interprets a char as hex digits, returning a number from -1 (invalid char) to 15 ('f').
90     */
91    private static int hexCharToValue(char c) {
92        if('0' <= c && c <= '9') {
93            return c - '0';
94        }
95        if ('a' <= c && c <= 'f') {
96            return 10 + c - 'a';
97        }
98        if ('A' <= c && c <= 'F') {
99            return 10 + c - 'A';
100        }
101        return -1;
102    }
103
104    private static URISyntaxException unexpectedCharacterException(
105            String uri, String name, char unexpected, int index) {
106        String nameString = (name == null) ? "" :  " in [" + name + "]";
107        return new URISyntaxException(
108                uri, "Unexpected character" + nameString + ": " + unexpected, index);
109    }
110
111    private static char getNextCharacter(String uri, int index, int end, String name)
112             throws URISyntaxException {
113        if (index >= end) {
114            String nameString = (name == null) ? "" :  " in [" + name + "]";
115            throw new URISyntaxException(
116                    uri, "Unexpected end of string" + nameString, index);
117        }
118        return uri.charAt(index);
119    }
120
121    /**
122     * Throws {@link URISyntaxException} if any character in {@code uri} is neither whitelisted nor
123     * in {@code legal}.
124     */
125    public static void validateSimple(String uri, String legal) throws URISyntaxException {
126        for (int i = 0; i < uri.length(); i++) {
127            char c = uri.charAt(i);
128            if (!isWhitelisted(c) && legal.indexOf(c) < 0) {
129                throw unexpectedCharacterException(uri, null /* name */, c, i);
130            }
131        }
132    }
133
134    /**
135     * Encodes the string {@code s} as per the rules of this encoder (see class level comment).
136     *
137     * @throws IllegalArgumentException if the encoder is unable to encode a sequence of bytes.
138     */
139    public final String encode(String s, Charset charset) {
140        StringBuilder builder = new StringBuilder(s.length());
141        appendEncoded(builder, s, charset, false);
142        return builder.toString();
143    }
144
145    /**
146     * Encodes the string {@code s} as per the rules of this encoder (see class level comment).
147     *
148     * Encoded output is appended to {@code builder}. This uses the default output encoding (UTF-8).
149     */
150    public final void appendEncoded(StringBuilder builder, String s) {
151        appendEncoded(builder, s, StandardCharsets.UTF_8, false);
152    }
153
154    /**
155     * Encodes the string {@code s} as per the rules of this encoder (see class level comment).
156     *
157     * Encoded output is appended to {@code builder}. This uses the default output encoding (UTF-8).
158     * This method must produce partially encoded output. What this means is that if encoded octets
159     * appear in the input string, they are passed through unmodified, instead of being double
160     * escaped. Consider a decoder operating on the global whitelist dealing with a string
161     * “foo%25bar”. With this method, the output will be “foo%25bar”, but with appendEncoded, it
162     * will be double encoded into “foo%2525bar”.
163     */
164    public final void appendPartiallyEncoded(StringBuilder builder, String s) {
165        appendEncoded(builder, s, StandardCharsets.UTF_8, true);
166    }
167
168    private void appendEncoded(
169            StringBuilder builder, String s, Charset charset, boolean partiallyEncoded) {
170        CharsetEncoder encoder = charset.newEncoder()
171                .onMalformedInput(CodingErrorAction.REPORT)
172                .onUnmappableCharacter(CodingErrorAction.REPORT);
173        CharBuffer cBuffer = CharBuffer.allocate(s.length());
174        for (int i = 0; i < s.length(); i++) {
175            char c = s.charAt(i);
176            if (c == '%' && partiallyEncoded) {
177                // In case there are characters waiting to be encoded.
178                flushEncodingCharBuffer(builder, encoder, cBuffer);
179                builder.append('%');
180                continue;
181            }
182
183            if (c == ' ' && isRetained(' ')) {
184                flushEncodingCharBuffer(builder, encoder, cBuffer);
185                builder.append('+');
186                continue;
187            }
188
189            if (isWhitelistedOrRetained(c)) {
190                flushEncodingCharBuffer(builder, encoder, cBuffer);
191                builder.append(c);
192                continue;
193            }
194
195            // Put the character in the queue for encoding.
196            cBuffer.put(c);
197        }
198        flushEncodingCharBuffer(builder, encoder, cBuffer);
199    }
200
201    private static void flushEncodingCharBuffer(
202            StringBuilder builder,
203            CharsetEncoder encoder,
204            CharBuffer cBuffer) {
205        if (cBuffer.position() == 0) {
206            return;
207        }
208        // We are reading from the buffer now.
209        cBuffer.flip();
210        ByteBuffer byteBuffer = ByteBuffer.allocate(
211                cBuffer.remaining() * (int) Math.ceil(encoder.maxBytesPerChar()));
212        byteBuffer.position(0);
213        CoderResult result = encoder.encode(cBuffer, byteBuffer, true /* endOfInput */);
214        // According to the {@code CharsetEncoder#encode} spec, the method returns underflow
215        // and leaves an empty output when all bytes were processed correctly.
216        if (result != CoderResult.UNDERFLOW) {
217            throw new IllegalArgumentException(
218                    "Error encoding, unexpected result ["
219                            + result.toString()
220                            + "] using encoder for ["
221                            + encoder.charset().name()
222                            + "]");
223        }
224        if (cBuffer.hasRemaining()) {
225            throw new IllegalArgumentException(
226                    "Encoder for [" + encoder.charset().name() + "] failed with underflow with "
227                            + "remaining input [" + cBuffer + "]");
228        }
229        // Need to flush in case the encoder saves internal state.
230        encoder.flush(byteBuffer);
231        if (result != CoderResult.UNDERFLOW) {
232            throw new IllegalArgumentException(
233                    "Error encoding, unexpected result ["
234                            + result.toString()
235                            + "] flushing encoder for ["
236                            + encoder.charset().name()
237                            + "]");
238        }
239        encoder.reset();
240
241        byteBuffer.flip();
242        // Write the encoded bytes.
243        while(byteBuffer.hasRemaining()) {
244            byte b = byteBuffer.get();
245            builder.append('%');
246            builder.append(intToHexDigit((b & 0xf0) >>> 4));
247            builder.append(intToHexDigit(b & 0x0f));
248
249        }
250        // Use the character buffer to write again.
251        cBuffer.flip();
252        cBuffer.limit(cBuffer.capacity());
253    }
254
255    private static char intToHexDigit(int b) {
256        if (b < 10) {
257            return (char) ('0' + b);
258        } else {
259            return (char) ('A' + b - 10);
260        }
261    }
262
263    /**
264     * Decode a string according to the rules of this decoder.
265     *
266     * - if {@code convertPlus == true} all ‘+’ chars in the decoded output are converted to ‘ ‘
267     *   (white space)
268     * - if {@code throwOnFailure == true}, an {@link IllegalArgumentException} is thrown for
269     *   invalid inputs. Else, U+FFFd is emitted to the output in place of invalid input octets.
270     */
271    public static String decode(
272            String s, boolean convertPlus, Charset charset, boolean throwOnFailure) {
273        StringBuilder builder = new StringBuilder(s.length());
274        appendDecoded(builder, s, convertPlus, charset, throwOnFailure);
275        return builder.toString();
276    }
277
278    /**
279     * Character to be output when there's an error decoding an input.
280     */
281    private static final char INVALID_INPUT_CHARACTER = '\ufffd';
282
283    private static void appendDecoded(
284            StringBuilder builder,
285            String s,
286            boolean convertPlus,
287            Charset charset,
288            boolean throwOnFailure) {
289        CharsetDecoder decoder = charset.newDecoder()
290                .onMalformedInput(CodingErrorAction.REPLACE)
291                .replaceWith("\ufffd")
292                .onUnmappableCharacter(CodingErrorAction.REPORT);
293        // Holds the bytes corresponding to the escaped chars being read (empty if the last char
294        // wasn't a escaped char).
295        ByteBuffer byteBuffer = ByteBuffer.allocate(s.length());
296        int i = 0;
297        while (i < s.length()) {
298            char c = s.charAt(i);
299            i++;
300            switch (c) {
301                case '+':
302                    flushDecodingByteAccumulator(
303                            builder, decoder, byteBuffer, throwOnFailure);
304                    builder.append(convertPlus ? ' ' : '+');
305                    break;
306                case '%':
307                    // Expect two characters representing a number in hex.
308                    byte hexValue = 0;
309                    for (int j = 0; j < 2; j++) {
310                        try {
311                            c = getNextCharacter(s, i, s.length(), null /* name */);
312                        } catch (URISyntaxException e) {
313                            // Unexpected end of input.
314                            if (throwOnFailure) {
315                                throw new IllegalArgumentException(e);
316                            } else {
317                                flushDecodingByteAccumulator(
318                                        builder, decoder, byteBuffer, throwOnFailure);
319                                builder.append(INVALID_INPUT_CHARACTER);
320                                return;
321                            }
322                        }
323                        i++;
324                        int newDigit = hexCharToValue(c);
325                        if (newDigit < 0) {
326                            if (throwOnFailure) {
327                                throw new IllegalArgumentException(
328                                        unexpectedCharacterException(s, null /* name */, c, i - 1));
329                            } else {
330                                flushDecodingByteAccumulator(
331                                        builder, decoder, byteBuffer, throwOnFailure);
332                                builder.append(INVALID_INPUT_CHARACTER);
333                                break;
334                            }
335                        }
336                        hexValue = (byte) (hexValue * 0x10 + newDigit);
337                    }
338                    byteBuffer.put(hexValue);
339                    break;
340                default:
341                    flushDecodingByteAccumulator(builder, decoder, byteBuffer, throwOnFailure);
342                    builder.append(c);
343            }
344        }
345        flushDecodingByteAccumulator(builder, decoder, byteBuffer, throwOnFailure);
346    }
347
348    private static void flushDecodingByteAccumulator(
349            StringBuilder builder,
350            CharsetDecoder decoder,
351            ByteBuffer byteBuffer,
352            boolean throwOnFailure) {
353        if (byteBuffer.position() == 0) {
354            return;
355        }
356        byteBuffer.flip();
357        try {
358            builder.append(decoder.decode(byteBuffer));
359        } catch (CharacterCodingException e) {
360            if (throwOnFailure) {
361                throw new IllegalArgumentException(e);
362            } else {
363                builder.append(INVALID_INPUT_CHARACTER);
364            }
365        } finally {
366            // Use the byte buffer to write again.
367            byteBuffer.flip();
368            byteBuffer.limit(byteBuffer.capacity());
369        }
370    }
371
372    /**
373     * Equivalent to {@code decode(s, false, UTF_8, true)}
374     */
375    public static String decode(String s) {
376        return decode(
377                s, false /* convertPlus */, StandardCharsets.UTF_8, true /* throwOnFailure */);
378    }
379}