libcore/net/UriCodec.java

/*
 * Copyright (C) 2015 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License
 */

package libcore.net;

import java.io.ByteArrayOutputStream;
import java.net.URISyntaxException;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.CharacterCodingException;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CharsetEncoder;
import java.nio.charset.CoderResult;
import java.nio.charset.CodingErrorAction;
import java.nio.charset.StandardCharsets;

/**
 * Encodes and decodes “application/x-www-form-urlencoded” content.
 *
 * Subclasses define “isRetained”, which decides which chars need to be escaped and which don’t.
 * Output is encoded as UTF-8 by default. I.e, each character (or surrogate pair) is converted to
 * its equivalent UTF-8 encoded byte sequence, which is then converted to it’s escaped form.
 * e.g a 4 byte sequence might look like” %c6%ef%e0%e8”
 */
public abstract class UriCodec {
    /**
     * Returns true iff. ‘c’ does not need to be escaped.
     * 'a’ - ‘z’ , ‘A’ - ‘Z’ and ‘0’ - ‘9’ are always considered valid (i.e, don’t need to be
     * escaped. This set is referred to as the ``whitelist''.
     */
    protected abstract boolean isRetained(char c);

    private static boolean isWhitelisted(char c) {
        return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || ('0' <= c && c <= '9');
    }

    private boolean isWhitelistedOrRetained(char c) {
        return isWhitelisted(c) || isRetained(c);
    }

    /**
     * Throw URISyntaxException if any of the characters in the range [start, end) are not valid
     * according to this codec.
     *  - If a char is in the whitelist or retained, it is valid both escaped and unescaped.
     *  - All escaped octets appearing in the input are structurally valid hex, i.e convertible to
     *  decimals.
     *
     * On success, the substring [start, end) is returned.
     * {@code name} is not used, except to generate debugging info.
     */
    public final String validate(String uri, int start, int end, String name)
            throws URISyntaxException {
        int i = start;
        while (i < end) {
            char c = uri.charAt(i++);
            if (isWhitelistedOrRetained(c)) {
                continue;
            }
            // c is either '%' or character not allowed in a uri.
            if (c != '%') {
                throw unexpectedCharacterException(uri, name, c, i - 1);
            }
            // Expect two characters representing a number in hex.
            for (int j = 0; j < 2; j++) {
                c = getNextCharacter(uri, i++, end, name);
                if (hexCharToValue(c) < 0) {
                    throw unexpectedCharacterException(uri, name, c, i - 1);
                }
            }
        }
        return uri.substring(start, end);
    }

    /**
     * Interprets a char as hex digits, returning a number from -1 (invalid char) to 15 ('f').
     */
    private static int hexCharToValue(char c) {
        if('0' <= c && c <= '9') {
            return c - '0';
        }
        if ('a' <= c && c <= 'f') {
            return 10 + c - 'a';
        }
        if ('A' <= c && c <= 'F') {
            return 10 + c - 'A';
        }
        return -1;
    }

    private static URISyntaxException unexpectedCharacterException(
            String uri, String name, char unexpected, int index) {
        String nameString = (name == null) ? "" :  " in [" + name + "]";
        return new URISyntaxException(
                uri, "Unexpected character" + nameString + ": " + unexpected, index);
    }

    private static char getNextCharacter(String uri, int index, int end, String name)
             throws URISyntaxException {
        if (index >= end) {
            String nameString = (name == null) ? "" :  " in [" + name + "]";
            throw new URISyntaxException(
                    uri, "Unexpected end of string" + nameString, index);
        }
        return uri.charAt(index);
    }

    /**
     * Throws {@link URISyntaxException} if any character in {@code uri} is neither whitelisted nor
     * in {@code legal}.
     */
    public static void validateSimple(String uri, String legal) throws URISyntaxException {
        for (int i = 0; i < uri.length(); i++) {
            char c = uri.charAt(i);
            if (!isWhitelisted(c) && legal.indexOf(c) < 0) {
                throw unexpectedCharacterException(uri, null /* name */, c, i);
            }
        }
    }

    /**
     * Encodes the string {@code s} as per the rules of this encoder (see class level comment).
     *
     * @throws IllegalArgumentException if the encoder is unable to encode a sequence of bytes.
     */
    public final String encode(String s, Charset charset) {
        StringBuilder builder = new StringBuilder(s.length());
        appendEncoded(builder, s, charset, false);
        return builder.toString();
    }

    /**
     * Encodes the string {@code s} as per the rules of this encoder (see class level comment).
     *
     * Encoded output is appended to {@code builder}. This uses the default output encoding (UTF-8).
     */
    public final void appendEncoded(StringBuilder builder, String s) {
        appendEncoded(builder, s, StandardCharsets.UTF_8, false);
    }

    /**
     * Encodes the string {@code s} as per the rules of this encoder (see class level comment).
     *
     * Encoded output is appended to {@code builder}. This uses the default output encoding (UTF-8).
     * This method must produce partially encoded output. What this means is that if encoded octets
     * appear in the input string, they are passed through unmodified, instead of being double
     * escaped. Consider a decoder operating on the global whitelist dealing with a string
     * “foo%25bar”. With this method, the output will be “foo%25bar”, but with appendEncoded, it
     * will be double encoded into “foo%2525bar”.
     */
    public final void appendPartiallyEncoded(StringBuilder builder, String s) {
        appendEncoded(builder, s, StandardCharsets.UTF_8, true);
    }

    private void appendEncoded(
            StringBuilder builder, String s, Charset charset, boolean partiallyEncoded) {
        CharsetEncoder encoder = charset.newEncoder()
                .onMalformedInput(CodingErrorAction.REPORT)
                .onUnmappableCharacter(CodingErrorAction.REPORT);
        CharBuffer cBuffer = CharBuffer.allocate(s.length());
        for (int i = 0; i < s.length(); i++) {
            char c = s.charAt(i);
            if (c == '%' && partiallyEncoded) {
                // In case there are characters waiting to be encoded.
                flushEncodingCharBuffer(builder, encoder, cBuffer);
                builder.append('%');
                continue;
            }

            if (c == ' ' && isRetained(' ')) {
                flushEncodingCharBuffer(builder, encoder, cBuffer);
                builder.append('+');
                continue;
            }

            if (isWhitelistedOrRetained(c)) {
                flushEncodingCharBuffer(builder, encoder, cBuffer);
                builder.append(c);
                continue;
            }

            // Put the character in the queue for encoding.
            cBuffer.put(c);
        }
        flushEncodingCharBuffer(builder, encoder, cBuffer);
    }

    private static void flushEncodingCharBuffer(
            StringBuilder builder,
            CharsetEncoder encoder,
            CharBuffer cBuffer) {
        if (cBuffer.position() == 0) {
            return;
        }
        // We are reading from the buffer now.
        cBuffer.flip();
        ByteBuffer byteBuffer = ByteBuffer.allocate(
                cBuffer.remaining() * (int) Math.ceil(encoder.maxBytesPerChar()));
        byteBuffer.position(0);
        CoderResult result = encoder.encode(cBuffer, byteBuffer, true /* endOfInput */);
        // According to the {@code CharsetEncoder#encode} spec, the method returns underflow
        // and leaves an empty output when all bytes were processed correctly.
        if (result != CoderResult.UNDERFLOW) {
            throw new IllegalArgumentException(
                    "Error encoding, unexpected result ["
                            + result.toString()
                            + "] using encoder for ["
                            + encoder.charset().name()
                            + "]");
        }
        if (cBuffer.hasRemaining()) {
            throw new IllegalArgumentException(
                    "Encoder for [" + encoder.charset().name() + "] failed with underflow with "
                            + "remaining input [" + cBuffer + "]");
        }
        // Need to flush in case the encoder saves internal state.
        encoder.flush(byteBuffer);
        if (result != CoderResult.UNDERFLOW) {
            throw new IllegalArgumentException(
                    "Error encoding, unexpected result ["
                            + result.toString()
                            + "] flushing encoder for ["
                            + encoder.charset().name()
                            + "]");
        }
        encoder.reset();

        byteBuffer.flip();
        // Write the encoded bytes.
        while(byteBuffer.hasRemaining()) {
            byte b = byteBuffer.get();
            builder.append('%');
            builder.append(intToHexDigit((b & 0xf0) >>> 4));
            builder.append(intToHexDigit(b & 0x0f));

        }
        // Use the character buffer to write again.
        cBuffer.flip();
        cBuffer.limit(cBuffer.capacity());
    }

    private static char intToHexDigit(int b) {
        if (b < 10) {
            return (char) ('0' + b);
        } else {
            return (char) ('A' + b - 10);
        }
    }

    /**
     * Decode a string according to the rules of this decoder.
     *
     * - if {@code convertPlus == true} all ‘+’ chars in the decoded output are converted to ‘ ‘
     *   (white space)
     * - if {@code throwOnFailure == true}, an {@link IllegalArgumentException} is thrown for
     *   invalid inputs. Else, U+FFFd is emitted to the output in place of invalid input octets.
     */
    public static String decode(
            String s, boolean convertPlus, Charset charset, boolean throwOnFailure) {
        StringBuilder builder = new StringBuilder(s.length());
        appendDecoded(builder, s, convertPlus, charset, throwOnFailure);
        return builder.toString();
    }

    /**
     * Character to be output when there's an error decoding an input.
     */
    private static final char INVALID_INPUT_CHARACTER = '\ufffd';

    private static void appendDecoded(
            StringBuilder builder,
            String s,
            boolean convertPlus,
            Charset charset,
            boolean throwOnFailure) {
        CharsetDecoder decoder = charset.newDecoder()
                .onMalformedInput(CodingErrorAction.REPLACE)
                .replaceWith("\ufffd")
                .onUnmappableCharacter(CodingErrorAction.REPORT);
        // Holds the bytes corresponding to the escaped chars being read (empty if the last char
        // wasn't a escaped char).
        ByteBuffer byteBuffer = ByteBuffer.allocate(s.length());
        int i = 0;
        while (i < s.length()) {
            char c = s.charAt(i);
            i++;
            switch (c) {
                case '+':
                    flushDecodingByteAccumulator(
                            builder, decoder, byteBuffer, throwOnFailure);
                    builder.append(convertPlus ? ' ' : '+');
                    break;
                case '%':
                    // Expect two characters representing a number in hex.
                    byte hexValue = 0;
                    for (int j = 0; j < 2; j++) {
                        try {
                            c = getNextCharacter(s, i, s.length(), null /* name */);
                        } catch (URISyntaxException e) {
                            // Unexpected end of input.
                            if (throwOnFailure) {
                                throw new IllegalArgumentException(e);
                            } else {
                                flushDecodingByteAccumulator(
                                        builder, decoder, byteBuffer, throwOnFailure);
                                builder.append(INVALID_INPUT_CHARACTER);
                                return;
                            }
                        }
                        i++;
                        int newDigit = hexCharToValue(c);
                        if (newDigit < 0) {
                            if (throwOnFailure) {
                                throw new IllegalArgumentException(
                                        unexpectedCharacterException(s, null /* name */, c, i - 1));
                            } else {
                                flushDecodingByteAccumulator(
                                        builder, decoder, byteBuffer, throwOnFailure);
                                builder.append(INVALID_INPUT_CHARACTER);
                                break;
                            }
                        }
                        hexValue = (byte) (hexValue * 0x10 + newDigit);
                    }
                    byteBuffer.put(hexValue);
                    break;
                default:
                    flushDecodingByteAccumulator(builder, decoder, byteBuffer, throwOnFailure);
                    builder.append(c);
            }
        }
        flushDecodingByteAccumulator(builder, decoder, byteBuffer, throwOnFailure);
    }

    private static void flushDecodingByteAccumulator(
            StringBuilder builder,
            CharsetDecoder decoder,
            ByteBuffer byteBuffer,
            boolean throwOnFailure) {
        if (byteBuffer.position() == 0) {
            return;
        }
        byteBuffer.flip();
        try {
            builder.append(decoder.decode(byteBuffer));
        } catch (CharacterCodingException e) {
            if (throwOnFailure) {
                throw new IllegalArgumentException(e);
            } else {
                builder.append(INVALID_INPUT_CHARACTER);
            }
        } finally {
            // Use the byte buffer to write again.
            byteBuffer.flip();
            byteBuffer.limit(byteBuffer.capacity());
        }
    }

    /**
     * Equivalent to {@code decode(s, false, UTF_8, true)}
     */
    public static String decode(String s) {
        return decode(
                s, false /* convertPlus */, StandardCharsets.UTF_8, true /* throwOnFailure */);
    }
}