UriCodec.java revision 32559028b14b9b321b10eede050afd554a376569
156099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson/*
256099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson *  Licensed to the Apache Software Foundation (ASF) under one or more
356099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson *  contributor license agreements.  See the NOTICE file distributed with
456099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson *  this work for additional information regarding copyright ownership.
556099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson *  The ASF licenses this file to You under the Apache License, Version 2.0
656099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson *  (the "License"); you may not use this file except in compliance with
756099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson *  the License.  You may obtain a copy of the License at
856099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson *
956099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson *     http://www.apache.org/licenses/LICENSE-2.0
1056099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson *
1156099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson *  Unless required by applicable law or agreed to in writing, software
1256099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson *  distributed under the License is distributed on an "AS IS" BASIS,
1356099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1456099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson *  See the License for the specific language governing permissions and
1556099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson *  limitations under the License.
1656099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson */
1756099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson
1856099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilsonpackage libcore.net;
1956099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson
2056099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilsonimport java.io.ByteArrayOutputStream;
2156099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilsonimport java.net.URISyntaxException;
2256099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilsonimport java.nio.charset.Charset;
2356099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilsonimport java.nio.charset.Charsets;
2456099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson
2556099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson/**
2656099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson * Encodes and decodes {@code application/x-www-form-urlencoded} content.
2756099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson * Subclasses define exactly which characters are legal.
2856099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson *
2956099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson * <p>By default, UTF-8 is used to encode escaped characters. A single input
3056099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson * character like "\u0080" may be encoded to multiple octets like %C2%80.
3156099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson */
3256099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilsonpublic abstract class UriCodec {
3356099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson
3456099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson    /**
3556099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson     * Returns true if {@code c} does not need to be escaped.
3656099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson     */
3756099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson    protected abstract boolean isRetained(char c);
3856099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson
3956099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson    /**
4056099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson     * Throws if {@code s} is invalid according to this encoder.
4156099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson     */
4256099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson    public void validate(String s) throws URISyntaxException {
4356099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson        for (int i = 0; i < s.length();) {
4456099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson            char ch = s.charAt(i);
4556099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson            if ((ch >= 'a' && ch <= 'z')
4656099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson                    || (ch >= 'A' && ch <= 'Z')
4756099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson                    || (ch >= '0' && ch <= '9')
4856099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson                    || isRetained(ch)) {
4956099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson                i++;
5056099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson            } else if (ch == '%') {
5156099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson                if (i + 2 >= s.length()) {
5256099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson                    throw new URISyntaxException(s, "Incomplete % sequence", i);
5356099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson                }
5432559028b14b9b321b10eede050afd554a376569Jesse Wilson                int d1 = hexToInt(s.charAt(i + 1));
5532559028b14b9b321b10eede050afd554a376569Jesse Wilson                int d2 = hexToInt(s.charAt(i + 2));
5656099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson                if (d1 == -1 || d2 == -1) {
5756099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson                    throw new URISyntaxException(s, "Invalid % sequence: " +
5856099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson                            s.substring(i, i + 3), i);
5956099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson                }
6056099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson                i += 3;
6156099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson            } else {
6256099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson                throw new URISyntaxException(s, "Illegal character", i);
6356099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson            }
6456099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson        }
6556099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson    }
6656099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson
6756099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson    /**
6856099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson     * Throws if {@code s} contains characters that are not letters, digits or
6956099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson     * in {@code legal}.
7056099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson     */
7156099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson    public static void validateSimple(String s, String legal) throws URISyntaxException {
7256099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson        for (int i = 0; i < s.length(); i++) {
7356099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson            char ch = s.charAt(i);
7456099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson            if (!((ch >= 'a' && ch <= 'z')
7556099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson                    || (ch >= 'A' && ch <= 'Z')
7656099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson                    || (ch >= '0' && ch <= '9')
7756099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson                    || legal.indexOf(ch) > -1)) {
7856099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson                throw new URISyntaxException(s, "Illegal character", i);
7956099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson            }
8056099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson        }
8156099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson    }
8256099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson
8356099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson    /**
8456099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson     * Encodes {@code s} and appends the result to {@code builder}.
8532559028b14b9b321b10eede050afd554a376569Jesse Wilson     *
8632559028b14b9b321b10eede050afd554a376569Jesse Wilson     * @param isPartiallyEncoded true to fix input that has already been
8732559028b14b9b321b10eede050afd554a376569Jesse Wilson     *     partially or fully encoded. For example, input of "hello%20world" is
8832559028b14b9b321b10eede050afd554a376569Jesse Wilson     *     unchanged with isPartiallyEncoded=true but would be double-escaped to
8932559028b14b9b321b10eede050afd554a376569Jesse Wilson     *     "hello%2520world" otherwise.
9056099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson     */
9132559028b14b9b321b10eede050afd554a376569Jesse Wilson    private void appendEncoded(StringBuilder builder, String s, Charset charset,
9232559028b14b9b321b10eede050afd554a376569Jesse Wilson            boolean isPartiallyEncoded) {
9356099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson        if (s == null) {
9456099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson            throw new NullPointerException();
9556099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson        }
9656099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson
9756099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson        int escapeStart = -1;
9856099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson        for (int i = 0; i < s.length(); i++) {
9956099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson            char c = s.charAt(i);
10032559028b14b9b321b10eede050afd554a376569Jesse Wilson            if (isPartiallyEncoded && c == '%') {
10132559028b14b9b321b10eede050afd554a376569Jesse Wilson                i += 2; // this is a 3-character sequence like "%20"
10232559028b14b9b321b10eede050afd554a376569Jesse Wilson            } else if ((c >= 'a' && c <= 'z')
10356099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson                    || (c >= 'A' && c <= 'Z')
10456099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson                    || (c >= '0' && c <= '9')
10556099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson                    || isRetained(c)) {
10656099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson                if (escapeStart != -1) {
10756099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson                    appendHex(builder, s.substring(escapeStart, i), charset);
10856099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson                    escapeStart = -1;
10956099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson                }
11056099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson                if (c != ' ') {
11156099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson                    builder.append(c);
11256099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson                } else {
11356099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson                    builder.append('+');
11456099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson                }
11556099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson            } else if (escapeStart == -1) {
11656099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson                escapeStart = i;
11756099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson            }
11856099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson        }
11956099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson        if (escapeStart != -1) {
12056099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson            appendHex(builder, s.substring(escapeStart, s.length()), charset);
12156099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson        }
12256099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson    }
12356099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson
12432559028b14b9b321b10eede050afd554a376569Jesse Wilson    public String encode(String s, Charset charset) {
12532559028b14b9b321b10eede050afd554a376569Jesse Wilson        // Guess a bit larger for encoded form
12632559028b14b9b321b10eede050afd554a376569Jesse Wilson        StringBuilder builder = new StringBuilder(s.length() + 16);
12732559028b14b9b321b10eede050afd554a376569Jesse Wilson        appendEncoded(builder, s, charset, false);
12832559028b14b9b321b10eede050afd554a376569Jesse Wilson        return builder.toString();
12956099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson    }
13056099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson
13132559028b14b9b321b10eede050afd554a376569Jesse Wilson    public void appendEncoded(StringBuilder builder, String s) {
13232559028b14b9b321b10eede050afd554a376569Jesse Wilson        appendEncoded(builder, s, Charsets.UTF_8, false);
13332559028b14b9b321b10eede050afd554a376569Jesse Wilson    }
13456099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson
13532559028b14b9b321b10eede050afd554a376569Jesse Wilson    public void appendPartiallyEncoded(StringBuilder builder, String s) {
13632559028b14b9b321b10eede050afd554a376569Jesse Wilson        appendEncoded(builder, s, Charsets.UTF_8, true);
13756099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson    }
13856099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson
13956099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson    /**
14056099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson     * @param convertPlus true to convert '+' to ' '.
14156099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson     */
14256099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson    public static String decode(String s, boolean convertPlus, Charset charset) {
14356099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson        if (s.indexOf('%') == -1 && (!convertPlus || s.indexOf('+') == -1)) {
14456099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson            return s;
14556099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson        }
14656099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson
14756099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson        StringBuilder result = new StringBuilder(s.length());
14856099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson        ByteArrayOutputStream out = new ByteArrayOutputStream();
14956099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson        for (int i = 0; i < s.length();) {
15056099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson            char c = s.charAt(i);
15156099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson            if (c == '%') {
15256099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson                do {
15356099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson                    if (i + 2 >= s.length()) {
15456099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson                        throw new IllegalArgumentException("Incomplete % sequence at: " + i);
15556099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson                    }
15632559028b14b9b321b10eede050afd554a376569Jesse Wilson                    int d1 = hexToInt(s.charAt(i + 1));
15732559028b14b9b321b10eede050afd554a376569Jesse Wilson                    int d2 = hexToInt(s.charAt(i + 2));
15856099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson                    if (d1 == -1 || d2 == -1) {
15956099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson                        throw new IllegalArgumentException("Invalid % sequence " +
16056099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson                                s.substring(i, i + 3) + " at " + i);
16156099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson                    }
16256099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson                    out.write((byte) ((d1 << 4) + d2));
16356099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson                    i += 3;
16456099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson                } while (i < s.length() && s.charAt(i) == '%');
16556099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson                result.append(new String(out.toByteArray(), charset));
16656099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson                out.reset();
16756099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson            } else {
16856099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson                if (convertPlus && c == '+') {
16956099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson                    c = ' ';
17056099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson                }
17156099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson                result.append(c);
17256099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson                i++;
17356099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson            }
17456099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson        }
17556099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson        return result.toString();
17656099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson    }
17756099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson
17832559028b14b9b321b10eede050afd554a376569Jesse Wilson    /**
17932559028b14b9b321b10eede050afd554a376569Jesse Wilson     * Like {@link Character#digit}, but without support for non-ASCII
18032559028b14b9b321b10eede050afd554a376569Jesse Wilson     * characters.
18132559028b14b9b321b10eede050afd554a376569Jesse Wilson     */
18232559028b14b9b321b10eede050afd554a376569Jesse Wilson    private static int hexToInt(char c) {
18332559028b14b9b321b10eede050afd554a376569Jesse Wilson        if ('0' <= c && c <= '9') {
18432559028b14b9b321b10eede050afd554a376569Jesse Wilson            return c - '0';
18532559028b14b9b321b10eede050afd554a376569Jesse Wilson        } else if ('a' <= c && c <= 'f') {
18632559028b14b9b321b10eede050afd554a376569Jesse Wilson            return 10 + (c - 'a');
18732559028b14b9b321b10eede050afd554a376569Jesse Wilson        } else if ('A' <= c && c <= 'F') {
18832559028b14b9b321b10eede050afd554a376569Jesse Wilson            return 10 + (c - 'A');
18932559028b14b9b321b10eede050afd554a376569Jesse Wilson        } else {
19032559028b14b9b321b10eede050afd554a376569Jesse Wilson            return -1;
19132559028b14b9b321b10eede050afd554a376569Jesse Wilson        }
19232559028b14b9b321b10eede050afd554a376569Jesse Wilson    }
19332559028b14b9b321b10eede050afd554a376569Jesse Wilson
19456099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson    public static String decode(String s) {
19556099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson        return decode(s, false, Charsets.UTF_8);
19656099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson    }
19756099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson
19856099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson    private static void appendHex(StringBuilder builder, String s, Charset charset) {
19956099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson        for (byte b : s.getBytes(charset)) {
20056099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson            appendHex(builder, b);
20156099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson        }
20256099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson    }
20356099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson
20456099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson    private static void appendHex(StringBuilder sb, byte b) {
20556099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson        sb.append('%');
20656099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson        sb.append(Byte.toHexString(b, true));
20756099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson    }
20856099d23fcb002b164bff8fb7f14d6ec0453509eJesse Wilson}
209