1/*
2 * Copyright (c) 1998, 2007, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.  Oracle designates this
8 * particular file as subject to the "Classpath" exception as provided
9 * by Oracle in the LICENSE file that accompanied this code.
10 *
11 * This code is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
14 * version 2 for more details (a copy is included in the LICENSE file that
15 * accompanied this code).
16 *
17 * You should have received a copy of the GNU General Public License version
18 * 2 along with this work; if not, write to the Free Software Foundation,
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 *
21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22 * or visit www.oracle.com if you need additional information or have any
23 * questions.
24 */
25
26package sun.net.www;
27
28import java.util.BitSet;
29import java.io.UnsupportedEncodingException;
30import java.io.File;
31import java.net.URL;
32import java.net.MalformedURLException;
33import java.net.URI;
34import java.net.URISyntaxException;
35import java.nio.ByteBuffer;
36import java.nio.CharBuffer;
37import java.nio.charset.CharacterCodingException;
38import sun.nio.cs.ThreadLocalCoders;
39import java.nio.charset.CharsetDecoder;
40import java.nio.charset.CoderResult;
41import java.nio.charset.CodingErrorAction;
42
43/**
44 * A class that contains useful routines common to sun.net.www
45 * @author  Mike McCloskey
46 */
47
48public class ParseUtil {
49    static BitSet encodedInPath;
50
51    static {
52        encodedInPath = new BitSet(256);
53
54        // Set the bits corresponding to characters that are encoded in the
55        // path component of a URI.
56
57        // These characters are reserved in the path segment as described in
58        // RFC2396 section 3.3.
59        encodedInPath.set('=');
60        encodedInPath.set(';');
61        encodedInPath.set('?');
62        encodedInPath.set('/');
63
64        // These characters are defined as excluded in RFC2396 section 2.4.3
65        // and must be escaped if they occur in the data part of a URI.
66        encodedInPath.set('#');
67        encodedInPath.set(' ');
68        encodedInPath.set('<');
69        encodedInPath.set('>');
70        encodedInPath.set('%');
71        encodedInPath.set('"');
72        encodedInPath.set('{');
73        encodedInPath.set('}');
74        encodedInPath.set('|');
75        encodedInPath.set('\\');
76        encodedInPath.set('^');
77        encodedInPath.set('[');
78        encodedInPath.set(']');
79        encodedInPath.set('`');
80
81        // US ASCII control characters 00-1F and 7F.
82        for (int i=0; i<32; i++)
83            encodedInPath.set(i);
84        encodedInPath.set(127);
85    }
86
87    /**
88     * Constructs an encoded version of the specified path string suitable
89     * for use in the construction of a URL.
90     *
91     * A path separator is replaced by a forward slash. The string is UTF8
92     * encoded. The % escape sequence is used for characters that are above
93     * 0x7F or those defined in RFC2396 as reserved or excluded in the path
94     * component of a URL.
95     */
96    public static String encodePath(String path) {
97        return encodePath(path, true);
98    }
99    /*
100     * flag indicates whether path uses platform dependent
101     * File.separatorChar or not. True indicates path uses platform
102     * dependent File.separatorChar.
103     */
104    public static String encodePath(String path, boolean flag) {
105        char[] retCC = new char[path.length() * 2 + 16];
106        int    retLen = 0;
107        char[] pathCC = path.toCharArray();
108
109        int n = path.length();
110        for (int i=0; i<n; i++) {
111            char c = pathCC[i];
112            if ((!flag && c == '/') || (flag && c == File.separatorChar))
113                retCC[retLen++] = '/';
114            else {
115                if (c <= 0x007F) {
116                    if (c >= 'a' && c <= 'z' ||
117                        c >= 'A' && c <= 'Z' ||
118                        c >= '0' && c <= '9') {
119                        retCC[retLen++] = c;
120                    } else
121                    if (encodedInPath.get(c))
122                        retLen = escape(retCC, c, retLen);
123                    else
124                        retCC[retLen++] = c;
125                } else if (c > 0x07FF) {
126                    retLen = escape(retCC, (char)(0xE0 | ((c >> 12) & 0x0F)), retLen);
127                    retLen = escape(retCC, (char)(0x80 | ((c >>  6) & 0x3F)), retLen);
128                    retLen = escape(retCC, (char)(0x80 | ((c >>  0) & 0x3F)), retLen);
129                } else {
130                    retLen = escape(retCC, (char)(0xC0 | ((c >>  6) & 0x1F)), retLen);
131                    retLen = escape(retCC, (char)(0x80 | ((c >>  0) & 0x3F)), retLen);
132                }
133            }
134            //worst case scenario for character [0x7ff-] every single
135            //character will be encoded into 9 characters.
136            if (retLen + 9 > retCC.length) {
137                int newLen = retCC.length * 2 + 16;
138                if (newLen < 0) {
139                    newLen = Integer.MAX_VALUE;
140                }
141                char[] buf = new char[newLen];
142                System.arraycopy(retCC, 0, buf, 0, retLen);
143                retCC = buf;
144            }
145        }
146        return new String(retCC, 0, retLen);
147    }
148
149    /**
150     * Appends the URL escape sequence for the specified char to the
151     * specified StringBuffer.
152     */
153    private static int escape(char[] cc, char c, int index) {
154        cc[index++] = '%';
155        cc[index++] = Character.forDigit((c >> 4) & 0xF, 16);
156        cc[index++] = Character.forDigit(c & 0xF, 16);
157        return index;
158    }
159
160    /**
161     * Un-escape and return the character at position i in string s.
162     */
163    private static byte unescape(String s, int i) {
164        return (byte) Integer.parseInt(s.substring(i+1,i+3),16);
165    }
166
167
168    /**
169     * Returns a new String constructed from the specified String by replacing
170     * the URL escape sequences and UTF8 encoding with the characters they
171     * represent.
172     */
173    public static String decode(String s) {
174        int n = s.length();
175        if ((n == 0) || (s.indexOf('%') < 0))
176            return s;
177
178        StringBuilder sb = new StringBuilder(n);
179        ByteBuffer bb = ByteBuffer.allocate(n);
180        CharBuffer cb = CharBuffer.allocate(n);
181        CharsetDecoder dec = ThreadLocalCoders.decoderFor("UTF-8")
182            .onMalformedInput(CodingErrorAction.REPORT)
183            .onUnmappableCharacter(CodingErrorAction.REPORT);
184
185        char c = s.charAt(0);
186        for (int i = 0; i < n;) {
187            assert c == s.charAt(i);
188            if (c != '%') {
189                sb.append(c);
190                if (++i >= n)
191                    break;
192                c = s.charAt(i);
193                continue;
194            }
195            bb.clear();
196            int ui = i;
197            for (;;) {
198                assert (n - i >= 2);
199                try {
200                    bb.put(unescape(s, i));
201                } catch (NumberFormatException e) {
202                    throw new IllegalArgumentException();
203                }
204                i += 3;
205                if (i >= n)
206                    break;
207                c = s.charAt(i);
208                if (c != '%')
209                    break;
210            }
211            bb.flip();
212            cb.clear();
213            dec.reset();
214            CoderResult cr = dec.decode(bb, cb, true);
215            if (cr.isError())
216                throw new IllegalArgumentException("Error decoding percent encoded characters");
217            cr = dec.flush(cb);
218            if (cr.isError())
219                throw new IllegalArgumentException("Error decoding percent encoded characters");
220            sb.append(cb.flip().toString());
221        }
222
223        return sb.toString();
224    }
225
226    /**
227     * Returns a canonical version of the specified string.
228     */
229    public String canonizeString(String file) {
230        int i = 0;
231        int lim = file.length();
232
233        // Remove embedded /../
234        while ((i = file.indexOf("/../")) >= 0) {
235            if ((lim = file.lastIndexOf('/', i - 1)) >= 0) {
236                file = file.substring(0, lim) + file.substring(i + 3);
237            } else {
238                file = file.substring(i + 3);
239            }
240        }
241        // Remove embedded /./
242        while ((i = file.indexOf("/./")) >= 0) {
243            file = file.substring(0, i) + file.substring(i + 2);
244        }
245        // Remove trailing ..
246        while (file.endsWith("/..")) {
247            i = file.indexOf("/..");
248            if ((lim = file.lastIndexOf('/', i - 1)) >= 0) {
249                file = file.substring(0, lim+1);
250            } else {
251                file = file.substring(0, i);
252            }
253        }
254        // Remove trailing .
255        if (file.endsWith("/."))
256            file = file.substring(0, file.length() -1);
257
258        return file;
259    }
260
261    public static URL fileToEncodedURL(File file)
262        throws MalformedURLException
263    {
264        String path = file.getAbsolutePath();
265        path = ParseUtil.encodePath(path);
266        if (!path.startsWith("/")) {
267            path = "/" + path;
268        }
269        if (!path.endsWith("/") && file.isDirectory()) {
270            path = path + "/";
271        }
272        return new URL("file", "", path);
273    }
274
275    public static java.net.URI toURI(URL url) {
276        String protocol = url.getProtocol();
277        String auth = url.getAuthority();
278        String path = url.getPath();
279        String query = url.getQuery();
280        String ref = url.getRef();
281        if (path != null && !(path.startsWith("/")))
282            path = "/" + path;
283
284        //
285        // In java.net.URI class, a port number of -1 implies the default
286        // port number. So get it stripped off before creating URI instance.
287        //
288        if (auth != null && auth.endsWith(":-1"))
289            auth = auth.substring(0, auth.length() - 3);
290
291        java.net.URI uri;
292        try {
293            uri = createURI(protocol, auth, path, query, ref);
294        } catch (java.net.URISyntaxException e) {
295            uri = null;
296        }
297        return uri;
298    }
299
300    //
301    // createURI() and its auxiliary code are cloned from java.net.URI.
302    // Most of the code are just copy and paste, except that quote()
303    // has been modified to avoid double-escape.
304    //
305    // Usually it is unacceptable, but we're forced to do it because
306    // otherwise we need to change public API, namely java.net.URI's
307    // multi-argument constructors. It turns out that the changes cause
308    // incompatibilities so can't be done.
309    //
310    private static URI createURI(String scheme,
311                                 String authority,
312                                 String path,
313                                 String query,
314                                 String fragment) throws URISyntaxException
315    {
316        String s = toString(scheme, null,
317                            authority, null, null, -1,
318                            path, query, fragment);
319        checkPath(s, scheme, path);
320        return new URI(s);
321    }
322
323    private static String toString(String scheme,
324                            String opaquePart,
325                            String authority,
326                            String userInfo,
327                            String host,
328                            int port,
329                            String path,
330                            String query,
331                            String fragment)
332    {
333        StringBuffer sb = new StringBuffer();
334        if (scheme != null) {
335            sb.append(scheme);
336            sb.append(':');
337        }
338        appendSchemeSpecificPart(sb, opaquePart,
339                                 authority, userInfo, host, port,
340                                 path, query);
341        appendFragment(sb, fragment);
342        return sb.toString();
343    }
344
345    private static void appendSchemeSpecificPart(StringBuffer sb,
346                                          String opaquePart,
347                                          String authority,
348                                          String userInfo,
349                                          String host,
350                                          int port,
351                                          String path,
352                                          String query)
353    {
354        if (opaquePart != null) {
355            /* check if SSP begins with an IPv6 address
356             * because we must not quote a literal IPv6 address
357             */
358            if (opaquePart.startsWith("//[")) {
359                int end =  opaquePart.indexOf("]");
360                if (end != -1 && opaquePart.indexOf(":")!=-1) {
361                    String doquote, dontquote;
362                    if (end == opaquePart.length()) {
363                        dontquote = opaquePart;
364                        doquote = "";
365                    } else {
366                        dontquote = opaquePart.substring(0,end+1);
367                        doquote = opaquePart.substring(end+1);
368                    }
369                    sb.append (dontquote);
370                    sb.append(quote(doquote, L_URIC, H_URIC));
371                }
372            } else {
373                sb.append(quote(opaquePart, L_URIC, H_URIC));
374            }
375        } else {
376            appendAuthority(sb, authority, userInfo, host, port);
377            if (path != null)
378                sb.append(quote(path, L_PATH, H_PATH));
379            if (query != null) {
380                sb.append('?');
381                sb.append(quote(query, L_URIC, H_URIC));
382            }
383        }
384    }
385
386    private static void appendAuthority(StringBuffer sb,
387                                 String authority,
388                                 String userInfo,
389                                 String host,
390                                 int port)
391    {
392        if (host != null) {
393            sb.append("//");
394            if (userInfo != null) {
395                sb.append(quote(userInfo, L_USERINFO, H_USERINFO));
396                sb.append('@');
397            }
398            boolean needBrackets = ((host.indexOf(':') >= 0)
399                                    && !host.startsWith("[")
400                                    && !host.endsWith("]"));
401            if (needBrackets) sb.append('[');
402            sb.append(host);
403            if (needBrackets) sb.append(']');
404            if (port != -1) {
405                sb.append(':');
406                sb.append(port);
407            }
408        } else if (authority != null) {
409            sb.append("//");
410            if (authority.startsWith("[")) {
411                int end = authority.indexOf("]");
412                if (end != -1 && authority.indexOf(":")!=-1) {
413                    String doquote, dontquote;
414                    if (end == authority.length()) {
415                        dontquote = authority;
416                        doquote = "";
417                    } else {
418                        dontquote = authority.substring(0,end+1);
419                        doquote = authority.substring(end+1);
420                    }
421                    sb.append (dontquote);
422                    sb.append(quote(doquote,
423                            L_REG_NAME | L_SERVER,
424                            H_REG_NAME | H_SERVER));
425                }
426            } else {
427                sb.append(quote(authority,
428                            L_REG_NAME | L_SERVER,
429                            H_REG_NAME | H_SERVER));
430            }
431        }
432    }
433
434    private static void appendFragment(StringBuffer sb, String fragment) {
435        if (fragment != null) {
436            sb.append('#');
437            sb.append(quote(fragment, L_URIC, H_URIC));
438        }
439    }
440
441    // Quote any characters in s that are not permitted
442    // by the given mask pair
443    //
444    private static String quote(String s, long lowMask, long highMask) {
445        int n = s.length();
446        StringBuffer sb = null;
447        boolean allowNonASCII = ((lowMask & L_ESCAPED) != 0);
448        for (int i = 0; i < s.length(); i++) {
449            char c = s.charAt(i);
450            if (c < '\u0080') {
451                if (!match(c, lowMask, highMask) && !isEscaped(s, i)) {
452                    if (sb == null) {
453                        sb = new StringBuffer();
454                        sb.append(s.substring(0, i));
455                    }
456                    appendEscape(sb, (byte)c);
457                } else {
458                    if (sb != null)
459                        sb.append(c);
460                }
461            } else if (allowNonASCII
462                       && (Character.isSpaceChar(c)
463                           || Character.isISOControl(c))) {
464                if (sb == null) {
465                    sb = new StringBuffer();
466                    sb.append(s.substring(0, i));
467                }
468                appendEncoded(sb, c);
469            } else {
470                if (sb != null)
471                    sb.append(c);
472            }
473        }
474        return (sb == null) ? s : sb.toString();
475    }
476
477    //
478    // To check if the given string has an escaped triplet
479    // at the given position
480    //
481    private static boolean isEscaped(String s, int pos) {
482        if (s == null || (s.length() <= (pos + 2)))
483            return false;
484
485        return s.charAt(pos) == '%'
486               && match(s.charAt(pos + 1), L_HEX, H_HEX)
487               && match(s.charAt(pos + 2), L_HEX, H_HEX);
488    }
489
490    private static void appendEncoded(StringBuffer sb, char c) {
491        ByteBuffer bb = null;
492        try {
493            bb = ThreadLocalCoders.encoderFor("UTF-8")
494                .encode(CharBuffer.wrap("" + c));
495        } catch (CharacterCodingException x) {
496            assert false;
497        }
498        while (bb.hasRemaining()) {
499            int b = bb.get() & 0xff;
500            if (b >= 0x80)
501                appendEscape(sb, (byte)b);
502            else
503                sb.append((char)b);
504        }
505    }
506
507    private final static char[] hexDigits = {
508        '0', '1', '2', '3', '4', '5', '6', '7',
509        '8', '9', 'A', 'B', 'C', 'D', 'E', 'F'
510    };
511
512    private static void appendEscape(StringBuffer sb, byte b) {
513        sb.append('%');
514        sb.append(hexDigits[(b >> 4) & 0x0f]);
515        sb.append(hexDigits[(b >> 0) & 0x0f]);
516    }
517
518    // Tell whether the given character is permitted by the given mask pair
519    private static boolean match(char c, long lowMask, long highMask) {
520        if (c < 64)
521            return ((1L << c) & lowMask) != 0;
522        if (c < 128)
523            return ((1L << (c - 64)) & highMask) != 0;
524        return false;
525    }
526
527    // If a scheme is given then the path, if given, must be absolute
528    //
529    private static void checkPath(String s, String scheme, String path)
530        throws URISyntaxException
531    {
532        if (scheme != null) {
533            if ((path != null)
534                && ((path.length() > 0) && (path.charAt(0) != '/')))
535                throw new URISyntaxException(s,
536                                             "Relative path in absolute URI");
537        }
538    }
539
540
541    // -- Character classes for parsing --
542
543    // Compute a low-order mask for the characters
544    // between first and last, inclusive
545    private static long lowMask(char first, char last) {
546        long m = 0;
547        int f = Math.max(Math.min(first, 63), 0);
548        int l = Math.max(Math.min(last, 63), 0);
549        for (int i = f; i <= l; i++)
550            m |= 1L << i;
551        return m;
552    }
553
554    // Compute the low-order mask for the characters in the given string
555    private static long lowMask(String chars) {
556        int n = chars.length();
557        long m = 0;
558        for (int i = 0; i < n; i++) {
559            char c = chars.charAt(i);
560            if (c < 64)
561                m |= (1L << c);
562        }
563        return m;
564    }
565
566    // Compute a high-order mask for the characters
567    // between first and last, inclusive
568    private static long highMask(char first, char last) {
569        long m = 0;
570        int f = Math.max(Math.min(first, 127), 64) - 64;
571        int l = Math.max(Math.min(last, 127), 64) - 64;
572        for (int i = f; i <= l; i++)
573            m |= 1L << i;
574        return m;
575    }
576
577    // Compute the high-order mask for the characters in the given string
578    private static long highMask(String chars) {
579        int n = chars.length();
580        long m = 0;
581        for (int i = 0; i < n; i++) {
582            char c = chars.charAt(i);
583            if ((c >= 64) && (c < 128))
584                m |= (1L << (c - 64));
585        }
586        return m;
587    }
588
589
590    // Character-class masks
591
592    // digit    = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" |
593    //            "8" | "9"
594    private static final long L_DIGIT = lowMask('0', '9');
595    private static final long H_DIGIT = 0L;
596
597    // hex           =  digit | "A" | "B" | "C" | "D" | "E" | "F" |
598    //                          "a" | "b" | "c" | "d" | "e" | "f"
599    private static final long L_HEX = L_DIGIT;
600    private static final long H_HEX = highMask('A', 'F') | highMask('a', 'f');
601
602    // upalpha  = "A" | "B" | "C" | "D" | "E" | "F" | "G" | "H" | "I" |
603    //            "J" | "K" | "L" | "M" | "N" | "O" | "P" | "Q" | "R" |
604    //            "S" | "T" | "U" | "V" | "W" | "X" | "Y" | "Z"
605    private static final long L_UPALPHA = 0L;
606    private static final long H_UPALPHA = highMask('A', 'Z');
607
608    // lowalpha = "a" | "b" | "c" | "d" | "e" | "f" | "g" | "h" | "i" |
609    //            "j" | "k" | "l" | "m" | "n" | "o" | "p" | "q" | "r" |
610    //            "s" | "t" | "u" | "v" | "w" | "x" | "y" | "z"
611    private static final long L_LOWALPHA = 0L;
612    private static final long H_LOWALPHA = highMask('a', 'z');
613
614    // alpha         = lowalpha | upalpha
615    private static final long L_ALPHA = L_LOWALPHA | L_UPALPHA;
616    private static final long H_ALPHA = H_LOWALPHA | H_UPALPHA;
617
618    // alphanum      = alpha | digit
619    private static final long L_ALPHANUM = L_DIGIT | L_ALPHA;
620    private static final long H_ALPHANUM = H_DIGIT | H_ALPHA;
621
622    // mark          = "-" | "_" | "." | "!" | "~" | "*" | "'" |
623    //                 "(" | ")"
624    private static final long L_MARK = lowMask("-_.!~*'()");
625    private static final long H_MARK = highMask("-_.!~*'()");
626
627    // unreserved    = alphanum | mark
628    private static final long L_UNRESERVED = L_ALPHANUM | L_MARK;
629    private static final long H_UNRESERVED = H_ALPHANUM | H_MARK;
630
631    // reserved      = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
632    //                 "$" | "," | "[" | "]"
633    // Added per RFC2732: "[", "]"
634    private static final long L_RESERVED = lowMask(";/?:@&=+$,[]");
635    private static final long H_RESERVED = highMask(";/?:@&=+$,[]");
636
637    // The zero'th bit is used to indicate that escape pairs and non-US-ASCII
638    // characters are allowed; this is handled by the scanEscape method below.
639    private static final long L_ESCAPED = 1L;
640    private static final long H_ESCAPED = 0L;
641
642    // Dash, for use in domainlabel and toplabel
643    private static final long L_DASH = lowMask("-");
644    private static final long H_DASH = highMask("-");
645
646    // uric          = reserved | unreserved | escaped
647    private static final long L_URIC = L_RESERVED | L_UNRESERVED | L_ESCAPED;
648    private static final long H_URIC = H_RESERVED | H_UNRESERVED | H_ESCAPED;
649
650    // pchar         = unreserved | escaped |
651    //                 ":" | "@" | "&" | "=" | "+" | "$" | ","
652    private static final long L_PCHAR
653        = L_UNRESERVED | L_ESCAPED | lowMask(":@&=+$,");
654    private static final long H_PCHAR
655        = H_UNRESERVED | H_ESCAPED | highMask(":@&=+$,");
656
657    // All valid path characters
658    private static final long L_PATH = L_PCHAR | lowMask(";/");
659    private static final long H_PATH = H_PCHAR | highMask(";/");
660
661    // userinfo      = *( unreserved | escaped |
662    //                    ";" | ":" | "&" | "=" | "+" | "$" | "," )
663    private static final long L_USERINFO
664        = L_UNRESERVED | L_ESCAPED | lowMask(";:&=+$,");
665    private static final long H_USERINFO
666        = H_UNRESERVED | H_ESCAPED | highMask(";:&=+$,");
667
668    // reg_name      = 1*( unreserved | escaped | "$" | "," |
669    //                     ";" | ":" | "@" | "&" | "=" | "+" )
670    private static final long L_REG_NAME
671        = L_UNRESERVED | L_ESCAPED | lowMask("$,;:@&=+");
672    private static final long H_REG_NAME
673        = H_UNRESERVED | H_ESCAPED | highMask("$,;:@&=+");
674
675    // All valid characters for server-based authorities
676    private static final long L_SERVER
677        = L_USERINFO | L_ALPHANUM | L_DASH | lowMask(".:@[]");
678    private static final long H_SERVER
679        = H_USERINFO | H_ALPHANUM | H_DASH | highMask(".:@[]");
680}
681