JSONTokener.java revision 661054f5a2f7f8f5f3ceffb97e803211b546e7fc
1/*
2 * Copyright (C) 2010 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17package org.json;
18
19// Note: this class was written without inspecting the non-free org.json sourcecode.
20
21/**
22 * Parses a JSON (<a href="http://www.ietf.org/rfc/rfc4627.txt">RFC 4627</a>)
23 * encoded string into the corresponding object. Most clients of
24 * this class will use only need the {@link #JSONTokener(String) constructor}
25 * and {@link #nextValue} method. Example usage: <pre>
26 * String json = "{"
27 *         + "  \"query\": \"Pizza\", "
28 *         + "  \"locations\": [ 94043, 90210 ] "
29 *         + "}";
30 *
31 * JSONObject object = (JSONObject) new JSONTokener(json).nextValue();
32 * String query = object.getString("query");
33 * JSONArray locations = object.getJSONArray("locations");</pre>
34 *
35 * <p>For best interoperability and performance use JSON that complies with
36 * RFC 4627, such as that generated by {@link JSONStringer}. For legacy reasons
37 * this parser is lenient, so a successful parse does not indicate that the
38 * input string was valid JSON. All of the following syntax errors will be
39 * ignored:
40 * <ul>
41 *   <li>End of line comments starting with {@code //} or {@code #} and ending
42 *       with a newline character.
43 *   <li>C-style comments starting with {@code /*} and ending with
44 *       {@code *}{@code /}. Such comments may not be nested.
45 *   <li>Strings that are unquoted or {@code 'single quoted'}.
46 *   <li>Hexadecimal integers prefixed with {@code 0x} or {@code 0X}.
47 *   <li>Octal integers prefixed with {@code 0}.
48 *   <li>Array elements separated by {@code ;}.
49 *   <li>Unnecessary array separators. These are interpreted as if null was the
50 *       omitted value.
51 *   <li>Key-value pairs separated by {@code =} or {@code =>}.
52 *   <li>Key-value pairs separated by {@code ;}.
53 * </ul>
54 *
55 * <p>Each tokener may be used to parse a single JSON string. Instances of this
56 * class are not thread safe. Although this class is nonfinal, it was not
57 * designed for inheritance and should not be subclassed. In particular,
58 * self-use by overrideable methods is not specified. See <i>Effective Java</i>
59 * Item 17, "Design and Document or inheritance or else prohibit it" for further
60 * information.
61 */
62public class JSONTokener {
63
64    /** The input JSON. */
65    private final String in;
66
67    /**
68     * The index of the next character to be returned by {@link #next}. When
69     * the input is exhausted, this equals the input's length.
70     */
71    private int pos;
72
73    /**
74     * @param in JSON encoded string. Null is not permitted and will yield a
75     *     tokener that throws {@code NullPointerExceptions} when methods are
76     *     called.
77     */
78    public JSONTokener(String in) {
79        this.in = in;
80    }
81
82    /**
83     * Returns the next value from the input.
84     *
85     * @return a {@link JSONObject}, {@link JSONArray}, String, Boolean,
86     *     Integer, Long, Double or {@link JSONObject#NULL}.
87     * @throws JSONException if the input is malformed.
88     */
89    public Object nextValue() throws JSONException {
90        int c = nextCleanInternal();
91        switch (c) {
92            case -1:
93                throw syntaxError("End of input");
94
95            case '{':
96                return readObject();
97
98            case '[':
99                return readArray();
100
101            case '\'':
102            case '"':
103                return nextString((char) c);
104
105            default:
106                pos--;
107                return readLiteral();
108        }
109    }
110
111    private int nextCleanInternal() throws JSONException {
112        while (pos < in.length()) {
113            int c = in.charAt(pos++);
114            switch (c) {
115                case '\t':
116                case ' ':
117                case '\n':
118                case '\r':
119                    continue;
120
121                case '/':
122                    if (pos == in.length()) {
123                        return c;
124                    }
125
126                    char peek = in.charAt(pos);
127                    switch (peek) {
128                        case '*':
129                            // skip a /* c-style comment */
130                            pos++;
131                            int commentEnd = in.indexOf("*/", pos);
132                            if (commentEnd == -1) {
133                                throw syntaxError("Unterminated comment");
134                            }
135                            pos = commentEnd + 2;
136                            continue;
137
138                        case '/':
139                            // skip a // end-of-line comment
140                            pos++;
141                            skipToEndOfLine();
142                            continue;
143
144                        default:
145                            return c;
146                    }
147
148                case '#':
149                    /*
150                     * Skip a # hash end-of-line comment. The JSON RFC doesn't
151                     * specify this behavior, but it's required to parse
152                     * existing documents. See http://b/2571423.
153                     */
154                    skipToEndOfLine();
155                    continue;
156
157                default:
158                    return c;
159            }
160        }
161
162        return -1;
163    }
164
165    /**
166     * Advances the position until after the next newline character. If the line
167     * is terminated by "\r\n", the '\n' must be consumed as whitespace by the
168     * caller.
169     */
170    private void skipToEndOfLine() {
171        for (; pos < in.length(); pos++) {
172            char c = in.charAt(pos);
173            if (c == '\r' || c == '\n') {
174                pos++;
175                break;
176            }
177        }
178    }
179
180    /**
181     * Returns the string up to but not including {@code quote}, unescaping any
182     * character escape sequences encountered along the way. The opening quote
183     * should have already been read. This consumes the closing quote, but does
184     * not include it in the returned string.
185     *
186     * @param quote either ' or ".
187     * @throws NumberFormatException if any unicode escape sequences are
188     *     malformed.
189     */
190    public String nextString(char quote) throws JSONException {
191        /*
192         * For strings that are free of escape sequences, we can just extract
193         * the result as a substring of the input. But if we encounter an escape
194         * sequence, we need to use a StringBuilder to compose the result.
195         */
196        StringBuilder builder = null;
197
198        /* the index of the first character not yet appended to the builder. */
199        int start = pos;
200
201        while (pos < in.length()) {
202            int c = in.charAt(pos++);
203            if (c == quote) {
204                if (builder == null) {
205                    // a new string avoids leaking memory
206                    return new String(in.substring(start, pos - 1));
207                } else {
208                    builder.append(in, start, pos - 1);
209                    return builder.toString();
210                }
211            }
212
213            if (c == '\\') {
214                if (pos == in.length()) {
215                    throw syntaxError("Unterminated escape sequence");
216                }
217                if (builder == null) {
218                    builder = new StringBuilder();
219                }
220                builder.append(in, start, pos - 1);
221                builder.append(readEscapeCharacter());
222                start = pos;
223            }
224        }
225
226        throw syntaxError("Unterminated string");
227    }
228
229    /**
230     * Unescapes the character identified by the character or characters that
231     * immediately follow a backslash. The backslash '\' should have already
232     * been read. This supports both unicode escapes "u000A" and two-character
233     * escapes "\n".
234     *
235     * @throws NumberFormatException if any unicode escape sequences are
236     *     malformed.
237     */
238    private char readEscapeCharacter() throws JSONException {
239        char escaped = in.charAt(pos++);
240        switch (escaped) {
241            case 'u':
242                if (pos + 4 > in.length()) {
243                    throw syntaxError("Unterminated escape sequence");
244                }
245                String hex = in.substring(pos, pos + 4);
246                pos += 4;
247                return (char) Integer.parseInt(hex, 16);
248
249            case 't':
250                return '\t';
251
252            case 'b':
253                return '\b';
254
255            case 'n':
256                return '\n';
257
258            case 'r':
259                return '\r';
260
261            case 'f':
262                return '\f';
263
264            case '\'':
265            case '"':
266            case '\\':
267            default:
268                return escaped;
269        }
270    }
271
272    /**
273     * Reads a null, boolean, numeric or unquoted string literal value. Numeric
274     * values will be returned as an Integer, Long, or Double, in that order of
275     * preference.
276     */
277    private Object readLiteral() throws JSONException {
278        String literal = nextToInternal("{}[]/\\:,=;# \t\f");
279
280        if (literal.length() == 0) {
281            throw syntaxError("Expected literal value");
282        } else if ("null".equalsIgnoreCase(literal)) {
283            return JSONObject.NULL;
284        } else if ("true".equalsIgnoreCase(literal)) {
285            return Boolean.TRUE;
286        } else if ("false".equalsIgnoreCase(literal)) {
287            return Boolean.FALSE;
288        }
289
290        /* try to parse as an integral type... */
291        if (literal.indexOf('.') == -1) {
292            int base = 10;
293            String number = literal;
294            if (number.startsWith("0x") || number.startsWith("0X")) {
295                number = number.substring(2);
296                base = 16;
297            } else if (number.startsWith("0") && number.length() > 1) {
298                number = number.substring(1);
299                base = 8;
300            }
301            try {
302                long longValue = Long.parseLong(number, base);
303                if (longValue <= Integer.MAX_VALUE && longValue >= Integer.MIN_VALUE) {
304                    return (int) longValue;
305                } else {
306                    return longValue;
307                }
308            } catch (NumberFormatException e) {
309                /*
310                 * This only happens for integral numbers greater than
311                 * Long.MAX_VALUE, numbers in exponential form (5e-10) and
312                 * unquoted strings. Fall through to try floating point.
313                 */
314            }
315        }
316
317        /* ...next try to parse as a floating point... */
318        try {
319            return Double.valueOf(literal);
320        } catch (NumberFormatException ignored) {
321        }
322
323        /* ... finally give up. We have an unquoted string */
324        return new String(literal); // a new string avoids leaking memory
325    }
326
327    /**
328     * Returns the string up to but not including any of the given characters or
329     * a newline character. This does not consume the excluded character.
330     */
331    private String nextToInternal(String excluded) {
332        int start = pos;
333        for (; pos < in.length(); pos++) {
334            char c = in.charAt(pos);
335            if (c == '\r' || c == '\n' || excluded.indexOf(c) != -1) {
336                return in.substring(start, pos);
337            }
338        }
339        return in.substring(start);
340    }
341
342    /**
343     * Reads a sequence of key/value pairs and the trailing closing brace '}' of
344     * an object. The opening brace '{' should have already been read.
345     */
346    private JSONObject readObject() throws JSONException {
347        JSONObject result = new JSONObject();
348
349        /* Peek to see if this is the empty object. */
350        int first = nextCleanInternal();
351        if (first == '}') {
352            return result;
353        } else if (first != -1) {
354            pos--;
355        }
356
357        while (true) {
358            Object name = nextValue();
359            if (!(name instanceof String)) {
360                if (name == null) {
361                    throw syntaxError("Names cannot be null");
362                } else {
363                    throw syntaxError("Names must be strings, but " + name
364                            + " is of type " + name.getClass().getName());
365                }
366            }
367
368            /*
369             * Expect the name/value separator to be either a colon ':', an
370             * equals sign '=', or an arrow "=>". The last two are bogus but we
371             * include them because that's what the original implementation did.
372             */
373            int separator = nextCleanInternal();
374            if (separator != ':' && separator != '=') {
375                throw syntaxError("Expected ':' after " + name);
376            }
377            if (pos < in.length() && in.charAt(pos) == '>') {
378                pos++;
379            }
380
381            result.put((String) name, nextValue());
382
383            switch (nextCleanInternal()) {
384                case '}':
385                    return result;
386                case ';':
387                case ',':
388                    continue;
389                default:
390                    throw syntaxError("Unterminated object");
391            }
392        }
393    }
394
395    /**
396     * Reads a sequence of values and the trailing closing brace ']' of an
397     * array. The opening brace '[' should have already been read. Note that
398     * "[]" yields an empty array, but "[,]" returns a two-element array
399     * equivalent to "[null,null]".
400     */
401    private JSONArray readArray() throws JSONException {
402        JSONArray result = new JSONArray();
403
404        /* to cover input that ends with ",]". */
405        boolean hasTrailingSeparator = false;
406
407        while (true) {
408            switch (nextCleanInternal()) {
409                case -1:
410                    throw syntaxError("Unterminated array");
411                case ']':
412                    if (hasTrailingSeparator) {
413                        result.put(null);
414                    }
415                    return result;
416                case ',':
417                case ';':
418                    /* A separator without a value first means "null". */
419                    result.put(null);
420                    hasTrailingSeparator = true;
421                    continue;
422                default:
423                    pos--;
424            }
425
426            result.put(nextValue());
427
428            switch (nextCleanInternal()) {
429                case ']':
430                    return result;
431                case ',':
432                case ';':
433                    hasTrailingSeparator = true;
434                    continue;
435                default:
436                    throw syntaxError("Unterminated array");
437            }
438        }
439    }
440
441    /**
442     * Returns an exception containing the given message plus the current
443     * position and the entire input string.
444     */
445    public JSONException syntaxError(String message) {
446        return new JSONException(message + this);
447    }
448
449    /**
450     * Returns the current position and the entire input string.
451     */
452    @Override public String toString() {
453        // consistent with the original implementation
454        return " at character " + pos + " of " + in;
455    }
456
457    /*
458     * Legacy APIs.
459     *
460     * None of the methods below are on the critical path of parsing JSON
461     * documents. They exist only because they were exposed by the original
462     * implementation and may be used by some clients.
463     */
464
465    /**
466     * Returns true until the input has been exhausted.
467     */
468    public boolean more() {
469        return pos < in.length();
470    }
471
472    /**
473     * Returns the next available character, or the null character '\0' if all
474     * input has been exhausted. The return value of this method is ambiguous
475     * for JSON strings that contain the character '\0'.
476     */
477    public char next() {
478        return pos < in.length() ? in.charAt(pos++) : '\0';
479    }
480
481    /**
482     * Returns the next available character if it equals {@code c}. Otherwise an
483     * exception is thrown.
484     */
485    public char next(char c) throws JSONException {
486        char result = next();
487        if (result != c) {
488            throw syntaxError("Expected " + c + " but was " + result);
489        }
490        return result;
491    }
492
493    /**
494     * Returns the next character that is not whitespace and does not belong to
495     * a comment. If the input is exhausted before such a character can be
496     * found, the null character '\0' is returned. The return value of this
497     * method is ambiguous for JSON strings that contain the character '\0'.
498     */
499    public char nextClean() throws JSONException {
500        int nextCleanInt = nextCleanInternal();
501        return nextCleanInt == -1 ? '\0' : (char) nextCleanInt;
502    }
503
504    /**
505     * Returns the next {@code length} characters of the input.
506     *
507     * <p>The returned string shares its backing character array with this
508     * tokener's input string. If a reference to the returned string may be held
509     * indefinitely, you should use {@code new String(result)} to copy it first
510     * to avoid memory leaks.
511     *
512     * @throws JSONException if the remaining input is not long enough to
513     *     satisfy this request.
514     */
515    public String next(int length) throws JSONException {
516        if (pos + length > in.length()) {
517            throw syntaxError(length + " is out of bounds");
518        }
519        String result = in.substring(pos, pos + length);
520        pos += length;
521        return result;
522    }
523
524    /**
525     * Returns the {@link String#trim trimmed} string holding the characters up
526     * to but not including the first of:
527     * <ul>
528     *   <li>any character in {@code excluded}
529     *   <li>a newline character '\n'
530     *   <li>a carriage return '\r'
531     * </ul>
532     *
533     * <p>The returned string shares its backing character array with this
534     * tokener's input string. If a reference to the returned string may be held
535     * indefinitely, you should use {@code new String(result)} to copy it first
536     * to avoid memory leaks.
537     *
538     * @return a possibly-empty string
539     */
540    public String nextTo(String excluded) {
541        if (excluded == null) {
542            throw new NullPointerException();
543        }
544        return nextToInternal(excluded).trim();
545    }
546
547    /**
548     * Equivalent to {@code nextTo(String.valueOf(excluded))}.
549     */
550    public String nextTo(char excluded) {
551        return nextToInternal(String.valueOf(excluded)).trim();
552    }
553
554    /**
555     * Advances past all input up to and including the next occurrence of
556     * {@code thru}. If the remaining input doesn't contain {@code thru}, the
557     * input is exhausted.
558     */
559    public void skipPast(String thru) {
560        int thruStart = in.indexOf(thru, pos);
561        pos = thruStart == -1 ? in.length() : (thruStart + thru.length());
562    }
563
564    /**
565     * Advances past all input up to but not including the next occurrence of
566     * {@code to}. If the remaining input doesn't contain {@code to}, the input
567     * is unchanged.
568     */
569    public char skipTo(char to) {
570        int index = in.indexOf(to, pos);
571        if (index != -1) {
572            pos = index;
573            return to;
574        } else {
575            return '\0';
576        }
577    }
578
579    /**
580     * Unreads the most recent character of input. If no input characters have
581     * been read, the input is unchanged.
582     */
583    public void back() {
584        if (--pos == -1) {
585            pos = 0;
586        }
587    }
588
589    /**
590     * Returns the integer [0..15] value for the given hex character, or -1
591     * for non-hex input.
592     *
593     * @param hex a character in the ranges [0-9], [A-F] or [a-f]. Any other
594     *     character will yield a -1 result.
595     */
596    public static int dehexchar(char hex) {
597        if (hex >= '0' && hex <= '9') {
598            return hex - '0';
599        } else if (hex >= 'A' && hex <= 'F') {
600            return hex - 'A' + 10;
601        } else if (hex >= 'a' && hex <= 'f') {
602            return hex - 'a' + 10;
603        } else {
604            return -1;
605        }
606    }
607}
608