JSONTokener.java revision d79a9eede731ac48cfeb152ca59f8dd574ae1284
1/*
2 * Copyright (C) 2010 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17package org.json;
18
19// Note: this class was written without inspecting the non-free org.json sourcecode.
20
21/**
22 * Parses a JSON (<a href="http://www.ietf.org/rfc/rfc4627.txt">RFC 4627</a>)
23 * encoded string into the corresponding object. Most clients of
24 * this class will use only need the {@link #JSONTokener(String) constructor}
25 * and {@link #nextValue} method. Example usage: <pre>
26 * String json = "{"
27 *         + "  \"query\": \"Pizza\", "
28 *         + "  \"locations\": [ 94043, 90210 ] "
29 *         + "}";
30 *
31 * JSONObject object = (JSONObject) new JSONTokener(json).nextValue();
32 * String query = object.getString("query");
33 * JSONArray locations = object.getJSONArray("locations");</pre>
34 *
35 * <p>This parser is lenient. A successful parse does not necessarily indicate
36 * that the input string is valid JSON.
37 *
38 * <p>Each tokener may be used to parse a single JSON string. Instances of this
39 * class are not thread safe. Although this class is nonfinal, it was not
40 * designed for inheritance and should not be subclassed. In particular,
41 * self-use by overridable methods is not specified. See <i>Effective Java</i>
42 * Item 17, "Design and Document or inheritance or else prohibit it" for further
43 * information.
44 */
45public class JSONTokener {
46
47    /** The input JSON. */
48    private final String in;
49
50    /**
51     * The index of the next character to be returned by {@link #next()}. When
52     * the input is exhausted, this equals the input's length.
53     */
54    private int pos;
55
56    /**
57     * @param in JSON encoded string. Null is not permitted and will yield a
58     *     tokener that throws {@code NullPointerExceptions} when methods are
59     *     called.
60     */
61    public JSONTokener(String in) {
62        this.in = in;
63    }
64
65    /**
66     * Returns the next value from the input.
67     *
68     * @return a {@link JSONObject}, {@link JSONArray}, String, Boolean,
69     *     Integer, Long, Double or {@link JSONObject#NULL}.
70     * @throws JSONException if the input is malformed.
71     */
72    public Object nextValue() throws JSONException {
73        int c = nextCleanInternal();
74        switch (c) {
75            case -1:
76                throw syntaxError("End of input");
77
78            case '{':
79                return readObject();
80
81            case '[':
82                return readArray();
83
84            case '\'':
85            case '"':
86                return nextString((char) c);
87
88            default:
89                pos--;
90                return readLiteral();
91        }
92    }
93
94    private int nextCleanInternal() throws JSONException {
95        while (pos < in.length()) {
96            int c = in.charAt(pos++);
97            switch (c) {
98                case '\t':
99                case ' ':
100                case '\n':
101                case '\r':
102                    continue;
103
104                case '/':
105                    if (pos == in.length()) {
106                        return c;
107                    }
108
109                    char peek = in.charAt(pos);
110                    if (peek != '*' && peek != '/') {
111                        return c;
112                    }
113
114                    skipComment();
115                    continue;
116
117                default:
118                    return c;
119            }
120        }
121
122        return -1;
123    }
124
125    /**
126     * Advances the position until it is beyond the current comment. The opening
127     * slash '/' should have already been read, and character at the current
128     * position be an asterisk '*' for a C-style comment or a slash '/' for an
129     * end-of-line comment.
130     *
131     * @throws JSONException if a C-style comment was not terminated.
132     */
133    private void skipComment() throws JSONException {
134        if (in.charAt(pos++) == '*') {
135            int commentEnd = in.indexOf("*/", pos);
136            if (commentEnd == -1) {
137                throw syntaxError("Unterminated comment");
138            }
139            pos = commentEnd + 2;
140
141        } else {
142            /*
143             * Skip to the next newline character. If the line is terminated by
144             * "\r\n", the '\n' will be consumed as whitespace by the caller.
145             */
146            for (; pos < in.length(); pos++) {
147                char c = in.charAt(pos);
148                if (c == '\r' || c == '\n') {
149                    pos++;
150                    break;
151                }
152            }
153        }
154    }
155
156    /**
157     * Returns the string up to but not including {@code quote}, unescaping any
158     * character escape sequences encountered along the way. The opening quote
159     * should have already been read. This consumes the closing quote, but does
160     * not include it in the returned string.
161     *
162     * @param quote either ' or ".
163     * @throws NumberFormatException if any unicode escape sequences are
164     *     malformed.
165     */
166    public String nextString(char quote) throws JSONException {
167        /*
168         * For strings that are free of escape sequences, we can just extract
169         * the result as a substring of the input. But if we encounter an escape
170         * sequence, we need to use a StringBuilder to compose the result.
171         */
172        StringBuilder builder = null;
173
174        /* the index of the first character not yet appended to the builder. */
175        int start = pos;
176
177        while (pos < in.length()) {
178            int c = in.charAt(pos++);
179            if (c == quote) {
180                if (builder == null) {
181                    // a new string avoids leaking memory
182                    return new String(in.substring(start, pos - 1));
183                } else {
184                    builder.append(in, start, pos - 1);
185                    return builder.toString();
186                }
187            }
188
189            if (c == '\\') {
190                if (pos == in.length()) {
191                    throw syntaxError("Unterminated escape sequence");
192                }
193                if (builder == null) {
194                    builder = new StringBuilder();
195                }
196                builder.append(in, start, pos - 1);
197                builder.append(readEscapeCharacter());
198                start = pos;
199            }
200        }
201
202        throw syntaxError("Unterminated string");
203    }
204
205    /**
206     * Unescapes the character identified by the character or characters that
207     * immediately follow a backslash. The backslash '\' should have already
208     * been read. This supports both unicode escapes "u000A" and two-character
209     * escapes "\n".
210     *
211     * @throws NumberFormatException if any unicode escape sequences are
212     *     malformed.
213     */
214    private char readEscapeCharacter() throws JSONException {
215        char escaped = in.charAt(pos++);
216        switch (escaped) {
217            case 'u':
218                if (pos + 4 > in.length()) {
219                    throw syntaxError("Unterminated escape sequence");
220                }
221                String hex = in.substring(pos, pos + 4);
222                pos += 4;
223                return (char) Integer.parseInt(hex, 16);
224
225            case 't':
226                return '\t';
227
228            case 'b':
229                return '\b';
230
231            case 'n':
232                return '\n';
233
234            case 'r':
235                return '\r';
236
237            case 'f':
238                return '\f';
239
240            case '\'':
241            case '"':
242            case '\\':
243            default:
244                return escaped;
245        }
246    }
247
248    /**
249     * Reads a null, boolean, numeric or unquoted string literal value. Numeric
250     * values will be returned as an Integer, Long, or Double, in that order of
251     * preference.
252     */
253    private Object readLiteral() throws JSONException {
254        String literal = nextToInternal("{}[]/\\:,=;# \t\f");
255
256        if (literal.length() == 0) {
257            throw syntaxError("Expected literal value");
258        } else if ("null".equalsIgnoreCase(literal)) {
259            return JSONObject.NULL;
260        } else if ("true".equalsIgnoreCase(literal)) {
261            return Boolean.TRUE;
262        } else if ("false".equalsIgnoreCase(literal)) {
263            return Boolean.FALSE;
264        }
265
266        /* try to parse as an integral type... */
267        if (literal.indexOf('.') == -1) {
268            int base = 10;
269            String number = literal;
270            if (number.startsWith("0x") || number.startsWith("0X")) {
271                number = number.substring(2);
272                base = 16;
273            } else if (number.startsWith("0") && number.length() > 1) {
274                number = number.substring(1);
275                base = 8;
276            }
277            try {
278                long longValue = Long.parseLong(number, base);
279                if (longValue <= Integer.MAX_VALUE && longValue >= Integer.MIN_VALUE) {
280                    return (int) longValue;
281                } else {
282                    return longValue;
283                }
284            } catch (NumberFormatException e) {
285                /*
286                 * This only happens for integral numbers greater than
287                 * Long.MAX_VALUE, numbers in exponential form (5e-10) and
288                 * unquoted strings. Fall through to try floating point.
289                 */
290            }
291        }
292
293        /* ...next try to parse as a floating point... */
294        try {
295            return Double.valueOf(literal);
296        } catch (NumberFormatException e) {
297        }
298
299        /* ... finally give up. We have an unquoted string */
300        return new String(literal); // a new string avoids leaking memory
301    }
302
303    /**
304     * Returns the string up to but not including any of the given characters or
305     * a newline character. This does not consume the excluded character.
306     */
307    private String nextToInternal(String excluded) {
308        int start = pos;
309        for (; pos < in.length(); pos++) {
310            char c = in.charAt(pos);
311            if (c == '\r' || c == '\n' || excluded.indexOf(c) != -1) {
312                return in.substring(start, pos);
313            }
314        }
315        return in.substring(start);
316    }
317
318    /**
319     * Reads a sequence of key/value pairs and the trailing closing brace '}' of
320     * an object. The opening brace '{' should have already been read.
321     */
322    private JSONObject readObject() throws JSONException {
323        JSONObject result = new JSONObject();
324
325        /* Peek to see if this is the empty object. */
326        int first = nextCleanInternal();
327        if (first == '}') {
328            return result;
329        } else if (first != -1) {
330            pos--;
331        }
332
333        while (true) {
334            Object name = nextValue();
335            if (!(name instanceof String)) {
336                if (name == null) {
337                    throw syntaxError("Names cannot be null");
338                } else {
339                    throw syntaxError("Names must be strings, but " + name
340                            + " is of type " + name.getClass().getName());
341                }
342            }
343
344            /*
345             * Expect the name/value separator to be either a colon ':', an
346             * equals sign '=', or an arrow "=>". The last two are bogus but we
347             * include them because that's what the original implementation did.
348             */
349            int separator = nextCleanInternal();
350            if (separator != ':' && separator != '=') {
351                throw syntaxError("Expected ':' after " + name);
352            }
353            if (pos < in.length() && in.charAt(pos) == '>') {
354                pos++;
355            }
356
357            result.put((String) name, nextValue());
358
359            switch (nextCleanInternal()) {
360                case '}':
361                    return result;
362                case ';':
363                case ',':
364                    continue;
365                default:
366                    throw syntaxError("Unterminated object");
367            }
368        }
369    }
370
371    /**
372     * Reads a sequence of values and the trailing closing brace ']' of an
373     * array. The opening brace '[' should have already been read. Note that
374     * "[]" yields an empty array, but "[,]" returns a two-element array
375     * equivalent to "[null,null]".
376     */
377    private JSONArray readArray() throws JSONException {
378        JSONArray result = new JSONArray();
379
380        /* to cover input that ends with ",]". */
381        boolean hasTrailingSeparator = false;
382
383        while (true) {
384            switch (nextCleanInternal()) {
385                case -1:
386                    throw syntaxError("Unterminated array");
387                case ']':
388                    if (hasTrailingSeparator) {
389                        result.put(null);
390                    }
391                    return result;
392                case ',':
393                case ';':
394                    /* A separator without a value first means "null". */
395                    result.put(null);
396                    hasTrailingSeparator = true;
397                    continue;
398                default:
399                    pos--;
400            }
401
402            result.put(nextValue());
403
404            switch (nextCleanInternal()) {
405                case ']':
406                    return result;
407                case ',':
408                case ';':
409                    hasTrailingSeparator = true;
410                    continue;
411                default:
412                    throw syntaxError("Unterminated array");
413            }
414        }
415    }
416
417    /**
418     * Returns an exception containing the given message plus the current
419     * position and the entire input string.
420     */
421    public JSONException syntaxError(String message) {
422        return new JSONException(message + this);
423    }
424
425    /**
426     * Returns the current position and the entire input string.
427     */
428    @Override public String toString() {
429        // consistent with the original implementation
430        return " at character " + pos + " of " + in;
431    }
432
433    /*
434     * Legacy APIs.
435     *
436     * None of the methods below are on the critical path of parsing JSON
437     * documents. They exist only because they were exposed by the original
438     * implementation and may be used by some clients.
439     */
440
441    /**
442     * Returns true until the input has been exhausted.
443     */
444    public boolean more() {
445        return pos < in.length();
446    }
447
448    /**
449     * Returns the next available character, or the null character '\0' if all
450     * input has been exhausted. The return value of this method is ambiguous
451     * for JSON strings that contain the character '\0'.
452     */
453    public char next() {
454        return pos < in.length() ? in.charAt(pos++) : '\0';
455    }
456
457    /**
458     * Returns the next available character if it equals {@code c}. Otherwise an
459     * exception is thrown.
460     */
461    public char next(char c) throws JSONException {
462        char result = next();
463        if (result != c) {
464            throw syntaxError("Expected " + c + " but was " + result);
465        }
466        return result;
467    }
468
469    /**
470     * Returns the next character that is not whitespace and does not belong to
471     * a comment. If the input is exhausted before such a character can be
472     * found, the null character '\0' is returned. The return value of this
473     * method is ambiguous for JSON strings that contain the character '\0'.
474     */
475    public char nextClean() throws JSONException {
476        int nextCleanInt = nextCleanInternal();
477        return nextCleanInt == -1 ? '\0' : (char) nextCleanInt;
478    }
479
480    /**
481     * Returns the next {@code length} characters of the input.
482     *
483     * <p>The returned string shares its backing character array with this
484     * tokener's input string. If a reference to the returned string may be held
485     * indefinitely, you should use {@code new String(result)} to copy it first
486     * to avoid memory leaks.
487     *
488     * @throws JSONException if the remaining input is not long enough to
489     *     satisfy this request.
490     */
491    public String next(int length) throws JSONException {
492        if (pos + length > in.length()) {
493            throw syntaxError(length + " is out of bounds");
494        }
495        String result = in.substring(pos, pos + length);
496        pos += length;
497        return result;
498    }
499
500    /**
501     * Returns the {@link String#trim trimmed} string holding the characters up
502     * to but not including the first of:
503     * <ul>
504     *   <li>any character in {@code excluded}
505     *   <li>a newline character '\n'
506     *   <li>a carriage return '\r'
507     * </ul>
508     *
509     * <p>The returned string shares its backing character array with this
510     * tokener's input string. If a reference to the returned string may be held
511     * indefinitely, you should use {@code new String(result)} to copy it first
512     * to avoid memory leaks.
513     *
514     * @return a possibly-empty string
515     */
516    public String nextTo(String excluded) {
517        if (excluded == null) {
518            throw new NullPointerException();
519        }
520        return nextToInternal(excluded).trim();
521    }
522
523    /**
524     * Equivalent to {@code nextTo(String.valueOf(excluded))}.
525     */
526    public String nextTo(char excluded) {
527        return nextToInternal(String.valueOf(excluded)).trim();
528    }
529
530    /**
531     * Advances past all input up to and including the next occurrence of
532     * {@code thru}. If the remaining input doesn't contain {@code thru}, the
533     * input is exhausted.
534     */
535    public void skipPast(String thru) {
536        int thruStart = in.indexOf(thru, pos);
537        pos = thruStart == -1 ? in.length() : (thruStart + thru.length());
538    }
539
540    /**
541     * Advances past all input up to but not including the next occurrence of
542     * {@code to}. If the remaining input doesn't contain {@code to}, the input
543     * is unchanged.
544     */
545    public char skipTo(char to) {
546        int index = in.indexOf(to, pos);
547        if (index != -1) {
548            pos = index;
549            return to;
550        } else {
551            return '\0';
552        }
553    }
554
555    /**
556     * Unreads the most recent character of input. If no input characters have
557     * been read, the input is unchanged.
558     */
559    public void back() {
560        if (--pos == -1) {
561            pos = 0;
562        }
563    }
564
565    /**
566     * Returns the integer [0..15] value for the given hex character, or -1
567     * for non-hex input.
568     *
569     * @param hex a character in the ranges [0-9], [A-F] or [a-f]. Any other
570     *     character will yield a -1 result.
571     */
572    public static int dehexchar(char hex) {
573        if (hex >= '0' && hex <= '9') {
574            return hex - '0';
575        } else if (hex >= 'A' && hex <= 'F') {
576            return hex - 'A' + 10;
577        } else if (hex >= 'a' && hex <= 'f') {
578            return hex - 'a' + 10;
579        } else {
580            return -1;
581        }
582    }
583}
584