decoder.py revision 2a99a7e74a7f215066514fe81d2bfa6639d9eddd
1"""Implementation of JSONDecoder
2"""
3import re
4import sys
5import struct
6
7from simplejson.scanner import make_scanner
8def _import_c_scanstring():
9    try:
10        from simplejson._speedups import scanstring
11        return scanstring
12    except ImportError:
13        return None
14c_scanstring = _import_c_scanstring()
15
16__all__ = ['JSONDecoder']
17
18FLAGS = re.VERBOSE | re.MULTILINE | re.DOTALL
19
20def _floatconstants():
21    _BYTES = '7FF80000000000007FF0000000000000'.decode('hex')
22    # The struct module in Python 2.4 would get frexp() out of range here
23    # when an endian is specified in the format string. Fixed in Python 2.5+
24    if sys.byteorder != 'big':
25        _BYTES = _BYTES[:8][::-1] + _BYTES[8:][::-1]
26    nan, inf = struct.unpack('dd', _BYTES)
27    return nan, inf, -inf
28
29NaN, PosInf, NegInf = _floatconstants()
30
31
32class JSONDecodeError(ValueError):
33    """Subclass of ValueError with the following additional properties:
34
35    msg: The unformatted error message
36    doc: The JSON document being parsed
37    pos: The start index of doc where parsing failed
38    end: The end index of doc where parsing failed (may be None)
39    lineno: The line corresponding to pos
40    colno: The column corresponding to pos
41    endlineno: The line corresponding to end (may be None)
42    endcolno: The column corresponding to end (may be None)
43
44    """
45    def __init__(self, msg, doc, pos, end=None):
46        ValueError.__init__(self, errmsg(msg, doc, pos, end=end))
47        self.msg = msg
48        self.doc = doc
49        self.pos = pos
50        self.end = end
51        self.lineno, self.colno = linecol(doc, pos)
52        if end is not None:
53            self.endlineno, self.endcolno = linecol(doc, end)
54        else:
55            self.endlineno, self.endcolno = None, None
56
57
58def linecol(doc, pos):
59    lineno = doc.count('\n', 0, pos) + 1
60    if lineno == 1:
61        colno = pos
62    else:
63        colno = pos - doc.rindex('\n', 0, pos)
64    return lineno, colno
65
66
67def errmsg(msg, doc, pos, end=None):
68    # Note that this function is called from _speedups
69    lineno, colno = linecol(doc, pos)
70    if end is None:
71        #fmt = '{0}: line {1} column {2} (char {3})'
72        #return fmt.format(msg, lineno, colno, pos)
73        fmt = '%s: line %d column %d (char %d)'
74        return fmt % (msg, lineno, colno, pos)
75    endlineno, endcolno = linecol(doc, end)
76    #fmt = '{0}: line {1} column {2} - line {3} column {4} (char {5} - {6})'
77    #return fmt.format(msg, lineno, colno, endlineno, endcolno, pos, end)
78    fmt = '%s: line %d column %d - line %d column %d (char %d - %d)'
79    return fmt % (msg, lineno, colno, endlineno, endcolno, pos, end)
80
81
82_CONSTANTS = {
83    '-Infinity': NegInf,
84    'Infinity': PosInf,
85    'NaN': NaN,
86}
87
88STRINGCHUNK = re.compile(r'(.*?)(["\\\x00-\x1f])', FLAGS)
89BACKSLASH = {
90    '"': u'"', '\\': u'\\', '/': u'/',
91    'b': u'\b', 'f': u'\f', 'n': u'\n', 'r': u'\r', 't': u'\t',
92}
93
94DEFAULT_ENCODING = "utf-8"
95
96def py_scanstring(s, end, encoding=None, strict=True,
97        _b=BACKSLASH, _m=STRINGCHUNK.match):
98    """Scan the string s for a JSON string. End is the index of the
99    character in s after the quote that started the JSON string.
100    Unescapes all valid JSON string escape sequences and raises ValueError
101    on attempt to decode an invalid string. If strict is False then literal
102    control characters are allowed in the string.
103
104    Returns a tuple of the decoded string and the index of the character in s
105    after the end quote."""
106    if encoding is None:
107        encoding = DEFAULT_ENCODING
108    chunks = []
109    _append = chunks.append
110    begin = end - 1
111    while 1:
112        chunk = _m(s, end)
113        if chunk is None:
114            raise JSONDecodeError(
115                "Unterminated string starting at", s, begin)
116        end = chunk.end()
117        content, terminator = chunk.groups()
118        # Content is contains zero or more unescaped string characters
119        if content:
120            if not isinstance(content, unicode):
121                content = unicode(content, encoding)
122            _append(content)
123        # Terminator is the end of string, a literal control character,
124        # or a backslash denoting that an escape sequence follows
125        if terminator == '"':
126            break
127        elif terminator != '\\':
128            if strict:
129                msg = "Invalid control character %r at" % (terminator,)
130                #msg = "Invalid control character {0!r} at".format(terminator)
131                raise JSONDecodeError(msg, s, end)
132            else:
133                _append(terminator)
134                continue
135        try:
136            esc = s[end]
137        except IndexError:
138            raise JSONDecodeError(
139                "Unterminated string starting at", s, begin)
140        # If not a unicode escape sequence, must be in the lookup table
141        if esc != 'u':
142            try:
143                char = _b[esc]
144            except KeyError:
145                msg = "Invalid \\escape: " + repr(esc)
146                raise JSONDecodeError(msg, s, end)
147            end += 1
148        else:
149            # Unicode escape sequence
150            esc = s[end + 1:end + 5]
151            next_end = end + 5
152            if len(esc) != 4:
153                msg = "Invalid \\uXXXX escape"
154                raise JSONDecodeError(msg, s, end)
155            uni = int(esc, 16)
156            # Check for surrogate pair on UCS-4 systems
157            if 0xd800 <= uni <= 0xdbff and sys.maxunicode > 65535:
158                msg = "Invalid \\uXXXX\\uXXXX surrogate pair"
159                if not s[end + 5:end + 7] == '\\u':
160                    raise JSONDecodeError(msg, s, end)
161                esc2 = s[end + 7:end + 11]
162                if len(esc2) != 4:
163                    raise JSONDecodeError(msg, s, end)
164                uni2 = int(esc2, 16)
165                uni = 0x10000 + (((uni - 0xd800) << 10) | (uni2 - 0xdc00))
166                next_end += 6
167            char = unichr(uni)
168            end = next_end
169        # Append the unescaped character
170        _append(char)
171    return u''.join(chunks), end
172
173
174# Use speedup if available
175scanstring = c_scanstring or py_scanstring
176
177WHITESPACE = re.compile(r'[ \t\n\r]*', FLAGS)
178WHITESPACE_STR = ' \t\n\r'
179
180def JSONObject((s, end), encoding, strict, scan_once, object_hook,
181        object_pairs_hook, memo=None,
182        _w=WHITESPACE.match, _ws=WHITESPACE_STR):
183    # Backwards compatibility
184    if memo is None:
185        memo = {}
186    memo_get = memo.setdefault
187    pairs = []
188    # Use a slice to prevent IndexError from being raised, the following
189    # check will raise a more specific ValueError if the string is empty
190    nextchar = s[end:end + 1]
191    # Normally we expect nextchar == '"'
192    if nextchar != '"':
193        if nextchar in _ws:
194            end = _w(s, end).end()
195            nextchar = s[end:end + 1]
196        # Trivial empty object
197        if nextchar == '}':
198            if object_pairs_hook is not None:
199                result = object_pairs_hook(pairs)
200                return result, end + 1
201            pairs = {}
202            if object_hook is not None:
203                pairs = object_hook(pairs)
204            return pairs, end + 1
205        elif nextchar != '"':
206            raise JSONDecodeError(
207                "Expecting property name enclosed in double quotes",
208                s, end)
209    end += 1
210    while True:
211        key, end = scanstring(s, end, encoding, strict)
212        key = memo_get(key, key)
213
214        # To skip some function call overhead we optimize the fast paths where
215        # the JSON key separator is ": " or just ":".
216        if s[end:end + 1] != ':':
217            end = _w(s, end).end()
218            if s[end:end + 1] != ':':
219                raise JSONDecodeError("Expecting ':' delimiter", s, end)
220
221        end += 1
222
223        try:
224            if s[end] in _ws:
225                end += 1
226                if s[end] in _ws:
227                    end = _w(s, end + 1).end()
228        except IndexError:
229            pass
230
231        try:
232            value, end = scan_once(s, end)
233        except StopIteration:
234            raise JSONDecodeError("Expecting object", s, end)
235        pairs.append((key, value))
236
237        try:
238            nextchar = s[end]
239            if nextchar in _ws:
240                end = _w(s, end + 1).end()
241                nextchar = s[end]
242        except IndexError:
243            nextchar = ''
244        end += 1
245
246        if nextchar == '}':
247            break
248        elif nextchar != ',':
249            raise JSONDecodeError("Expecting ',' delimiter", s, end - 1)
250
251        try:
252            nextchar = s[end]
253            if nextchar in _ws:
254                end += 1
255                nextchar = s[end]
256                if nextchar in _ws:
257                    end = _w(s, end + 1).end()
258                    nextchar = s[end]
259        except IndexError:
260            nextchar = ''
261
262        end += 1
263        if nextchar != '"':
264            raise JSONDecodeError(
265                "Expecting property name enclosed in double quotes",
266                s, end - 1)
267
268    if object_pairs_hook is not None:
269        result = object_pairs_hook(pairs)
270        return result, end
271    pairs = dict(pairs)
272    if object_hook is not None:
273        pairs = object_hook(pairs)
274    return pairs, end
275
276def JSONArray((s, end), scan_once, _w=WHITESPACE.match, _ws=WHITESPACE_STR):
277    values = []
278    nextchar = s[end:end + 1]
279    if nextchar in _ws:
280        end = _w(s, end + 1).end()
281        nextchar = s[end:end + 1]
282    # Look-ahead for trivial empty array
283    if nextchar == ']':
284        return values, end + 1
285    _append = values.append
286    while True:
287        try:
288            value, end = scan_once(s, end)
289        except StopIteration:
290            raise JSONDecodeError("Expecting object", s, end)
291        _append(value)
292        nextchar = s[end:end + 1]
293        if nextchar in _ws:
294            end = _w(s, end + 1).end()
295            nextchar = s[end:end + 1]
296        end += 1
297        if nextchar == ']':
298            break
299        elif nextchar != ',':
300            raise JSONDecodeError("Expecting ',' delimiter", s, end)
301
302        try:
303            if s[end] in _ws:
304                end += 1
305                if s[end] in _ws:
306                    end = _w(s, end + 1).end()
307        except IndexError:
308            pass
309
310    return values, end
311
312class JSONDecoder(object):
313    """Simple JSON <http://json.org> decoder
314
315    Performs the following translations in decoding by default:
316
317    +---------------+-------------------+
318    | JSON          | Python            |
319    +===============+===================+
320    | object        | dict              |
321    +---------------+-------------------+
322    | array         | list              |
323    +---------------+-------------------+
324    | string        | unicode           |
325    +---------------+-------------------+
326    | number (int)  | int, long         |
327    +---------------+-------------------+
328    | number (real) | float             |
329    +---------------+-------------------+
330    | true          | True              |
331    +---------------+-------------------+
332    | false         | False             |
333    +---------------+-------------------+
334    | null          | None              |
335    +---------------+-------------------+
336
337    It also understands ``NaN``, ``Infinity``, and ``-Infinity`` as
338    their corresponding ``float`` values, which is outside the JSON spec.
339
340    """
341
342    def __init__(self, encoding=None, object_hook=None, parse_float=None,
343            parse_int=None, parse_constant=None, strict=True,
344            object_pairs_hook=None):
345        """
346        *encoding* determines the encoding used to interpret any
347        :class:`str` objects decoded by this instance (``'utf-8'`` by
348        default).  It has no effect when decoding :class:`unicode` objects.
349
350        Note that currently only encodings that are a superset of ASCII work,
351        strings of other encodings should be passed in as :class:`unicode`.
352
353        *object_hook*, if specified, will be called with the result of every
354        JSON object decoded and its return value will be used in place of the
355        given :class:`dict`.  This can be used to provide custom
356        deserializations (e.g. to support JSON-RPC class hinting).
357
358        *object_pairs_hook* is an optional function that will be called with
359        the result of any object literal decode with an ordered list of pairs.
360        The return value of *object_pairs_hook* will be used instead of the
361        :class:`dict`.  This feature can be used to implement custom decoders
362        that rely on the order that the key and value pairs are decoded (for
363        example, :func:`collections.OrderedDict` will remember the order of
364        insertion). If *object_hook* is also defined, the *object_pairs_hook*
365        takes priority.
366
367        *parse_float*, if specified, will be called with the string of every
368        JSON float to be decoded.  By default, this is equivalent to
369        ``float(num_str)``. This can be used to use another datatype or parser
370        for JSON floats (e.g. :class:`decimal.Decimal`).
371
372        *parse_int*, if specified, will be called with the string of every
373        JSON int to be decoded.  By default, this is equivalent to
374        ``int(num_str)``.  This can be used to use another datatype or parser
375        for JSON integers (e.g. :class:`float`).
376
377        *parse_constant*, if specified, will be called with one of the
378        following strings: ``'-Infinity'``, ``'Infinity'``, ``'NaN'``.  This
379        can be used to raise an exception if invalid JSON numbers are
380        encountered.
381
382        *strict* controls the parser's behavior when it encounters an
383        invalid control character in a string. The default setting of
384        ``True`` means that unescaped control characters are parse errors, if
385        ``False`` then control characters will be allowed in strings.
386
387        """
388        self.encoding = encoding
389        self.object_hook = object_hook
390        self.object_pairs_hook = object_pairs_hook
391        self.parse_float = parse_float or float
392        self.parse_int = parse_int or int
393        self.parse_constant = parse_constant or _CONSTANTS.__getitem__
394        self.strict = strict
395        self.parse_object = JSONObject
396        self.parse_array = JSONArray
397        self.parse_string = scanstring
398        self.memo = {}
399        self.scan_once = make_scanner(self)
400
401    def decode(self, s, _w=WHITESPACE.match):
402        """Return the Python representation of ``s`` (a ``str`` or ``unicode``
403        instance containing a JSON document)
404
405        """
406        obj, end = self.raw_decode(s)
407        end = _w(s, end).end()
408        if end != len(s):
409            raise JSONDecodeError("Extra data", s, end, len(s))
410        return obj
411
412    def raw_decode(self, s, idx=0, _w=WHITESPACE.match):
413        """Decode a JSON document from ``s`` (a ``str`` or ``unicode``
414        beginning with a JSON document) and return a 2-tuple of the Python
415        representation and the index in ``s`` where the document ended.
416        Optionally, ``idx`` can be used to specify an offset in ``s`` where
417        the JSON document begins.
418
419        This can be used to decode a JSON document from a string that may
420        have extraneous data at the end.
421
422        """
423        try:
424            obj, end = self.scan_once(s, idx=_w(s, idx).end())
425        except StopIteration:
426            raise JSONDecodeError("No JSON object could be decoded", s, idx)
427        return obj, end
428