encoder.py revision 5821806d5e7f356e8fa4b058a389a808ea183019
1"""
2Implementation of JSONEncoder
3"""
4import re
5try:
6    from simplejson import _speedups
7except ImportError:
8    _speedups = None
9
10ESCAPE = re.compile(r'[\x00-\x19\\"\b\f\n\r\t]')
11ESCAPE_ASCII = re.compile(r'([\\"/]|[^\ -~])')
12ESCAPE_DCT = {
13    # escape all forward slashes to prevent </script> attack
14    '/': '\\/',
15    '\\': '\\\\',
16    '"': '\\"',
17    '\b': '\\b',
18    '\f': '\\f',
19    '\n': '\\n',
20    '\r': '\\r',
21    '\t': '\\t',
22}
23for i in range(0x20):
24    ESCAPE_DCT.setdefault(chr(i), '\\u%04x' % (i,))
25
26# assume this produces an infinity on all machines (probably not guaranteed)
27INFINITY = float('1e66666')
28
29def floatstr(o, allow_nan=True):
30    # Check for specials.  Note that this type of test is processor- and/or
31    # platform-specific, so do tests which don't depend on the internals.
32
33    if o != o:
34        text = 'NaN'
35    elif o == INFINITY:
36        text = 'Infinity'
37    elif o == -INFINITY:
38        text = '-Infinity'
39    else:
40        return repr(o)
41
42    if not allow_nan:
43        raise ValueError("Out of range float values are not JSON compliant: %r"
44            % (o,))
45
46    return text
47
48
49def encode_basestring(s):
50    """
51    Return a JSON representation of a Python string
52    """
53    def replace(match):
54        return ESCAPE_DCT[match.group(0)]
55    return '"' + ESCAPE.sub(replace, s) + '"'
56
57def encode_basestring_ascii(s):
58    def replace(match):
59        s = match.group(0)
60        try:
61            return ESCAPE_DCT[s]
62        except KeyError:
63            n = ord(s)
64            if n < 0x10000:
65                return '\\u%04x' % (n,)
66            else:
67                # surrogate pair
68                n -= 0x10000
69                s1 = 0xd800 | ((n >> 10) & 0x3ff)
70                s2 = 0xdc00 | (n & 0x3ff)
71                return '\\u%04x\\u%04x' % (s1, s2)
72    return '"' + str(ESCAPE_ASCII.sub(replace, s)) + '"'
73
74try:
75    encode_basestring_ascii = _speedups.encode_basestring_ascii
76    _need_utf8 = True
77except AttributeError:
78    _need_utf8 = False
79
80class JSONEncoder(object):
81    """
82    Extensible JSON <http://json.org> encoder for Python data structures.
83
84    Supports the following objects and types by default:
85
86    +-------------------+---------------+
87    | Python            | JSON          |
88    +===================+===============+
89    | dict              | object        |
90    +-------------------+---------------+
91    | list, tuple       | array         |
92    +-------------------+---------------+
93    | str, unicode      | string        |
94    +-------------------+---------------+
95    | int, long, float  | number        |
96    +-------------------+---------------+
97    | True              | true          |
98    +-------------------+---------------+
99    | False             | false         |
100    +-------------------+---------------+
101    | None              | null          |
102    +-------------------+---------------+
103
104    To extend this to recognize other objects, subclass and implement a
105    ``.default()`` method with another method that returns a serializable
106    object for ``o`` if possible, otherwise it should call the superclass
107    implementation (to raise ``TypeError``).
108    """
109    __all__ = ['__init__', 'default', 'encode', 'iterencode']
110    item_separator = ', '
111    key_separator = ': '
112    def __init__(self, skipkeys=False, ensure_ascii=True,
113            check_circular=True, allow_nan=True, sort_keys=False,
114            indent=None, separators=None, encoding='utf-8'):
115        """
116        Constructor for JSONEncoder, with sensible defaults.
117
118        If skipkeys is False, then it is a TypeError to attempt
119        encoding of keys that are not str, int, long, float or None.  If
120        skipkeys is True, such items are simply skipped.
121
122        If ensure_ascii is True, the output is guaranteed to be str
123        objects with all incoming unicode characters escaped.  If
124        ensure_ascii is false, the output will be unicode object.
125
126        If check_circular is True, then lists, dicts, and custom encoded
127        objects will be checked for circular references during encoding to
128        prevent an infinite recursion (which would cause an OverflowError).
129        Otherwise, no such check takes place.
130
131        If allow_nan is True, then NaN, Infinity, and -Infinity will be
132        encoded as such.  This behavior is not JSON specification compliant,
133        but is consistent with most JavaScript based encoders and decoders.
134        Otherwise, it will be a ValueError to encode such floats.
135
136        If sort_keys is True, then the output of dictionaries will be
137        sorted by key; this is useful for regression tests to ensure
138        that JSON serializations can be compared on a day-to-day basis.
139
140        If indent is a non-negative integer, then JSON array
141        elements and object members will be pretty-printed with that
142        indent level.  An indent level of 0 will only insert newlines.
143        None is the most compact representation.
144
145        If specified, separators should be a (item_separator, key_separator)
146        tuple. The default is (', ', ': '). To get the most compact JSON
147        representation you should specify (',', ':') to eliminate whitespace.
148
149        If encoding is not None, then all input strings will be
150        transformed into unicode using that encoding prior to JSON-encoding.
151        The default is UTF-8.
152        """
153
154        self.skipkeys = skipkeys
155        self.ensure_ascii = ensure_ascii
156        self.check_circular = check_circular
157        self.allow_nan = allow_nan
158        self.sort_keys = sort_keys
159        self.indent = indent
160        self.current_indent_level = 0
161        if separators is not None:
162            self.item_separator, self.key_separator = separators
163        self.encoding = encoding
164
165    def _newline_indent(self):
166        return '\n' + (' ' * (self.indent * self.current_indent_level))
167
168    def _iterencode_list(self, lst, markers=None):
169        if not lst:
170            yield '[]'
171            return
172        if markers is not None:
173            markerid = id(lst)
174            if markerid in markers:
175                raise ValueError("Circular reference detected")
176            markers[markerid] = lst
177        yield '['
178        if self.indent is not None:
179            self.current_indent_level += 1
180            newline_indent = self._newline_indent()
181            separator = self.item_separator + newline_indent
182            yield newline_indent
183        else:
184            newline_indent = None
185            separator = self.item_separator
186        first = True
187        for value in lst:
188            if first:
189                first = False
190            else:
191                yield separator
192            for chunk in self._iterencode(value, markers):
193                yield chunk
194        if newline_indent is not None:
195            self.current_indent_level -= 1
196            yield self._newline_indent()
197        yield ']'
198        if markers is not None:
199            del markers[markerid]
200
201    def _iterencode_dict(self, dct, markers=None):
202        if not dct:
203            yield '{}'
204            return
205        if markers is not None:
206            markerid = id(dct)
207            if markerid in markers:
208                raise ValueError("Circular reference detected")
209            markers[markerid] = dct
210        yield '{'
211        key_separator = self.key_separator
212        if self.indent is not None:
213            self.current_indent_level += 1
214            newline_indent = self._newline_indent()
215            item_separator = self.item_separator + newline_indent
216            yield newline_indent
217        else:
218            newline_indent = None
219            item_separator = self.item_separator
220        first = True
221        if self.ensure_ascii:
222            encoder = encode_basestring_ascii
223        else:
224            encoder = encode_basestring
225        allow_nan = self.allow_nan
226        if self.sort_keys:
227            keys = dct.keys()
228            keys.sort()
229            items = [(k, dct[k]) for k in keys]
230        else:
231            items = dct.iteritems()
232        _encoding = self.encoding
233        _do_decode = (_encoding is not None
234            and not (_need_utf8 and _encoding == 'utf-8'))
235        for key, value in items:
236            if isinstance(key, str):
237                if _do_decode:
238                    key = key.decode(_encoding)
239            elif isinstance(key, basestring):
240                pass
241            # JavaScript is weakly typed for these, so it makes sense to
242            # also allow them.  Many encoders seem to do something like this.
243            elif isinstance(key, float):
244                key = floatstr(key, allow_nan)
245            elif isinstance(key, (int, long)):
246                key = str(key)
247            elif key is True:
248                key = 'true'
249            elif key is False:
250                key = 'false'
251            elif key is None:
252                key = 'null'
253            elif self.skipkeys:
254                continue
255            else:
256                raise TypeError("key %r is not a string" % (key,))
257            if first:
258                first = False
259            else:
260                yield item_separator
261            yield encoder(key)
262            yield key_separator
263            for chunk in self._iterencode(value, markers):
264                yield chunk
265        if newline_indent is not None:
266            self.current_indent_level -= 1
267            yield self._newline_indent()
268        yield '}'
269        if markers is not None:
270            del markers[markerid]
271
272    def _iterencode(self, o, markers=None):
273        if isinstance(o, basestring):
274            if self.ensure_ascii:
275                encoder = encode_basestring_ascii
276            else:
277                encoder = encode_basestring
278            _encoding = self.encoding
279            if (_encoding is not None and isinstance(o, str)
280                    and not (_need_utf8 and _encoding == 'utf-8')):
281                o = o.decode(_encoding)
282            yield encoder(o)
283        elif o is None:
284            yield 'null'
285        elif o is True:
286            yield 'true'
287        elif o is False:
288            yield 'false'
289        elif isinstance(o, (int, long)):
290            yield str(o)
291        elif isinstance(o, float):
292            yield floatstr(o, self.allow_nan)
293        elif isinstance(o, (list, tuple)):
294            for chunk in self._iterencode_list(o, markers):
295                yield chunk
296        elif isinstance(o, dict):
297            for chunk in self._iterencode_dict(o, markers):
298                yield chunk
299        else:
300            if markers is not None:
301                markerid = id(o)
302                if markerid in markers:
303                    raise ValueError("Circular reference detected")
304                markers[markerid] = o
305            for chunk in self._iterencode_default(o, markers):
306                yield chunk
307            if markers is not None:
308                del markers[markerid]
309
310    def _iterencode_default(self, o, markers=None):
311        newobj = self.default(o)
312        return self._iterencode(newobj, markers)
313
314    def default(self, o):
315        """
316        Implement this method in a subclass such that it returns
317        a serializable object for ``o``, or calls the base implementation
318        (to raise a ``TypeError``).
319
320        For example, to support arbitrary iterators, you could
321        implement default like this::
322
323            def default(self, o):
324                try:
325                    iterable = iter(o)
326                except TypeError:
327                    pass
328                else:
329                    return list(iterable)
330                return JSONEncoder.default(self, o)
331        """
332        raise TypeError("%r is not JSON serializable" % (o,))
333
334    def encode(self, o):
335        """
336        Return a JSON string representation of a Python data structure.
337
338        >>> JSONEncoder().encode({"foo": ["bar", "baz"]})
339        '{"foo":["bar", "baz"]}'
340        """
341        # This is for extremely simple cases and benchmarks...
342        if isinstance(o, basestring):
343            if isinstance(o, str):
344                _encoding = self.encoding
345                if (_encoding is not None
346                        and not (_encoding == 'utf-8' and _need_utf8)):
347                    o = o.decode(_encoding)
348            return encode_basestring_ascii(o)
349        # This doesn't pass the iterator directly to ''.join() because it
350        # sucks at reporting exceptions.  It's going to do this internally
351        # anyway because it uses PySequence_Fast or similar.
352        chunks = list(self.iterencode(o))
353        return ''.join(chunks)
354
355    def iterencode(self, o):
356        """
357        Encode the given object and yield each string
358        representation as available.
359
360        For example::
361
362            for chunk in JSONEncoder().iterencode(bigobject):
363                mysocket.write(chunk)
364        """
365        if self.check_circular:
366            markers = {}
367        else:
368            markers = None
369        return self._iterencode(o, markers)
370
371__all__ = ['JSONEncoder']
372