1"""Implementation of JSONDecoder
2"""
3import re
4import sys
5import struct
6
7from json import scanner
8try:
9    from _json import scanstring as c_scanstring
10except ImportError:
11    c_scanstring = None
12
13__all__ = ['JSONDecoder']
14
15FLAGS = re.VERBOSE | re.MULTILINE | re.DOTALL
16
17def _floatconstants():
18    nan, = struct.unpack('>d', b'\x7f\xf8\x00\x00\x00\x00\x00\x00')
19    inf, = struct.unpack('>d', b'\x7f\xf0\x00\x00\x00\x00\x00\x00')
20    return nan, inf, -inf
21
22NaN, PosInf, NegInf = _floatconstants()
23
24
25def linecol(doc, pos):
26    lineno = doc.count('\n', 0, pos) + 1
27    if lineno == 1:
28        colno = pos + 1
29    else:
30        colno = pos - doc.rindex('\n', 0, pos)
31    return lineno, colno
32
33
34def errmsg(msg, doc, pos, end=None):
35    # Note that this function is called from _json
36    lineno, colno = linecol(doc, pos)
37    if end is None:
38        fmt = '{0}: line {1} column {2} (char {3})'
39        return fmt.format(msg, lineno, colno, pos)
40        #fmt = '%s: line %d column %d (char %d)'
41        #return fmt % (msg, lineno, colno, pos)
42    endlineno, endcolno = linecol(doc, end)
43    fmt = '{0}: line {1} column {2} - line {3} column {4} (char {5} - {6})'
44    return fmt.format(msg, lineno, colno, endlineno, endcolno, pos, end)
45    #fmt = '%s: line %d column %d - line %d column %d (char %d - %d)'
46    #return fmt % (msg, lineno, colno, endlineno, endcolno, pos, end)
47
48
49_CONSTANTS = {
50    '-Infinity': NegInf,
51    'Infinity': PosInf,
52    'NaN': NaN,
53}
54
55STRINGCHUNK = re.compile(r'(.*?)(["\\\x00-\x1f])', FLAGS)
56BACKSLASH = {
57    '"': u'"', '\\': u'\\', '/': u'/',
58    'b': u'\b', 'f': u'\f', 'n': u'\n', 'r': u'\r', 't': u'\t',
59}
60
61DEFAULT_ENCODING = "utf-8"
62
63def _decode_uXXXX(s, pos):
64    esc = s[pos + 1:pos + 5]
65    if len(esc) == 4 and esc[1] not in 'xX':
66        try:
67            return int(esc, 16)
68        except ValueError:
69            pass
70    msg = "Invalid \\uXXXX escape"
71    raise ValueError(errmsg(msg, s, pos))
72
73def py_scanstring(s, end, encoding=None, strict=True,
74        _b=BACKSLASH, _m=STRINGCHUNK.match):
75    """Scan the string s for a JSON string. End is the index of the
76    character in s after the quote that started the JSON string.
77    Unescapes all valid JSON string escape sequences and raises ValueError
78    on attempt to decode an invalid string. If strict is False then literal
79    control characters are allowed in the string.
80
81    Returns a tuple of the decoded string and the index of the character in s
82    after the end quote."""
83    if encoding is None:
84        encoding = DEFAULT_ENCODING
85    chunks = []
86    _append = chunks.append
87    begin = end - 1
88    while 1:
89        chunk = _m(s, end)
90        if chunk is None:
91            raise ValueError(
92                errmsg("Unterminated string starting at", s, begin))
93        end = chunk.end()
94        content, terminator = chunk.groups()
95        # Content is contains zero or more unescaped string characters
96        if content:
97            if not isinstance(content, unicode):
98                content = unicode(content, encoding)
99            _append(content)
100        # Terminator is the end of string, a literal control character,
101        # or a backslash denoting that an escape sequence follows
102        if terminator == '"':
103            break
104        elif terminator != '\\':
105            if strict:
106                #msg = "Invalid control character %r at" % (terminator,)
107                msg = "Invalid control character {0!r} at".format(terminator)
108                raise ValueError(errmsg(msg, s, end))
109            else:
110                _append(terminator)
111                continue
112        try:
113            esc = s[end]
114        except IndexError:
115            raise ValueError(
116                errmsg("Unterminated string starting at", s, begin))
117        # If not a unicode escape sequence, must be in the lookup table
118        if esc != 'u':
119            try:
120                char = _b[esc]
121            except KeyError:
122                msg = "Invalid \\escape: " + repr(esc)
123                raise ValueError(errmsg(msg, s, end))
124            end += 1
125        else:
126            # Unicode escape sequence
127            uni = _decode_uXXXX(s, end)
128            end += 5
129            # Check for surrogate pair on UCS-4 systems
130            if sys.maxunicode > 65535 and \
131               0xd800 <= uni <= 0xdbff and s[end:end + 2] == '\\u':
132                uni2 = _decode_uXXXX(s, end + 1)
133                if 0xdc00 <= uni2 <= 0xdfff:
134                    uni = 0x10000 + (((uni - 0xd800) << 10) | (uni2 - 0xdc00))
135                    end += 6
136            char = unichr(uni)
137        # Append the unescaped character
138        _append(char)
139    return u''.join(chunks), end
140
141
142# Use speedup if available
143scanstring = c_scanstring or py_scanstring
144
145WHITESPACE = re.compile(r'[ \t\n\r]*', FLAGS)
146WHITESPACE_STR = ' \t\n\r'
147
148def JSONObject(s_and_end, encoding, strict, scan_once, object_hook,
149               object_pairs_hook, _w=WHITESPACE.match, _ws=WHITESPACE_STR):
150    s, end = s_and_end
151    pairs = []
152    pairs_append = pairs.append
153    # Use a slice to prevent IndexError from being raised, the following
154    # check will raise a more specific ValueError if the string is empty
155    nextchar = s[end:end + 1]
156    # Normally we expect nextchar == '"'
157    if nextchar != '"':
158        if nextchar in _ws:
159            end = _w(s, end).end()
160            nextchar = s[end:end + 1]
161        # Trivial empty object
162        if nextchar == '}':
163            if object_pairs_hook is not None:
164                result = object_pairs_hook(pairs)
165                return result, end + 1
166            pairs = {}
167            if object_hook is not None:
168                pairs = object_hook(pairs)
169            return pairs, end + 1
170        elif nextchar != '"':
171            raise ValueError(errmsg(
172                "Expecting property name enclosed in double quotes", s, end))
173    end += 1
174    while True:
175        key, end = scanstring(s, end, encoding, strict)
176
177        # To skip some function call overhead we optimize the fast paths where
178        # the JSON key separator is ": " or just ":".
179        if s[end:end + 1] != ':':
180            end = _w(s, end).end()
181            if s[end:end + 1] != ':':
182                raise ValueError(errmsg("Expecting ':' delimiter", s, end))
183        end += 1
184
185        try:
186            if s[end] in _ws:
187                end += 1
188                if s[end] in _ws:
189                    end = _w(s, end + 1).end()
190        except IndexError:
191            pass
192
193        try:
194            value, end = scan_once(s, end)
195        except StopIteration:
196            raise ValueError(errmsg("Expecting object", s, end))
197        pairs_append((key, value))
198
199        try:
200            nextchar = s[end]
201            if nextchar in _ws:
202                end = _w(s, end + 1).end()
203                nextchar = s[end]
204        except IndexError:
205            nextchar = ''
206        end += 1
207
208        if nextchar == '}':
209            break
210        elif nextchar != ',':
211            raise ValueError(errmsg("Expecting ',' delimiter", s, end - 1))
212
213        try:
214            nextchar = s[end]
215            if nextchar in _ws:
216                end += 1
217                nextchar = s[end]
218                if nextchar in _ws:
219                    end = _w(s, end + 1).end()
220                    nextchar = s[end]
221        except IndexError:
222            nextchar = ''
223
224        end += 1
225        if nextchar != '"':
226            raise ValueError(errmsg(
227                "Expecting property name enclosed in double quotes", s, end - 1))
228    if object_pairs_hook is not None:
229        result = object_pairs_hook(pairs)
230        return result, end
231    pairs = dict(pairs)
232    if object_hook is not None:
233        pairs = object_hook(pairs)
234    return pairs, end
235
236def JSONArray(s_and_end, scan_once, _w=WHITESPACE.match, _ws=WHITESPACE_STR):
237    s, end = s_and_end
238    values = []
239    nextchar = s[end:end + 1]
240    if nextchar in _ws:
241        end = _w(s, end + 1).end()
242        nextchar = s[end:end + 1]
243    # Look-ahead for trivial empty array
244    if nextchar == ']':
245        return values, end + 1
246    _append = values.append
247    while True:
248        try:
249            value, end = scan_once(s, end)
250        except StopIteration:
251            raise ValueError(errmsg("Expecting object", s, end))
252        _append(value)
253        nextchar = s[end:end + 1]
254        if nextchar in _ws:
255            end = _w(s, end + 1).end()
256            nextchar = s[end:end + 1]
257        end += 1
258        if nextchar == ']':
259            break
260        elif nextchar != ',':
261            raise ValueError(errmsg("Expecting ',' delimiter", s, end))
262        try:
263            if s[end] in _ws:
264                end += 1
265                if s[end] in _ws:
266                    end = _w(s, end + 1).end()
267        except IndexError:
268            pass
269
270    return values, end
271
272class JSONDecoder(object):
273    """Simple JSON <http://json.org> decoder
274
275    Performs the following translations in decoding by default:
276
277    +---------------+-------------------+
278    | JSON          | Python            |
279    +===============+===================+
280    | object        | dict              |
281    +---------------+-------------------+
282    | array         | list              |
283    +---------------+-------------------+
284    | string        | unicode           |
285    +---------------+-------------------+
286    | number (int)  | int, long         |
287    +---------------+-------------------+
288    | number (real) | float             |
289    +---------------+-------------------+
290    | true          | True              |
291    +---------------+-------------------+
292    | false         | False             |
293    +---------------+-------------------+
294    | null          | None              |
295    +---------------+-------------------+
296
297    It also understands ``NaN``, ``Infinity``, and ``-Infinity`` as
298    their corresponding ``float`` values, which is outside the JSON spec.
299
300    """
301
302    def __init__(self, encoding=None, object_hook=None, parse_float=None,
303            parse_int=None, parse_constant=None, strict=True,
304            object_pairs_hook=None):
305        """``encoding`` determines the encoding used to interpret any ``str``
306        objects decoded by this instance (utf-8 by default).  It has no
307        effect when decoding ``unicode`` objects.
308
309        Note that currently only encodings that are a superset of ASCII work,
310        strings of other encodings should be passed in as ``unicode``.
311
312        ``object_hook``, if specified, will be called with the result
313        of every JSON object decoded and its return value will be used in
314        place of the given ``dict``.  This can be used to provide custom
315        deserializations (e.g. to support JSON-RPC class hinting).
316
317        ``object_pairs_hook``, if specified will be called with the result of
318        every JSON object decoded with an ordered list of pairs.  The return
319        value of ``object_pairs_hook`` will be used instead of the ``dict``.
320        This feature can be used to implement custom decoders that rely on the
321        order that the key and value pairs are decoded (for example,
322        collections.OrderedDict will remember the order of insertion). If
323        ``object_hook`` is also defined, the ``object_pairs_hook`` takes
324        priority.
325
326        ``parse_float``, if specified, will be called with the string
327        of every JSON float to be decoded. By default this is equivalent to
328        float(num_str). This can be used to use another datatype or parser
329        for JSON floats (e.g. decimal.Decimal).
330
331        ``parse_int``, if specified, will be called with the string
332        of every JSON int to be decoded. By default this is equivalent to
333        int(num_str). This can be used to use another datatype or parser
334        for JSON integers (e.g. float).
335
336        ``parse_constant``, if specified, will be called with one of the
337        following strings: -Infinity, Infinity, NaN.
338        This can be used to raise an exception if invalid JSON numbers
339        are encountered.
340
341        If ``strict`` is false (true is the default), then control
342        characters will be allowed inside strings.  Control characters in
343        this context are those with character codes in the 0-31 range,
344        including ``'\\t'`` (tab), ``'\\n'``, ``'\\r'`` and ``'\\0'``.
345
346        """
347        self.encoding = encoding
348        self.object_hook = object_hook
349        self.object_pairs_hook = object_pairs_hook
350        self.parse_float = parse_float or float
351        self.parse_int = parse_int or int
352        self.parse_constant = parse_constant or _CONSTANTS.__getitem__
353        self.strict = strict
354        self.parse_object = JSONObject
355        self.parse_array = JSONArray
356        self.parse_string = scanstring
357        self.scan_once = scanner.make_scanner(self)
358
359    def decode(self, s, _w=WHITESPACE.match):
360        """Return the Python representation of ``s`` (a ``str`` or ``unicode``
361        instance containing a JSON document)
362
363        """
364        obj, end = self.raw_decode(s, idx=_w(s, 0).end())
365        end = _w(s, end).end()
366        if end != len(s):
367            raise ValueError(errmsg("Extra data", s, end, len(s)))
368        return obj
369
370    def raw_decode(self, s, idx=0):
371        """Decode a JSON document from ``s`` (a ``str`` or ``unicode``
372        beginning with a JSON document) and return a 2-tuple of the Python
373        representation and the index in ``s`` where the document ended.
374
375        This can be used to decode a JSON document from a string that may
376        have extraneous data at the end.
377
378        """
379        try:
380            obj, end = self.scan_once(s, idx)
381        except StopIteration:
382            raise ValueError("No JSON object could be decoded")
383        return obj, end
384