1/*
2 * JSON lexer
3 *
4 * Copyright IBM, Corp. 2009
5 *
6 * Authors:
7 *  Anthony Liguori   <aliguori@us.ibm.com>
8 *
9 * This work is licensed under the terms of the GNU LGPL, version 2.1 or later.
10 * See the COPYING.LIB file in the top-level directory.
11 *
12 */
13
14#include "qapi/qmp/qstring.h"
15#include "qapi/qmp/qlist.h"
16#include "qapi/qmp/qdict.h"
17#include "qapi/qmp/qint.h"
18#include "qemu-common.h"
19#include "qapi/qmp/json-lexer.h"
20
21#define MAX_TOKEN_SIZE (64ULL << 20)
22
23/*
24 * \"([^\\\"]|(\\\"\\'\\\\\\/\\b\\f\\n\\r\\t\\u[0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F]))*\"
25 * '([^\\']|(\\\"\\'\\\\\\/\\b\\f\\n\\r\\t\\u[0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F]))*'
26 * 0|([1-9][0-9]*(.[0-9]+)?([eE]([-+])?[0-9]+))
27 * [{}\[\],:]
28 * [a-z]+
29 *
30 */
31
32enum json_lexer_state {
33    IN_ERROR = 0,
34    IN_DQ_UCODE3,
35    IN_DQ_UCODE2,
36    IN_DQ_UCODE1,
37    IN_DQ_UCODE0,
38    IN_DQ_STRING_ESCAPE,
39    IN_DQ_STRING,
40    IN_SQ_UCODE3,
41    IN_SQ_UCODE2,
42    IN_SQ_UCODE1,
43    IN_SQ_UCODE0,
44    IN_SQ_STRING_ESCAPE,
45    IN_SQ_STRING,
46    IN_ZERO,
47    IN_DIGITS,
48    IN_DIGIT,
49    IN_EXP_E,
50    IN_MANTISSA,
51    IN_MANTISSA_DIGITS,
52    IN_NONZERO_NUMBER,
53    IN_NEG_NONZERO_NUMBER,
54    IN_KEYWORD,
55    IN_ESCAPE,
56    IN_ESCAPE_L,
57    IN_ESCAPE_LL,
58    IN_ESCAPE_I,
59    IN_ESCAPE_I6,
60    IN_ESCAPE_I64,
61    IN_WHITESPACE,
62    IN_START,
63};
64
65#define TERMINAL(state) [0 ... 0x7F] = (state)
66
67/* Return whether TERMINAL is a terminal state and the transition to it
68   from OLD_STATE required lookahead.  This happens whenever the table
69   below uses the TERMINAL macro.  */
70#define TERMINAL_NEEDED_LOOKAHEAD(old_state, terminal) \
71            (json_lexer[(old_state)][0] == (terminal))
72
73static const uint8_t json_lexer[][256] =  {
74    /* double quote string */
75    [IN_DQ_UCODE3] = {
76        ['0' ... '9'] = IN_DQ_STRING,
77        ['a' ... 'f'] = IN_DQ_STRING,
78        ['A' ... 'F'] = IN_DQ_STRING,
79    },
80    [IN_DQ_UCODE2] = {
81        ['0' ... '9'] = IN_DQ_UCODE3,
82        ['a' ... 'f'] = IN_DQ_UCODE3,
83        ['A' ... 'F'] = IN_DQ_UCODE3,
84    },
85    [IN_DQ_UCODE1] = {
86        ['0' ... '9'] = IN_DQ_UCODE2,
87        ['a' ... 'f'] = IN_DQ_UCODE2,
88        ['A' ... 'F'] = IN_DQ_UCODE2,
89    },
90    [IN_DQ_UCODE0] = {
91        ['0' ... '9'] = IN_DQ_UCODE1,
92        ['a' ... 'f'] = IN_DQ_UCODE1,
93        ['A' ... 'F'] = IN_DQ_UCODE1,
94    },
95    [IN_DQ_STRING_ESCAPE] = {
96        ['b'] = IN_DQ_STRING,
97        ['f'] =  IN_DQ_STRING,
98        ['n'] =  IN_DQ_STRING,
99        ['r'] =  IN_DQ_STRING,
100        ['t'] =  IN_DQ_STRING,
101        ['/'] = IN_DQ_STRING,
102        ['\\'] = IN_DQ_STRING,
103        ['\''] = IN_DQ_STRING,
104        ['\"'] = IN_DQ_STRING,
105        ['u'] = IN_DQ_UCODE0,
106    },
107    [IN_DQ_STRING] = {
108        [1 ... 0xBF] = IN_DQ_STRING,
109        [0xC2 ... 0xF4] = IN_DQ_STRING,
110        ['\\'] = IN_DQ_STRING_ESCAPE,
111        ['"'] = JSON_STRING,
112    },
113
114    /* single quote string */
115    [IN_SQ_UCODE3] = {
116        ['0' ... '9'] = IN_SQ_STRING,
117        ['a' ... 'f'] = IN_SQ_STRING,
118        ['A' ... 'F'] = IN_SQ_STRING,
119    },
120    [IN_SQ_UCODE2] = {
121        ['0' ... '9'] = IN_SQ_UCODE3,
122        ['a' ... 'f'] = IN_SQ_UCODE3,
123        ['A' ... 'F'] = IN_SQ_UCODE3,
124    },
125    [IN_SQ_UCODE1] = {
126        ['0' ... '9'] = IN_SQ_UCODE2,
127        ['a' ... 'f'] = IN_SQ_UCODE2,
128        ['A' ... 'F'] = IN_SQ_UCODE2,
129    },
130    [IN_SQ_UCODE0] = {
131        ['0' ... '9'] = IN_SQ_UCODE1,
132        ['a' ... 'f'] = IN_SQ_UCODE1,
133        ['A' ... 'F'] = IN_SQ_UCODE1,
134    },
135    [IN_SQ_STRING_ESCAPE] = {
136        ['b'] = IN_SQ_STRING,
137        ['f'] =  IN_SQ_STRING,
138        ['n'] =  IN_SQ_STRING,
139        ['r'] =  IN_SQ_STRING,
140        ['t'] =  IN_SQ_STRING,
141        ['/'] = IN_DQ_STRING,
142        ['\\'] = IN_DQ_STRING,
143        ['\''] = IN_SQ_STRING,
144        ['\"'] = IN_SQ_STRING,
145        ['u'] = IN_SQ_UCODE0,
146    },
147    [IN_SQ_STRING] = {
148        [1 ... 0xBF] = IN_SQ_STRING,
149        [0xC2 ... 0xF4] = IN_SQ_STRING,
150        ['\\'] = IN_SQ_STRING_ESCAPE,
151        ['\''] = JSON_STRING,
152    },
153
154    /* Zero */
155    [IN_ZERO] = {
156        TERMINAL(JSON_INTEGER),
157        ['0' ... '9'] = IN_ERROR,
158        ['.'] = IN_MANTISSA,
159    },
160
161    /* Float */
162    [IN_DIGITS] = {
163        TERMINAL(JSON_FLOAT),
164        ['0' ... '9'] = IN_DIGITS,
165    },
166
167    [IN_DIGIT] = {
168        ['0' ... '9'] = IN_DIGITS,
169    },
170
171    [IN_EXP_E] = {
172        ['-'] = IN_DIGIT,
173        ['+'] = IN_DIGIT,
174        ['0' ... '9'] = IN_DIGITS,
175    },
176
177    [IN_MANTISSA_DIGITS] = {
178        TERMINAL(JSON_FLOAT),
179        ['0' ... '9'] = IN_MANTISSA_DIGITS,
180        ['e'] = IN_EXP_E,
181        ['E'] = IN_EXP_E,
182    },
183
184    [IN_MANTISSA] = {
185        ['0' ... '9'] = IN_MANTISSA_DIGITS,
186    },
187
188    /* Number */
189    [IN_NONZERO_NUMBER] = {
190        TERMINAL(JSON_INTEGER),
191        ['0' ... '9'] = IN_NONZERO_NUMBER,
192        ['e'] = IN_EXP_E,
193        ['E'] = IN_EXP_E,
194        ['.'] = IN_MANTISSA,
195    },
196
197    [IN_NEG_NONZERO_NUMBER] = {
198        ['0'] = IN_ZERO,
199        ['1' ... '9'] = IN_NONZERO_NUMBER,
200    },
201
202    /* keywords */
203    [IN_KEYWORD] = {
204        TERMINAL(JSON_KEYWORD),
205        ['a' ... 'z'] = IN_KEYWORD,
206    },
207
208    /* whitespace */
209    [IN_WHITESPACE] = {
210        TERMINAL(JSON_SKIP),
211        [' '] = IN_WHITESPACE,
212        ['\t'] = IN_WHITESPACE,
213        ['\r'] = IN_WHITESPACE,
214        ['\n'] = IN_WHITESPACE,
215    },
216
217    /* escape */
218    [IN_ESCAPE_LL] = {
219        ['d'] = JSON_ESCAPE,
220    },
221
222    [IN_ESCAPE_L] = {
223        ['d'] = JSON_ESCAPE,
224        ['l'] = IN_ESCAPE_LL,
225    },
226
227    [IN_ESCAPE_I64] = {
228        ['d'] = JSON_ESCAPE,
229    },
230
231    [IN_ESCAPE_I6] = {
232        ['4'] = IN_ESCAPE_I64,
233    },
234
235    [IN_ESCAPE_I] = {
236        ['6'] = IN_ESCAPE_I6,
237    },
238
239    [IN_ESCAPE] = {
240        ['d'] = JSON_ESCAPE,
241        ['i'] = JSON_ESCAPE,
242        ['p'] = JSON_ESCAPE,
243        ['s'] = JSON_ESCAPE,
244        ['f'] = JSON_ESCAPE,
245        ['l'] = IN_ESCAPE_L,
246        ['I'] = IN_ESCAPE_I,
247    },
248
249    /* top level rule */
250    [IN_START] = {
251        ['"'] = IN_DQ_STRING,
252        ['\''] = IN_SQ_STRING,
253        ['0'] = IN_ZERO,
254        ['1' ... '9'] = IN_NONZERO_NUMBER,
255        ['-'] = IN_NEG_NONZERO_NUMBER,
256        ['{'] = JSON_OPERATOR,
257        ['}'] = JSON_OPERATOR,
258        ['['] = JSON_OPERATOR,
259        [']'] = JSON_OPERATOR,
260        [','] = JSON_OPERATOR,
261        [':'] = JSON_OPERATOR,
262        ['a' ... 'z'] = IN_KEYWORD,
263        ['%'] = IN_ESCAPE,
264        [' '] = IN_WHITESPACE,
265        ['\t'] = IN_WHITESPACE,
266        ['\r'] = IN_WHITESPACE,
267        ['\n'] = IN_WHITESPACE,
268    },
269};
270
271void json_lexer_init(JSONLexer *lexer, JSONLexerEmitter func)
272{
273    lexer->emit = func;
274    lexer->state = IN_START;
275    lexer->token = qstring_new();
276    lexer->x = lexer->y = 0;
277}
278
279static int json_lexer_feed_char(JSONLexer *lexer, char ch, bool flush)
280{
281    int char_consumed, new_state;
282
283    lexer->x++;
284    if (ch == '\n') {
285        lexer->x = 0;
286        lexer->y++;
287    }
288
289    do {
290        new_state = json_lexer[lexer->state][(uint8_t)ch];
291        char_consumed = !TERMINAL_NEEDED_LOOKAHEAD(lexer->state, new_state);
292        if (char_consumed) {
293            qstring_append_chr(lexer->token, ch);
294        }
295
296        switch (new_state) {
297        case JSON_OPERATOR:
298        case JSON_ESCAPE:
299        case JSON_INTEGER:
300        case JSON_FLOAT:
301        case JSON_KEYWORD:
302        case JSON_STRING:
303            lexer->emit(lexer, lexer->token, new_state, lexer->x, lexer->y);
304            /* fall through */
305        case JSON_SKIP:
306            QDECREF(lexer->token);
307            lexer->token = qstring_new();
308            new_state = IN_START;
309            break;
310        case IN_ERROR:
311            /* XXX: To avoid having previous bad input leaving the parser in an
312             * unresponsive state where we consume unpredictable amounts of
313             * subsequent "good" input, percolate this error state up to the
314             * tokenizer/parser by forcing a NULL object to be emitted, then
315             * reset state.
316             *
317             * Also note that this handling is required for reliable channel
318             * negotiation between QMP and the guest agent, since chr(0xFF)
319             * is placed at the beginning of certain events to ensure proper
320             * delivery when the channel is in an unknown state. chr(0xFF) is
321             * never a valid ASCII/UTF-8 sequence, so this should reliably
322             * induce an error/flush state.
323             */
324            lexer->emit(lexer, lexer->token, JSON_ERROR, lexer->x, lexer->y);
325            QDECREF(lexer->token);
326            lexer->token = qstring_new();
327            new_state = IN_START;
328            lexer->state = new_state;
329            return 0;
330        default:
331            break;
332        }
333        lexer->state = new_state;
334    } while (!char_consumed && !flush);
335
336    /* Do not let a single token grow to an arbitrarily large size,
337     * this is a security consideration.
338     */
339    if (lexer->token->length > MAX_TOKEN_SIZE) {
340        lexer->emit(lexer, lexer->token, lexer->state, lexer->x, lexer->y);
341        QDECREF(lexer->token);
342        lexer->token = qstring_new();
343        lexer->state = IN_START;
344    }
345
346    return 0;
347}
348
349int json_lexer_feed(JSONLexer *lexer, const char *buffer, size_t size)
350{
351    size_t i;
352
353    for (i = 0; i < size; i++) {
354        int err;
355
356        err = json_lexer_feed_char(lexer, buffer[i], false);
357        if (err < 0) {
358            return err;
359        }
360    }
361
362    return 0;
363}
364
365int json_lexer_flush(JSONLexer *lexer)
366{
367    return lexer->state == IN_START ? 0 : json_lexer_feed_char(lexer, 0, true);
368}
369
370void json_lexer_destroy(JSONLexer *lexer)
371{
372    QDECREF(lexer->token);
373}
374