1/*
2 * JSON lexer
3 *
4 * Copyright IBM, Corp. 2009
5 *
6 * Authors:
7 *  Anthony Liguori   <aliguori@us.ibm.com>
8 *
9 * This work is licensed under the terms of the GNU LGPL, version 2.1 or later.
10 * See the COPYING.LIB file in the top-level directory.
11 *
12 */
13
14#include "qstring.h"
15#include "qlist.h"
16#include "qdict.h"
17#include "qint.h"
18#include "qemu-common.h"
19#include "json-lexer.h"
20
21/*
22 * \"([^\\\"]|(\\\"\\'\\\\\\/\\b\\f\\n\\r\\t\\u[0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F]))*\"
23 * '([^\\']|(\\\"\\'\\\\\\/\\b\\f\\n\\r\\t\\u[0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F]))*'
24 * 0|([1-9][0-9]*(.[0-9]+)?([eE]([-+])?[0-9]+))
25 * [{}\[\],:]
26 * [a-z]+
27 *
28 */
29
30enum json_lexer_state {
31    IN_ERROR = 0,
32    IN_DQ_UCODE3,
33    IN_DQ_UCODE2,
34    IN_DQ_UCODE1,
35    IN_DQ_UCODE0,
36    IN_DQ_STRING_ESCAPE,
37    IN_DQ_STRING,
38    IN_SQ_UCODE3,
39    IN_SQ_UCODE2,
40    IN_SQ_UCODE1,
41    IN_SQ_UCODE0,
42    IN_SQ_STRING_ESCAPE,
43    IN_SQ_STRING,
44    IN_ZERO,
45    IN_DIGITS,
46    IN_DIGIT,
47    IN_EXP_E,
48    IN_MANTISSA,
49    IN_MANTISSA_DIGITS,
50    IN_NONZERO_NUMBER,
51    IN_NEG_NONZERO_NUMBER,
52    IN_KEYWORD,
53    IN_ESCAPE,
54    IN_ESCAPE_L,
55    IN_ESCAPE_LL,
56    IN_ESCAPE_I,
57    IN_ESCAPE_I6,
58    IN_ESCAPE_I64,
59    IN_WHITESPACE,
60    IN_START,
61};
62
63#define TERMINAL(state) [0 ... 0x7F] = (state)
64
65/* Return whether TERMINAL is a terminal state and the transition to it
66   from OLD_STATE required lookahead.  This happens whenever the table
67   below uses the TERMINAL macro.  */
68#define TERMINAL_NEEDED_LOOKAHEAD(old_state, terminal) \
69            (json_lexer[(old_state)][0] == (terminal))
70
71static const uint8_t json_lexer[][256] =  {
72    /* double quote string */
73    [IN_DQ_UCODE3] = {
74        ['0' ... '9'] = IN_DQ_STRING,
75        ['a' ... 'f'] = IN_DQ_STRING,
76        ['A' ... 'F'] = IN_DQ_STRING,
77    },
78    [IN_DQ_UCODE2] = {
79        ['0' ... '9'] = IN_DQ_UCODE3,
80        ['a' ... 'f'] = IN_DQ_UCODE3,
81        ['A' ... 'F'] = IN_DQ_UCODE3,
82    },
83    [IN_DQ_UCODE1] = {
84        ['0' ... '9'] = IN_DQ_UCODE2,
85        ['a' ... 'f'] = IN_DQ_UCODE2,
86        ['A' ... 'F'] = IN_DQ_UCODE2,
87    },
88    [IN_DQ_UCODE0] = {
89        ['0' ... '9'] = IN_DQ_UCODE1,
90        ['a' ... 'f'] = IN_DQ_UCODE1,
91        ['A' ... 'F'] = IN_DQ_UCODE1,
92    },
93    [IN_DQ_STRING_ESCAPE] = {
94        ['b'] = IN_DQ_STRING,
95        ['f'] =  IN_DQ_STRING,
96        ['n'] =  IN_DQ_STRING,
97        ['r'] =  IN_DQ_STRING,
98        ['t'] =  IN_DQ_STRING,
99        ['/'] = IN_DQ_STRING,
100        ['\\'] = IN_DQ_STRING,
101        ['\''] = IN_DQ_STRING,
102        ['\"'] = IN_DQ_STRING,
103        ['u'] = IN_DQ_UCODE0,
104    },
105    [IN_DQ_STRING] = {
106        [1 ... 0xFF] = IN_DQ_STRING,
107        ['\\'] = IN_DQ_STRING_ESCAPE,
108        ['"'] = JSON_STRING,
109    },
110
111    /* single quote string */
112    [IN_SQ_UCODE3] = {
113        ['0' ... '9'] = IN_SQ_STRING,
114        ['a' ... 'f'] = IN_SQ_STRING,
115        ['A' ... 'F'] = IN_SQ_STRING,
116    },
117    [IN_SQ_UCODE2] = {
118        ['0' ... '9'] = IN_SQ_UCODE3,
119        ['a' ... 'f'] = IN_SQ_UCODE3,
120        ['A' ... 'F'] = IN_SQ_UCODE3,
121    },
122    [IN_SQ_UCODE1] = {
123        ['0' ... '9'] = IN_SQ_UCODE2,
124        ['a' ... 'f'] = IN_SQ_UCODE2,
125        ['A' ... 'F'] = IN_SQ_UCODE2,
126    },
127    [IN_SQ_UCODE0] = {
128        ['0' ... '9'] = IN_SQ_UCODE1,
129        ['a' ... 'f'] = IN_SQ_UCODE1,
130        ['A' ... 'F'] = IN_SQ_UCODE1,
131    },
132    [IN_SQ_STRING_ESCAPE] = {
133        ['b'] = IN_SQ_STRING,
134        ['f'] =  IN_SQ_STRING,
135        ['n'] =  IN_SQ_STRING,
136        ['r'] =  IN_SQ_STRING,
137        ['t'] =  IN_SQ_STRING,
138        ['/'] = IN_DQ_STRING,
139        ['\\'] = IN_DQ_STRING,
140        ['\''] = IN_SQ_STRING,
141        ['\"'] = IN_SQ_STRING,
142        ['u'] = IN_SQ_UCODE0,
143    },
144    [IN_SQ_STRING] = {
145        [1 ... 0xFF] = IN_SQ_STRING,
146        ['\\'] = IN_SQ_STRING_ESCAPE,
147        ['\''] = JSON_STRING,
148    },
149
150    /* Zero */
151    [IN_ZERO] = {
152        TERMINAL(JSON_INTEGER),
153        ['0' ... '9'] = IN_ERROR,
154        ['.'] = IN_MANTISSA,
155    },
156
157    /* Float */
158    [IN_DIGITS] = {
159        TERMINAL(JSON_FLOAT),
160        ['0' ... '9'] = IN_DIGITS,
161    },
162
163    [IN_DIGIT] = {
164        ['0' ... '9'] = IN_DIGITS,
165    },
166
167    [IN_EXP_E] = {
168        ['-'] = IN_DIGIT,
169        ['+'] = IN_DIGIT,
170        ['0' ... '9'] = IN_DIGITS,
171    },
172
173    [IN_MANTISSA_DIGITS] = {
174        TERMINAL(JSON_FLOAT),
175        ['0' ... '9'] = IN_MANTISSA_DIGITS,
176        ['e'] = IN_EXP_E,
177        ['E'] = IN_EXP_E,
178    },
179
180    [IN_MANTISSA] = {
181        ['0' ... '9'] = IN_MANTISSA_DIGITS,
182    },
183
184    /* Number */
185    [IN_NONZERO_NUMBER] = {
186        TERMINAL(JSON_INTEGER),
187        ['0' ... '9'] = IN_NONZERO_NUMBER,
188        ['e'] = IN_EXP_E,
189        ['E'] = IN_EXP_E,
190        ['.'] = IN_MANTISSA,
191    },
192
193    [IN_NEG_NONZERO_NUMBER] = {
194        ['0'] = IN_ZERO,
195        ['1' ... '9'] = IN_NONZERO_NUMBER,
196    },
197
198    /* keywords */
199    [IN_KEYWORD] = {
200        TERMINAL(JSON_KEYWORD),
201        ['a' ... 'z'] = IN_KEYWORD,
202    },
203
204    /* whitespace */
205    [IN_WHITESPACE] = {
206        TERMINAL(JSON_SKIP),
207        [' '] = IN_WHITESPACE,
208        ['\t'] = IN_WHITESPACE,
209        ['\r'] = IN_WHITESPACE,
210        ['\n'] = IN_WHITESPACE,
211    },
212
213    /* escape */
214    [IN_ESCAPE_LL] = {
215        ['d'] = JSON_ESCAPE,
216    },
217
218    [IN_ESCAPE_L] = {
219        ['d'] = JSON_ESCAPE,
220        ['l'] = IN_ESCAPE_LL,
221    },
222
223    [IN_ESCAPE_I64] = {
224        ['d'] = JSON_ESCAPE,
225    },
226
227    [IN_ESCAPE_I6] = {
228        ['4'] = IN_ESCAPE_I64,
229    },
230
231    [IN_ESCAPE_I] = {
232        ['6'] = IN_ESCAPE_I6,
233    },
234
235    [IN_ESCAPE] = {
236        ['d'] = JSON_ESCAPE,
237        ['i'] = JSON_ESCAPE,
238        ['p'] = JSON_ESCAPE,
239        ['s'] = JSON_ESCAPE,
240        ['f'] = JSON_ESCAPE,
241        ['l'] = IN_ESCAPE_L,
242        ['I'] = IN_ESCAPE_I,
243    },
244
245    /* top level rule */
246    [IN_START] = {
247        ['"'] = IN_DQ_STRING,
248        ['\''] = IN_SQ_STRING,
249        ['0'] = IN_ZERO,
250        ['1' ... '9'] = IN_NONZERO_NUMBER,
251        ['-'] = IN_NEG_NONZERO_NUMBER,
252        ['{'] = JSON_OPERATOR,
253        ['}'] = JSON_OPERATOR,
254        ['['] = JSON_OPERATOR,
255        [']'] = JSON_OPERATOR,
256        [','] = JSON_OPERATOR,
257        [':'] = JSON_OPERATOR,
258        ['a' ... 'z'] = IN_KEYWORD,
259        ['%'] = IN_ESCAPE,
260        [' '] = IN_WHITESPACE,
261        ['\t'] = IN_WHITESPACE,
262        ['\r'] = IN_WHITESPACE,
263        ['\n'] = IN_WHITESPACE,
264    },
265};
266
267void json_lexer_init(JSONLexer *lexer, JSONLexerEmitter func)
268{
269    lexer->emit = func;
270    lexer->state = IN_START;
271    lexer->token = qstring_new();
272    lexer->x = lexer->y = 0;
273}
274
275static int json_lexer_feed_char(JSONLexer *lexer, char ch)
276{
277    int char_consumed, new_state;
278
279    lexer->x++;
280    if (ch == '\n') {
281        lexer->x = 0;
282        lexer->y++;
283    }
284
285    do {
286        new_state = json_lexer[lexer->state][(uint8_t)ch];
287        char_consumed = !TERMINAL_NEEDED_LOOKAHEAD(lexer->state, new_state);
288        if (char_consumed) {
289            qstring_append_chr(lexer->token, ch);
290        }
291
292        switch (new_state) {
293        case JSON_OPERATOR:
294        case JSON_ESCAPE:
295        case JSON_INTEGER:
296        case JSON_FLOAT:
297        case JSON_KEYWORD:
298        case JSON_STRING:
299            lexer->emit(lexer, lexer->token, new_state, lexer->x, lexer->y);
300        case JSON_SKIP:
301            QDECREF(lexer->token);
302            lexer->token = qstring_new();
303            new_state = IN_START;
304            break;
305        case IN_ERROR:
306            return -EINVAL;
307        default:
308            break;
309        }
310        lexer->state = new_state;
311    } while (!char_consumed);
312    return 0;
313}
314
315int json_lexer_feed(JSONLexer *lexer, const char *buffer, size_t size)
316{
317    size_t i;
318
319    for (i = 0; i < size; i++) {
320        int err;
321
322        err = json_lexer_feed_char(lexer, buffer[i]);
323        if (err < 0) {
324            return err;
325        }
326    }
327
328    return 0;
329}
330
331int json_lexer_flush(JSONLexer *lexer)
332{
333    return lexer->state == IN_START ? 0 : json_lexer_feed_char(lexer, 0);
334}
335
336void json_lexer_destroy(JSONLexer *lexer)
337{
338    QDECREF(lexer->token);
339}
340