json-lexer.c revision 9251866320b5f8329a043bb56b3a794f78d12849
1/*
2 * JSON lexer
3 *
4 * Copyright IBM, Corp. 2009
5 *
6 * Authors:
7 *  Anthony Liguori   <aliguori@us.ibm.com>
8 *
9 * This work is licensed under the terms of the GNU LGPL, version 2.1 or later.
10 * See the COPYING.LIB file in the top-level directory.
11 *
12 */
13
14#include "qstring.h"
15#include "qlist.h"
16#include "qdict.h"
17#include "qint.h"
18#include "qemu-common.h"
19#include "json-lexer.h"
20
21/*
22 * \"([^\\\"]|(\\\"\\'\\\\\\/\\b\\f\\n\\r\\t\\u[0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F]))*\"
23 * '([^\\']|(\\\"\\'\\\\\\/\\b\\f\\n\\r\\t\\u[0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F]))*'
24 * 0|([1-9][0-9]*(.[0-9]+)?([eE]([-+])?[0-9]+))
25 * [{}\[\],:]
26 * [a-z]+
27 *
28 */
29
30enum json_lexer_state {
31    ERROR = 0,
32    IN_DONE_STRING,
33    IN_DQ_UCODE3,
34    IN_DQ_UCODE2,
35    IN_DQ_UCODE1,
36    IN_DQ_UCODE0,
37    IN_DQ_STRING_ESCAPE,
38    IN_DQ_STRING,
39    IN_SQ_UCODE3,
40    IN_SQ_UCODE2,
41    IN_SQ_UCODE1,
42    IN_SQ_UCODE0,
43    IN_SQ_STRING_ESCAPE,
44    IN_SQ_STRING,
45    IN_ZERO,
46    IN_DIGITS,
47    IN_DIGIT,
48    IN_EXP_E,
49    IN_MANTISSA,
50    IN_MANTISSA_DIGITS,
51    IN_NONZERO_NUMBER,
52    IN_NEG_NONZERO_NUMBER,
53    IN_KEYWORD,
54    IN_ESCAPE,
55    IN_ESCAPE_L,
56    IN_ESCAPE_LL,
57    IN_ESCAPE_I,
58    IN_ESCAPE_I6,
59    IN_ESCAPE_I64,
60    IN_ESCAPE_DONE,
61    IN_WHITESPACE,
62    IN_OPERATOR_DONE,
63    IN_START,
64};
65
66#define TERMINAL(state) [0 ... 0x7F] = (state)
67
68static const uint8_t json_lexer[][256] =  {
69    [IN_DONE_STRING] = {
70        TERMINAL(JSON_STRING),
71    },
72
73    /* double quote string */
74    [IN_DQ_UCODE3] = {
75        ['0' ... '9'] = IN_DQ_STRING,
76        ['a' ... 'f'] = IN_DQ_STRING,
77        ['A' ... 'F'] = IN_DQ_STRING,
78    },
79    [IN_DQ_UCODE2] = {
80        ['0' ... '9'] = IN_DQ_UCODE3,
81        ['a' ... 'f'] = IN_DQ_UCODE3,
82        ['A' ... 'F'] = IN_DQ_UCODE3,
83    },
84    [IN_DQ_UCODE1] = {
85        ['0' ... '9'] = IN_DQ_UCODE2,
86        ['a' ... 'f'] = IN_DQ_UCODE2,
87        ['A' ... 'F'] = IN_DQ_UCODE2,
88    },
89    [IN_DQ_UCODE0] = {
90        ['0' ... '9'] = IN_DQ_UCODE1,
91        ['a' ... 'f'] = IN_DQ_UCODE1,
92        ['A' ... 'F'] = IN_DQ_UCODE1,
93    },
94    [IN_DQ_STRING_ESCAPE] = {
95        ['b'] = IN_DQ_STRING,
96        ['f'] =  IN_DQ_STRING,
97        ['n'] =  IN_DQ_STRING,
98        ['r'] =  IN_DQ_STRING,
99        ['t'] =  IN_DQ_STRING,
100        ['\''] = IN_DQ_STRING,
101        ['\"'] = IN_DQ_STRING,
102        ['u'] = IN_DQ_UCODE0,
103    },
104    [IN_DQ_STRING] = {
105        [1 ... 0xFF] = IN_DQ_STRING,
106        ['\\'] = IN_DQ_STRING_ESCAPE,
107        ['"'] = IN_DONE_STRING,
108    },
109
110    /* single quote string */
111    [IN_SQ_UCODE3] = {
112        ['0' ... '9'] = IN_SQ_STRING,
113        ['a' ... 'f'] = IN_SQ_STRING,
114        ['A' ... 'F'] = IN_SQ_STRING,
115    },
116    [IN_SQ_UCODE2] = {
117        ['0' ... '9'] = IN_SQ_UCODE3,
118        ['a' ... 'f'] = IN_SQ_UCODE3,
119        ['A' ... 'F'] = IN_SQ_UCODE3,
120    },
121    [IN_SQ_UCODE1] = {
122        ['0' ... '9'] = IN_SQ_UCODE2,
123        ['a' ... 'f'] = IN_SQ_UCODE2,
124        ['A' ... 'F'] = IN_SQ_UCODE2,
125    },
126    [IN_SQ_UCODE0] = {
127        ['0' ... '9'] = IN_SQ_UCODE1,
128        ['a' ... 'f'] = IN_SQ_UCODE1,
129        ['A' ... 'F'] = IN_SQ_UCODE1,
130    },
131    [IN_SQ_STRING_ESCAPE] = {
132        ['b'] = IN_SQ_STRING,
133        ['f'] =  IN_SQ_STRING,
134        ['n'] =  IN_SQ_STRING,
135        ['r'] =  IN_SQ_STRING,
136        ['t'] =  IN_SQ_STRING,
137        ['\''] = IN_SQ_STRING,
138        ['\"'] = IN_SQ_STRING,
139        ['u'] = IN_SQ_UCODE0,
140    },
141    [IN_SQ_STRING] = {
142        [1 ... 0xFF] = IN_SQ_STRING,
143        ['\\'] = IN_SQ_STRING_ESCAPE,
144        ['\''] = IN_DONE_STRING,
145    },
146
147    /* Zero */
148    [IN_ZERO] = {
149        TERMINAL(JSON_INTEGER),
150        ['0' ... '9'] = ERROR,
151        ['.'] = IN_MANTISSA,
152    },
153
154    /* Float */
155    [IN_DIGITS] = {
156        TERMINAL(JSON_FLOAT),
157        ['0' ... '9'] = IN_DIGITS,
158    },
159
160    [IN_DIGIT] = {
161        ['0' ... '9'] = IN_DIGITS,
162    },
163
164    [IN_EXP_E] = {
165        ['-'] = IN_DIGIT,
166        ['+'] = IN_DIGIT,
167        ['0' ... '9'] = IN_DIGITS,
168    },
169
170    [IN_MANTISSA_DIGITS] = {
171        TERMINAL(JSON_FLOAT),
172        ['0' ... '9'] = IN_MANTISSA_DIGITS,
173        ['e'] = IN_EXP_E,
174        ['E'] = IN_EXP_E,
175    },
176
177    [IN_MANTISSA] = {
178        ['0' ... '9'] = IN_MANTISSA_DIGITS,
179    },
180
181    /* Number */
182    [IN_NONZERO_NUMBER] = {
183        TERMINAL(JSON_INTEGER),
184        ['0' ... '9'] = IN_NONZERO_NUMBER,
185        ['e'] = IN_EXP_E,
186        ['E'] = IN_EXP_E,
187        ['.'] = IN_MANTISSA,
188    },
189
190    [IN_NEG_NONZERO_NUMBER] = {
191        ['0'] = IN_ZERO,
192        ['1' ... '9'] = IN_NONZERO_NUMBER,
193    },
194
195    /* keywords */
196    [IN_KEYWORD] = {
197        TERMINAL(JSON_KEYWORD),
198        ['a' ... 'z'] = IN_KEYWORD,
199    },
200
201    /* whitespace */
202    [IN_WHITESPACE] = {
203        TERMINAL(JSON_SKIP),
204        [' '] = IN_WHITESPACE,
205        ['\t'] = IN_WHITESPACE,
206        ['\r'] = IN_WHITESPACE,
207        ['\n'] = IN_WHITESPACE,
208    },
209
210    /* operator */
211    [IN_OPERATOR_DONE] = {
212        TERMINAL(JSON_OPERATOR),
213    },
214
215    /* escape */
216    [IN_ESCAPE_DONE] = {
217        TERMINAL(JSON_ESCAPE),
218    },
219
220    [IN_ESCAPE_LL] = {
221        ['d'] = IN_ESCAPE_DONE,
222    },
223
224    [IN_ESCAPE_L] = {
225        ['d'] = IN_ESCAPE_DONE,
226        ['l'] = IN_ESCAPE_LL,
227    },
228
229    [IN_ESCAPE_I64] = {
230        ['d'] = IN_ESCAPE_DONE,
231    },
232
233    [IN_ESCAPE_I6] = {
234        ['4'] = IN_ESCAPE_I64,
235    },
236
237    [IN_ESCAPE_I] = {
238        ['6'] = IN_ESCAPE_I6,
239    },
240
241    [IN_ESCAPE] = {
242        ['d'] = IN_ESCAPE_DONE,
243        ['i'] = IN_ESCAPE_DONE,
244        ['p'] = IN_ESCAPE_DONE,
245        ['s'] = IN_ESCAPE_DONE,
246        ['f'] = IN_ESCAPE_DONE,
247        ['l'] = IN_ESCAPE_L,
248        ['I'] = IN_ESCAPE_I,
249    },
250
251    /* top level rule */
252    [IN_START] = {
253        ['"'] = IN_DQ_STRING,
254        ['\''] = IN_SQ_STRING,
255        ['0'] = IN_ZERO,
256        ['1' ... '9'] = IN_NONZERO_NUMBER,
257        ['-'] = IN_NEG_NONZERO_NUMBER,
258        ['{'] = IN_OPERATOR_DONE,
259        ['}'] = IN_OPERATOR_DONE,
260        ['['] = IN_OPERATOR_DONE,
261        [']'] = IN_OPERATOR_DONE,
262        [','] = IN_OPERATOR_DONE,
263        [':'] = IN_OPERATOR_DONE,
264        ['a' ... 'z'] = IN_KEYWORD,
265        ['%'] = IN_ESCAPE,
266        [' '] = IN_WHITESPACE,
267        ['\t'] = IN_WHITESPACE,
268        ['\r'] = IN_WHITESPACE,
269        ['\n'] = IN_WHITESPACE,
270    },
271};
272
273void json_lexer_init(JSONLexer *lexer, JSONLexerEmitter func)
274{
275    lexer->emit = func;
276    lexer->state = IN_START;
277    lexer->token = qstring_new();
278}
279
280static int json_lexer_feed_char(JSONLexer *lexer, char ch)
281{
282    char buf[2];
283
284    lexer->x++;
285    if (ch == '\n') {
286        lexer->x = 0;
287        lexer->y++;
288    }
289
290    lexer->state = json_lexer[lexer->state][(uint8_t)ch];
291
292    switch (lexer->state) {
293    case JSON_OPERATOR:
294    case JSON_ESCAPE:
295    case JSON_INTEGER:
296    case JSON_FLOAT:
297    case JSON_KEYWORD:
298    case JSON_STRING:
299        lexer->emit(lexer, lexer->token, lexer->state, lexer->x, lexer->y);
300    case JSON_SKIP:
301        lexer->state = json_lexer[IN_START][(uint8_t)ch];
302        QDECREF(lexer->token);
303        lexer->token = qstring_new();
304        break;
305    case ERROR:
306        return -EINVAL;
307    default:
308        break;
309    }
310
311    buf[0] = ch;
312    buf[1] = 0;
313
314    qstring_append(lexer->token, buf);
315
316    return 0;
317}
318
319int json_lexer_feed(JSONLexer *lexer, const char *buffer, size_t size)
320{
321    size_t i;
322
323    for (i = 0; i < size; i++) {
324        int err;
325
326        err = json_lexer_feed_char(lexer, buffer[i]);
327        if (err < 0) {
328            return err;
329        }
330    }
331
332    return 0;
333}
334
335int json_lexer_flush(JSONLexer *lexer)
336{
337    return json_lexer_feed_char(lexer, 0);
338}
339
340void json_lexer_destroy(JSONLexer *lexer)
341{
342    QDECREF(lexer->token);
343}
344