json-lexer.c revision 9251866320b5f8329a043bb56b3a794f78d12849
1/* 2 * JSON lexer 3 * 4 * Copyright IBM, Corp. 2009 5 * 6 * Authors: 7 * Anthony Liguori <aliguori@us.ibm.com> 8 * 9 * This work is licensed under the terms of the GNU LGPL, version 2.1 or later. 10 * See the COPYING.LIB file in the top-level directory. 11 * 12 */ 13 14#include "qstring.h" 15#include "qlist.h" 16#include "qdict.h" 17#include "qint.h" 18#include "qemu-common.h" 19#include "json-lexer.h" 20 21/* 22 * \"([^\\\"]|(\\\"\\'\\\\\\/\\b\\f\\n\\r\\t\\u[0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F]))*\" 23 * '([^\\']|(\\\"\\'\\\\\\/\\b\\f\\n\\r\\t\\u[0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F]))*' 24 * 0|([1-9][0-9]*(.[0-9]+)?([eE]([-+])?[0-9]+)) 25 * [{}\[\],:] 26 * [a-z]+ 27 * 28 */ 29 30enum json_lexer_state { 31 ERROR = 0, 32 IN_DONE_STRING, 33 IN_DQ_UCODE3, 34 IN_DQ_UCODE2, 35 IN_DQ_UCODE1, 36 IN_DQ_UCODE0, 37 IN_DQ_STRING_ESCAPE, 38 IN_DQ_STRING, 39 IN_SQ_UCODE3, 40 IN_SQ_UCODE2, 41 IN_SQ_UCODE1, 42 IN_SQ_UCODE0, 43 IN_SQ_STRING_ESCAPE, 44 IN_SQ_STRING, 45 IN_ZERO, 46 IN_DIGITS, 47 IN_DIGIT, 48 IN_EXP_E, 49 IN_MANTISSA, 50 IN_MANTISSA_DIGITS, 51 IN_NONZERO_NUMBER, 52 IN_NEG_NONZERO_NUMBER, 53 IN_KEYWORD, 54 IN_ESCAPE, 55 IN_ESCAPE_L, 56 IN_ESCAPE_LL, 57 IN_ESCAPE_I, 58 IN_ESCAPE_I6, 59 IN_ESCAPE_I64, 60 IN_ESCAPE_DONE, 61 IN_WHITESPACE, 62 IN_OPERATOR_DONE, 63 IN_START, 64}; 65 66#define TERMINAL(state) [0 ... 0x7F] = (state) 67 68static const uint8_t json_lexer[][256] = { 69 [IN_DONE_STRING] = { 70 TERMINAL(JSON_STRING), 71 }, 72 73 /* double quote string */ 74 [IN_DQ_UCODE3] = { 75 ['0' ... '9'] = IN_DQ_STRING, 76 ['a' ... 'f'] = IN_DQ_STRING, 77 ['A' ... 'F'] = IN_DQ_STRING, 78 }, 79 [IN_DQ_UCODE2] = { 80 ['0' ... '9'] = IN_DQ_UCODE3, 81 ['a' ... 'f'] = IN_DQ_UCODE3, 82 ['A' ... 'F'] = IN_DQ_UCODE3, 83 }, 84 [IN_DQ_UCODE1] = { 85 ['0' ... '9'] = IN_DQ_UCODE2, 86 ['a' ... 'f'] = IN_DQ_UCODE2, 87 ['A' ... 'F'] = IN_DQ_UCODE2, 88 }, 89 [IN_DQ_UCODE0] = { 90 ['0' ... '9'] = IN_DQ_UCODE1, 91 ['a' ... 'f'] = IN_DQ_UCODE1, 92 ['A' ... 'F'] = IN_DQ_UCODE1, 93 }, 94 [IN_DQ_STRING_ESCAPE] = { 95 ['b'] = IN_DQ_STRING, 96 ['f'] = IN_DQ_STRING, 97 ['n'] = IN_DQ_STRING, 98 ['r'] = IN_DQ_STRING, 99 ['t'] = IN_DQ_STRING, 100 ['\''] = IN_DQ_STRING, 101 ['\"'] = IN_DQ_STRING, 102 ['u'] = IN_DQ_UCODE0, 103 }, 104 [IN_DQ_STRING] = { 105 [1 ... 0xFF] = IN_DQ_STRING, 106 ['\\'] = IN_DQ_STRING_ESCAPE, 107 ['"'] = IN_DONE_STRING, 108 }, 109 110 /* single quote string */ 111 [IN_SQ_UCODE3] = { 112 ['0' ... '9'] = IN_SQ_STRING, 113 ['a' ... 'f'] = IN_SQ_STRING, 114 ['A' ... 'F'] = IN_SQ_STRING, 115 }, 116 [IN_SQ_UCODE2] = { 117 ['0' ... '9'] = IN_SQ_UCODE3, 118 ['a' ... 'f'] = IN_SQ_UCODE3, 119 ['A' ... 'F'] = IN_SQ_UCODE3, 120 }, 121 [IN_SQ_UCODE1] = { 122 ['0' ... '9'] = IN_SQ_UCODE2, 123 ['a' ... 'f'] = IN_SQ_UCODE2, 124 ['A' ... 'F'] = IN_SQ_UCODE2, 125 }, 126 [IN_SQ_UCODE0] = { 127 ['0' ... '9'] = IN_SQ_UCODE1, 128 ['a' ... 'f'] = IN_SQ_UCODE1, 129 ['A' ... 'F'] = IN_SQ_UCODE1, 130 }, 131 [IN_SQ_STRING_ESCAPE] = { 132 ['b'] = IN_SQ_STRING, 133 ['f'] = IN_SQ_STRING, 134 ['n'] = IN_SQ_STRING, 135 ['r'] = IN_SQ_STRING, 136 ['t'] = IN_SQ_STRING, 137 ['\''] = IN_SQ_STRING, 138 ['\"'] = IN_SQ_STRING, 139 ['u'] = IN_SQ_UCODE0, 140 }, 141 [IN_SQ_STRING] = { 142 [1 ... 0xFF] = IN_SQ_STRING, 143 ['\\'] = IN_SQ_STRING_ESCAPE, 144 ['\''] = IN_DONE_STRING, 145 }, 146 147 /* Zero */ 148 [IN_ZERO] = { 149 TERMINAL(JSON_INTEGER), 150 ['0' ... '9'] = ERROR, 151 ['.'] = IN_MANTISSA, 152 }, 153 154 /* Float */ 155 [IN_DIGITS] = { 156 TERMINAL(JSON_FLOAT), 157 ['0' ... '9'] = IN_DIGITS, 158 }, 159 160 [IN_DIGIT] = { 161 ['0' ... '9'] = IN_DIGITS, 162 }, 163 164 [IN_EXP_E] = { 165 ['-'] = IN_DIGIT, 166 ['+'] = IN_DIGIT, 167 ['0' ... '9'] = IN_DIGITS, 168 }, 169 170 [IN_MANTISSA_DIGITS] = { 171 TERMINAL(JSON_FLOAT), 172 ['0' ... '9'] = IN_MANTISSA_DIGITS, 173 ['e'] = IN_EXP_E, 174 ['E'] = IN_EXP_E, 175 }, 176 177 [IN_MANTISSA] = { 178 ['0' ... '9'] = IN_MANTISSA_DIGITS, 179 }, 180 181 /* Number */ 182 [IN_NONZERO_NUMBER] = { 183 TERMINAL(JSON_INTEGER), 184 ['0' ... '9'] = IN_NONZERO_NUMBER, 185 ['e'] = IN_EXP_E, 186 ['E'] = IN_EXP_E, 187 ['.'] = IN_MANTISSA, 188 }, 189 190 [IN_NEG_NONZERO_NUMBER] = { 191 ['0'] = IN_ZERO, 192 ['1' ... '9'] = IN_NONZERO_NUMBER, 193 }, 194 195 /* keywords */ 196 [IN_KEYWORD] = { 197 TERMINAL(JSON_KEYWORD), 198 ['a' ... 'z'] = IN_KEYWORD, 199 }, 200 201 /* whitespace */ 202 [IN_WHITESPACE] = { 203 TERMINAL(JSON_SKIP), 204 [' '] = IN_WHITESPACE, 205 ['\t'] = IN_WHITESPACE, 206 ['\r'] = IN_WHITESPACE, 207 ['\n'] = IN_WHITESPACE, 208 }, 209 210 /* operator */ 211 [IN_OPERATOR_DONE] = { 212 TERMINAL(JSON_OPERATOR), 213 }, 214 215 /* escape */ 216 [IN_ESCAPE_DONE] = { 217 TERMINAL(JSON_ESCAPE), 218 }, 219 220 [IN_ESCAPE_LL] = { 221 ['d'] = IN_ESCAPE_DONE, 222 }, 223 224 [IN_ESCAPE_L] = { 225 ['d'] = IN_ESCAPE_DONE, 226 ['l'] = IN_ESCAPE_LL, 227 }, 228 229 [IN_ESCAPE_I64] = { 230 ['d'] = IN_ESCAPE_DONE, 231 }, 232 233 [IN_ESCAPE_I6] = { 234 ['4'] = IN_ESCAPE_I64, 235 }, 236 237 [IN_ESCAPE_I] = { 238 ['6'] = IN_ESCAPE_I6, 239 }, 240 241 [IN_ESCAPE] = { 242 ['d'] = IN_ESCAPE_DONE, 243 ['i'] = IN_ESCAPE_DONE, 244 ['p'] = IN_ESCAPE_DONE, 245 ['s'] = IN_ESCAPE_DONE, 246 ['f'] = IN_ESCAPE_DONE, 247 ['l'] = IN_ESCAPE_L, 248 ['I'] = IN_ESCAPE_I, 249 }, 250 251 /* top level rule */ 252 [IN_START] = { 253 ['"'] = IN_DQ_STRING, 254 ['\''] = IN_SQ_STRING, 255 ['0'] = IN_ZERO, 256 ['1' ... '9'] = IN_NONZERO_NUMBER, 257 ['-'] = IN_NEG_NONZERO_NUMBER, 258 ['{'] = IN_OPERATOR_DONE, 259 ['}'] = IN_OPERATOR_DONE, 260 ['['] = IN_OPERATOR_DONE, 261 [']'] = IN_OPERATOR_DONE, 262 [','] = IN_OPERATOR_DONE, 263 [':'] = IN_OPERATOR_DONE, 264 ['a' ... 'z'] = IN_KEYWORD, 265 ['%'] = IN_ESCAPE, 266 [' '] = IN_WHITESPACE, 267 ['\t'] = IN_WHITESPACE, 268 ['\r'] = IN_WHITESPACE, 269 ['\n'] = IN_WHITESPACE, 270 }, 271}; 272 273void json_lexer_init(JSONLexer *lexer, JSONLexerEmitter func) 274{ 275 lexer->emit = func; 276 lexer->state = IN_START; 277 lexer->token = qstring_new(); 278} 279 280static int json_lexer_feed_char(JSONLexer *lexer, char ch) 281{ 282 char buf[2]; 283 284 lexer->x++; 285 if (ch == '\n') { 286 lexer->x = 0; 287 lexer->y++; 288 } 289 290 lexer->state = json_lexer[lexer->state][(uint8_t)ch]; 291 292 switch (lexer->state) { 293 case JSON_OPERATOR: 294 case JSON_ESCAPE: 295 case JSON_INTEGER: 296 case JSON_FLOAT: 297 case JSON_KEYWORD: 298 case JSON_STRING: 299 lexer->emit(lexer, lexer->token, lexer->state, lexer->x, lexer->y); 300 case JSON_SKIP: 301 lexer->state = json_lexer[IN_START][(uint8_t)ch]; 302 QDECREF(lexer->token); 303 lexer->token = qstring_new(); 304 break; 305 case ERROR: 306 return -EINVAL; 307 default: 308 break; 309 } 310 311 buf[0] = ch; 312 buf[1] = 0; 313 314 qstring_append(lexer->token, buf); 315 316 return 0; 317} 318 319int json_lexer_feed(JSONLexer *lexer, const char *buffer, size_t size) 320{ 321 size_t i; 322 323 for (i = 0; i < size; i++) { 324 int err; 325 326 err = json_lexer_feed_char(lexer, buffer[i]); 327 if (err < 0) { 328 return err; 329 } 330 } 331 332 return 0; 333} 334 335int json_lexer_flush(JSONLexer *lexer) 336{ 337 return json_lexer_feed_char(lexer, 0); 338} 339 340void json_lexer_destroy(JSONLexer *lexer) 341{ 342 QDECREF(lexer->token); 343} 344