tokenizer.c revision fbab905ae1fe6320268301193d953d25d2acb5c1
1/*********************************************************** 2Copyright 1991 by Stichting Mathematisch Centrum, Amsterdam, The 3Netherlands. 4 5 All Rights Reserved 6 7Permission to use, copy, modify, and distribute this software and its 8documentation for any purpose and without fee is hereby granted, 9provided that the above copyright notice appear in all copies and that 10both that copyright notice and this permission notice appear in 11supporting documentation, and that the names of Stichting Mathematisch 12Centrum or CWI not be used in advertising or publicity pertaining to 13distribution of the software without specific, written prior permission. 14 15STICHTING MATHEMATISCH CENTRUM DISCLAIMS ALL WARRANTIES WITH REGARD TO 16THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND 17FITNESS, IN NO EVENT SHALL STICHTING MATHEMATISCH CENTRUM BE LIABLE 18FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 19WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 20ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT 21OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 22 23******************************************************************/ 24 25/* Tokenizer implementation */ 26 27/* XXX This is rather old, should be restructured perhaps */ 28/* XXX Need a better interface to report errors than writing to stderr */ 29/* XXX Should use editor resource to fetch true tab size on Macintosh */ 30 31#include "pgenheaders.h" 32 33#include <ctype.h> 34#include "string.h" 35 36#include "fgetsintr.h" 37#include "tokenizer.h" 38#include "errcode.h" 39 40#ifdef macintosh 41#define TABSIZE 4 42#endif 43 44#ifndef TABSIZE 45#define TABSIZE 8 46#endif 47 48/* Forward */ 49static struct tok_state *tok_new PROTO((void)); 50static int tok_nextc PROTO((struct tok_state *tok)); 51static void tok_backup PROTO((struct tok_state *tok, int c)); 52 53/* Token names */ 54 55char *tok_name[] = { 56 "ENDMARKER", 57 "NAME", 58 "NUMBER", 59 "STRING", 60 "NEWLINE", 61 "INDENT", 62 "DEDENT", 63 "LPAR", 64 "RPAR", 65 "LSQB", 66 "RSQB", 67 "COLON", 68 "COMMA", 69 "SEMI", 70 "PLUS", 71 "MINUS", 72 "STAR", 73 "SLASH", 74 "VBAR", 75 "AMPER", 76 "LESS", 77 "GREATER", 78 "EQUAL", 79 "DOT", 80 "PERCENT", 81 "BACKQUOTE", 82 "LBRACE", 83 "RBRACE", 84 "EQEQUAL", 85 "NOTEQUAL", 86 "LESSEQUAL", 87 "GREATEREQUAL", 88 /* This table must match the #defines in token.h! */ 89 "OP", 90 "<ERRORTOKEN>", 91 "<N_TOKENS>" 92}; 93 94 95/* Create and initialize a new tok_state structure */ 96 97static struct tok_state * 98tok_new() 99{ 100 struct tok_state *tok = NEW(struct tok_state, 1); 101 if (tok == NULL) 102 return NULL; 103 tok->buf = tok->cur = tok->end = tok->inp = NULL; 104 tok->done = E_OK; 105 tok->fp = NULL; 106 tok->tabsize = TABSIZE; 107 tok->indent = 0; 108 tok->indstack[0] = 0; 109 tok->atbol = 1; 110 tok->pendin = 0; 111 tok->prompt = tok->nextprompt = NULL; 112 tok->lineno = 0; 113 return tok; 114} 115 116 117/* Set up tokenizer for string */ 118 119struct tok_state * 120tok_setups(str) 121 char *str; 122{ 123 struct tok_state *tok = tok_new(); 124 if (tok == NULL) 125 return NULL; 126 tok->buf = tok->cur = str; 127 tok->end = tok->inp = strchr(str, '\0'); 128 return tok; 129} 130 131 132/* Set up tokenizer for file */ 133 134struct tok_state * 135tok_setupf(fp, ps1, ps2) 136 FILE *fp; 137 char *ps1, *ps2; 138{ 139 struct tok_state *tok = tok_new(); 140 if (tok == NULL) 141 return NULL; 142 if ((tok->buf = NEW(char, BUFSIZ)) == NULL) { 143 DEL(tok); 144 return NULL; 145 } 146 tok->cur = tok->inp = tok->buf; 147 tok->end = tok->buf + BUFSIZ; 148 tok->fp = fp; 149 tok->prompt = ps1; 150 tok->nextprompt = ps2; 151 return tok; 152} 153 154 155/* Free a tok_state structure */ 156 157void 158tok_free(tok) 159 struct tok_state *tok; 160{ 161 /* XXX really need a separate flag to say 'my buffer' */ 162 if (tok->fp != NULL && tok->buf != NULL) 163 DEL(tok->buf); 164 DEL(tok); 165} 166 167 168/* Get next char, updating state; error code goes into tok->done */ 169 170static int 171tok_nextc(tok) 172 register struct tok_state *tok; 173{ 174 if (tok->done != E_OK) 175 return EOF; 176 177 for (;;) { 178 if (tok->cur < tok->inp) 179 return *tok->cur++; 180 if (tok->fp == NULL) { 181 tok->done = E_EOF; 182 return EOF; 183 } 184 if (tok->inp > tok->buf && tok->inp[-1] == '\n') 185 tok->inp = tok->buf; 186 if (tok->inp == tok->end) { 187 int n = tok->end - tok->buf; 188 char *new = tok->buf; 189 RESIZE(new, char, n+n); 190 if (new == NULL) { 191 fprintf(stderr, "tokenizer out of mem\n"); 192 tok->done = E_NOMEM; 193 return EOF; 194 } 195 tok->buf = new; 196 tok->inp = tok->buf + n; 197 tok->end = tok->inp + n; 198 } 199#ifdef USE_READLINE 200 if (tok->prompt != NULL) { 201 extern char *readline PROTO((char *prompt)); 202 static int been_here; 203 if (!been_here) { 204 /* Force rebind of TAB to insert-tab */ 205 extern int rl_insert(); 206 rl_bind_key('\t', rl_insert); 207 been_here++; 208 } 209 if (tok->buf != NULL) 210 free(tok->buf); 211 tok->buf = readline(tok->prompt); 212 (void) intrcheck(); /* Clear pending interrupt */ 213 if (tok->nextprompt != NULL) 214 tok->prompt = tok->nextprompt; 215 /* XXX different semantics w/o readline()! */ 216 if (tok->buf == NULL) { 217 tok->done = E_EOF; 218 } 219 else { 220 unsigned int n = strlen(tok->buf); 221 if (n > 0) 222 add_history(tok->buf); 223 /* Append the '\n' that readline() 224 doesn't give us, for the tokenizer... */ 225 tok->buf = realloc(tok->buf, n+2); 226 if (tok->buf == NULL) 227 tok->done = E_NOMEM; 228 else { 229 tok->end = tok->buf + n; 230 *tok->end++ = '\n'; 231 *tok->end = '\0'; 232 tok->inp = tok->end; 233 tok->cur = tok->buf; 234 } 235 } 236 } 237 else 238#endif 239 { 240 tok->cur = tok->inp; 241 if (tok->prompt != NULL && tok->inp == tok->buf) { 242 fprintf(stderr, "%s", tok->prompt); 243 tok->prompt = tok->nextprompt; 244 } 245 tok->done = fgets_intr(tok->inp, 246 (int)(tok->end - tok->inp), tok->fp); 247 } 248 if (tok->done != E_OK) { 249 if (tok->prompt != NULL) 250 fprintf(stderr, "\n"); 251 return EOF; 252 } 253 tok->inp = strchr(tok->inp, '\0'); 254 } 255} 256 257 258/* Back-up one character */ 259 260static void 261tok_backup(tok, c) 262 register struct tok_state *tok; 263 register int c; 264{ 265 if (c != EOF) { 266 if (--tok->cur < tok->buf) { 267 fprintf(stderr, "tok_backup: begin of buffer\n"); 268 abort(); 269 } 270 if (*tok->cur != c) 271 *tok->cur = c; 272 } 273} 274 275 276/* Return the token corresponding to a single character */ 277 278int 279tok_1char(c) 280 int c; 281{ 282 switch (c) { 283 case '(': return LPAR; 284 case ')': return RPAR; 285 case '[': return LSQB; 286 case ']': return RSQB; 287 case ':': return COLON; 288 case ',': return COMMA; 289 case ';': return SEMI; 290 case '+': return PLUS; 291 case '-': return MINUS; 292 case '*': return STAR; 293 case '/': return SLASH; 294 case '|': return VBAR; 295 case '&': return AMPER; 296 case '<': return LESS; 297 case '>': return GREATER; 298 case '=': return EQUAL; 299 case '.': return DOT; 300 case '%': return PERCENT; 301 case '`': return BACKQUOTE; 302 case '{': return LBRACE; 303 case '}': return RBRACE; 304 default: return OP; 305 } 306} 307 308 309int 310tok_2char(c1, c2) 311 int c1, c2; 312{ 313 switch (c1) { 314 case '=': 315 switch (c2) { 316 case '=': return EQEQUAL; 317 } 318 break; 319 case '!': 320 switch (c2) { 321 case '=': return NOTEQUAL; 322 } 323 break; 324 case '<': 325 switch (c2) { 326 case '>': return NOTEQUAL; 327 case '=': return LESSEQUAL; 328 } 329 break; 330 case '>': 331 switch (c2) { 332 case '=': return GREATEREQUAL; 333 } 334 break; 335 } 336 return OP; 337} 338 339 340/* Get next token, after space stripping etc. */ 341 342int 343tok_get(tok, p_start, p_end) 344 register struct tok_state *tok; /* In/out: tokenizer state */ 345 char **p_start, **p_end; /* Out: point to start/end of token */ 346{ 347 register int c; 348 int blankline; 349 350 nextline: 351 blankline = 0; 352 353 /* Get indentation level */ 354 if (tok->atbol) { 355 register int col = 0; 356 tok->atbol = 0; 357 tok->lineno++; 358 for (;;) { 359 c = tok_nextc(tok); 360 if (c == ' ') 361 col++; 362 else if (c == '\t') 363 col = (col/tok->tabsize + 1) * tok->tabsize; 364 else 365 break; 366 } 367 tok_backup(tok, c); 368 if (c == '#' || c == '\n') { 369 /* Lines with only whitespace and/or comments 370 shouldn't affect the indentation and are 371 not passed to the parser as NEWLINE tokens, 372 except *totally* empty lines in interactive 373 mode, which signal the end of a command group. */ 374 if (col == 0 && c == '\n' && tok->prompt != NULL) 375 blankline = 0; /* Let it through */ 376 else 377 blankline = 1; /* Ignore completely */ 378 /* We can't jump back right here since we still 379 may need to skip to the end of a comment */ 380 } 381 if (!blankline) { 382 if (col == tok->indstack[tok->indent]) { 383 /* No change */ 384 } 385 else if (col > tok->indstack[tok->indent]) { 386 /* Indent -- always one */ 387 if (tok->indent+1 >= MAXINDENT) { 388 fprintf(stderr, "excessive indent\n"); 389 tok->done = E_TOKEN; 390 return ERRORTOKEN; 391 } 392 tok->pendin++; 393 tok->indstack[++tok->indent] = col; 394 } 395 else /* col < tok->indstack[tok->indent] */ { 396 /* Dedent -- any number, must be consistent */ 397 while (tok->indent > 0 && 398 col < tok->indstack[tok->indent]) { 399 tok->indent--; 400 tok->pendin--; 401 } 402 if (col != tok->indstack[tok->indent]) { 403 fprintf(stderr, "inconsistent dedent\n"); 404 tok->done = E_TOKEN; 405 return ERRORTOKEN; 406 } 407 } 408 } 409 } 410 411 *p_start = *p_end = tok->cur; 412 413 /* Return pending indents/dedents */ 414 if (tok->pendin != 0) { 415 if (tok->pendin < 0) { 416 tok->pendin++; 417 return DEDENT; 418 } 419 else { 420 tok->pendin--; 421 return INDENT; 422 } 423 } 424 425 again: 426 /* Skip spaces */ 427 do { 428 c = tok_nextc(tok); 429 } while (c == ' ' || c == '\t'); 430 431 /* Set start of current token */ 432 *p_start = tok->cur - 1; 433 434 /* Skip comment */ 435 if (c == '#') { 436 /* Hack to allow overriding the tabsize in the file. 437 This is also recognized by vi, when it occurs near the 438 beginning or end of the file. (Will vi never die...?) 439 For Python it must be at the beginning of the file! */ 440 int x; 441 /* XXX The case to (unsigned char *) is needed by THINK C 3.0 */ 442 if (sscanf(/*(unsigned char *)*/tok->cur, 443 " vi:set tabsize=%d:", &x) == 1 && 444 x >= 1 && x <= 40) { 445 /* fprintf(stderr, "# vi:set tabsize=%d:\n", x); */ 446 tok->tabsize = x; 447 } 448 do { 449 c = tok_nextc(tok); 450 } while (c != EOF && c != '\n'); 451 } 452 453 /* Check for EOF and errors now */ 454 if (c == EOF) 455 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN; 456 457 /* Identifier (most frequent token!) */ 458 if (isalpha(c) || c == '_') { 459 do { 460 c = tok_nextc(tok); 461 } while (isalnum(c) || c == '_'); 462 tok_backup(tok, c); 463 *p_end = tok->cur; 464 return NAME; 465 } 466 467 /* Newline */ 468 if (c == '\n') { 469 tok->atbol = 1; 470 if (blankline) 471 goto nextline; 472 *p_end = tok->cur - 1; /* Leave '\n' out of the string */ 473 return NEWLINE; 474 } 475 476 /* Number */ 477 if (isdigit(c)) { 478 if (c == '0') { 479 /* Hex or octal */ 480 c = tok_nextc(tok); 481 if (c == '.') 482 goto fraction; 483 if (c == 'x' || c == 'X') { 484 /* Hex */ 485 do { 486 c = tok_nextc(tok); 487 } while (isxdigit(c)); 488 } 489 else { 490 /* Octal; c is first char of it */ 491 /* There's no 'isoctdigit' macro, sigh */ 492 while ('0' <= c && c < '8') { 493 c = tok_nextc(tok); 494 } 495 } 496 if (c == 'l' || c == 'L') 497 c = tok_nextc(tok); 498 } 499 else { 500 /* Decimal */ 501 do { 502 c = tok_nextc(tok); 503 } while (isdigit(c)); 504 if (c == 'l' || c == 'L') 505 c = tok_nextc(tok); 506 else { 507 /* Accept floating point numbers. 508 XXX This accepts incomplete things like 509 XXX 12e or 1e+; worry run-time. 510 XXX Doesn't accept numbers 511 XXX starting with a dot */ 512 if (c == '.') { 513 fraction: 514 /* Fraction */ 515 do { 516 c = tok_nextc(tok); 517 } while (isdigit(c)); 518 } 519 if (c == 'e' || c == 'E') { 520 /* Exponent part */ 521 c = tok_nextc(tok); 522 if (c == '+' || c == '-') 523 c = tok_nextc(tok); 524 while (isdigit(c)) { 525 c = tok_nextc(tok); 526 } 527 } 528 } 529 } 530 tok_backup(tok, c); 531 *p_end = tok->cur; 532 return NUMBER; 533 } 534 535 /* String */ 536 if (c == '\'') { 537 for (;;) { 538 c = tok_nextc(tok); 539 if (c == '\n' || c == EOF) { 540 tok->done = E_TOKEN; 541 return ERRORTOKEN; 542 } 543 if (c == '\\') { 544 c = tok_nextc(tok); 545 *p_end = tok->cur; 546 if (c == '\n' || c == EOF) { 547 tok->done = E_TOKEN; 548 return ERRORTOKEN; 549 } 550 continue; 551 } 552 if (c == '\'') 553 break; 554 } 555 *p_end = tok->cur; 556 return STRING; 557 } 558 559 /* Line continuation */ 560 if (c == '\\') { 561 c = tok_nextc(tok); 562 if (c != '\n') { 563 tok->done = E_TOKEN; 564 return ERRORTOKEN; 565 } 566 tok->lineno++; 567 goto again; /* Read next line */ 568 } 569 570 /* Check for two-character token */ 571 { 572 int c2 = tok_nextc(tok); 573 int token = tok_2char(c, c2); 574 if (token != OP) { 575 *p_end = tok->cur; 576 return token; 577 } 578 tok_backup(tok, c2); 579 } 580 581 /* Punctuation character */ 582 *p_end = tok->cur; 583 return tok_1char(c); 584} 585 586 587#ifdef DEBUG 588 589void 590tok_dump(type, start, end) 591 int type; 592 char *start, *end; 593{ 594 printf("%s", tok_name[type]); 595 if (type == NAME || type == NUMBER || type == STRING || type == OP) 596 printf("(%.*s)", (int)(end - start), start); 597} 598 599#endif 600