tokenizer.c revision 78c0535a224697e1c7a1a4e68462d3d204e38942
1/*********************************************************** 2Copyright 1991-1995 by Stichting Mathematisch Centrum, Amsterdam, 3The Netherlands. 4 5 All Rights Reserved 6 7Permission to use, copy, modify, and distribute this software and its 8documentation for any purpose and without fee is hereby granted, 9provided that the above copyright notice appear in all copies and that 10both that copyright notice and this permission notice appear in 11supporting documentation, and that the names of Stichting Mathematisch 12Centrum or CWI not be used in advertising or publicity pertaining to 13distribution of the software without specific, written prior permission. 14 15STICHTING MATHEMATISCH CENTRUM DISCLAIMS ALL WARRANTIES WITH REGARD TO 16THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND 17FITNESS, IN NO EVENT SHALL STICHTING MATHEMATISCH CENTRUM BE LIABLE 18FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 19WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 20ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT 21OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 22 23******************************************************************/ 24 25/* Tokenizer implementation */ 26 27#include "pgenheaders.h" 28 29#include <ctype.h> 30 31#include "tokenizer.h" 32#include "errcode.h" 33 34extern char *my_readline PROTO((char *)); 35/* Return malloc'ed string including trailing \n; 36 empty malloc'ed string for EOF; 37 NULL if interrupted */ 38 39/* Don't ever change this -- it would break the portability of Python code */ 40#define TABSIZE 8 41 42/* Forward */ 43static struct tok_state *tok_new PROTO((void)); 44static int tok_nextc PROTO((struct tok_state *tok)); 45static void tok_backup PROTO((struct tok_state *tok, int c)); 46 47/* Token names */ 48 49char *tok_name[] = { 50 "ENDMARKER", 51 "NAME", 52 "NUMBER", 53 "STRING", 54 "NEWLINE", 55 "INDENT", 56 "DEDENT", 57 "LPAR", 58 "RPAR", 59 "LSQB", 60 "RSQB", 61 "COLON", 62 "COMMA", 63 "SEMI", 64 "PLUS", 65 "MINUS", 66 "STAR", 67 "SLASH", 68 "VBAR", 69 "AMPER", 70 "LESS", 71 "GREATER", 72 "EQUAL", 73 "DOT", 74 "PERCENT", 75 "BACKQUOTE", 76 "LBRACE", 77 "RBRACE", 78 "EQEQUAL", 79 "NOTEQUAL", 80 "LESSEQUAL", 81 "GREATEREQUAL", 82 "TILDE", 83 "CIRCUMFLEX", 84 "LEFTSHIFT", 85 "RIGHTSHIFT", 86 /* This table must match the #defines in token.h! */ 87 "OP", 88 "<ERRORTOKEN>", 89 "<N_TOKENS>" 90}; 91 92 93/* Create and initialize a new tok_state structure */ 94 95static struct tok_state * 96tok_new() 97{ 98 struct tok_state *tok = NEW(struct tok_state, 1); 99 if (tok == NULL) 100 return NULL; 101 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL; 102 tok->done = E_OK; 103 tok->fp = NULL; 104 tok->tabsize = TABSIZE; 105 tok->indent = 0; 106 tok->indstack[0] = 0; 107 tok->atbol = 1; 108 tok->pendin = 0; 109 tok->prompt = tok->nextprompt = NULL; 110 tok->lineno = 0; 111 tok->level = 0; 112 return tok; 113} 114 115 116/* Set up tokenizer for string */ 117 118struct tok_state * 119tok_setups(str) 120 char *str; 121{ 122 struct tok_state *tok = tok_new(); 123 if (tok == NULL) 124 return NULL; 125 tok->buf = tok->cur = tok->end = tok->inp = str; 126 return tok; 127} 128 129 130/* Set up tokenizer for file */ 131 132struct tok_state * 133tok_setupf(fp, ps1, ps2) 134 FILE *fp; 135 char *ps1, *ps2; 136{ 137 struct tok_state *tok = tok_new(); 138 if (tok == NULL) 139 return NULL; 140 if ((tok->buf = NEW(char, BUFSIZ)) == NULL) { 141 DEL(tok); 142 return NULL; 143 } 144 tok->cur = tok->inp = tok->buf; 145 tok->end = tok->buf + BUFSIZ; 146 tok->fp = fp; 147 tok->prompt = ps1; 148 tok->nextprompt = ps2; 149 return tok; 150} 151 152 153/* Free a tok_state structure */ 154 155void 156tok_free(tok) 157 struct tok_state *tok; 158{ 159 if (tok->fp != NULL && tok->buf != NULL) 160 DEL(tok->buf); 161 DEL(tok); 162} 163 164 165/* Get next char, updating state; error code goes into tok->done */ 166 167static int 168tok_nextc(tok) 169 register struct tok_state *tok; 170{ 171 for (;;) { 172 if (tok->cur != tok->inp) { 173 return *tok->cur++; /* Fast path */ 174 } 175 if (tok->done != E_OK) 176 return EOF; 177 if (tok->fp == NULL) { 178 char *end = strchr(tok->inp, '\n'); 179 if (end != NULL) 180 end++; 181 else { 182 end = strchr(tok->inp, '\0'); 183 if (end == tok->inp) { 184 tok->done = E_EOF; 185 return EOF; 186 } 187 } 188 if (tok->start == NULL) 189 tok->buf = tok->cur; 190 tok->lineno++; 191 tok->inp = end; 192 return *tok->cur++; 193 } 194 if (tok->prompt != NULL) { 195 char *new = my_readline(tok->prompt); 196 if (tok->nextprompt != NULL) 197 tok->prompt = tok->nextprompt; 198 if (new == NULL) 199 tok->done = E_INTR; 200 else if (*new == '\0') { 201 free(new); 202 tok->done = E_EOF; 203 } 204 else if (tok->start != NULL) { 205 int start = tok->start - tok->buf; 206 int oldlen = tok->cur - tok->buf; 207 int newlen = oldlen + strlen(new); 208 char *buf = realloc(tok->buf, newlen+1); 209 tok->lineno++; 210 if (buf == NULL) { 211 free(tok->buf); 212 tok->buf = NULL; 213 free(new); 214 tok->done = E_NOMEM; 215 return EOF; 216 } 217 tok->buf = buf; 218 tok->cur = tok->buf + oldlen; 219 strcpy(tok->buf + oldlen, new); 220 free(new); 221 tok->inp = tok->buf + newlen; 222 tok->end = tok->inp + 1; 223 tok->start = tok->buf + start; 224 } 225 else { 226 tok->lineno++; 227 if (tok->buf != NULL) 228 free(tok->buf); 229 tok->buf = new; 230 tok->cur = tok->buf; 231 tok->inp = strchr(tok->buf, '\0'); 232 tok->end = tok->inp + 1; 233 } 234 } 235 else { 236 int done = 0; 237 int cur = 0; 238 if (tok->start == NULL) { 239 if (tok->buf == NULL) { 240 tok->buf = NEW(char, BUFSIZ); 241 if (tok->buf == NULL) { 242 tok->done = E_NOMEM; 243 return EOF; 244 } 245 tok->end = tok->buf + BUFSIZ; 246 } 247 if (fgets(tok->buf, (int)(tok->end - tok->buf), 248 tok->fp) == NULL) { 249 tok->done = E_EOF; 250 done = 1; 251 } 252 else { 253 tok->done = E_OK; 254 tok->inp = strchr(tok->buf, '\0'); 255 done = tok->inp[-1] == '\n'; 256 } 257 } 258 else { 259 cur = tok->cur - tok->buf; 260 if (feof(tok->fp)) { 261 tok->done = E_EOF; 262 done = 1; 263 } 264 else 265 tok->done = E_OK; 266 } 267 tok->lineno++; 268 /* Read until '\n' or EOF */ 269 while (!done) { 270 int curstart = tok->start == NULL ? -1 : 271 tok->start - tok->buf; 272 int curvalid = tok->inp - tok->buf; 273 int cursize = tok->end - tok->buf; 274 int newsize = cursize + BUFSIZ; 275 char *newbuf = tok->buf; 276 RESIZE(newbuf, char, newsize); 277 if (newbuf == NULL) { 278 tok->done = E_NOMEM; 279 tok->cur = tok->inp; 280 return EOF; 281 } 282 tok->buf = newbuf; 283 tok->inp = tok->buf + curvalid; 284 tok->end = tok->buf + newsize; 285 tok->start = curstart < 0 ? NULL : 286 tok->buf + curstart; 287 if (fgets(tok->inp, 288 (int)(tok->end - tok->inp), 289 tok->fp) == NULL) { 290 /* Last line does not end in \n, 291 fake one */ 292 strcpy(tok->inp, "\n"); 293 } 294 tok->inp = strchr(tok->inp, '\0'); 295 done = tok->inp[-1] == '\n'; 296 } 297 tok->cur = tok->buf + cur; 298 } 299 if (tok->done != E_OK) { 300 if (tok->prompt != NULL) 301 fprintf(stderr, "\n"); 302 tok->cur = tok->inp; 303 return EOF; 304 } 305 } 306 /*NOTREACHED*/ 307} 308 309 310/* Back-up one character */ 311 312static void 313tok_backup(tok, c) 314 register struct tok_state *tok; 315 register int c; 316{ 317 if (c != EOF) { 318 if (--tok->cur < tok->buf) 319 fatal("tok_backup: begin of buffer"); 320 if (*tok->cur != c) 321 *tok->cur = c; 322 } 323} 324 325 326/* Return the token corresponding to a single character */ 327 328int 329tok_1char(c) 330 int c; 331{ 332 switch (c) { 333 case '(': return LPAR; 334 case ')': return RPAR; 335 case '[': return LSQB; 336 case ']': return RSQB; 337 case ':': return COLON; 338 case ',': return COMMA; 339 case ';': return SEMI; 340 case '+': return PLUS; 341 case '-': return MINUS; 342 case '*': return STAR; 343 case '/': return SLASH; 344 case '|': return VBAR; 345 case '&': return AMPER; 346 case '<': return LESS; 347 case '>': return GREATER; 348 case '=': return EQUAL; 349 case '.': return DOT; 350 case '%': return PERCENT; 351 case '`': return BACKQUOTE; 352 case '{': return LBRACE; 353 case '}': return RBRACE; 354 case '^': return CIRCUMFLEX; 355 case '~': return TILDE; 356 default: return OP; 357 } 358} 359 360 361int 362tok_2char(c1, c2) 363 int c1, c2; 364{ 365 switch (c1) { 366 case '=': 367 switch (c2) { 368 case '=': return EQEQUAL; 369 } 370 break; 371 case '!': 372 switch (c2) { 373 case '=': return NOTEQUAL; 374 } 375 break; 376 case '<': 377 switch (c2) { 378 case '>': return NOTEQUAL; 379 case '=': return LESSEQUAL; 380 case '<': return LEFTSHIFT; 381 } 382 break; 383 case '>': 384 switch (c2) { 385 case '=': return GREATEREQUAL; 386 case '>': return RIGHTSHIFT; 387 } 388 break; 389 } 390 return OP; 391} 392 393 394/* Get next token, after space stripping etc. */ 395 396int 397tok_get(tok, p_start, p_end) 398 register struct tok_state *tok; /* In/out: tokenizer state */ 399 char **p_start, **p_end; /* Out: point to start/end of token */ 400{ 401 register int c; 402 int blankline; 403 404 *p_start = *p_end = NULL; 405 nextline: 406 tok->start = NULL; 407 blankline = 0; 408 409 /* Get indentation level */ 410 if (tok->atbol) { 411 register int col = 0; 412 tok->atbol = 0; 413 for (;;) { 414 c = tok_nextc(tok); 415 if (c == ' ') 416 col++; 417 else if (c == '\t') 418 col = (col/tok->tabsize + 1) * tok->tabsize; 419 else 420 break; 421 } 422 tok_backup(tok, c); 423 if (c == '#' || c == '\n') { 424 /* Lines with only whitespace and/or comments 425 shouldn't affect the indentation and are 426 not passed to the parser as NEWLINE tokens, 427 except *totally* empty lines in interactive 428 mode, which signal the end of a command group. */ 429 if (col == 0 && c == '\n' && tok->prompt != NULL) 430 blankline = 0; /* Let it through */ 431 else 432 blankline = 1; /* Ignore completely */ 433 /* We can't jump back right here since we still 434 may need to skip to the end of a comment */ 435 } 436 if (!blankline && tok->level == 0) { 437 if (col == tok->indstack[tok->indent]) { 438 /* No change */ 439 } 440 else if (col > tok->indstack[tok->indent]) { 441 /* Indent -- always one */ 442 if (tok->indent+1 >= MAXINDENT) { 443 fprintf(stderr, "excessive indent\n"); 444 tok->done = E_TOKEN; 445 tok->cur = tok->inp; 446 return ERRORTOKEN; 447 } 448 tok->pendin++; 449 tok->indstack[++tok->indent] = col; 450 } 451 else /* col < tok->indstack[tok->indent] */ { 452 /* Dedent -- any number, must be consistent */ 453 while (tok->indent > 0 && 454 col < tok->indstack[tok->indent]) { 455 tok->indent--; 456 tok->pendin--; 457 } 458 if (col != tok->indstack[tok->indent]) { 459 fprintf(stderr, "inconsistent dedent\n"); 460 tok->done = E_TOKEN; 461 tok->cur = tok->inp; 462 return ERRORTOKEN; 463 } 464 } 465 } 466 } 467 468 tok->start = tok->cur; 469 470 /* Return pending indents/dedents */ 471 if (tok->pendin != 0) { 472 if (tok->pendin < 0) { 473 tok->pendin++; 474 return DEDENT; 475 } 476 else { 477 tok->pendin--; 478 return INDENT; 479 } 480 } 481 482 again: 483 tok->start = NULL; 484 /* Skip spaces */ 485 do { 486 c = tok_nextc(tok); 487 } while (c == ' ' || c == '\t'); 488 489 /* Set start of current token */ 490 tok->start = tok->cur - 1; 491 492 /* Skip comment */ 493 if (c == '#') { 494 /* Hack to allow overriding the tabsize in the file. 495 This is also recognized by vi, when it occurs near the 496 beginning or end of the file. (Will vi never die...?) 497 For Python it must be at the beginning of the file! */ 498 /* XXX The real vi syntax is actually different :-( */ 499 /* XXX Should recognize Emacs syntax, too */ 500 int x; 501 if (sscanf(tok->cur, 502 " vi:set tabsize=%d:", &x) == 1 && 503 x >= 1 && x <= 40) { 504 /* fprintf(stderr, "# vi:set tabsize=%d:\n", x); */ 505 tok->tabsize = x; 506 } 507 do { 508 c = tok_nextc(tok); 509 } while (c != EOF && c != '\n'); 510 } 511 512 /* Check for EOF and errors now */ 513 if (c == EOF) { 514 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN; 515 } 516 517 /* Identifier (most frequent token!) */ 518 if (isalpha(c) || c == '_') { 519 do { 520 c = tok_nextc(tok); 521 } while (isalnum(c) || c == '_'); 522 tok_backup(tok, c); 523 *p_start = tok->start; 524 *p_end = tok->cur; 525 return NAME; 526 } 527 528 /* Newline */ 529 if (c == '\n') { 530 tok->atbol = 1; 531 if (blankline || tok->level > 0) 532 goto nextline; 533 *p_start = tok->start; 534 *p_end = tok->cur - 1; /* Leave '\n' out of the string */ 535 return NEWLINE; 536 } 537 538 /* Period or number starting with period? */ 539 if (c == '.') { 540 c = tok_nextc(tok); 541 if (isdigit(c)) { 542 goto fraction; 543 } 544 else { 545 tok_backup(tok, c); 546 *p_start = tok->start; 547 *p_end = tok->cur; 548 return DOT; 549 } 550 } 551 552 /* Number */ 553 if (isdigit(c)) { 554 if (c == '0') { 555 /* Hex or octal */ 556 c = tok_nextc(tok); 557 if (c == '.') 558 goto fraction; 559 if (c == 'x' || c == 'X') { 560 /* Hex */ 561 do { 562 c = tok_nextc(tok); 563 } while (isxdigit(c)); 564 } 565 else { 566 /* XXX This is broken! E.g., 567 09.9 should be accepted as float! */ 568 /* Octal; c is first char of it */ 569 /* There's no 'isoctdigit' macro, sigh */ 570 while ('0' <= c && c < '8') { 571 c = tok_nextc(tok); 572 } 573 } 574 if (c == 'l' || c == 'L') 575 c = tok_nextc(tok); 576 } 577 else { 578 /* Decimal */ 579 do { 580 c = tok_nextc(tok); 581 } while (isdigit(c)); 582 if (c == 'l' || c == 'L') 583 c = tok_nextc(tok); 584 else { 585 /* Accept floating point numbers. 586 XXX This accepts incomplete things like 587 XXX 12e or 1e+; worry run-time */ 588 if (c == '.') { 589 fraction: 590 /* Fraction */ 591 do { 592 c = tok_nextc(tok); 593 } while (isdigit(c)); 594 } 595 if (c == 'e' || c == 'E') { 596 /* Exponent part */ 597 c = tok_nextc(tok); 598 if (c == '+' || c == '-') 599 c = tok_nextc(tok); 600 while (isdigit(c)) { 601 c = tok_nextc(tok); 602 } 603 } 604 } 605 } 606 tok_backup(tok, c); 607 *p_start = tok->start; 608 *p_end = tok->cur; 609 return NUMBER; 610 } 611 612 /* String */ 613 if (c == '\'' || c == '"') { 614 int quote = c; 615 int triple = 0; 616 int tripcount = 0; 617 for (;;) { 618 c = tok_nextc(tok); 619 if (c == '\n') { 620 if (!triple) { 621 tok->done = E_TOKEN; 622 tok_backup(tok, c); 623 return ERRORTOKEN; 624 } 625 tripcount = 0; 626 } 627 else if (c == EOF) { 628 tok->done = E_TOKEN; 629 tok->cur = tok->inp; 630 return ERRORTOKEN; 631 } 632 else if (c == quote) { 633 tripcount++; 634 if (tok->cur == tok->start+2) { 635 c = tok_nextc(tok); 636 if (c == quote) { 637 triple = 1; 638 tripcount = 0; 639 continue; 640 } 641 tok_backup(tok, c); 642 } 643 if (!triple || tripcount == 3) 644 break; 645 } 646 else if (c == '\\') { 647 tripcount = 0; 648 c = tok_nextc(tok); 649 if (c == EOF) { 650 tok->done = E_TOKEN; 651 tok->cur = tok->inp; 652 return ERRORTOKEN; 653 } 654 } 655 else 656 tripcount = 0; 657 } 658 *p_start = tok->start; 659 *p_end = tok->cur; 660 return STRING; 661 } 662 663 /* Line continuation */ 664 if (c == '\\') { 665 c = tok_nextc(tok); 666 if (c != '\n') { 667 tok->done = E_TOKEN; 668 tok->cur = tok->inp; 669 return ERRORTOKEN; 670 } 671 goto again; /* Read next line */ 672 } 673 674 /* Check for two-character token */ 675 { 676 int c2 = tok_nextc(tok); 677 int token = tok_2char(c, c2); 678 if (token != OP) { 679 *p_start = tok->start; 680 *p_end = tok->cur; 681 return token; 682 } 683 tok_backup(tok, c2); 684 } 685 686 /* Keep track of parentheses nesting level */ 687 switch (c) { 688 case '(': 689 case '[': 690 case '{': 691 tok->level++; 692 break; 693 case ')': 694 case ']': 695 case '}': 696 tok->level--; 697 break; 698 } 699 700 /* Punctuation character */ 701 *p_start = tok->start; 702 *p_end = tok->cur; 703 return tok_1char(c); 704} 705 706 707#ifdef DEBUG 708 709void 710tok_dump(type, start, end) 711 int type; 712 char *start, *end; 713{ 714 printf("%s", tok_name[type]); 715 if (type == NAME || type == NUMBER || type == STRING || type == OP) 716 printf("(%.*s)", (int)(end - start), start); 717} 718 719#endif 720