tokenizer.c revision fd8a393086fbf43597965d5e55bec158a094a466
1/*********************************************************** 2Copyright 1991-1995 by Stichting Mathematisch Centrum, Amsterdam, 3The Netherlands. 4 5 All Rights Reserved 6 7Permission to use, copy, modify, and distribute this software and its 8documentation for any purpose and without fee is hereby granted, 9provided that the above copyright notice appear in all copies and that 10both that copyright notice and this permission notice appear in 11supporting documentation, and that the names of Stichting Mathematisch 12Centrum or CWI or Corporation for National Research Initiatives or 13CNRI not be used in advertising or publicity pertaining to 14distribution of the software without specific, written prior 15permission. 16 17While CWI is the initial source for this software, a modified version 18is made available by the Corporation for National Research Initiatives 19(CNRI) at the Internet address ftp://ftp.python.org. 20 21STICHTING MATHEMATISCH CENTRUM AND CNRI DISCLAIM ALL WARRANTIES WITH 22REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF 23MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL STICHTING MATHEMATISCH 24CENTRUM OR CNRI BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL 25DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR 26PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER 27TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 28PERFORMANCE OF THIS SOFTWARE. 29 30******************************************************************/ 31 32/* Tokenizer implementation */ 33 34#include "pgenheaders.h" 35 36#include <ctype.h> 37 38#include "tokenizer.h" 39#include "errcode.h" 40 41extern char *my_readline PROTO((char *)); 42/* Return malloc'ed string including trailing \n; 43 empty malloc'ed string for EOF; 44 NULL if interrupted */ 45 46/* Don't ever change this -- it would break the portability of Python code */ 47#define TABSIZE 8 48 49/* Forward */ 50static struct tok_state *tok_new PROTO((void)); 51static int tok_nextc PROTO((struct tok_state *tok)); 52static void tok_backup PROTO((struct tok_state *tok, int c)); 53 54/* Token names */ 55 56char *tok_name[] = { 57 "ENDMARKER", 58 "NAME", 59 "NUMBER", 60 "STRING", 61 "NEWLINE", 62 "INDENT", 63 "DEDENT", 64 "LPAR", 65 "RPAR", 66 "LSQB", 67 "RSQB", 68 "COLON", 69 "COMMA", 70 "SEMI", 71 "PLUS", 72 "MINUS", 73 "STAR", 74 "SLASH", 75 "VBAR", 76 "AMPER", 77 "LESS", 78 "GREATER", 79 "EQUAL", 80 "DOT", 81 "PERCENT", 82 "BACKQUOTE", 83 "LBRACE", 84 "RBRACE", 85 "EQEQUAL", 86 "NOTEQUAL", 87 "LESSEQUAL", 88 "GREATEREQUAL", 89 "TILDE", 90 "CIRCUMFLEX", 91 "LEFTSHIFT", 92 "RIGHTSHIFT", 93 "DOUBLESTAR", 94 /* This table must match the #defines in token.h! */ 95 "OP", 96 "<ERRORTOKEN>", 97 "<N_TOKENS>" 98}; 99 100 101/* Create and initialize a new tok_state structure */ 102 103static struct tok_state * 104tok_new() 105{ 106 struct tok_state *tok = NEW(struct tok_state, 1); 107 if (tok == NULL) 108 return NULL; 109 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL; 110 tok->done = E_OK; 111 tok->fp = NULL; 112 tok->tabsize = TABSIZE; 113 tok->indent = 0; 114 tok->indstack[0] = 0; 115 tok->atbol = 1; 116 tok->pendin = 0; 117 tok->prompt = tok->nextprompt = NULL; 118 tok->lineno = 0; 119 tok->level = 0; 120 return tok; 121} 122 123 124/* Set up tokenizer for string */ 125 126struct tok_state * 127tok_setups(str) 128 char *str; 129{ 130 struct tok_state *tok = tok_new(); 131 if (tok == NULL) 132 return NULL; 133 tok->buf = tok->cur = tok->end = tok->inp = str; 134 return tok; 135} 136 137 138/* Set up tokenizer for file */ 139 140struct tok_state * 141tok_setupf(fp, ps1, ps2) 142 FILE *fp; 143 char *ps1, *ps2; 144{ 145 struct tok_state *tok = tok_new(); 146 if (tok == NULL) 147 return NULL; 148 if ((tok->buf = NEW(char, BUFSIZ)) == NULL) { 149 DEL(tok); 150 return NULL; 151 } 152 tok->cur = tok->inp = tok->buf; 153 tok->end = tok->buf + BUFSIZ; 154 tok->fp = fp; 155 tok->prompt = ps1; 156 tok->nextprompt = ps2; 157 return tok; 158} 159 160 161/* Free a tok_state structure */ 162 163void 164tok_free(tok) 165 struct tok_state *tok; 166{ 167 if (tok->fp != NULL && tok->buf != NULL) 168 DEL(tok->buf); 169 DEL(tok); 170} 171 172 173/* Get next char, updating state; error code goes into tok->done */ 174 175static int 176tok_nextc(tok) 177 register struct tok_state *tok; 178{ 179 for (;;) { 180 if (tok->cur != tok->inp) { 181 return *tok->cur++; /* Fast path */ 182 } 183 if (tok->done != E_OK) 184 return EOF; 185 if (tok->fp == NULL) { 186 char *end = strchr(tok->inp, '\n'); 187 if (end != NULL) 188 end++; 189 else { 190 end = strchr(tok->inp, '\0'); 191 if (end == tok->inp) { 192 tok->done = E_EOF; 193 return EOF; 194 } 195 } 196 if (tok->start == NULL) 197 tok->buf = tok->cur; 198 tok->lineno++; 199 tok->inp = end; 200 return *tok->cur++; 201 } 202 if (tok->prompt != NULL) { 203 char *new = my_readline(tok->prompt); 204 if (tok->nextprompt != NULL) 205 tok->prompt = tok->nextprompt; 206 if (new == NULL) 207 tok->done = E_INTR; 208 else if (*new == '\0') { 209 free(new); 210 tok->done = E_EOF; 211 } 212 else if (tok->start != NULL) { 213 int start = tok->start - tok->buf; 214 int oldlen = tok->cur - tok->buf; 215 int newlen = oldlen + strlen(new); 216 char *buf = realloc(tok->buf, newlen+1); 217 tok->lineno++; 218 if (buf == NULL) { 219 free(tok->buf); 220 tok->buf = NULL; 221 free(new); 222 tok->done = E_NOMEM; 223 return EOF; 224 } 225 tok->buf = buf; 226 tok->cur = tok->buf + oldlen; 227 strcpy(tok->buf + oldlen, new); 228 free(new); 229 tok->inp = tok->buf + newlen; 230 tok->end = tok->inp + 1; 231 tok->start = tok->buf + start; 232 } 233 else { 234 tok->lineno++; 235 if (tok->buf != NULL) 236 free(tok->buf); 237 tok->buf = new; 238 tok->cur = tok->buf; 239 tok->inp = strchr(tok->buf, '\0'); 240 tok->end = tok->inp + 1; 241 } 242 } 243 else { 244 int done = 0; 245 int cur = 0; 246 char *pt; 247 if (tok->start == NULL) { 248 if (tok->buf == NULL) { 249 tok->buf = NEW(char, BUFSIZ); 250 if (tok->buf == NULL) { 251 tok->done = E_NOMEM; 252 return EOF; 253 } 254 tok->end = tok->buf + BUFSIZ; 255 } 256 if (fgets(tok->buf, (int)(tok->end - tok->buf), 257 tok->fp) == NULL) { 258 tok->done = E_EOF; 259 done = 1; 260 } 261 else { 262 tok->done = E_OK; 263 tok->inp = strchr(tok->buf, '\0'); 264 done = tok->inp[-1] == '\n'; 265 } 266 } 267 else { 268 cur = tok->cur - tok->buf; 269 if (feof(tok->fp)) { 270 tok->done = E_EOF; 271 done = 1; 272 } 273 else 274 tok->done = E_OK; 275 } 276 tok->lineno++; 277 /* Read until '\n' or EOF */ 278 while (!done) { 279 int curstart = tok->start == NULL ? -1 : 280 tok->start - tok->buf; 281 int curvalid = tok->inp - tok->buf; 282 int newsize = curvalid + BUFSIZ; 283 char *newbuf = tok->buf; 284 RESIZE(newbuf, char, newsize); 285 if (newbuf == NULL) { 286 tok->done = E_NOMEM; 287 tok->cur = tok->inp; 288 return EOF; 289 } 290 tok->buf = newbuf; 291 tok->inp = tok->buf + curvalid; 292 tok->end = tok->buf + newsize; 293 tok->start = curstart < 0 ? NULL : 294 tok->buf + curstart; 295 if (fgets(tok->inp, 296 (int)(tok->end - tok->inp), 297 tok->fp) == NULL) { 298 /* Last line does not end in \n, 299 fake one */ 300 strcpy(tok->inp, "\n"); 301 } 302 tok->inp = strchr(tok->inp, '\0'); 303 done = tok->inp[-1] == '\n'; 304 } 305 tok->cur = tok->buf + cur; 306 /* replace "\r\n" with "\n" */ 307 pt = tok->inp - 2; 308 if (pt >= tok->buf && *pt == '\r') { 309 *pt++ = '\n'; 310 *pt = '\0'; 311 tok->inp = pt; 312 } 313 } 314 if (tok->done != E_OK) { 315 if (tok->prompt != NULL) 316 fprintf(stderr, "\n"); 317 tok->cur = tok->inp; 318 return EOF; 319 } 320 } 321 /*NOTREACHED*/ 322} 323 324 325/* Back-up one character */ 326 327static void 328tok_backup(tok, c) 329 register struct tok_state *tok; 330 register int c; 331{ 332 if (c != EOF) { 333 if (--tok->cur < tok->buf) 334 fatal("tok_backup: begin of buffer"); 335 if (*tok->cur != c) 336 *tok->cur = c; 337 } 338} 339 340 341/* Return the token corresponding to a single character */ 342 343int 344tok_1char(c) 345 int c; 346{ 347 switch (c) { 348 case '(': return LPAR; 349 case ')': return RPAR; 350 case '[': return LSQB; 351 case ']': return RSQB; 352 case ':': return COLON; 353 case ',': return COMMA; 354 case ';': return SEMI; 355 case '+': return PLUS; 356 case '-': return MINUS; 357 case '*': return STAR; 358 case '/': return SLASH; 359 case '|': return VBAR; 360 case '&': return AMPER; 361 case '<': return LESS; 362 case '>': return GREATER; 363 case '=': return EQUAL; 364 case '.': return DOT; 365 case '%': return PERCENT; 366 case '`': return BACKQUOTE; 367 case '{': return LBRACE; 368 case '}': return RBRACE; 369 case '^': return CIRCUMFLEX; 370 case '~': return TILDE; 371 default: return OP; 372 } 373} 374 375 376int 377tok_2char(c1, c2) 378 int c1, c2; 379{ 380 switch (c1) { 381 case '=': 382 switch (c2) { 383 case '=': return EQEQUAL; 384 } 385 break; 386 case '!': 387 switch (c2) { 388 case '=': return NOTEQUAL; 389 } 390 break; 391 case '<': 392 switch (c2) { 393 case '>': return NOTEQUAL; 394 case '=': return LESSEQUAL; 395 case '<': return LEFTSHIFT; 396 } 397 break; 398 case '>': 399 switch (c2) { 400 case '=': return GREATEREQUAL; 401 case '>': return RIGHTSHIFT; 402 } 403 break; 404 case '*': 405 switch (c2) { 406 case '*': return DOUBLESTAR; 407 } 408 break; 409 } 410 return OP; 411} 412 413 414/* Get next token, after space stripping etc. */ 415 416int 417tok_get(tok, p_start, p_end) 418 register struct tok_state *tok; /* In/out: tokenizer state */ 419 char **p_start, **p_end; /* Out: point to start/end of token */ 420{ 421 register int c; 422 int blankline; 423 424 *p_start = *p_end = NULL; 425 nextline: 426 tok->start = NULL; 427 blankline = 0; 428 429 /* Get indentation level */ 430 if (tok->atbol) { 431 register int col = 0; 432 tok->atbol = 0; 433 for (;;) { 434 c = tok_nextc(tok); 435 if (c == ' ') 436 col++; 437 else if (c == '\t') 438 col = (col/tok->tabsize + 1) * tok->tabsize; 439 else if (c == '\014') /* Control-L (formfeed) */ 440 col = 0; /* For Emacs users */ 441 else 442 break; 443 } 444 tok_backup(tok, c); 445 if (c == '#' || c == '\n') { 446 /* Lines with only whitespace and/or comments 447 shouldn't affect the indentation and are 448 not passed to the parser as NEWLINE tokens, 449 except *totally* empty lines in interactive 450 mode, which signal the end of a command group. */ 451 if (col == 0 && c == '\n' && tok->prompt != NULL) 452 blankline = 0; /* Let it through */ 453 else 454 blankline = 1; /* Ignore completely */ 455 /* We can't jump back right here since we still 456 may need to skip to the end of a comment */ 457 } 458 if (!blankline && tok->level == 0) { 459 if (col == tok->indstack[tok->indent]) { 460 /* No change */ 461 } 462 else if (col > tok->indstack[tok->indent]) { 463 /* Indent -- always one */ 464 if (tok->indent+1 >= MAXINDENT) { 465 fprintf(stderr, "excessive indent\n"); 466 tok->done = E_TOKEN; 467 tok->cur = tok->inp; 468 return ERRORTOKEN; 469 } 470 tok->pendin++; 471 tok->indstack[++tok->indent] = col; 472 } 473 else /* col < tok->indstack[tok->indent] */ { 474 /* Dedent -- any number, must be consistent */ 475 while (tok->indent > 0 && 476 col < tok->indstack[tok->indent]) { 477 tok->indent--; 478 tok->pendin--; 479 } 480 if (col != tok->indstack[tok->indent]) { 481 fprintf(stderr, "inconsistent dedent\n"); 482 tok->done = E_TOKEN; 483 tok->cur = tok->inp; 484 return ERRORTOKEN; 485 } 486 } 487 } 488 } 489 490 tok->start = tok->cur; 491 492 /* Return pending indents/dedents */ 493 if (tok->pendin != 0) { 494 if (tok->pendin < 0) { 495 tok->pendin++; 496 return DEDENT; 497 } 498 else { 499 tok->pendin--; 500 return INDENT; 501 } 502 } 503 504 again: 505 tok->start = NULL; 506 /* Skip spaces */ 507 do { 508 c = tok_nextc(tok); 509 } while (c == ' ' || c == '\t' || c == '\014'); 510 511 /* Set start of current token */ 512 tok->start = tok->cur - 1; 513 514 /* Skip comment */ 515 if (c == '#') { 516 /* Hack to allow overriding the tabsize in the file. 517 This is also recognized by vi, when it occurs near the 518 beginning or end of the file. (Will vi never die...?) 519 For Python it must be at the beginning of the file! */ 520 /* XXX The real vi syntax is actually different :-( */ 521 /* XXX Should recognize Emacs syntax, too */ 522 int x; 523 if (sscanf(tok->cur, 524 " vi:set tabsize=%d:", &x) == 1 && 525 x >= 1 && x <= 40) { 526 /* fprintf(stderr, "# vi:set tabsize=%d:\n", x); */ 527 tok->tabsize = x; 528 } 529 do { 530 c = tok_nextc(tok); 531 } while (c != EOF && c != '\n'); 532 } 533 534 /* Check for EOF and errors now */ 535 if (c == EOF) { 536 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN; 537 } 538 539 /* Identifier (most frequent token!) */ 540 if (isalpha(c) || c == '_') { 541 do { 542 c = tok_nextc(tok); 543 } while (isalnum(c) || c == '_'); 544 tok_backup(tok, c); 545 *p_start = tok->start; 546 *p_end = tok->cur; 547 return NAME; 548 } 549 550 /* Newline */ 551 if (c == '\n') { 552 tok->atbol = 1; 553 if (blankline || tok->level > 0) 554 goto nextline; 555 *p_start = tok->start; 556 *p_end = tok->cur - 1; /* Leave '\n' out of the string */ 557 return NEWLINE; 558 } 559 560 /* Period or number starting with period? */ 561 if (c == '.') { 562 c = tok_nextc(tok); 563 if (isdigit(c)) { 564 goto fraction; 565 } 566 else { 567 tok_backup(tok, c); 568 *p_start = tok->start; 569 *p_end = tok->cur; 570 return DOT; 571 } 572 } 573 574 /* Number */ 575 if (isdigit(c)) { 576 if (c == '0') { 577 /* Hex or octal */ 578 c = tok_nextc(tok); 579 if (c == '.') 580 goto fraction; 581#ifndef WITHOUT_COMPLEX 582 if (c == 'j' || c == 'J') 583 goto imaginary; 584#endif 585 if (c == 'x' || c == 'X') { 586 /* Hex */ 587 do { 588 c = tok_nextc(tok); 589 } while (isxdigit(c)); 590 } 591 else { 592 /* XXX This is broken! E.g., 593 09.9 should be accepted as float! */ 594 /* Octal; c is first char of it */ 595 /* There's no 'isoctdigit' macro, sigh */ 596 while ('0' <= c && c < '8') { 597 c = tok_nextc(tok); 598 } 599 } 600 if (c == 'l' || c == 'L') 601 c = tok_nextc(tok); 602 } 603 else { 604 /* Decimal */ 605 do { 606 c = tok_nextc(tok); 607 } while (isdigit(c)); 608 if (c == 'l' || c == 'L') 609 c = tok_nextc(tok); 610 else { 611 /* Accept floating point numbers. 612 XXX This accepts incomplete things like 613 XXX 12e or 1e+; worry run-time */ 614 if (c == '.') { 615 fraction: 616 /* Fraction */ 617 do { 618 c = tok_nextc(tok); 619 } while (isdigit(c)); 620 } 621 if (c == 'e' || c == 'E') { 622 /* Exponent part */ 623 c = tok_nextc(tok); 624 if (c == '+' || c == '-') 625 c = tok_nextc(tok); 626 while (isdigit(c)) { 627 c = tok_nextc(tok); 628 } 629 } 630#ifndef WITHOUT_COMPLEX 631 if (c == 'j' || c == 'J') 632 /* Imaginary part */ 633 imaginary: 634 c = tok_nextc(tok); 635#endif 636 } 637 } 638 tok_backup(tok, c); 639 *p_start = tok->start; 640 *p_end = tok->cur; 641 return NUMBER; 642 } 643 644 /* String */ 645 if (c == '\'' || c == '"') { 646 int quote = c; 647 int triple = 0; 648 int tripcount = 0; 649 for (;;) { 650 c = tok_nextc(tok); 651 if (c == '\n') { 652 if (!triple) { 653 tok->done = E_TOKEN; 654 tok_backup(tok, c); 655 return ERRORTOKEN; 656 } 657 tripcount = 0; 658 } 659 else if (c == EOF) { 660 tok->done = E_TOKEN; 661 tok->cur = tok->inp; 662 return ERRORTOKEN; 663 } 664 else if (c == quote) { 665 tripcount++; 666 if (tok->cur == tok->start+2) { 667 c = tok_nextc(tok); 668 if (c == quote) { 669 triple = 1; 670 tripcount = 0; 671 continue; 672 } 673 tok_backup(tok, c); 674 } 675 if (!triple || tripcount == 3) 676 break; 677 } 678 else if (c == '\\') { 679 tripcount = 0; 680 c = tok_nextc(tok); 681 if (c == EOF) { 682 tok->done = E_TOKEN; 683 tok->cur = tok->inp; 684 return ERRORTOKEN; 685 } 686 } 687 else 688 tripcount = 0; 689 } 690 *p_start = tok->start; 691 *p_end = tok->cur; 692 return STRING; 693 } 694 695 /* Line continuation */ 696 if (c == '\\') { 697 c = tok_nextc(tok); 698 if (c != '\n') { 699 tok->done = E_TOKEN; 700 tok->cur = tok->inp; 701 return ERRORTOKEN; 702 } 703 goto again; /* Read next line */ 704 } 705 706 /* Check for two-character token */ 707 { 708 int c2 = tok_nextc(tok); 709 int token = tok_2char(c, c2); 710 if (token != OP) { 711 *p_start = tok->start; 712 *p_end = tok->cur; 713 return token; 714 } 715 tok_backup(tok, c2); 716 } 717 718 /* Keep track of parentheses nesting level */ 719 switch (c) { 720 case '(': 721 case '[': 722 case '{': 723 tok->level++; 724 break; 725 case ')': 726 case ']': 727 case '}': 728 tok->level--; 729 break; 730 } 731 732 /* Punctuation character */ 733 *p_start = tok->start; 734 *p_end = tok->cur; 735 return tok_1char(c); 736} 737 738 739#ifdef DEBUG 740 741void 742tok_dump(type, start, end) 743 int type; 744 char *start, *end; 745{ 746 printf("%s", tok_name[type]); 747 if (type == NAME || type == NUMBER || type == STRING || type == OP) 748 printf("(%.*s)", (int)(end - start), start); 749} 750 751#endif 752