tokenizer.c revision 118ec70ea27000db428ba3e3a757f4b423670db6
1 2/* Tokenizer implementation */ 3 4#include "Python.h" 5#include "pgenheaders.h" 6 7#include <ctype.h> 8#include <assert.h> 9 10#include "tokenizer.h" 11#include "errcode.h" 12 13#ifndef PGEN 14#include "unicodeobject.h" 15#include "stringobject.h" 16#include "fileobject.h" 17#include "codecs.h" 18#include "abstract.h" 19#endif /* PGEN */ 20 21extern char *PyOS_Readline(char *); 22/* Return malloc'ed string including trailing \n; 23 empty malloc'ed string for EOF; 24 NULL if interrupted */ 25 26/* Don't ever change this -- it would break the portability of Python code */ 27#define TABSIZE 8 28 29/* Convert a possibly signed character to a nonnegative int */ 30/* XXX This assumes characters are 8 bits wide */ 31#ifdef __CHAR_UNSIGNED__ 32#define Py_CHARMASK(c) (c) 33#else 34#define Py_CHARMASK(c) ((c) & 0xff) 35#endif 36 37/* Forward */ 38static struct tok_state *tok_new(void); 39static int tok_nextc(struct tok_state *tok); 40static void tok_backup(struct tok_state *tok, int c); 41 42/* Token names */ 43 44char *_PyParser_TokenNames[] = { 45 "ENDMARKER", 46 "NAME", 47 "NUMBER", 48 "STRING", 49 "NEWLINE", 50 "INDENT", 51 "DEDENT", 52 "LPAR", 53 "RPAR", 54 "LSQB", 55 "RSQB", 56 "COLON", 57 "COMMA", 58 "SEMI", 59 "PLUS", 60 "MINUS", 61 "STAR", 62 "SLASH", 63 "VBAR", 64 "AMPER", 65 "LESS", 66 "GREATER", 67 "EQUAL", 68 "DOT", 69 "PERCENT", 70 "BACKQUOTE", 71 "LBRACE", 72 "RBRACE", 73 "EQEQUAL", 74 "NOTEQUAL", 75 "LESSEQUAL", 76 "GREATEREQUAL", 77 "TILDE", 78 "CIRCUMFLEX", 79 "LEFTSHIFT", 80 "RIGHTSHIFT", 81 "DOUBLESTAR", 82 "PLUSEQUAL", 83 "MINEQUAL", 84 "STAREQUAL", 85 "SLASHEQUAL", 86 "PERCENTEQUAL", 87 "AMPEREQUAL", 88 "VBAREQUAL", 89 "CIRCUMFLEXEQUAL", 90 "LEFTSHIFTEQUAL", 91 "RIGHTSHIFTEQUAL", 92 "DOUBLESTAREQUAL", 93 "DOUBLESLASH", 94 "DOUBLESLASHEQUAL", 95 /* This table must match the #defines in token.h! */ 96 "OP", 97 "<ERRORTOKEN>", 98 "<N_TOKENS>" 99}; 100 101 102/* Create and initialize a new tok_state structure */ 103 104static struct tok_state * 105tok_new(void) 106{ 107 struct tok_state *tok = PyMem_NEW(struct tok_state, 1); 108 if (tok == NULL) 109 return NULL; 110 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL; 111 tok->done = E_OK; 112 tok->fp = NULL; 113 tok->tabsize = TABSIZE; 114 tok->indent = 0; 115 tok->indstack[0] = 0; 116 tok->atbol = 1; 117 tok->pendin = 0; 118 tok->prompt = tok->nextprompt = NULL; 119 tok->lineno = 0; 120 tok->level = 0; 121 tok->filename = NULL; 122 tok->altwarning = 0; 123 tok->alterror = 0; 124 tok->alttabsize = 1; 125 tok->altindstack[0] = 0; 126 tok->decoding_state = 0; 127 tok->decoding_erred = 0; 128 tok->read_coding_spec = 0; 129 tok->issued_encoding_warning = 0; 130 tok->encoding = NULL; 131#ifndef PGEN 132 tok->decoding_readline = NULL; 133 tok->decoding_buffer = NULL; 134#endif 135 return tok; 136} 137 138#ifdef PGEN 139 140static char * 141decoding_fgets(char *s, int size, struct tok_state *tok) 142{ 143 return fgets(s, size, tok->fp); 144} 145 146static int 147decoding_feof(struct tok_state *tok) 148{ 149 return feof(tok->fp); 150} 151 152static const char * 153decode_str(const char *str, struct tok_state *tok) 154{ 155 return str; 156} 157 158#else /* PGEN */ 159 160static char * 161error_ret(struct tok_state *tok) /* XXX */ 162{ 163 tok->decoding_erred = 1; 164 if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */ 165 PyMem_DEL(tok->buf); 166 tok->buf = NULL; 167 return NULL; /* as if it were EOF */ 168} 169 170static char * 171new_string(const char *s, int len) 172{ 173 char* result = PyMem_NEW(char, len + 1); 174 if (result != NULL) { 175 memcpy(result, s, len); 176 result[len] = '\0'; 177 } 178 return result; 179} 180 181static char * 182get_normal_name(char *s) /* for utf-8 and latin-1 */ 183{ 184 char buf[13]; 185 int i; 186 for (i = 0; i < 12; i++) { 187 int c = s[i]; 188 if (c == '\0') break; 189 else if (c == '_') buf[i] = '-'; 190 else buf[i] = tolower(c); 191 } 192 buf[i] = '\0'; 193 if (strcmp(buf, "utf-8") == 0 || 194 strncmp(buf, "utf-8-", 6) == 0) return "utf-8"; 195 else if (strcmp(buf, "latin-1") == 0 || 196 strcmp(buf, "iso-8859-1") == 0 || 197 strcmp(buf, "iso-latin-1") == 0 || 198 strncmp(buf, "latin-1-", 8) == 0 || 199 strncmp(buf, "iso-8859-1-", 11) == 0 || 200 strncmp(buf, "iso-latin-1-", 12) == 0) return "iso-8859-1"; 201 else return s; 202} 203 204/* Return the coding spec in S, or NULL if none is found. */ 205 206static char * 207get_coding_spec(const char *s, int size) 208{ 209 int i; 210 for (i = 0; i < size - 6; i++) { /* XXX inefficient search */ 211 const char* t = s + i; 212 if (strncmp(t, "coding", 6) == 0) { 213 const char* begin = NULL; 214 t += 6; 215 if (t[0] != ':' && t[0] != '=') 216 continue; 217 do { 218 t++; 219 } while (t[0] == '\x20' || t[0] == '\t'); 220 221 begin = t; 222 while (isalnum(t[0]) || t[0] == '-' || t[0] == '_' || 223 t[0] == '.') 224 t++; 225 226 if (begin < t) { 227 char* r = new_string(begin, t - begin); 228 char* q = get_normal_name(r); 229 if (r != q) { 230 PyMem_DEL(r); 231 r = new_string(q, strlen(q)); 232 } 233 return r; 234 } 235 } 236 } 237 return NULL; 238} 239 240/* Check whether the line contains a coding spec. If it does, 241 invoke the set_readline function for the new encoding. 242 This function receives the tok_state and the new encoding. 243 Return 1 on success, 0 on failure. */ 244 245static int 246check_coding_spec(const char* line, int size, struct tok_state *tok, 247 int set_readline(struct tok_state *, const char *)) 248{ 249 int r = 1; 250 char* cs = get_coding_spec(line, size); 251 if (cs != NULL) { 252 tok->read_coding_spec = 1; 253 if (tok->encoding == NULL) { 254 assert(tok->decoding_state == 1); /* raw */ 255 if (strcmp(cs, "utf-8") == 0 || 256 strcmp(cs, "iso-8859-1") == 0) { 257 tok->encoding = cs; 258 } else { 259#ifdef Py_USING_UNICODE 260 r = set_readline(tok, cs); 261 if (r) { 262 tok->encoding = cs; 263 tok->decoding_state = -1; 264 } 265#else 266 /* Without Unicode support, we cannot 267 process the coding spec. Since there 268 won't be any Unicode literals, that 269 won't matter. */ 270#endif 271 } 272 } else { /* then, compare cs with BOM */ 273 r = (strcmp(tok->encoding, cs) == 0); 274 PyMem_DEL(cs); 275 } 276 } 277 return r; 278} 279 280/* See whether the file starts with a BOM. If it does, 281 invoke the set_readline function with the new encoding. 282 Return 1 on success, 0 on failure. */ 283 284static int 285check_bom(int get_char(struct tok_state *), 286 void unget_char(int, struct tok_state *), 287 int set_readline(struct tok_state *, const char *), 288 struct tok_state *tok) 289{ 290 int ch = get_char(tok); 291 tok->decoding_state = 1; 292 if (ch == EOF) { 293 return 1; 294 } else if (ch == 0xEF) { 295 ch = get_char(tok); if (ch != 0xBB) goto NON_BOM; 296 ch = get_char(tok); if (ch != 0xBF) goto NON_BOM; 297#if 0 298 /* Disable support for UTF-16 BOMs until a decision 299 is made whether this needs to be supported. */ 300 } else if (ch == 0xFE) { 301 ch = get_char(tok); if (ch != 0xFF) goto NON_BOM; 302 if (!set_readline(tok, "utf-16-be")) return 0; 303 tok->decoding_state = -1; 304 } else if (ch == 0xFF) { 305 ch = get_char(tok); if (ch != 0xFE) goto NON_BOM; 306 if (!set_readline(tok, "utf-16-le")) return 0; 307 tok->decoding_state = -1; 308#endif 309 } else { 310 unget_char(ch, tok); 311 return 1; 312 } 313 tok->encoding = new_string("utf-8", 5); /* resulting is in utf-8 */ 314 return 1; 315 NON_BOM: 316 /* any token beginning with '\xEF', '\xFE', '\xFF' is a bad token */ 317 unget_char(0xFF, tok); /* XXX this will cause a syntax error */ 318 return 1; 319} 320 321/* Read a line of text from TOK into S, using the stream in TOK. 322 Return NULL on failure, else S. */ 323 324static char * 325fp_readl(char *s, int size, struct tok_state *tok) 326{ 327#ifndef Py_USING_UNICODE 328 /* In a non-Unicode built, this should never be called. */ 329 Py_FatalError("fp_readl should not be called in this build."); 330 return NULL; 331#else 332 PyObject* utf8; 333 PyObject* buf = tok->decoding_buffer; 334 if (buf == NULL) { 335 buf = PyObject_CallObject(tok->decoding_readline, NULL); 336 if (buf == NULL) 337 return error_ret(tok); 338 } else { 339 tok->decoding_buffer = NULL; 340 } 341 utf8 = PyUnicode_AsUTF8String(buf); 342 Py_DECREF(buf); 343 if (utf8 == NULL) 344 return error_ret(tok); 345 else { 346 const char* str = PyString_AsString(utf8); 347 assert(strlen(str) < (size_t)size); /* XXX */ 348 strcpy(s, str); 349 Py_DECREF(utf8); 350 if (s[0] == '\0') return NULL; /* EOF */ 351 return s; 352 } 353#endif 354} 355 356/* Set the readline function for TOK to a StreamReader's 357 readline function. The StreamReader is named ENC. 358 359 This function is called from check_bom and check_coding_spec. 360 361 ENC is usually identical to the future value of tok->encoding, 362 except for the (currently unsupported) case of UTF-16. 363 364 Return 1 on success, 0 on failure. */ 365 366static int 367fp_setreadl(struct tok_state *tok, const char* enc) 368{ 369 PyObject *reader, *stream, *readline; 370 371 stream = PyFile_FromFile(tok->fp, tok->filename, "rb", NULL); 372 if (stream == NULL) 373 return 0; 374 375 reader = PyCodec_StreamReader(enc, stream, NULL); 376 Py_DECREF(stream); 377 if (reader == NULL) 378 return 0; 379 380 readline = PyObject_GetAttrString(reader, "readline"); 381 Py_DECREF(reader); 382 if (readline == NULL) 383 return 0; 384 385 tok->decoding_readline = readline; 386 return 1; 387} 388 389/* Fetch the next byte from TOK. */ 390 391static int fp_getc(struct tok_state *tok) { 392 return getc(tok->fp); 393} 394 395/* Unfetch the last byte back into TOK. */ 396 397static void fp_ungetc(int c, struct tok_state *tok) { 398 ungetc(c, tok->fp); 399} 400 401/* Read a line of input from TOK. Determine encoding 402 if necessary. */ 403 404static char * 405decoding_fgets(char *s, int size, struct tok_state *tok) 406{ 407 char *line = NULL; 408 int warn = 0, badchar = 0; 409 for (;;) { 410 if (tok->decoding_state < 0) { 411 /* We already have a codec associated with 412 this input. */ 413 line = fp_readl(s, size, tok); 414 break; 415 } else if (tok->decoding_state > 0) { 416 /* We want a 'raw' read. */ 417 line = Py_UniversalNewlineFgets(s, size, 418 tok->fp, NULL); 419 warn = 1; 420 break; 421 } else { 422 /* We have not yet determined the encoding. 423 If an encoding is found, use the file-pointer 424 reader functions from now on. */ 425 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok)) 426 return error_ret(tok); 427 assert(tok->decoding_state != 0); 428 } 429 } 430 if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) { 431 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) { 432 return error_ret(tok); 433 } 434 } 435#ifndef PGEN 436 if (warn && line && !tok->issued_encoding_warning && !tok->encoding) { 437 unsigned char *c; 438 for (c = (unsigned char *)line; *c; c++) 439 if (*c > 127) { 440 badchar = *c; 441 break; 442 } 443 } 444 if (badchar) { 445 char buf[200]; 446 sprintf(buf, "Non-ASCII character '\\x%.2x', " 447 "but no declared encoding", badchar); 448 /* Need to add 1 to the line number, since this line 449 has not been counted, yet. */ 450 PyErr_WarnExplicit(PyExc_DeprecationWarning, 451 buf, tok->filename, tok->lineno + 1, 452 NULL, NULL); 453 tok->issued_encoding_warning = 1; 454 } 455#endif 456 return line; 457} 458 459static int 460decoding_feof(struct tok_state *tok) 461{ 462 if (tok->decoding_state >= 0) { 463 return feof(tok->fp); 464 } else { 465 PyObject* buf = tok->decoding_buffer; 466 if (buf == NULL) { 467 buf = PyObject_CallObject(tok->decoding_readline, NULL); 468 if (buf == NULL) { 469 error_ret(tok); 470 return 1; 471 } else { 472 tok->decoding_buffer = buf; 473 } 474 } 475 return PyObject_Length(buf) == 0; 476 } 477} 478 479/* Fetch a byte from TOK, using the string buffer. */ 480 481static int buf_getc(struct tok_state *tok) { 482 return *tok->str++; 483} 484 485/* Unfetch a byte from TOK, using the string buffer. */ 486 487static void buf_ungetc(int c, struct tok_state *tok) { 488 tok->str--; 489 assert(*tok->str == c); /* tok->cur may point to read-only segment */ 490} 491 492/* Set the readline function for TOK to ENC. For the string-based 493 tokenizer, this means to just record the encoding. */ 494 495static int buf_setreadl(struct tok_state *tok, const char* enc) { 496 tok->enc = enc; 497 return 1; 498} 499 500/* Return a UTF-8 encoding Python string object from the 501 C byte string STR, which is encoded with ENC. */ 502 503#ifdef Py_USING_UNICODE 504static PyObject * 505translate_into_utf8(const char* str, const char* enc) { 506 PyObject *utf8; 507 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL); 508 if (buf == NULL) 509 return NULL; 510 utf8 = PyUnicode_AsUTF8String(buf); 511 Py_DECREF(buf); 512 return utf8; 513} 514#endif 515 516/* Decode a byte string STR for use as the buffer of TOK. 517 Look for encoding declarations inside STR, and record them 518 inside TOK. */ 519 520static const char * 521decode_str(const char *str, struct tok_state *tok) 522{ 523 PyObject* utf8 = NULL; 524 const char *s; 525 int lineno = 0; 526 tok->enc = NULL; 527 tok->str = str; 528 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok)) 529 return NULL; 530 str = tok->str; /* string after BOM if any */ 531 assert(str); 532#ifdef Py_USING_UNICODE 533 if (tok->enc != NULL) { 534 utf8 = translate_into_utf8(str, tok->enc); 535 if (utf8 == NULL) 536 return NULL; 537 str = PyString_AsString(utf8); 538 } 539#endif 540 for (s = str;; s++) { 541 if (*s == '\0') break; 542 else if (*s == '\n') { 543 lineno++; 544 if (lineno == 2) break; 545 } 546 } 547 tok->enc = NULL; 548 if (!check_coding_spec(str, s - str, tok, buf_setreadl)) 549 return NULL; 550#ifdef Py_USING_UNICODE 551 if (tok->enc != NULL) { 552 assert(utf8 == NULL); 553 utf8 = translate_into_utf8(str, tok->enc); 554 if (utf8 == NULL) 555 return NULL; 556 str = PyString_AsString(utf8); 557 } 558#endif 559 assert(tok->decoding_buffer == NULL); 560 tok->decoding_buffer = utf8; /* CAUTION */ 561 return str; 562} 563 564#endif /* PGEN */ 565 566/* Set up tokenizer for string */ 567 568struct tok_state * 569PyTokenizer_FromString(char *str) 570{ 571 struct tok_state *tok = tok_new(); 572 if (tok == NULL) 573 return NULL; 574 str = (char *)decode_str(str, tok); 575 if (str == NULL) 576 return NULL; 577 tok->buf = tok->cur = tok->end = tok->inp = str; 578 return tok; 579} 580 581 582/* Set up tokenizer for file */ 583 584struct tok_state * 585PyTokenizer_FromFile(FILE *fp, char *ps1, char *ps2) 586{ 587 struct tok_state *tok = tok_new(); 588 if (tok == NULL) 589 return NULL; 590 if ((tok->buf = PyMem_NEW(char, BUFSIZ)) == NULL) { 591 PyMem_DEL(tok); 592 return NULL; 593 } 594 tok->cur = tok->inp = tok->buf; 595 tok->end = tok->buf + BUFSIZ; 596 tok->fp = fp; 597 tok->prompt = ps1; 598 tok->nextprompt = ps2; 599 return tok; 600} 601 602 603/* Free a tok_state structure */ 604 605void 606PyTokenizer_Free(struct tok_state *tok) 607{ 608 if (tok->encoding != NULL) 609 PyMem_DEL(tok->encoding); 610#ifndef PGEN 611 Py_XDECREF(tok->decoding_readline); 612 Py_XDECREF(tok->decoding_buffer); 613#endif 614 if (tok->fp != NULL && tok->buf != NULL) 615 PyMem_DEL(tok->buf); 616 PyMem_DEL(tok); 617} 618 619 620/* Get next char, updating state; error code goes into tok->done */ 621 622static int 623tok_nextc(register struct tok_state *tok) 624{ 625 for (;;) { 626 if (tok->cur != tok->inp) { 627 return Py_CHARMASK(*tok->cur++); /* Fast path */ 628 } 629 if (tok->done != E_OK) 630 return EOF; 631 if (tok->fp == NULL) { 632 char *end = strchr(tok->inp, '\n'); 633 if (end != NULL) 634 end++; 635 else { 636 end = strchr(tok->inp, '\0'); 637 if (end == tok->inp) { 638 tok->done = E_EOF; 639 return EOF; 640 } 641 } 642 if (tok->start == NULL) 643 tok->buf = tok->cur; 644 tok->lineno++; 645 tok->inp = end; 646 return Py_CHARMASK(*tok->cur++); 647 } 648 if (tok->prompt != NULL) { 649 char *new = PyOS_Readline(tok->prompt); 650 if (tok->nextprompt != NULL) 651 tok->prompt = tok->nextprompt; 652 if (new == NULL) 653 tok->done = E_INTR; 654 else if (*new == '\0') { 655 PyMem_FREE(new); 656 tok->done = E_EOF; 657 } 658 else if (tok->start != NULL) { 659 size_t start = tok->start - tok->buf; 660 size_t oldlen = tok->cur - tok->buf; 661 size_t newlen = oldlen + strlen(new); 662 char *buf = tok->buf; 663 PyMem_RESIZE(buf, char, newlen+1); 664 tok->lineno++; 665 if (buf == NULL) { 666 PyMem_DEL(tok->buf); 667 tok->buf = NULL; 668 PyMem_FREE(new); 669 tok->done = E_NOMEM; 670 return EOF; 671 } 672 tok->buf = buf; 673 tok->cur = tok->buf + oldlen; 674 strcpy(tok->buf + oldlen, new); 675 PyMem_FREE(new); 676 tok->inp = tok->buf + newlen; 677 tok->end = tok->inp + 1; 678 tok->start = tok->buf + start; 679 } 680 else { 681 tok->lineno++; 682 if (tok->buf != NULL) 683 PyMem_DEL(tok->buf); 684 tok->buf = new; 685 tok->cur = tok->buf; 686 tok->inp = strchr(tok->buf, '\0'); 687 tok->end = tok->inp + 1; 688 } 689 } 690 else { 691 int done = 0; 692 int cur = 0; 693 char *pt; 694 if (tok->start == NULL) { 695 if (tok->buf == NULL) { 696 tok->buf = PyMem_NEW(char, BUFSIZ); 697 if (tok->buf == NULL) { 698 tok->done = E_NOMEM; 699 return EOF; 700 } 701 tok->end = tok->buf + BUFSIZ; 702 } 703 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf), 704 tok) == NULL) { 705 tok->done = E_EOF; 706 done = 1; 707 } 708 else { 709 tok->done = E_OK; 710 tok->inp = strchr(tok->buf, '\0'); 711 done = tok->inp[-1] == '\n'; 712 } 713 } 714 else { 715 cur = tok->cur - tok->buf; 716 if (decoding_feof(tok)) { 717 tok->done = E_EOF; 718 done = 1; 719 } 720 else 721 tok->done = E_OK; 722 } 723 tok->lineno++; 724 /* Read until '\n' or EOF */ 725 while (!done) { 726 int curstart = tok->start == NULL ? -1 : 727 tok->start - tok->buf; 728 int curvalid = tok->inp - tok->buf; 729 int newsize = curvalid + BUFSIZ; 730 char *newbuf = tok->buf; 731 PyMem_RESIZE(newbuf, char, newsize); 732 if (newbuf == NULL) { 733 tok->done = E_NOMEM; 734 tok->cur = tok->inp; 735 return EOF; 736 } 737 tok->buf = newbuf; 738 tok->inp = tok->buf + curvalid; 739 tok->end = tok->buf + newsize; 740 tok->start = curstart < 0 ? NULL : 741 tok->buf + curstart; 742 if (decoding_fgets(tok->inp, 743 (int)(tok->end - tok->inp), 744 tok) == NULL) { 745 /* Last line does not end in \n, 746 fake one */ 747 strcpy(tok->inp, "\n"); 748 } 749 tok->inp = strchr(tok->inp, '\0'); 750 done = tok->inp[-1] == '\n'; 751 } 752 tok->cur = tok->buf + cur; 753#ifndef macintosh 754 /* replace "\r\n" with "\n" */ 755 /* For Mac we leave the \r, giving a syntax error */ 756 pt = tok->inp - 2; 757 if (pt >= tok->buf && *pt == '\r') { 758 *pt++ = '\n'; 759 *pt = '\0'; 760 tok->inp = pt; 761 } 762#endif 763 } 764 if (tok->done != E_OK) { 765 if (tok->prompt != NULL) 766 PySys_WriteStderr("\n"); 767 tok->cur = tok->inp; 768 return EOF; 769 } 770 } 771 /*NOTREACHED*/ 772} 773 774 775/* Back-up one character */ 776 777static void 778tok_backup(register struct tok_state *tok, register int c) 779{ 780 if (c != EOF) { 781 if (--tok->cur < tok->buf) 782 Py_FatalError("tok_backup: begin of buffer"); 783 if (*tok->cur != c) 784 *tok->cur = c; 785 } 786} 787 788 789/* Return the token corresponding to a single character */ 790 791int 792PyToken_OneChar(int c) 793{ 794 switch (c) { 795 case '(': return LPAR; 796 case ')': return RPAR; 797 case '[': return LSQB; 798 case ']': return RSQB; 799 case ':': return COLON; 800 case ',': return COMMA; 801 case ';': return SEMI; 802 case '+': return PLUS; 803 case '-': return MINUS; 804 case '*': return STAR; 805 case '/': return SLASH; 806 case '|': return VBAR; 807 case '&': return AMPER; 808 case '<': return LESS; 809 case '>': return GREATER; 810 case '=': return EQUAL; 811 case '.': return DOT; 812 case '%': return PERCENT; 813 case '`': return BACKQUOTE; 814 case '{': return LBRACE; 815 case '}': return RBRACE; 816 case '^': return CIRCUMFLEX; 817 case '~': return TILDE; 818 default: return OP; 819 } 820} 821 822 823int 824PyToken_TwoChars(int c1, int c2) 825{ 826 switch (c1) { 827 case '=': 828 switch (c2) { 829 case '=': return EQEQUAL; 830 } 831 break; 832 case '!': 833 switch (c2) { 834 case '=': return NOTEQUAL; 835 } 836 break; 837 case '<': 838 switch (c2) { 839 case '>': return NOTEQUAL; 840 case '=': return LESSEQUAL; 841 case '<': return LEFTSHIFT; 842 } 843 break; 844 case '>': 845 switch (c2) { 846 case '=': return GREATEREQUAL; 847 case '>': return RIGHTSHIFT; 848 } 849 break; 850 case '+': 851 switch (c2) { 852 case '=': return PLUSEQUAL; 853 } 854 break; 855 case '-': 856 switch (c2) { 857 case '=': return MINEQUAL; 858 } 859 break; 860 case '*': 861 switch (c2) { 862 case '*': return DOUBLESTAR; 863 case '=': return STAREQUAL; 864 } 865 break; 866 case '/': 867 switch (c2) { 868 case '/': return DOUBLESLASH; 869 case '=': return SLASHEQUAL; 870 } 871 break; 872 case '|': 873 switch (c2) { 874 case '=': return VBAREQUAL; 875 } 876 break; 877 case '%': 878 switch (c2) { 879 case '=': return PERCENTEQUAL; 880 } 881 break; 882 case '&': 883 switch (c2) { 884 case '=': return AMPEREQUAL; 885 } 886 break; 887 case '^': 888 switch (c2) { 889 case '=': return CIRCUMFLEXEQUAL; 890 } 891 break; 892 } 893 return OP; 894} 895 896int 897PyToken_ThreeChars(int c1, int c2, int c3) 898{ 899 switch (c1) { 900 case '<': 901 switch (c2) { 902 case '<': 903 switch (c3) { 904 case '=': 905 return LEFTSHIFTEQUAL; 906 } 907 break; 908 } 909 break; 910 case '>': 911 switch (c2) { 912 case '>': 913 switch (c3) { 914 case '=': 915 return RIGHTSHIFTEQUAL; 916 } 917 break; 918 } 919 break; 920 case '*': 921 switch (c2) { 922 case '*': 923 switch (c3) { 924 case '=': 925 return DOUBLESTAREQUAL; 926 } 927 break; 928 } 929 break; 930 case '/': 931 switch (c2) { 932 case '/': 933 switch (c3) { 934 case '=': 935 return DOUBLESLASHEQUAL; 936 } 937 break; 938 } 939 break; 940 } 941 return OP; 942} 943 944static int 945indenterror(struct tok_state *tok) 946{ 947 if (tok->alterror) { 948 tok->done = E_TABSPACE; 949 tok->cur = tok->inp; 950 return 1; 951 } 952 if (tok->altwarning) { 953 PySys_WriteStderr("%s: inconsistent use of tabs and spaces " 954 "in indentation\n", tok->filename); 955 tok->altwarning = 0; 956 } 957 return 0; 958} 959 960 961/* Get next token, after space stripping etc. */ 962 963static int 964tok_get(register struct tok_state *tok, char **p_start, char **p_end) 965{ 966 register int c; 967 int blankline; 968 969 *p_start = *p_end = NULL; 970 nextline: 971 tok->start = NULL; 972 blankline = 0; 973 974 /* Get indentation level */ 975 if (tok->atbol) { 976 register int col = 0; 977 register int altcol = 0; 978 tok->atbol = 0; 979 for (;;) { 980 c = tok_nextc(tok); 981 if (c == ' ') 982 col++, altcol++; 983 else if (c == '\t') { 984 col = (col/tok->tabsize + 1) * tok->tabsize; 985 altcol = (altcol/tok->alttabsize + 1) 986 * tok->alttabsize; 987 } 988 else if (c == '\014') /* Control-L (formfeed) */ 989 col = altcol = 0; /* For Emacs users */ 990 else 991 break; 992 } 993 tok_backup(tok, c); 994 if (c == '#' || c == '\n') { 995 /* Lines with only whitespace and/or comments 996 shouldn't affect the indentation and are 997 not passed to the parser as NEWLINE tokens, 998 except *totally* empty lines in interactive 999 mode, which signal the end of a command group. */ 1000 if (col == 0 && c == '\n' && tok->prompt != NULL) 1001 blankline = 0; /* Let it through */ 1002 else 1003 blankline = 1; /* Ignore completely */ 1004 /* We can't jump back right here since we still 1005 may need to skip to the end of a comment */ 1006 } 1007 if (!blankline && tok->level == 0) { 1008 if (col == tok->indstack[tok->indent]) { 1009 /* No change */ 1010 if (altcol != tok->altindstack[tok->indent]) { 1011 if (indenterror(tok)) 1012 return ERRORTOKEN; 1013 } 1014 } 1015 else if (col > tok->indstack[tok->indent]) { 1016 /* Indent -- always one */ 1017 if (tok->indent+1 >= MAXINDENT) { 1018 tok->done = E_TOODEEP; 1019 tok->cur = tok->inp; 1020 return ERRORTOKEN; 1021 } 1022 if (altcol <= tok->altindstack[tok->indent]) { 1023 if (indenterror(tok)) 1024 return ERRORTOKEN; 1025 } 1026 tok->pendin++; 1027 tok->indstack[++tok->indent] = col; 1028 tok->altindstack[tok->indent] = altcol; 1029 } 1030 else /* col < tok->indstack[tok->indent] */ { 1031 /* Dedent -- any number, must be consistent */ 1032 while (tok->indent > 0 && 1033 col < tok->indstack[tok->indent]) { 1034 tok->pendin--; 1035 tok->indent--; 1036 } 1037 if (col != tok->indstack[tok->indent]) { 1038 tok->done = E_DEDENT; 1039 tok->cur = tok->inp; 1040 return ERRORTOKEN; 1041 } 1042 if (altcol != tok->altindstack[tok->indent]) { 1043 if (indenterror(tok)) 1044 return ERRORTOKEN; 1045 } 1046 } 1047 } 1048 } 1049 1050 tok->start = tok->cur; 1051 1052 /* Return pending indents/dedents */ 1053 if (tok->pendin != 0) { 1054 if (tok->pendin < 0) { 1055 tok->pendin++; 1056 return DEDENT; 1057 } 1058 else { 1059 tok->pendin--; 1060 return INDENT; 1061 } 1062 } 1063 1064 again: 1065 tok->start = NULL; 1066 /* Skip spaces */ 1067 do { 1068 c = tok_nextc(tok); 1069 } while (c == ' ' || c == '\t' || c == '\014'); 1070 1071 /* Set start of current token */ 1072 tok->start = tok->cur - 1; 1073 1074 /* Skip comment, while looking for tab-setting magic */ 1075 if (c == '#') { 1076 static char *tabforms[] = { 1077 "tab-width:", /* Emacs */ 1078 ":tabstop=", /* vim, full form */ 1079 ":ts=", /* vim, abbreviated form */ 1080 "set tabsize=", /* will vi never die? */ 1081 /* more templates can be added here to support other editors */ 1082 }; 1083 char cbuf[80]; 1084 char *tp, **cp; 1085 tp = cbuf; 1086 do { 1087 *tp++ = c = tok_nextc(tok); 1088 } while (c != EOF && c != '\n' && 1089 tp - cbuf + 1 < sizeof(cbuf)); 1090 *tp = '\0'; 1091 for (cp = tabforms; 1092 cp < tabforms + sizeof(tabforms)/sizeof(tabforms[0]); 1093 cp++) { 1094 if ((tp = strstr(cbuf, *cp))) { 1095 int newsize = atoi(tp + strlen(*cp)); 1096 1097 if (newsize >= 1 && newsize <= 40) { 1098 tok->tabsize = newsize; 1099 if (Py_VerboseFlag) 1100 PySys_WriteStderr( 1101 "Tab size set to %d\n", 1102 newsize); 1103 } 1104 } 1105 } 1106 while (c != EOF && c != '\n') 1107 c = tok_nextc(tok); 1108 } 1109 1110 /* Check for EOF and errors now */ 1111 if (c == EOF) { 1112 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN; 1113 } 1114 1115 /* Identifier (most frequent token!) */ 1116 if (isalpha(c) || c == '_') { 1117 /* Process r"", u"" and ur"" */ 1118 switch (c) { 1119 case 'r': 1120 case 'R': 1121 c = tok_nextc(tok); 1122 if (c == '"' || c == '\'') 1123 goto letter_quote; 1124 break; 1125 case 'u': 1126 case 'U': 1127 c = tok_nextc(tok); 1128 if (c == 'r' || c == 'R') 1129 c = tok_nextc(tok); 1130 if (c == '"' || c == '\'') 1131 goto letter_quote; 1132 break; 1133 } 1134 while (isalnum(c) || c == '_') { 1135 c = tok_nextc(tok); 1136 } 1137 tok_backup(tok, c); 1138 *p_start = tok->start; 1139 *p_end = tok->cur; 1140 return NAME; 1141 } 1142 1143 /* Newline */ 1144 if (c == '\n') { 1145 tok->atbol = 1; 1146 if (blankline || tok->level > 0) 1147 goto nextline; 1148 *p_start = tok->start; 1149 *p_end = tok->cur - 1; /* Leave '\n' out of the string */ 1150 return NEWLINE; 1151 } 1152 1153#ifdef macintosh 1154 if (c == '\r') { 1155 PySys_WriteStderr( 1156 "File contains \\r characters (incorrect line endings?)\n"); 1157 tok->done = E_TOKEN; 1158 tok->cur = tok->inp; 1159 return ERRORTOKEN; 1160 } 1161#endif 1162 /* Period or number starting with period? */ 1163 if (c == '.') { 1164 c = tok_nextc(tok); 1165 if (isdigit(c)) { 1166 goto fraction; 1167 } 1168 else { 1169 tok_backup(tok, c); 1170 *p_start = tok->start; 1171 *p_end = tok->cur; 1172 return DOT; 1173 } 1174 } 1175 1176 /* Number */ 1177 if (isdigit(c)) { 1178 if (c == '0') { 1179 /* Hex or octal -- maybe. */ 1180 c = tok_nextc(tok); 1181 if (c == '.') 1182 goto fraction; 1183#ifndef WITHOUT_COMPLEX 1184 if (c == 'j' || c == 'J') 1185 goto imaginary; 1186#endif 1187 if (c == 'x' || c == 'X') { 1188 /* Hex */ 1189 do { 1190 c = tok_nextc(tok); 1191 } while (isxdigit(c)); 1192 } 1193 else { 1194 int found_decimal = 0; 1195 /* Octal; c is first char of it */ 1196 /* There's no 'isoctdigit' macro, sigh */ 1197 while ('0' <= c && c < '8') { 1198 c = tok_nextc(tok); 1199 } 1200 if (isdigit(c)) { 1201 found_decimal = 1; 1202 do { 1203 c = tok_nextc(tok); 1204 } while (isdigit(c)); 1205 } 1206 if (c == '.') 1207 goto fraction; 1208 else if (c == 'e' || c == 'E') 1209 goto exponent; 1210#ifndef WITHOUT_COMPLEX 1211 else if (c == 'j' || c == 'J') 1212 goto imaginary; 1213#endif 1214 else if (found_decimal) { 1215 tok->done = E_TOKEN; 1216 tok_backup(tok, c); 1217 return ERRORTOKEN; 1218 } 1219 } 1220 if (c == 'l' || c == 'L') 1221 c = tok_nextc(tok); 1222 } 1223 else { 1224 /* Decimal */ 1225 do { 1226 c = tok_nextc(tok); 1227 } while (isdigit(c)); 1228 if (c == 'l' || c == 'L') 1229 c = tok_nextc(tok); 1230 else { 1231 /* Accept floating point numbers. */ 1232 if (c == '.') { 1233 fraction: 1234 /* Fraction */ 1235 do { 1236 c = tok_nextc(tok); 1237 } while (isdigit(c)); 1238 } 1239 if (c == 'e' || c == 'E') { 1240 exponent: 1241 /* Exponent part */ 1242 c = tok_nextc(tok); 1243 if (c == '+' || c == '-') 1244 c = tok_nextc(tok); 1245 if (!isdigit(c)) { 1246 tok->done = E_TOKEN; 1247 tok_backup(tok, c); 1248 return ERRORTOKEN; 1249 } 1250 do { 1251 c = tok_nextc(tok); 1252 } while (isdigit(c)); 1253 } 1254#ifndef WITHOUT_COMPLEX 1255 if (c == 'j' || c == 'J') 1256 /* Imaginary part */ 1257 imaginary: 1258 c = tok_nextc(tok); 1259#endif 1260 } 1261 } 1262 tok_backup(tok, c); 1263 *p_start = tok->start; 1264 *p_end = tok->cur; 1265 return NUMBER; 1266 } 1267 1268 letter_quote: 1269 /* String */ 1270 if (c == '\'' || c == '"') { 1271 int quote2 = tok->cur - tok->start + 1; 1272 int quote = c; 1273 int triple = 0; 1274 int tripcount = 0; 1275 for (;;) { 1276 c = tok_nextc(tok); 1277 if (c == '\n') { 1278 if (!triple) { 1279 tok->done = E_EOLS; 1280 tok_backup(tok, c); 1281 return ERRORTOKEN; 1282 } 1283 tripcount = 0; 1284 } 1285 else if (c == EOF) { 1286 if (triple) 1287 tok->done = E_EOFS; 1288 else 1289 tok->done = E_EOLS; 1290 tok->cur = tok->inp; 1291 return ERRORTOKEN; 1292 } 1293 else if (c == quote) { 1294 tripcount++; 1295 if (tok->cur - tok->start == quote2) { 1296 c = tok_nextc(tok); 1297 if (c == quote) { 1298 triple = 1; 1299 tripcount = 0; 1300 continue; 1301 } 1302 tok_backup(tok, c); 1303 } 1304 if (!triple || tripcount == 3) 1305 break; 1306 } 1307 else if (c == '\\') { 1308 tripcount = 0; 1309 c = tok_nextc(tok); 1310 if (c == EOF) { 1311 tok->done = E_EOLS; 1312 tok->cur = tok->inp; 1313 return ERRORTOKEN; 1314 } 1315 } 1316 else 1317 tripcount = 0; 1318 } 1319 *p_start = tok->start; 1320 *p_end = tok->cur; 1321 return STRING; 1322 } 1323 1324 /* Line continuation */ 1325 if (c == '\\') { 1326 c = tok_nextc(tok); 1327 if (c != '\n') { 1328 tok->done = E_TOKEN; 1329 tok->cur = tok->inp; 1330 return ERRORTOKEN; 1331 } 1332 goto again; /* Read next line */ 1333 } 1334 1335 /* Check for two-character token */ 1336 { 1337 int c2 = tok_nextc(tok); 1338 int token = PyToken_TwoChars(c, c2); 1339 if (token != OP) { 1340 int c3 = tok_nextc(tok); 1341 int token3 = PyToken_ThreeChars(c, c2, c3); 1342 if (token3 != OP) { 1343 token = token3; 1344 } else { 1345 tok_backup(tok, c3); 1346 } 1347 *p_start = tok->start; 1348 *p_end = tok->cur; 1349 return token; 1350 } 1351 tok_backup(tok, c2); 1352 } 1353 1354 /* Keep track of parentheses nesting level */ 1355 switch (c) { 1356 case '(': 1357 case '[': 1358 case '{': 1359 tok->level++; 1360 break; 1361 case ')': 1362 case ']': 1363 case '}': 1364 tok->level--; 1365 break; 1366 } 1367 1368 /* Punctuation character */ 1369 *p_start = tok->start; 1370 *p_end = tok->cur; 1371 return PyToken_OneChar(c); 1372} 1373 1374int 1375PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end) 1376{ 1377 int result = tok_get(tok, p_start, p_end); 1378 if (tok->decoding_erred) { 1379 result = ERRORTOKEN; 1380 tok->done = E_DECODE; 1381 } 1382 return result; 1383} 1384 1385#ifdef Py_DEBUG 1386 1387void 1388tok_dump(int type, char *start, char *end) 1389{ 1390 printf("%s", _PyParser_TokenNames[type]); 1391 if (type == NAME || type == NUMBER || type == STRING || type == OP) 1392 printf("(%.*s)", (int)(end - start), start); 1393} 1394 1395#endif 1396