tokenizer.c revision 49c5da1d88f605248167f4d95b1dfe08c1f703c7
1 2/* Tokenizer implementation */ 3 4#include "Python.h" 5#include "pgenheaders.h" 6 7#include <ctype.h> 8#include <assert.h> 9 10#include "tokenizer.h" 11#include "errcode.h" 12 13#ifndef PGEN 14#include "unicodeobject.h" 15#include "stringobject.h" 16#include "fileobject.h" 17#include "codecs.h" 18#include "abstract.h" 19#endif /* PGEN */ 20 21extern char *PyOS_Readline(FILE *, FILE *, char *); 22/* Return malloc'ed string including trailing \n; 23 empty malloc'ed string for EOF; 24 NULL if interrupted */ 25 26/* Don't ever change this -- it would break the portability of Python code */ 27#define TABSIZE 8 28 29/* Convert a possibly signed character to a nonnegative int */ 30/* XXX This assumes characters are 8 bits wide */ 31#ifdef __CHAR_UNSIGNED__ 32#define Py_CHARMASK(c) (c) 33#else 34#define Py_CHARMASK(c) ((c) & 0xff) 35#endif 36 37/* Forward */ 38static struct tok_state *tok_new(void); 39static int tok_nextc(struct tok_state *tok); 40static void tok_backup(struct tok_state *tok, int c); 41 42/* Token names */ 43 44char *_PyParser_TokenNames[] = { 45 "ENDMARKER", 46 "NAME", 47 "NUMBER", 48 "STRING", 49 "NEWLINE", 50 "INDENT", 51 "DEDENT", 52 "LPAR", 53 "RPAR", 54 "LSQB", 55 "RSQB", 56 "COLON", 57 "COMMA", 58 "SEMI", 59 "PLUS", 60 "MINUS", 61 "STAR", 62 "SLASH", 63 "VBAR", 64 "AMPER", 65 "LESS", 66 "GREATER", 67 "EQUAL", 68 "DOT", 69 "PERCENT", 70 "BACKQUOTE", 71 "LBRACE", 72 "RBRACE", 73 "EQEQUAL", 74 "NOTEQUAL", 75 "LESSEQUAL", 76 "GREATEREQUAL", 77 "TILDE", 78 "CIRCUMFLEX", 79 "LEFTSHIFT", 80 "RIGHTSHIFT", 81 "DOUBLESTAR", 82 "PLUSEQUAL", 83 "MINEQUAL", 84 "STAREQUAL", 85 "SLASHEQUAL", 86 "PERCENTEQUAL", 87 "AMPEREQUAL", 88 "VBAREQUAL", 89 "CIRCUMFLEXEQUAL", 90 "LEFTSHIFTEQUAL", 91 "RIGHTSHIFTEQUAL", 92 "DOUBLESTAREQUAL", 93 "DOUBLESLASH", 94 "DOUBLESLASHEQUAL", 95 "AT", 96 /* This table must match the #defines in token.h! */ 97 "OP", 98 "<ERRORTOKEN>", 99 "<N_TOKENS>" 100}; 101 102 103/* Create and initialize a new tok_state structure */ 104 105static struct tok_state * 106tok_new(void) 107{ 108 struct tok_state *tok = PyMem_NEW(struct tok_state, 1); 109 if (tok == NULL) 110 return NULL; 111 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL; 112 tok->done = E_OK; 113 tok->fp = NULL; 114 tok->tabsize = TABSIZE; 115 tok->indent = 0; 116 tok->indstack[0] = 0; 117 tok->atbol = 1; 118 tok->pendin = 0; 119 tok->prompt = tok->nextprompt = NULL; 120 tok->lineno = 0; 121 tok->level = 0; 122 tok->filename = NULL; 123 tok->altwarning = 0; 124 tok->alterror = 0; 125 tok->alttabsize = 1; 126 tok->altindstack[0] = 0; 127 tok->decoding_state = 0; 128 tok->decoding_erred = 0; 129 tok->read_coding_spec = 0; 130 tok->encoding = NULL; 131 tok->cont_line = 0; 132#ifndef PGEN 133 tok->decoding_readline = NULL; 134 tok->decoding_buffer = NULL; 135#endif 136 return tok; 137} 138 139#ifdef PGEN 140 141static char * 142decoding_fgets(char *s, int size, struct tok_state *tok) 143{ 144 return fgets(s, size, tok->fp); 145} 146 147static int 148decoding_feof(struct tok_state *tok) 149{ 150 return feof(tok->fp); 151} 152 153static const char * 154decode_str(const char *str, struct tok_state *tok) 155{ 156 return str; 157} 158 159#else /* PGEN */ 160 161static char * 162error_ret(struct tok_state *tok) /* XXX */ 163{ 164 tok->decoding_erred = 1; 165 if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */ 166 PyMem_DEL(tok->buf); 167 tok->buf = NULL; 168 return NULL; /* as if it were EOF */ 169} 170 171static char * 172new_string(const char *s, Py_ssize_t len) 173{ 174 char* result = PyMem_NEW(char, len + 1); 175 if (result != NULL) { 176 memcpy(result, s, len); 177 result[len] = '\0'; 178 } 179 return result; 180} 181 182static char * 183get_normal_name(char *s) /* for utf-8 and latin-1 */ 184{ 185 char buf[13]; 186 int i; 187 for (i = 0; i < 12; i++) { 188 int c = s[i]; 189 if (c == '\0') break; 190 else if (c == '_') buf[i] = '-'; 191 else buf[i] = tolower(c); 192 } 193 buf[i] = '\0'; 194 if (strcmp(buf, "utf-8") == 0 || 195 strncmp(buf, "utf-8-", 6) == 0) return "utf-8"; 196 else if (strcmp(buf, "latin-1") == 0 || 197 strcmp(buf, "iso-8859-1") == 0 || 198 strcmp(buf, "iso-latin-1") == 0 || 199 strncmp(buf, "latin-1-", 8) == 0 || 200 strncmp(buf, "iso-8859-1-", 11) == 0 || 201 strncmp(buf, "iso-latin-1-", 12) == 0) return "iso-8859-1"; 202 else return s; 203} 204 205/* Return the coding spec in S, or NULL if none is found. */ 206 207static char * 208get_coding_spec(const char *s, Py_ssize_t size) 209{ 210 Py_ssize_t i; 211 /* Coding spec must be in a comment, and that comment must be 212 * the only statement on the source code line. */ 213 for (i = 0; i < size - 6; i++) { 214 if (s[i] == '#') 215 break; 216 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014') 217 return NULL; 218 } 219 for (; i < size - 6; i++) { /* XXX inefficient search */ 220 const char* t = s + i; 221 if (strncmp(t, "coding", 6) == 0) { 222 const char* begin = NULL; 223 t += 6; 224 if (t[0] != ':' && t[0] != '=') 225 continue; 226 do { 227 t++; 228 } while (t[0] == '\x20' || t[0] == '\t'); 229 230 begin = t; 231 while (isalnum(Py_CHARMASK(t[0])) || 232 t[0] == '-' || t[0] == '_' || t[0] == '.') 233 t++; 234 235 if (begin < t) { 236 char* r = new_string(begin, t - begin); 237 char* q = get_normal_name(r); 238 if (r != q) { 239 PyMem_DEL(r); 240 r = new_string(q, strlen(q)); 241 } 242 return r; 243 } 244 } 245 } 246 return NULL; 247} 248 249/* Check whether the line contains a coding spec. If it does, 250 invoke the set_readline function for the new encoding. 251 This function receives the tok_state and the new encoding. 252 Return 1 on success, 0 on failure. */ 253 254static int 255check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok, 256 int set_readline(struct tok_state *, const char *)) 257{ 258 char * cs; 259 int r = 1; 260 261 if (tok->cont_line) 262 /* It's a continuation line, so it can't be a coding spec. */ 263 return 1; 264 cs = get_coding_spec(line, size); 265 if (cs != NULL) { 266 tok->read_coding_spec = 1; 267 if (tok->encoding == NULL) { 268 assert(tok->decoding_state == 1); /* raw */ 269 if (strcmp(cs, "utf-8") == 0 || 270 strcmp(cs, "iso-8859-1") == 0) { 271 tok->encoding = cs; 272 } else { 273#ifdef Py_USING_UNICODE 274 r = set_readline(tok, cs); 275 if (r) { 276 tok->encoding = cs; 277 tok->decoding_state = -1; 278 } 279 else 280 PyMem_DEL(cs); 281#else 282 /* Without Unicode support, we cannot 283 process the coding spec. Since there 284 won't be any Unicode literals, that 285 won't matter. */ 286 PyMem_DEL(cs); 287#endif 288 } 289 } else { /* then, compare cs with BOM */ 290 r = (strcmp(tok->encoding, cs) == 0); 291 PyMem_DEL(cs); 292 } 293 } 294 if (!r) { 295 cs = tok->encoding; 296 if (!cs) 297 cs = "with BOM"; 298 PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs); 299 } 300 return r; 301} 302 303/* See whether the file starts with a BOM. If it does, 304 invoke the set_readline function with the new encoding. 305 Return 1 on success, 0 on failure. */ 306 307static int 308check_bom(int get_char(struct tok_state *), 309 void unget_char(int, struct tok_state *), 310 int set_readline(struct tok_state *, const char *), 311 struct tok_state *tok) 312{ 313 int ch = get_char(tok); 314 tok->decoding_state = 1; 315 if (ch == EOF) { 316 return 1; 317 } else if (ch == 0xEF) { 318 ch = get_char(tok); if (ch != 0xBB) goto NON_BOM; 319 ch = get_char(tok); if (ch != 0xBF) goto NON_BOM; 320#if 0 321 /* Disable support for UTF-16 BOMs until a decision 322 is made whether this needs to be supported. */ 323 } else if (ch == 0xFE) { 324 ch = get_char(tok); if (ch != 0xFF) goto NON_BOM; 325 if (!set_readline(tok, "utf-16-be")) return 0; 326 tok->decoding_state = -1; 327 } else if (ch == 0xFF) { 328 ch = get_char(tok); if (ch != 0xFE) goto NON_BOM; 329 if (!set_readline(tok, "utf-16-le")) return 0; 330 tok->decoding_state = -1; 331#endif 332 } else { 333 unget_char(ch, tok); 334 return 1; 335 } 336 if (tok->encoding != NULL) 337 PyMem_DEL(tok->encoding); 338 tok->encoding = new_string("utf-8", 5); /* resulting is in utf-8 */ 339 return 1; 340 NON_BOM: 341 /* any token beginning with '\xEF', '\xFE', '\xFF' is a bad token */ 342 unget_char(0xFF, tok); /* XXX this will cause a syntax error */ 343 return 1; 344} 345 346/* Read a line of text from TOK into S, using the stream in TOK. 347 Return NULL on failure, else S. 348 349 On entry, tok->decoding_buffer will be one of: 350 1) NULL: need to call tok->decoding_readline to get a new line 351 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and 352 stored the result in tok->decoding_buffer 353 3) PyStringObject *: previous call to fp_readl did not have enough room 354 (in the s buffer) to copy entire contents of the line read 355 by tok->decoding_readline. tok->decoding_buffer has the overflow. 356 In this case, fp_readl is called in a loop (with an expanded buffer) 357 until the buffer ends with a '\n' (or until the end of the file is 358 reached): see tok_nextc and its calls to decoding_fgets. 359*/ 360 361static char * 362fp_readl(char *s, int size, struct tok_state *tok) 363{ 364#ifndef Py_USING_UNICODE 365 /* In a non-Unicode built, this should never be called. */ 366 Py_FatalError("fp_readl should not be called in this build."); 367 return NULL; /* Keep compiler happy (not reachable) */ 368#else 369 PyObject* utf8 = NULL; 370 PyObject* buf = tok->decoding_buffer; 371 char *str; 372 Py_ssize_t utf8len; 373 374 /* Ask for one less byte so we can terminate it */ 375 assert(size > 0); 376 size--; 377 378 if (buf == NULL) { 379 buf = PyObject_CallObject(tok->decoding_readline, NULL); 380 if (buf == NULL) 381 return error_ret(tok); 382 } else { 383 tok->decoding_buffer = NULL; 384 if (PyString_CheckExact(buf)) 385 utf8 = buf; 386 } 387 if (utf8 == NULL) { 388 utf8 = PyUnicode_AsUTF8String(buf); 389 Py_DECREF(buf); 390 if (utf8 == NULL) 391 return error_ret(tok); 392 } 393 str = PyString_AsString(utf8); 394 utf8len = PyString_GET_SIZE(utf8); 395 if (utf8len > size) { 396 tok->decoding_buffer = PyString_FromStringAndSize(str+size, utf8len-size); 397 if (tok->decoding_buffer == NULL) { 398 Py_DECREF(utf8); 399 return error_ret(tok); 400 } 401 utf8len = size; 402 } 403 memcpy(s, str, utf8len); 404 s[utf8len] = '\0'; 405 Py_DECREF(utf8); 406 if (utf8len == 0) return NULL; /* EOF */ 407 return s; 408#endif 409} 410 411/* Set the readline function for TOK to a StreamReader's 412 readline function. The StreamReader is named ENC. 413 414 This function is called from check_bom and check_coding_spec. 415 416 ENC is usually identical to the future value of tok->encoding, 417 except for the (currently unsupported) case of UTF-16. 418 419 Return 1 on success, 0 on failure. */ 420 421static int 422fp_setreadl(struct tok_state *tok, const char* enc) 423{ 424 PyObject *reader, *stream, *readline; 425 426 /* XXX: constify filename argument. */ 427 stream = PyFile_FromFile(tok->fp, (char*)tok->filename, "rb", NULL); 428 if (stream == NULL) 429 return 0; 430 431 reader = PyCodec_StreamReader(enc, stream, NULL); 432 Py_DECREF(stream); 433 if (reader == NULL) 434 return 0; 435 436 readline = PyObject_GetAttrString(reader, "readline"); 437 Py_DECREF(reader); 438 if (readline == NULL) 439 return 0; 440 441 tok->decoding_readline = readline; 442 return 1; 443} 444 445/* Fetch the next byte from TOK. */ 446 447static int fp_getc(struct tok_state *tok) { 448 return getc(tok->fp); 449} 450 451/* Unfetch the last byte back into TOK. */ 452 453static void fp_ungetc(int c, struct tok_state *tok) { 454 ungetc(c, tok->fp); 455} 456 457/* Read a line of input from TOK. Determine encoding 458 if necessary. */ 459 460static char * 461decoding_fgets(char *s, int size, struct tok_state *tok) 462{ 463 char *line = NULL; 464 int badchar = 0; 465 for (;;) { 466 if (tok->decoding_state < 0) { 467 /* We already have a codec associated with 468 this input. */ 469 line = fp_readl(s, size, tok); 470 break; 471 } else if (tok->decoding_state > 0) { 472 /* We want a 'raw' read. */ 473 line = Py_UniversalNewlineFgets(s, size, 474 tok->fp, NULL); 475 break; 476 } else { 477 /* We have not yet determined the encoding. 478 If an encoding is found, use the file-pointer 479 reader functions from now on. */ 480 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok)) 481 return error_ret(tok); 482 assert(tok->decoding_state != 0); 483 } 484 } 485 if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) { 486 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) { 487 return error_ret(tok); 488 } 489 } 490#ifndef PGEN 491 /* The default encoding is ASCII, so make sure we don't have any 492 non-ASCII bytes in it. */ 493 if (line && !tok->encoding) { 494 unsigned char *c; 495 for (c = (unsigned char *)line; *c; c++) 496 if (*c > 127) { 497 badchar = *c; 498 break; 499 } 500 } 501 if (badchar) { 502 char buf[500]; 503 /* Need to add 1 to the line number, since this line 504 has not been counted, yet. */ 505 sprintf(buf, 506 "Non-ASCII character '\\x%.2x' " 507 "in file %.200s on line %i, " 508 "but no encoding declared; " 509 "see http://www.python.org/peps/pep-0263.html for details", 510 badchar, tok->filename, tok->lineno + 1); 511 PyErr_SetString(PyExc_SyntaxError, buf); 512 return error_ret(tok); 513 } 514#endif 515 return line; 516} 517 518static int 519decoding_feof(struct tok_state *tok) 520{ 521 if (tok->decoding_state >= 0) { 522 return feof(tok->fp); 523 } else { 524 PyObject* buf = tok->decoding_buffer; 525 if (buf == NULL) { 526 buf = PyObject_CallObject(tok->decoding_readline, NULL); 527 if (buf == NULL) { 528 error_ret(tok); 529 return 1; 530 } else { 531 tok->decoding_buffer = buf; 532 } 533 } 534 return PyObject_Length(buf) == 0; 535 } 536} 537 538/* Fetch a byte from TOK, using the string buffer. */ 539 540static int buf_getc(struct tok_state *tok) { 541 return Py_CHARMASK(*tok->str++); 542} 543 544/* Unfetch a byte from TOK, using the string buffer. */ 545 546static void buf_ungetc(int c, struct tok_state *tok) { 547 tok->str--; 548 assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */ 549} 550 551/* Set the readline function for TOK to ENC. For the string-based 552 tokenizer, this means to just record the encoding. */ 553 554static int buf_setreadl(struct tok_state *tok, const char* enc) { 555 tok->enc = enc; 556 return 1; 557} 558 559/* Return a UTF-8 encoding Python string object from the 560 C byte string STR, which is encoded with ENC. */ 561 562#ifdef Py_USING_UNICODE 563static PyObject * 564translate_into_utf8(const char* str, const char* enc) { 565 PyObject *utf8; 566 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL); 567 if (buf == NULL) 568 return NULL; 569 utf8 = PyUnicode_AsUTF8String(buf); 570 Py_DECREF(buf); 571 return utf8; 572} 573#endif 574 575/* Decode a byte string STR for use as the buffer of TOK. 576 Look for encoding declarations inside STR, and record them 577 inside TOK. */ 578 579static const char * 580decode_str(const char *str, struct tok_state *tok) 581{ 582 PyObject* utf8 = NULL; 583 const char *s; 584 int lineno = 0; 585 tok->enc = NULL; 586 tok->str = str; 587 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok)) 588 return error_ret(tok); 589 str = tok->str; /* string after BOM if any */ 590 assert(str); 591#ifdef Py_USING_UNICODE 592 if (tok->enc != NULL) { 593 utf8 = translate_into_utf8(str, tok->enc); 594 if (utf8 == NULL) 595 return error_ret(tok); 596 str = PyString_AsString(utf8); 597 } 598#endif 599 for (s = str;; s++) { 600 if (*s == '\0') break; 601 else if (*s == '\n') { 602 lineno++; 603 if (lineno == 2) break; 604 } 605 } 606 tok->enc = NULL; 607 if (!check_coding_spec(str, s - str, tok, buf_setreadl)) 608 return error_ret(tok); 609#ifdef Py_USING_UNICODE 610 if (tok->enc != NULL) { 611 assert(utf8 == NULL); 612 utf8 = translate_into_utf8(str, tok->enc); 613 if (utf8 == NULL) { 614 PyErr_Format(PyExc_SyntaxError, 615 "unknown encoding: %s", tok->enc); 616 return error_ret(tok); 617 } 618 str = PyString_AsString(utf8); 619 } 620#endif 621 assert(tok->decoding_buffer == NULL); 622 tok->decoding_buffer = utf8; /* CAUTION */ 623 return str; 624} 625 626#endif /* PGEN */ 627 628/* Set up tokenizer for string */ 629 630struct tok_state * 631PyTokenizer_FromString(const char *str) 632{ 633 struct tok_state *tok = tok_new(); 634 if (tok == NULL) 635 return NULL; 636 str = (char *)decode_str(str, tok); 637 if (str == NULL) { 638 PyTokenizer_Free(tok); 639 return NULL; 640 } 641 642 /* XXX: constify members. */ 643 tok->buf = tok->cur = tok->end = tok->inp = (char*)str; 644 return tok; 645} 646 647 648/* Set up tokenizer for file */ 649 650struct tok_state * 651PyTokenizer_FromFile(FILE *fp, char *ps1, char *ps2) 652{ 653 struct tok_state *tok = tok_new(); 654 if (tok == NULL) 655 return NULL; 656 if ((tok->buf = PyMem_NEW(char, BUFSIZ)) == NULL) { 657 PyTokenizer_Free(tok); 658 return NULL; 659 } 660 tok->cur = tok->inp = tok->buf; 661 tok->end = tok->buf + BUFSIZ; 662 tok->fp = fp; 663 tok->prompt = ps1; 664 tok->nextprompt = ps2; 665 return tok; 666} 667 668 669/* Free a tok_state structure */ 670 671void 672PyTokenizer_Free(struct tok_state *tok) 673{ 674 if (tok->encoding != NULL) 675 PyMem_DEL(tok->encoding); 676#ifndef PGEN 677 Py_XDECREF(tok->decoding_readline); 678 Py_XDECREF(tok->decoding_buffer); 679#endif 680 if (tok->fp != NULL && tok->buf != NULL) 681 PyMem_DEL(tok->buf); 682 PyMem_DEL(tok); 683} 684 685#if !defined(PGEN) && defined(Py_USING_UNICODE) 686static int 687tok_stdin_decode(struct tok_state *tok, char **inp) 688{ 689 PyObject *enc, *sysstdin, *decoded, *utf8; 690 const char *encoding; 691 char *converted; 692 693 if (PySys_GetFile((char *)"stdin", NULL) != stdin) 694 return 0; 695 sysstdin = PySys_GetObject("stdin"); 696 if (sysstdin == NULL || !PyFile_Check(sysstdin)) 697 return 0; 698 699 enc = ((PyFileObject *)sysstdin)->f_encoding; 700 if (enc == NULL || !PyString_Check(enc)) 701 return 0; 702 Py_INCREF(enc); 703 704 encoding = PyString_AsString(enc); 705 decoded = PyUnicode_Decode(*inp, strlen(*inp), encoding, NULL); 706 if (decoded == NULL) 707 goto error_clear; 708 709 utf8 = PyUnicode_AsEncodedString(decoded, "utf-8", NULL); 710 Py_DECREF(decoded); 711 if (utf8 == NULL) 712 goto error_clear; 713 714 converted = new_string(PyString_AsString(utf8), PyString_Size(utf8)); 715 Py_DECREF(utf8); 716 if (converted == NULL) 717 goto error_nomem; 718 719 PyMem_FREE(*inp); 720 *inp = converted; 721 if (tok->encoding != NULL) 722 PyMem_DEL(tok->encoding); 723 tok->encoding = new_string(encoding, strlen(encoding)); 724 if (tok->encoding == NULL) 725 goto error_nomem; 726 727 Py_DECREF(enc); 728 return 0; 729 730error_nomem: 731 Py_DECREF(enc); 732 tok->done = E_NOMEM; 733 return -1; 734 735error_clear: 736 /* Fallback to iso-8859-1: for backward compatibility */ 737 Py_DECREF(enc); 738 PyErr_Clear(); 739 return 0; 740} 741#endif 742 743/* Get next char, updating state; error code goes into tok->done */ 744 745static int 746tok_nextc(register struct tok_state *tok) 747{ 748 for (;;) { 749 if (tok->cur != tok->inp) { 750 return Py_CHARMASK(*tok->cur++); /* Fast path */ 751 } 752 if (tok->done != E_OK) 753 return EOF; 754 if (tok->fp == NULL) { 755 char *end = strchr(tok->inp, '\n'); 756 if (end != NULL) 757 end++; 758 else { 759 end = strchr(tok->inp, '\0'); 760 if (end == tok->inp) { 761 tok->done = E_EOF; 762 return EOF; 763 } 764 } 765 if (tok->start == NULL) 766 tok->buf = tok->cur; 767 tok->line_start = tok->cur; 768 tok->lineno++; 769 tok->inp = end; 770 return Py_CHARMASK(*tok->cur++); 771 } 772 if (tok->prompt != NULL) { 773 char *new = PyOS_Readline(stdin, stdout, tok->prompt); 774 if (tok->nextprompt != NULL) 775 tok->prompt = tok->nextprompt; 776 if (new == NULL) 777 tok->done = E_INTR; 778 else if (*new == '\0') { 779 PyMem_FREE(new); 780 tok->done = E_EOF; 781 } 782#if !defined(PGEN) && defined(Py_USING_UNICODE) 783 else if (tok_stdin_decode(tok, &new) != 0) 784 PyMem_FREE(new); 785#endif 786 else if (tok->start != NULL) { 787 size_t start = tok->start - tok->buf; 788 size_t oldlen = tok->cur - tok->buf; 789 size_t newlen = oldlen + strlen(new); 790 char *buf = tok->buf; 791 PyMem_RESIZE(buf, char, newlen+1); 792 tok->lineno++; 793 if (buf == NULL) { 794 PyMem_DEL(tok->buf); 795 tok->buf = NULL; 796 PyMem_FREE(new); 797 tok->done = E_NOMEM; 798 return EOF; 799 } 800 tok->buf = buf; 801 tok->cur = tok->buf + oldlen; 802 tok->line_start = tok->cur; 803 strcpy(tok->buf + oldlen, new); 804 PyMem_FREE(new); 805 tok->inp = tok->buf + newlen; 806 tok->end = tok->inp + 1; 807 tok->start = tok->buf + start; 808 } 809 else { 810 tok->lineno++; 811 if (tok->buf != NULL) 812 PyMem_DEL(tok->buf); 813 tok->buf = new; 814 tok->line_start = tok->buf; 815 tok->cur = tok->buf; 816 tok->line_start = tok->buf; 817 tok->inp = strchr(tok->buf, '\0'); 818 tok->end = tok->inp + 1; 819 } 820 } 821 else { 822 int done = 0; 823 Py_ssize_t cur = 0; 824 char *pt; 825 if (tok->start == NULL) { 826 if (tok->buf == NULL) { 827 tok->buf = PyMem_NEW(char, BUFSIZ); 828 if (tok->buf == NULL) { 829 tok->done = E_NOMEM; 830 return EOF; 831 } 832 tok->end = tok->buf + BUFSIZ; 833 } 834 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf), 835 tok) == NULL) { 836 tok->done = E_EOF; 837 done = 1; 838 } 839 else { 840 tok->done = E_OK; 841 tok->inp = strchr(tok->buf, '\0'); 842 done = tok->inp[-1] == '\n'; 843 } 844 } 845 else { 846 cur = tok->cur - tok->buf; 847 if (decoding_feof(tok)) { 848 tok->done = E_EOF; 849 done = 1; 850 } 851 else 852 tok->done = E_OK; 853 } 854 tok->lineno++; 855 /* Read until '\n' or EOF */ 856 while (!done) { 857 Py_ssize_t curstart = tok->start == NULL ? -1 : 858 tok->start - tok->buf; 859 Py_ssize_t curvalid = tok->inp - tok->buf; 860 Py_ssize_t newsize = curvalid + BUFSIZ; 861 char *newbuf = tok->buf; 862 PyMem_RESIZE(newbuf, char, newsize); 863 if (newbuf == NULL) { 864 tok->done = E_NOMEM; 865 tok->cur = tok->inp; 866 return EOF; 867 } 868 tok->buf = newbuf; 869 tok->inp = tok->buf + curvalid; 870 tok->end = tok->buf + newsize; 871 tok->start = curstart < 0 ? NULL : 872 tok->buf + curstart; 873 if (decoding_fgets(tok->inp, 874 (int)(tok->end - tok->inp), 875 tok) == NULL) { 876 /* Last line does not end in \n, 877 fake one */ 878 strcpy(tok->inp, "\n"); 879 } 880 tok->inp = strchr(tok->inp, '\0'); 881 done = tok->inp[-1] == '\n'; 882 } 883 tok->cur = tok->buf + cur; 884 tok->line_start = tok->cur; 885 /* replace "\r\n" with "\n" */ 886 /* For Mac we leave the \r, giving a syntax error */ 887 pt = tok->inp - 2; 888 if (pt >= tok->buf && *pt == '\r') { 889 *pt++ = '\n'; 890 *pt = '\0'; 891 tok->inp = pt; 892 } 893 } 894 if (tok->done != E_OK) { 895 if (tok->prompt != NULL) 896 PySys_WriteStderr("\n"); 897 tok->cur = tok->inp; 898 return EOF; 899 } 900 } 901 /*NOTREACHED*/ 902} 903 904 905/* Back-up one character */ 906 907static void 908tok_backup(register struct tok_state *tok, register int c) 909{ 910 if (c != EOF) { 911 if (--tok->cur < tok->buf) 912 Py_FatalError("tok_backup: begin of buffer"); 913 if (*tok->cur != c) 914 *tok->cur = c; 915 } 916} 917 918 919/* Return the token corresponding to a single character */ 920 921int 922PyToken_OneChar(int c) 923{ 924 switch (c) { 925 case '(': return LPAR; 926 case ')': return RPAR; 927 case '[': return LSQB; 928 case ']': return RSQB; 929 case ':': return COLON; 930 case ',': return COMMA; 931 case ';': return SEMI; 932 case '+': return PLUS; 933 case '-': return MINUS; 934 case '*': return STAR; 935 case '/': return SLASH; 936 case '|': return VBAR; 937 case '&': return AMPER; 938 case '<': return LESS; 939 case '>': return GREATER; 940 case '=': return EQUAL; 941 case '.': return DOT; 942 case '%': return PERCENT; 943 case '`': return BACKQUOTE; 944 case '{': return LBRACE; 945 case '}': return RBRACE; 946 case '^': return CIRCUMFLEX; 947 case '~': return TILDE; 948 case '@': return AT; 949 default: return OP; 950 } 951} 952 953 954int 955PyToken_TwoChars(int c1, int c2) 956{ 957 switch (c1) { 958 case '=': 959 switch (c2) { 960 case '=': return EQEQUAL; 961 } 962 break; 963 case '!': 964 switch (c2) { 965 case '=': return NOTEQUAL; 966 } 967 break; 968 case '<': 969 switch (c2) { 970 case '>': return NOTEQUAL; 971 case '=': return LESSEQUAL; 972 case '<': return LEFTSHIFT; 973 } 974 break; 975 case '>': 976 switch (c2) { 977 case '=': return GREATEREQUAL; 978 case '>': return RIGHTSHIFT; 979 } 980 break; 981 case '+': 982 switch (c2) { 983 case '=': return PLUSEQUAL; 984 } 985 break; 986 case '-': 987 switch (c2) { 988 case '=': return MINEQUAL; 989 } 990 break; 991 case '*': 992 switch (c2) { 993 case '*': return DOUBLESTAR; 994 case '=': return STAREQUAL; 995 } 996 break; 997 case '/': 998 switch (c2) { 999 case '/': return DOUBLESLASH; 1000 case '=': return SLASHEQUAL; 1001 } 1002 break; 1003 case '|': 1004 switch (c2) { 1005 case '=': return VBAREQUAL; 1006 } 1007 break; 1008 case '%': 1009 switch (c2) { 1010 case '=': return PERCENTEQUAL; 1011 } 1012 break; 1013 case '&': 1014 switch (c2) { 1015 case '=': return AMPEREQUAL; 1016 } 1017 break; 1018 case '^': 1019 switch (c2) { 1020 case '=': return CIRCUMFLEXEQUAL; 1021 } 1022 break; 1023 } 1024 return OP; 1025} 1026 1027int 1028PyToken_ThreeChars(int c1, int c2, int c3) 1029{ 1030 switch (c1) { 1031 case '<': 1032 switch (c2) { 1033 case '<': 1034 switch (c3) { 1035 case '=': 1036 return LEFTSHIFTEQUAL; 1037 } 1038 break; 1039 } 1040 break; 1041 case '>': 1042 switch (c2) { 1043 case '>': 1044 switch (c3) { 1045 case '=': 1046 return RIGHTSHIFTEQUAL; 1047 } 1048 break; 1049 } 1050 break; 1051 case '*': 1052 switch (c2) { 1053 case '*': 1054 switch (c3) { 1055 case '=': 1056 return DOUBLESTAREQUAL; 1057 } 1058 break; 1059 } 1060 break; 1061 case '/': 1062 switch (c2) { 1063 case '/': 1064 switch (c3) { 1065 case '=': 1066 return DOUBLESLASHEQUAL; 1067 } 1068 break; 1069 } 1070 break; 1071 } 1072 return OP; 1073} 1074 1075static int 1076indenterror(struct tok_state *tok) 1077{ 1078 if (tok->alterror) { 1079 tok->done = E_TABSPACE; 1080 tok->cur = tok->inp; 1081 return 1; 1082 } 1083 if (tok->altwarning) { 1084 PySys_WriteStderr("%s: inconsistent use of tabs and spaces " 1085 "in indentation\n", tok->filename); 1086 tok->altwarning = 0; 1087 } 1088 return 0; 1089} 1090 1091 1092/* Get next token, after space stripping etc. */ 1093 1094static int 1095tok_get(register struct tok_state *tok, char **p_start, char **p_end) 1096{ 1097 register int c; 1098 int blankline; 1099 1100 *p_start = *p_end = NULL; 1101 nextline: 1102 tok->start = NULL; 1103 blankline = 0; 1104 1105 /* Get indentation level */ 1106 if (tok->atbol) { 1107 register int col = 0; 1108 register int altcol = 0; 1109 tok->atbol = 0; 1110 for (;;) { 1111 c = tok_nextc(tok); 1112 if (c == ' ') 1113 col++, altcol++; 1114 else if (c == '\t') { 1115 col = (col/tok->tabsize + 1) * tok->tabsize; 1116 altcol = (altcol/tok->alttabsize + 1) 1117 * tok->alttabsize; 1118 } 1119 else if (c == '\014') /* Control-L (formfeed) */ 1120 col = altcol = 0; /* For Emacs users */ 1121 else 1122 break; 1123 } 1124 tok_backup(tok, c); 1125 if (c == '#' || c == '\n') { 1126 /* Lines with only whitespace and/or comments 1127 shouldn't affect the indentation and are 1128 not passed to the parser as NEWLINE tokens, 1129 except *totally* empty lines in interactive 1130 mode, which signal the end of a command group. */ 1131 if (col == 0 && c == '\n' && tok->prompt != NULL) 1132 blankline = 0; /* Let it through */ 1133 else 1134 blankline = 1; /* Ignore completely */ 1135 /* We can't jump back right here since we still 1136 may need to skip to the end of a comment */ 1137 } 1138 if (!blankline && tok->level == 0) { 1139 if (col == tok->indstack[tok->indent]) { 1140 /* No change */ 1141 if (altcol != tok->altindstack[tok->indent]) { 1142 if (indenterror(tok)) 1143 return ERRORTOKEN; 1144 } 1145 } 1146 else if (col > tok->indstack[tok->indent]) { 1147 /* Indent -- always one */ 1148 if (tok->indent+1 >= MAXINDENT) { 1149 tok->done = E_TOODEEP; 1150 tok->cur = tok->inp; 1151 return ERRORTOKEN; 1152 } 1153 if (altcol <= tok->altindstack[tok->indent]) { 1154 if (indenterror(tok)) 1155 return ERRORTOKEN; 1156 } 1157 tok->pendin++; 1158 tok->indstack[++tok->indent] = col; 1159 tok->altindstack[tok->indent] = altcol; 1160 } 1161 else /* col < tok->indstack[tok->indent] */ { 1162 /* Dedent -- any number, must be consistent */ 1163 while (tok->indent > 0 && 1164 col < tok->indstack[tok->indent]) { 1165 tok->pendin--; 1166 tok->indent--; 1167 } 1168 if (col != tok->indstack[tok->indent]) { 1169 tok->done = E_DEDENT; 1170 tok->cur = tok->inp; 1171 return ERRORTOKEN; 1172 } 1173 if (altcol != tok->altindstack[tok->indent]) { 1174 if (indenterror(tok)) 1175 return ERRORTOKEN; 1176 } 1177 } 1178 } 1179 } 1180 1181 tok->start = tok->cur; 1182 1183 /* Return pending indents/dedents */ 1184 if (tok->pendin != 0) { 1185 if (tok->pendin < 0) { 1186 tok->pendin++; 1187 return DEDENT; 1188 } 1189 else { 1190 tok->pendin--; 1191 return INDENT; 1192 } 1193 } 1194 1195 again: 1196 tok->start = NULL; 1197 /* Skip spaces */ 1198 do { 1199 c = tok_nextc(tok); 1200 } while (c == ' ' || c == '\t' || c == '\014'); 1201 1202 /* Set start of current token */ 1203 tok->start = tok->cur - 1; 1204 1205 /* Skip comment, while looking for tab-setting magic */ 1206 if (c == '#') { 1207 static char *tabforms[] = { 1208 "tab-width:", /* Emacs */ 1209 ":tabstop=", /* vim, full form */ 1210 ":ts=", /* vim, abbreviated form */ 1211 "set tabsize=", /* will vi never die? */ 1212 /* more templates can be added here to support other editors */ 1213 }; 1214 char cbuf[80]; 1215 char *tp, **cp; 1216 tp = cbuf; 1217 do { 1218 *tp++ = c = tok_nextc(tok); 1219 } while (c != EOF && c != '\n' && 1220 tp - cbuf + 1 < sizeof(cbuf)); 1221 *tp = '\0'; 1222 for (cp = tabforms; 1223 cp < tabforms + sizeof(tabforms)/sizeof(tabforms[0]); 1224 cp++) { 1225 if ((tp = strstr(cbuf, *cp))) { 1226 int newsize = atoi(tp + strlen(*cp)); 1227 1228 if (newsize >= 1 && newsize <= 40) { 1229 tok->tabsize = newsize; 1230 if (Py_VerboseFlag) 1231 PySys_WriteStderr( 1232 "Tab size set to %d\n", 1233 newsize); 1234 } 1235 } 1236 } 1237 while (c != EOF && c != '\n') 1238 c = tok_nextc(tok); 1239 } 1240 1241 /* Check for EOF and errors now */ 1242 if (c == EOF) { 1243 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN; 1244 } 1245 1246 /* Identifier (most frequent token!) */ 1247 if (isalpha(c) || c == '_') { 1248 /* Process r"", u"" and ur"" */ 1249 switch (c) { 1250 case 'r': 1251 case 'R': 1252 c = tok_nextc(tok); 1253 if (c == '"' || c == '\'') 1254 goto letter_quote; 1255 break; 1256 case 'u': 1257 case 'U': 1258 c = tok_nextc(tok); 1259 if (c == 'r' || c == 'R') 1260 c = tok_nextc(tok); 1261 if (c == '"' || c == '\'') 1262 goto letter_quote; 1263 break; 1264 } 1265 while (isalnum(c) || c == '_') { 1266 c = tok_nextc(tok); 1267 } 1268 tok_backup(tok, c); 1269 *p_start = tok->start; 1270 *p_end = tok->cur; 1271 return NAME; 1272 } 1273 1274 /* Newline */ 1275 if (c == '\n') { 1276 tok->atbol = 1; 1277 if (blankline || tok->level > 0) 1278 goto nextline; 1279 *p_start = tok->start; 1280 *p_end = tok->cur - 1; /* Leave '\n' out of the string */ 1281 tok->cont_line = 0; 1282 return NEWLINE; 1283 } 1284 1285 /* Period or number starting with period? */ 1286 if (c == '.') { 1287 c = tok_nextc(tok); 1288 if (isdigit(c)) { 1289 goto fraction; 1290 } 1291 else { 1292 tok_backup(tok, c); 1293 *p_start = tok->start; 1294 *p_end = tok->cur; 1295 return DOT; 1296 } 1297 } 1298 1299 /* Number */ 1300 if (isdigit(c)) { 1301 if (c == '0') { 1302 /* Hex or octal -- maybe. */ 1303 c = tok_nextc(tok); 1304 if (c == '.') 1305 goto fraction; 1306#ifndef WITHOUT_COMPLEX 1307 if (c == 'j' || c == 'J') 1308 goto imaginary; 1309#endif 1310 if (c == 'x' || c == 'X') { 1311 /* Hex */ 1312 do { 1313 c = tok_nextc(tok); 1314 } while (isxdigit(c)); 1315 } 1316 else { 1317 int found_decimal = 0; 1318 /* Octal; c is first char of it */ 1319 /* There's no 'isoctdigit' macro, sigh */ 1320 while ('0' <= c && c < '8') { 1321 c = tok_nextc(tok); 1322 } 1323 if (isdigit(c)) { 1324 found_decimal = 1; 1325 do { 1326 c = tok_nextc(tok); 1327 } while (isdigit(c)); 1328 } 1329 if (c == '.') 1330 goto fraction; 1331 else if (c == 'e' || c == 'E') 1332 goto exponent; 1333#ifndef WITHOUT_COMPLEX 1334 else if (c == 'j' || c == 'J') 1335 goto imaginary; 1336#endif 1337 else if (found_decimal) { 1338 tok->done = E_TOKEN; 1339 tok_backup(tok, c); 1340 return ERRORTOKEN; 1341 } 1342 } 1343 if (c == 'l' || c == 'L') 1344 c = tok_nextc(tok); 1345 } 1346 else { 1347 /* Decimal */ 1348 do { 1349 c = tok_nextc(tok); 1350 } while (isdigit(c)); 1351 if (c == 'l' || c == 'L') 1352 c = tok_nextc(tok); 1353 else { 1354 /* Accept floating point numbers. */ 1355 if (c == '.') { 1356 fraction: 1357 /* Fraction */ 1358 do { 1359 c = tok_nextc(tok); 1360 } while (isdigit(c)); 1361 } 1362 if (c == 'e' || c == 'E') { 1363 exponent: 1364 /* Exponent part */ 1365 c = tok_nextc(tok); 1366 if (c == '+' || c == '-') 1367 c = tok_nextc(tok); 1368 if (!isdigit(c)) { 1369 tok->done = E_TOKEN; 1370 tok_backup(tok, c); 1371 return ERRORTOKEN; 1372 } 1373 do { 1374 c = tok_nextc(tok); 1375 } while (isdigit(c)); 1376 } 1377#ifndef WITHOUT_COMPLEX 1378 if (c == 'j' || c == 'J') 1379 /* Imaginary part */ 1380 imaginary: 1381 c = tok_nextc(tok); 1382#endif 1383 } 1384 } 1385 tok_backup(tok, c); 1386 *p_start = tok->start; 1387 *p_end = tok->cur; 1388 return NUMBER; 1389 } 1390 1391 letter_quote: 1392 /* String */ 1393 if (c == '\'' || c == '"') { 1394 Py_ssize_t quote2 = tok->cur - tok->start + 1; 1395 int quote = c; 1396 int triple = 0; 1397 int tripcount = 0; 1398 for (;;) { 1399 c = tok_nextc(tok); 1400 if (c == '\n') { 1401 if (!triple) { 1402 tok->done = E_EOLS; 1403 tok_backup(tok, c); 1404 return ERRORTOKEN; 1405 } 1406 tripcount = 0; 1407 tok->cont_line = 1; /* multiline string. */ 1408 } 1409 else if (c == EOF) { 1410 if (triple) 1411 tok->done = E_EOFS; 1412 else 1413 tok->done = E_EOLS; 1414 tok->cur = tok->inp; 1415 return ERRORTOKEN; 1416 } 1417 else if (c == quote) { 1418 tripcount++; 1419 if (tok->cur - tok->start == quote2) { 1420 c = tok_nextc(tok); 1421 if (c == quote) { 1422 triple = 1; 1423 tripcount = 0; 1424 continue; 1425 } 1426 tok_backup(tok, c); 1427 } 1428 if (!triple || tripcount == 3) 1429 break; 1430 } 1431 else if (c == '\\') { 1432 tripcount = 0; 1433 c = tok_nextc(tok); 1434 if (c == EOF) { 1435 tok->done = E_EOLS; 1436 tok->cur = tok->inp; 1437 return ERRORTOKEN; 1438 } 1439 } 1440 else 1441 tripcount = 0; 1442 } 1443 *p_start = tok->start; 1444 *p_end = tok->cur; 1445 return STRING; 1446 } 1447 1448 /* Line continuation */ 1449 if (c == '\\') { 1450 c = tok_nextc(tok); 1451 if (c != '\n') { 1452 tok->done = E_LINECONT; 1453 tok->cur = tok->inp; 1454 return ERRORTOKEN; 1455 } 1456 tok->cont_line = 1; 1457 goto again; /* Read next line */ 1458 } 1459 1460 /* Check for two-character token */ 1461 { 1462 int c2 = tok_nextc(tok); 1463 int token = PyToken_TwoChars(c, c2); 1464 if (token != OP) { 1465 int c3 = tok_nextc(tok); 1466 int token3 = PyToken_ThreeChars(c, c2, c3); 1467 if (token3 != OP) { 1468 token = token3; 1469 } else { 1470 tok_backup(tok, c3); 1471 } 1472 *p_start = tok->start; 1473 *p_end = tok->cur; 1474 return token; 1475 } 1476 tok_backup(tok, c2); 1477 } 1478 1479 /* Keep track of parentheses nesting level */ 1480 switch (c) { 1481 case '(': 1482 case '[': 1483 case '{': 1484 tok->level++; 1485 break; 1486 case ')': 1487 case ']': 1488 case '}': 1489 tok->level--; 1490 break; 1491 } 1492 1493 /* Punctuation character */ 1494 *p_start = tok->start; 1495 *p_end = tok->cur; 1496 return PyToken_OneChar(c); 1497} 1498 1499int 1500PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end) 1501{ 1502 int result = tok_get(tok, p_start, p_end); 1503 if (tok->decoding_erred) { 1504 result = ERRORTOKEN; 1505 tok->done = E_DECODE; 1506 } 1507 return result; 1508} 1509 1510#ifdef Py_DEBUG 1511 1512void 1513tok_dump(int type, char *start, char *end) 1514{ 1515 printf("%s", _PyParser_TokenNames[type]); 1516 if (type == NAME || type == NUMBER || type == STRING || type == OP) 1517 printf("(%.*s)", (int)(end - start), start); 1518} 1519 1520#endif 1521