tokenizer.c revision 9fc1b96a19ef821174f5ce37d007b68a55b9ba67
1 2/* Tokenizer implementation */ 3 4#include "Python.h" 5#include "pgenheaders.h" 6 7#include <ctype.h> 8#include <assert.h> 9 10#include "tokenizer.h" 11#include "errcode.h" 12 13#ifndef PGEN 14#include "unicodeobject.h" 15#include "stringobject.h" 16#include "fileobject.h" 17#include "codecs.h" 18#include "abstract.h" 19#endif /* PGEN */ 20 21extern char *PyOS_Readline(FILE *, FILE *, char *); 22/* Return malloc'ed string including trailing \n; 23 empty malloc'ed string for EOF; 24 NULL if interrupted */ 25 26/* Don't ever change this -- it would break the portability of Python code */ 27#define TABSIZE 8 28 29/* Convert a possibly signed character to a nonnegative int */ 30/* XXX This assumes characters are 8 bits wide */ 31#ifdef __CHAR_UNSIGNED__ 32#define Py_CHARMASK(c) (c) 33#else 34#define Py_CHARMASK(c) ((c) & 0xff) 35#endif 36 37/* Forward */ 38static struct tok_state *tok_new(void); 39static int tok_nextc(struct tok_state *tok); 40static void tok_backup(struct tok_state *tok, int c); 41 42/* Token names */ 43 44char *_PyParser_TokenNames[] = { 45 "ENDMARKER", 46 "NAME", 47 "NUMBER", 48 "STRING", 49 "NEWLINE", 50 "INDENT", 51 "DEDENT", 52 "LPAR", 53 "RPAR", 54 "LSQB", 55 "RSQB", 56 "COLON", 57 "COMMA", 58 "SEMI", 59 "PLUS", 60 "MINUS", 61 "STAR", 62 "SLASH", 63 "VBAR", 64 "AMPER", 65 "LESS", 66 "GREATER", 67 "EQUAL", 68 "DOT", 69 "PERCENT", 70 "BACKQUOTE", 71 "LBRACE", 72 "RBRACE", 73 "EQEQUAL", 74 "NOTEQUAL", 75 "LESSEQUAL", 76 "GREATEREQUAL", 77 "TILDE", 78 "CIRCUMFLEX", 79 "LEFTSHIFT", 80 "RIGHTSHIFT", 81 "DOUBLESTAR", 82 "PLUSEQUAL", 83 "MINEQUAL", 84 "STAREQUAL", 85 "SLASHEQUAL", 86 "PERCENTEQUAL", 87 "AMPEREQUAL", 88 "VBAREQUAL", 89 "CIRCUMFLEXEQUAL", 90 "LEFTSHIFTEQUAL", 91 "RIGHTSHIFTEQUAL", 92 "DOUBLESTAREQUAL", 93 "DOUBLESLASH", 94 "DOUBLESLASHEQUAL", 95 "AT", 96 /* This table must match the #defines in token.h! */ 97 "OP", 98 "<ERRORTOKEN>", 99 "<N_TOKENS>" 100}; 101 102 103/* Create and initialize a new tok_state structure */ 104 105static struct tok_state * 106tok_new(void) 107{ 108 struct tok_state *tok = (struct tok_state *)PyMem_MALLOC( 109 sizeof(struct tok_state)); 110 if (tok == NULL) 111 return NULL; 112 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL; 113 tok->done = E_OK; 114 tok->fp = NULL; 115 tok->tabsize = TABSIZE; 116 tok->indent = 0; 117 tok->indstack[0] = 0; 118 tok->atbol = 1; 119 tok->pendin = 0; 120 tok->prompt = tok->nextprompt = NULL; 121 tok->lineno = 0; 122 tok->level = 0; 123 tok->filename = NULL; 124 tok->altwarning = 0; 125 tok->alterror = 0; 126 tok->alttabsize = 1; 127 tok->altindstack[0] = 0; 128 tok->decoding_state = 0; 129 tok->decoding_erred = 0; 130 tok->read_coding_spec = 0; 131 tok->encoding = NULL; 132 tok->cont_line = 0; 133#ifndef PGEN 134 tok->decoding_readline = NULL; 135 tok->decoding_buffer = NULL; 136#endif 137 return tok; 138} 139 140#ifdef PGEN 141 142static char * 143decoding_fgets(char *s, int size, struct tok_state *tok) 144{ 145 return fgets(s, size, tok->fp); 146} 147 148static int 149decoding_feof(struct tok_state *tok) 150{ 151 return feof(tok->fp); 152} 153 154static const char * 155decode_str(const char *str, struct tok_state *tok) 156{ 157 return str; 158} 159 160#else /* PGEN */ 161 162static char * 163error_ret(struct tok_state *tok) /* XXX */ 164{ 165 tok->decoding_erred = 1; 166 if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */ 167 PyMem_FREE(tok->buf); 168 tok->buf = NULL; 169 return NULL; /* as if it were EOF */ 170} 171 172static char * 173new_string(const char *s, Py_ssize_t len) 174{ 175 char* result = (char *)PyMem_MALLOC(len + 1); 176 if (result != NULL) { 177 memcpy(result, s, len); 178 result[len] = '\0'; 179 } 180 return result; 181} 182 183static char * 184get_normal_name(char *s) /* for utf-8 and latin-1 */ 185{ 186 char buf[13]; 187 int i; 188 for (i = 0; i < 12; i++) { 189 int c = s[i]; 190 if (c == '\0') break; 191 else if (c == '_') buf[i] = '-'; 192 else buf[i] = tolower(c); 193 } 194 buf[i] = '\0'; 195 if (strcmp(buf, "utf-8") == 0 || 196 strncmp(buf, "utf-8-", 6) == 0) return "utf-8"; 197 else if (strcmp(buf, "latin-1") == 0 || 198 strcmp(buf, "iso-8859-1") == 0 || 199 strcmp(buf, "iso-latin-1") == 0 || 200 strncmp(buf, "latin-1-", 8) == 0 || 201 strncmp(buf, "iso-8859-1-", 11) == 0 || 202 strncmp(buf, "iso-latin-1-", 12) == 0) return "iso-8859-1"; 203 else return s; 204} 205 206/* Return the coding spec in S, or NULL if none is found. */ 207 208static char * 209get_coding_spec(const char *s, Py_ssize_t size) 210{ 211 Py_ssize_t i; 212 /* Coding spec must be in a comment, and that comment must be 213 * the only statement on the source code line. */ 214 for (i = 0; i < size - 6; i++) { 215 if (s[i] == '#') 216 break; 217 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014') 218 return NULL; 219 } 220 for (; i < size - 6; i++) { /* XXX inefficient search */ 221 const char* t = s + i; 222 if (strncmp(t, "coding", 6) == 0) { 223 const char* begin = NULL; 224 t += 6; 225 if (t[0] != ':' && t[0] != '=') 226 continue; 227 do { 228 t++; 229 } while (t[0] == '\x20' || t[0] == '\t'); 230 231 begin = t; 232 while (isalnum(Py_CHARMASK(t[0])) || 233 t[0] == '-' || t[0] == '_' || t[0] == '.') 234 t++; 235 236 if (begin < t) { 237 char* r = new_string(begin, t - begin); 238 char* q = get_normal_name(r); 239 if (r != q) { 240 PyMem_FREE(r); 241 r = new_string(q, strlen(q)); 242 } 243 return r; 244 } 245 } 246 } 247 return NULL; 248} 249 250/* Check whether the line contains a coding spec. If it does, 251 invoke the set_readline function for the new encoding. 252 This function receives the tok_state and the new encoding. 253 Return 1 on success, 0 on failure. */ 254 255static int 256check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok, 257 int set_readline(struct tok_state *, const char *)) 258{ 259 char * cs; 260 int r = 1; 261 262 if (tok->cont_line) 263 /* It's a continuation line, so it can't be a coding spec. */ 264 return 1; 265 cs = get_coding_spec(line, size); 266 if (cs != NULL) { 267 tok->read_coding_spec = 1; 268 if (tok->encoding == NULL) { 269 assert(tok->decoding_state == 1); /* raw */ 270 if (strcmp(cs, "utf-8") == 0 || 271 strcmp(cs, "iso-8859-1") == 0) { 272 tok->encoding = cs; 273 } else { 274#ifdef Py_USING_UNICODE 275 r = set_readline(tok, cs); 276 if (r) { 277 tok->encoding = cs; 278 tok->decoding_state = -1; 279 } 280 else 281 PyMem_FREE(cs); 282#else 283 /* Without Unicode support, we cannot 284 process the coding spec. Since there 285 won't be any Unicode literals, that 286 won't matter. */ 287 PyMem_FREE(cs); 288#endif 289 } 290 } else { /* then, compare cs with BOM */ 291 r = (strcmp(tok->encoding, cs) == 0); 292 PyMem_FREE(cs); 293 } 294 } 295 if (!r) { 296 cs = tok->encoding; 297 if (!cs) 298 cs = "with BOM"; 299 PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs); 300 } 301 return r; 302} 303 304/* See whether the file starts with a BOM. If it does, 305 invoke the set_readline function with the new encoding. 306 Return 1 on success, 0 on failure. */ 307 308static int 309check_bom(int get_char(struct tok_state *), 310 void unget_char(int, struct tok_state *), 311 int set_readline(struct tok_state *, const char *), 312 struct tok_state *tok) 313{ 314 int ch = get_char(tok); 315 tok->decoding_state = 1; 316 if (ch == EOF) { 317 return 1; 318 } else if (ch == 0xEF) { 319 ch = get_char(tok); if (ch != 0xBB) goto NON_BOM; 320 ch = get_char(tok); if (ch != 0xBF) goto NON_BOM; 321#if 0 322 /* Disable support for UTF-16 BOMs until a decision 323 is made whether this needs to be supported. */ 324 } else if (ch == 0xFE) { 325 ch = get_char(tok); if (ch != 0xFF) goto NON_BOM; 326 if (!set_readline(tok, "utf-16-be")) return 0; 327 tok->decoding_state = -1; 328 } else if (ch == 0xFF) { 329 ch = get_char(tok); if (ch != 0xFE) goto NON_BOM; 330 if (!set_readline(tok, "utf-16-le")) return 0; 331 tok->decoding_state = -1; 332#endif 333 } else { 334 unget_char(ch, tok); 335 return 1; 336 } 337 if (tok->encoding != NULL) 338 PyMem_FREE(tok->encoding); 339 tok->encoding = new_string("utf-8", 5); /* resulting is in utf-8 */ 340 return 1; 341 NON_BOM: 342 /* any token beginning with '\xEF', '\xFE', '\xFF' is a bad token */ 343 unget_char(0xFF, tok); /* XXX this will cause a syntax error */ 344 return 1; 345} 346 347/* Read a line of text from TOK into S, using the stream in TOK. 348 Return NULL on failure, else S. 349 350 On entry, tok->decoding_buffer will be one of: 351 1) NULL: need to call tok->decoding_readline to get a new line 352 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and 353 stored the result in tok->decoding_buffer 354 3) PyStringObject *: previous call to fp_readl did not have enough room 355 (in the s buffer) to copy entire contents of the line read 356 by tok->decoding_readline. tok->decoding_buffer has the overflow. 357 In this case, fp_readl is called in a loop (with an expanded buffer) 358 until the buffer ends with a '\n' (or until the end of the file is 359 reached): see tok_nextc and its calls to decoding_fgets. 360*/ 361 362static char * 363fp_readl(char *s, int size, struct tok_state *tok) 364{ 365#ifndef Py_USING_UNICODE 366 /* In a non-Unicode built, this should never be called. */ 367 Py_FatalError("fp_readl should not be called in this build."); 368 return NULL; /* Keep compiler happy (not reachable) */ 369#else 370 PyObject* utf8 = NULL; 371 PyObject* buf = tok->decoding_buffer; 372 char *str; 373 Py_ssize_t utf8len; 374 375 /* Ask for one less byte so we can terminate it */ 376 assert(size > 0); 377 size--; 378 379 if (buf == NULL) { 380 buf = PyObject_CallObject(tok->decoding_readline, NULL); 381 if (buf == NULL) 382 return error_ret(tok); 383 } else { 384 tok->decoding_buffer = NULL; 385 if (PyString_CheckExact(buf)) 386 utf8 = buf; 387 } 388 if (utf8 == NULL) { 389 utf8 = PyUnicode_AsUTF8String(buf); 390 Py_DECREF(buf); 391 if (utf8 == NULL) 392 return error_ret(tok); 393 } 394 str = PyString_AsString(utf8); 395 utf8len = PyString_GET_SIZE(utf8); 396 if (utf8len > size) { 397 tok->decoding_buffer = PyString_FromStringAndSize(str+size, utf8len-size); 398 if (tok->decoding_buffer == NULL) { 399 Py_DECREF(utf8); 400 return error_ret(tok); 401 } 402 utf8len = size; 403 } 404 memcpy(s, str, utf8len); 405 s[utf8len] = '\0'; 406 Py_DECREF(utf8); 407 if (utf8len == 0) return NULL; /* EOF */ 408 return s; 409#endif 410} 411 412/* Set the readline function for TOK to a StreamReader's 413 readline function. The StreamReader is named ENC. 414 415 This function is called from check_bom and check_coding_spec. 416 417 ENC is usually identical to the future value of tok->encoding, 418 except for the (currently unsupported) case of UTF-16. 419 420 Return 1 on success, 0 on failure. */ 421 422static int 423fp_setreadl(struct tok_state *tok, const char* enc) 424{ 425 PyObject *reader, *stream, *readline; 426 427 /* XXX: constify filename argument. */ 428 stream = PyFile_FromFile(tok->fp, (char*)tok->filename, "rb", NULL); 429 if (stream == NULL) 430 return 0; 431 432 reader = PyCodec_StreamReader(enc, stream, NULL); 433 Py_DECREF(stream); 434 if (reader == NULL) 435 return 0; 436 437 readline = PyObject_GetAttrString(reader, "readline"); 438 Py_DECREF(reader); 439 if (readline == NULL) 440 return 0; 441 442 tok->decoding_readline = readline; 443 return 1; 444} 445 446/* Fetch the next byte from TOK. */ 447 448static int fp_getc(struct tok_state *tok) { 449 return getc(tok->fp); 450} 451 452/* Unfetch the last byte back into TOK. */ 453 454static void fp_ungetc(int c, struct tok_state *tok) { 455 ungetc(c, tok->fp); 456} 457 458/* Read a line of input from TOK. Determine encoding 459 if necessary. */ 460 461static char * 462decoding_fgets(char *s, int size, struct tok_state *tok) 463{ 464 char *line = NULL; 465 int badchar = 0; 466 for (;;) { 467 if (tok->decoding_state < 0) { 468 /* We already have a codec associated with 469 this input. */ 470 line = fp_readl(s, size, tok); 471 break; 472 } else if (tok->decoding_state > 0) { 473 /* We want a 'raw' read. */ 474 line = Py_UniversalNewlineFgets(s, size, 475 tok->fp, NULL); 476 break; 477 } else { 478 /* We have not yet determined the encoding. 479 If an encoding is found, use the file-pointer 480 reader functions from now on. */ 481 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok)) 482 return error_ret(tok); 483 assert(tok->decoding_state != 0); 484 } 485 } 486 if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) { 487 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) { 488 return error_ret(tok); 489 } 490 } 491#ifndef PGEN 492 /* The default encoding is ASCII, so make sure we don't have any 493 non-ASCII bytes in it. */ 494 if (line && !tok->encoding) { 495 unsigned char *c; 496 for (c = (unsigned char *)line; *c; c++) 497 if (*c > 127) { 498 badchar = *c; 499 break; 500 } 501 } 502 if (badchar) { 503 char buf[500]; 504 /* Need to add 1 to the line number, since this line 505 has not been counted, yet. */ 506 sprintf(buf, 507 "Non-ASCII character '\\x%.2x' " 508 "in file %.200s on line %i, " 509 "but no encoding declared; " 510 "see http://www.python.org/peps/pep-0263.html for details", 511 badchar, tok->filename, tok->lineno + 1); 512 PyErr_SetString(PyExc_SyntaxError, buf); 513 return error_ret(tok); 514 } 515#endif 516 return line; 517} 518 519static int 520decoding_feof(struct tok_state *tok) 521{ 522 if (tok->decoding_state >= 0) { 523 return feof(tok->fp); 524 } else { 525 PyObject* buf = tok->decoding_buffer; 526 if (buf == NULL) { 527 buf = PyObject_CallObject(tok->decoding_readline, NULL); 528 if (buf == NULL) { 529 error_ret(tok); 530 return 1; 531 } else { 532 tok->decoding_buffer = buf; 533 } 534 } 535 return PyObject_Length(buf) == 0; 536 } 537} 538 539/* Fetch a byte from TOK, using the string buffer. */ 540 541static int 542buf_getc(struct tok_state *tok) { 543 return Py_CHARMASK(*tok->str++); 544} 545 546/* Unfetch a byte from TOK, using the string buffer. */ 547 548static void 549buf_ungetc(int c, struct tok_state *tok) { 550 tok->str--; 551 assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */ 552} 553 554/* Set the readline function for TOK to ENC. For the string-based 555 tokenizer, this means to just record the encoding. */ 556 557static int 558buf_setreadl(struct tok_state *tok, const char* enc) { 559 tok->enc = enc; 560 return 1; 561} 562 563/* Return a UTF-8 encoding Python string object from the 564 C byte string STR, which is encoded with ENC. */ 565 566#ifdef Py_USING_UNICODE 567static PyObject * 568translate_into_utf8(const char* str, const char* enc) { 569 PyObject *utf8; 570 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL); 571 if (buf == NULL) 572 return NULL; 573 utf8 = PyUnicode_AsUTF8String(buf); 574 Py_DECREF(buf); 575 return utf8; 576} 577#endif 578 579/* Decode a byte string STR for use as the buffer of TOK. 580 Look for encoding declarations inside STR, and record them 581 inside TOK. */ 582 583static const char * 584decode_str(const char *str, struct tok_state *tok) 585{ 586 PyObject* utf8 = NULL; 587 const char *s; 588 int lineno = 0; 589 tok->enc = NULL; 590 tok->str = str; 591 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok)) 592 return error_ret(tok); 593 str = tok->str; /* string after BOM if any */ 594 assert(str); 595#ifdef Py_USING_UNICODE 596 if (tok->enc != NULL) { 597 utf8 = translate_into_utf8(str, tok->enc); 598 if (utf8 == NULL) 599 return error_ret(tok); 600 str = PyString_AsString(utf8); 601 } 602#endif 603 for (s = str;; s++) { 604 if (*s == '\0') break; 605 else if (*s == '\n') { 606 lineno++; 607 if (lineno == 2) break; 608 } 609 } 610 tok->enc = NULL; 611 if (!check_coding_spec(str, s - str, tok, buf_setreadl)) 612 return error_ret(tok); 613#ifdef Py_USING_UNICODE 614 if (tok->enc != NULL) { 615 assert(utf8 == NULL); 616 utf8 = translate_into_utf8(str, tok->enc); 617 if (utf8 == NULL) { 618 PyErr_Format(PyExc_SyntaxError, 619 "unknown encoding: %s", tok->enc); 620 return error_ret(tok); 621 } 622 str = PyString_AsString(utf8); 623 } 624#endif 625 assert(tok->decoding_buffer == NULL); 626 tok->decoding_buffer = utf8; /* CAUTION */ 627 return str; 628} 629 630#endif /* PGEN */ 631 632/* Set up tokenizer for string */ 633 634struct tok_state * 635PyTokenizer_FromString(const char *str) 636{ 637 struct tok_state *tok = tok_new(); 638 if (tok == NULL) 639 return NULL; 640 str = (char *)decode_str(str, tok); 641 if (str == NULL) { 642 PyTokenizer_Free(tok); 643 return NULL; 644 } 645 646 /* XXX: constify members. */ 647 tok->buf = tok->cur = tok->end = tok->inp = (char*)str; 648 return tok; 649} 650 651 652/* Set up tokenizer for file */ 653 654struct tok_state * 655PyTokenizer_FromFile(FILE *fp, char *ps1, char *ps2) 656{ 657 struct tok_state *tok = tok_new(); 658 if (tok == NULL) 659 return NULL; 660 if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) { 661 PyTokenizer_Free(tok); 662 return NULL; 663 } 664 tok->cur = tok->inp = tok->buf; 665 tok->end = tok->buf + BUFSIZ; 666 tok->fp = fp; 667 tok->prompt = ps1; 668 tok->nextprompt = ps2; 669 return tok; 670} 671 672 673/* Free a tok_state structure */ 674 675void 676PyTokenizer_Free(struct tok_state *tok) 677{ 678 if (tok->encoding != NULL) 679 PyMem_FREE(tok->encoding); 680#ifndef PGEN 681 Py_XDECREF(tok->decoding_readline); 682 Py_XDECREF(tok->decoding_buffer); 683#endif 684 if (tok->fp != NULL && tok->buf != NULL) 685 PyMem_FREE(tok->buf); 686 PyMem_FREE(tok); 687} 688 689#if !defined(PGEN) && defined(Py_USING_UNICODE) 690static int 691tok_stdin_decode(struct tok_state *tok, char **inp) 692{ 693 PyObject *enc, *sysstdin, *decoded, *utf8; 694 const char *encoding; 695 char *converted; 696 697 if (PySys_GetFile((char *)"stdin", NULL) != stdin) 698 return 0; 699 sysstdin = PySys_GetObject("stdin"); 700 if (sysstdin == NULL || !PyFile_Check(sysstdin)) 701 return 0; 702 703 enc = ((PyFileObject *)sysstdin)->f_encoding; 704 if (enc == NULL || !PyString_Check(enc)) 705 return 0; 706 Py_INCREF(enc); 707 708 encoding = PyString_AsString(enc); 709 decoded = PyUnicode_Decode(*inp, strlen(*inp), encoding, NULL); 710 if (decoded == NULL) 711 goto error_clear; 712 713 utf8 = PyUnicode_AsEncodedString(decoded, "utf-8", NULL); 714 Py_DECREF(decoded); 715 if (utf8 == NULL) 716 goto error_clear; 717 718 assert(PyString_Check(utf8)); 719 converted = new_string(PyString_AS_STRING(utf8), 720 PyString_GET_SIZE(utf8)); 721 Py_DECREF(utf8); 722 if (converted == NULL) 723 goto error_nomem; 724 725 PyMem_FREE(*inp); 726 *inp = converted; 727 if (tok->encoding != NULL) 728 PyMem_FREE(tok->encoding); 729 tok->encoding = new_string(encoding, strlen(encoding)); 730 if (tok->encoding == NULL) 731 goto error_nomem; 732 733 Py_DECREF(enc); 734 return 0; 735 736error_nomem: 737 Py_DECREF(enc); 738 tok->done = E_NOMEM; 739 return -1; 740 741error_clear: 742 /* Fallback to iso-8859-1: for backward compatibility */ 743 Py_DECREF(enc); 744 PyErr_Clear(); 745 return 0; 746} 747#endif 748 749/* Get next char, updating state; error code goes into tok->done */ 750 751static int 752tok_nextc(register struct tok_state *tok) 753{ 754 for (;;) { 755 if (tok->cur != tok->inp) { 756 return Py_CHARMASK(*tok->cur++); /* Fast path */ 757 } 758 if (tok->done != E_OK) 759 return EOF; 760 if (tok->fp == NULL) { 761 char *end = strchr(tok->inp, '\n'); 762 if (end != NULL) 763 end++; 764 else { 765 end = strchr(tok->inp, '\0'); 766 if (end == tok->inp) { 767 tok->done = E_EOF; 768 return EOF; 769 } 770 } 771 if (tok->start == NULL) 772 tok->buf = tok->cur; 773 tok->line_start = tok->cur; 774 tok->lineno++; 775 tok->inp = end; 776 return Py_CHARMASK(*tok->cur++); 777 } 778 if (tok->prompt != NULL) { 779 char *newtok = PyOS_Readline(stdin, stdout, tok->prompt); 780 if (tok->nextprompt != NULL) 781 tok->prompt = tok->nextprompt; 782 if (newtok == NULL) 783 tok->done = E_INTR; 784 else if (*newtok == '\0') { 785 PyMem_FREE(newtok); 786 tok->done = E_EOF; 787 } 788#if !defined(PGEN) && defined(Py_USING_UNICODE) 789 else if (tok_stdin_decode(tok, &newtok) != 0) 790 PyMem_FREE(newtok); 791#endif 792 else if (tok->start != NULL) { 793 size_t start = tok->start - tok->buf; 794 size_t oldlen = tok->cur - tok->buf; 795 size_t newlen = oldlen + strlen(newtok); 796 char *buf = tok->buf; 797 buf = (char *)PyMem_REALLOC(buf, newlen+1); 798 tok->lineno++; 799 if (buf == NULL) { 800 PyMem_FREE(tok->buf); 801 tok->buf = NULL; 802 PyMem_FREE(newtok); 803 tok->done = E_NOMEM; 804 return EOF; 805 } 806 tok->buf = buf; 807 tok->cur = tok->buf + oldlen; 808 tok->line_start = tok->cur; 809 strcpy(tok->buf + oldlen, newtok); 810 PyMem_FREE(newtok); 811 tok->inp = tok->buf + newlen; 812 tok->end = tok->inp + 1; 813 tok->start = tok->buf + start; 814 } 815 else { 816 tok->lineno++; 817 if (tok->buf != NULL) 818 PyMem_FREE(tok->buf); 819 tok->buf = newtok; 820 tok->line_start = tok->buf; 821 tok->cur = tok->buf; 822 tok->line_start = tok->buf; 823 tok->inp = strchr(tok->buf, '\0'); 824 tok->end = tok->inp + 1; 825 } 826 } 827 else { 828 int done = 0; 829 Py_ssize_t cur = 0; 830 char *pt; 831 if (tok->start == NULL) { 832 if (tok->buf == NULL) { 833 tok->buf = (char *) 834 PyMem_MALLOC(BUFSIZ); 835 if (tok->buf == NULL) { 836 tok->done = E_NOMEM; 837 return EOF; 838 } 839 tok->end = tok->buf + BUFSIZ; 840 } 841 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf), 842 tok) == NULL) { 843 tok->done = E_EOF; 844 done = 1; 845 } 846 else { 847 tok->done = E_OK; 848 tok->inp = strchr(tok->buf, '\0'); 849 done = tok->inp[-1] == '\n'; 850 } 851 } 852 else { 853 cur = tok->cur - tok->buf; 854 if (decoding_feof(tok)) { 855 tok->done = E_EOF; 856 done = 1; 857 } 858 else 859 tok->done = E_OK; 860 } 861 tok->lineno++; 862 /* Read until '\n' or EOF */ 863 while (!done) { 864 Py_ssize_t curstart = tok->start == NULL ? -1 : 865 tok->start - tok->buf; 866 Py_ssize_t curvalid = tok->inp - tok->buf; 867 Py_ssize_t newsize = curvalid + BUFSIZ; 868 char *newbuf = tok->buf; 869 newbuf = (char *)PyMem_REALLOC(newbuf, 870 newsize); 871 if (newbuf == NULL) { 872 tok->done = E_NOMEM; 873 tok->cur = tok->inp; 874 return EOF; 875 } 876 tok->buf = newbuf; 877 tok->inp = tok->buf + curvalid; 878 tok->end = tok->buf + newsize; 879 tok->start = curstart < 0 ? NULL : 880 tok->buf + curstart; 881 if (decoding_fgets(tok->inp, 882 (int)(tok->end - tok->inp), 883 tok) == NULL) { 884 /* Break out early on decoding 885 errors, as tok->buf will be NULL 886 */ 887 if (tok->decoding_erred) 888 return EOF; 889 /* Last line does not end in \n, 890 fake one */ 891 strcpy(tok->inp, "\n"); 892 } 893 tok->inp = strchr(tok->inp, '\0'); 894 done = tok->inp[-1] == '\n'; 895 } 896 if (tok->buf != NULL) { 897 tok->cur = tok->buf + cur; 898 tok->line_start = tok->cur; 899 /* replace "\r\n" with "\n" */ 900 /* For Mac leave the \r, giving a syntax error */ 901 pt = tok->inp - 2; 902 if (pt >= tok->buf && *pt == '\r') { 903 *pt++ = '\n'; 904 *pt = '\0'; 905 tok->inp = pt; 906 } 907 } 908 } 909 if (tok->done != E_OK) { 910 if (tok->prompt != NULL) 911 PySys_WriteStderr("\n"); 912 tok->cur = tok->inp; 913 return EOF; 914 } 915 } 916 /*NOTREACHED*/ 917} 918 919 920/* Back-up one character */ 921 922static void 923tok_backup(register struct tok_state *tok, register int c) 924{ 925 if (c != EOF) { 926 if (--tok->cur < tok->buf) 927 Py_FatalError("tok_backup: begin of buffer"); 928 if (*tok->cur != c) 929 *tok->cur = c; 930 } 931} 932 933 934/* Return the token corresponding to a single character */ 935 936int 937PyToken_OneChar(int c) 938{ 939 switch (c) { 940 case '(': return LPAR; 941 case ')': return RPAR; 942 case '[': return LSQB; 943 case ']': return RSQB; 944 case ':': return COLON; 945 case ',': return COMMA; 946 case ';': return SEMI; 947 case '+': return PLUS; 948 case '-': return MINUS; 949 case '*': return STAR; 950 case '/': return SLASH; 951 case '|': return VBAR; 952 case '&': return AMPER; 953 case '<': return LESS; 954 case '>': return GREATER; 955 case '=': return EQUAL; 956 case '.': return DOT; 957 case '%': return PERCENT; 958 case '`': return BACKQUOTE; 959 case '{': return LBRACE; 960 case '}': return RBRACE; 961 case '^': return CIRCUMFLEX; 962 case '~': return TILDE; 963 case '@': return AT; 964 default: return OP; 965 } 966} 967 968 969int 970PyToken_TwoChars(int c1, int c2) 971{ 972 switch (c1) { 973 case '=': 974 switch (c2) { 975 case '=': return EQEQUAL; 976 } 977 break; 978 case '!': 979 switch (c2) { 980 case '=': return NOTEQUAL; 981 } 982 break; 983 case '<': 984 switch (c2) { 985 case '>': return NOTEQUAL; 986 case '=': return LESSEQUAL; 987 case '<': return LEFTSHIFT; 988 } 989 break; 990 case '>': 991 switch (c2) { 992 case '=': return GREATEREQUAL; 993 case '>': return RIGHTSHIFT; 994 } 995 break; 996 case '+': 997 switch (c2) { 998 case '=': return PLUSEQUAL; 999 } 1000 break; 1001 case '-': 1002 switch (c2) { 1003 case '=': return MINEQUAL; 1004 } 1005 break; 1006 case '*': 1007 switch (c2) { 1008 case '*': return DOUBLESTAR; 1009 case '=': return STAREQUAL; 1010 } 1011 break; 1012 case '/': 1013 switch (c2) { 1014 case '/': return DOUBLESLASH; 1015 case '=': return SLASHEQUAL; 1016 } 1017 break; 1018 case '|': 1019 switch (c2) { 1020 case '=': return VBAREQUAL; 1021 } 1022 break; 1023 case '%': 1024 switch (c2) { 1025 case '=': return PERCENTEQUAL; 1026 } 1027 break; 1028 case '&': 1029 switch (c2) { 1030 case '=': return AMPEREQUAL; 1031 } 1032 break; 1033 case '^': 1034 switch (c2) { 1035 case '=': return CIRCUMFLEXEQUAL; 1036 } 1037 break; 1038 } 1039 return OP; 1040} 1041 1042int 1043PyToken_ThreeChars(int c1, int c2, int c3) 1044{ 1045 switch (c1) { 1046 case '<': 1047 switch (c2) { 1048 case '<': 1049 switch (c3) { 1050 case '=': 1051 return LEFTSHIFTEQUAL; 1052 } 1053 break; 1054 } 1055 break; 1056 case '>': 1057 switch (c2) { 1058 case '>': 1059 switch (c3) { 1060 case '=': 1061 return RIGHTSHIFTEQUAL; 1062 } 1063 break; 1064 } 1065 break; 1066 case '*': 1067 switch (c2) { 1068 case '*': 1069 switch (c3) { 1070 case '=': 1071 return DOUBLESTAREQUAL; 1072 } 1073 break; 1074 } 1075 break; 1076 case '/': 1077 switch (c2) { 1078 case '/': 1079 switch (c3) { 1080 case '=': 1081 return DOUBLESLASHEQUAL; 1082 } 1083 break; 1084 } 1085 break; 1086 } 1087 return OP; 1088} 1089 1090static int 1091indenterror(struct tok_state *tok) 1092{ 1093 if (tok->alterror) { 1094 tok->done = E_TABSPACE; 1095 tok->cur = tok->inp; 1096 return 1; 1097 } 1098 if (tok->altwarning) { 1099 PySys_WriteStderr("%s: inconsistent use of tabs and spaces " 1100 "in indentation\n", tok->filename); 1101 tok->altwarning = 0; 1102 } 1103 return 0; 1104} 1105 1106 1107/* Get next token, after space stripping etc. */ 1108 1109static int 1110tok_get(register struct tok_state *tok, char **p_start, char **p_end) 1111{ 1112 register int c; 1113 int blankline; 1114 1115 *p_start = *p_end = NULL; 1116 nextline: 1117 tok->start = NULL; 1118 blankline = 0; 1119 1120 /* Get indentation level */ 1121 if (tok->atbol) { 1122 register int col = 0; 1123 register int altcol = 0; 1124 tok->atbol = 0; 1125 for (;;) { 1126 c = tok_nextc(tok); 1127 if (c == ' ') 1128 col++, altcol++; 1129 else if (c == '\t') { 1130 col = (col/tok->tabsize + 1) * tok->tabsize; 1131 altcol = (altcol/tok->alttabsize + 1) 1132 * tok->alttabsize; 1133 } 1134 else if (c == '\014') /* Control-L (formfeed) */ 1135 col = altcol = 0; /* For Emacs users */ 1136 else 1137 break; 1138 } 1139 tok_backup(tok, c); 1140 if (c == '#' || c == '\n') { 1141 /* Lines with only whitespace and/or comments 1142 shouldn't affect the indentation and are 1143 not passed to the parser as NEWLINE tokens, 1144 except *totally* empty lines in interactive 1145 mode, which signal the end of a command group. */ 1146 if (col == 0 && c == '\n' && tok->prompt != NULL) 1147 blankline = 0; /* Let it through */ 1148 else 1149 blankline = 1; /* Ignore completely */ 1150 /* We can't jump back right here since we still 1151 may need to skip to the end of a comment */ 1152 } 1153 if (!blankline && tok->level == 0) { 1154 if (col == tok->indstack[tok->indent]) { 1155 /* No change */ 1156 if (altcol != tok->altindstack[tok->indent]) { 1157 if (indenterror(tok)) 1158 return ERRORTOKEN; 1159 } 1160 } 1161 else if (col > tok->indstack[tok->indent]) { 1162 /* Indent -- always one */ 1163 if (tok->indent+1 >= MAXINDENT) { 1164 tok->done = E_TOODEEP; 1165 tok->cur = tok->inp; 1166 return ERRORTOKEN; 1167 } 1168 if (altcol <= tok->altindstack[tok->indent]) { 1169 if (indenterror(tok)) 1170 return ERRORTOKEN; 1171 } 1172 tok->pendin++; 1173 tok->indstack[++tok->indent] = col; 1174 tok->altindstack[tok->indent] = altcol; 1175 } 1176 else /* col < tok->indstack[tok->indent] */ { 1177 /* Dedent -- any number, must be consistent */ 1178 while (tok->indent > 0 && 1179 col < tok->indstack[tok->indent]) { 1180 tok->pendin--; 1181 tok->indent--; 1182 } 1183 if (col != tok->indstack[tok->indent]) { 1184 tok->done = E_DEDENT; 1185 tok->cur = tok->inp; 1186 return ERRORTOKEN; 1187 } 1188 if (altcol != tok->altindstack[tok->indent]) { 1189 if (indenterror(tok)) 1190 return ERRORTOKEN; 1191 } 1192 } 1193 } 1194 } 1195 1196 tok->start = tok->cur; 1197 1198 /* Return pending indents/dedents */ 1199 if (tok->pendin != 0) { 1200 if (tok->pendin < 0) { 1201 tok->pendin++; 1202 return DEDENT; 1203 } 1204 else { 1205 tok->pendin--; 1206 return INDENT; 1207 } 1208 } 1209 1210 again: 1211 tok->start = NULL; 1212 /* Skip spaces */ 1213 do { 1214 c = tok_nextc(tok); 1215 } while (c == ' ' || c == '\t' || c == '\014'); 1216 1217 /* Set start of current token */ 1218 tok->start = tok->cur - 1; 1219 1220 /* Skip comment, while looking for tab-setting magic */ 1221 if (c == '#') { 1222 static char *tabforms[] = { 1223 "tab-width:", /* Emacs */ 1224 ":tabstop=", /* vim, full form */ 1225 ":ts=", /* vim, abbreviated form */ 1226 "set tabsize=", /* will vi never die? */ 1227 /* more templates can be added here to support other editors */ 1228 }; 1229 char cbuf[80]; 1230 char *tp, **cp; 1231 tp = cbuf; 1232 do { 1233 *tp++ = c = tok_nextc(tok); 1234 } while (c != EOF && c != '\n' && 1235 (size_t)(tp - cbuf + 1) < sizeof(cbuf)); 1236 *tp = '\0'; 1237 for (cp = tabforms; 1238 cp < tabforms + sizeof(tabforms)/sizeof(tabforms[0]); 1239 cp++) { 1240 if ((tp = strstr(cbuf, *cp))) { 1241 int newsize = atoi(tp + strlen(*cp)); 1242 1243 if (newsize >= 1 && newsize <= 40) { 1244 tok->tabsize = newsize; 1245 if (Py_VerboseFlag) 1246 PySys_WriteStderr( 1247 "Tab size set to %d\n", 1248 newsize); 1249 } 1250 } 1251 } 1252 while (c != EOF && c != '\n') 1253 c = tok_nextc(tok); 1254 } 1255 1256 /* Check for EOF and errors now */ 1257 if (c == EOF) { 1258 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN; 1259 } 1260 1261 /* Identifier (most frequent token!) */ 1262 if (isalpha(c) || c == '_') { 1263 /* Process r"", u"" and ur"" */ 1264 switch (c) { 1265 case 'r': 1266 case 'R': 1267 c = tok_nextc(tok); 1268 if (c == '"' || c == '\'') 1269 goto letter_quote; 1270 break; 1271 case 'u': 1272 case 'U': 1273 c = tok_nextc(tok); 1274 if (c == 'r' || c == 'R') 1275 c = tok_nextc(tok); 1276 if (c == '"' || c == '\'') 1277 goto letter_quote; 1278 break; 1279 } 1280 while (isalnum(c) || c == '_') { 1281 c = tok_nextc(tok); 1282 } 1283 tok_backup(tok, c); 1284 *p_start = tok->start; 1285 *p_end = tok->cur; 1286 return NAME; 1287 } 1288 1289 /* Newline */ 1290 if (c == '\n') { 1291 tok->atbol = 1; 1292 if (blankline || tok->level > 0) 1293 goto nextline; 1294 *p_start = tok->start; 1295 *p_end = tok->cur - 1; /* Leave '\n' out of the string */ 1296 tok->cont_line = 0; 1297 return NEWLINE; 1298 } 1299 1300 /* Period or number starting with period? */ 1301 if (c == '.') { 1302 c = tok_nextc(tok); 1303 if (isdigit(c)) { 1304 goto fraction; 1305 } 1306 else { 1307 tok_backup(tok, c); 1308 *p_start = tok->start; 1309 *p_end = tok->cur; 1310 return DOT; 1311 } 1312 } 1313 1314 /* Number */ 1315 if (isdigit(c)) { 1316 if (c == '0') { 1317 /* Hex or octal -- maybe. */ 1318 c = tok_nextc(tok); 1319 if (c == '.') 1320 goto fraction; 1321#ifndef WITHOUT_COMPLEX 1322 if (c == 'j' || c == 'J') 1323 goto imaginary; 1324#endif 1325 if (c == 'x' || c == 'X') { 1326 /* Hex */ 1327 do { 1328 c = tok_nextc(tok); 1329 } while (isxdigit(c)); 1330 } 1331 else { 1332 int found_decimal = 0; 1333 /* Octal; c is first char of it */ 1334 /* There's no 'isoctdigit' macro, sigh */ 1335 while ('0' <= c && c < '8') { 1336 c = tok_nextc(tok); 1337 } 1338 if (isdigit(c)) { 1339 found_decimal = 1; 1340 do { 1341 c = tok_nextc(tok); 1342 } while (isdigit(c)); 1343 } 1344 if (c == '.') 1345 goto fraction; 1346 else if (c == 'e' || c == 'E') 1347 goto exponent; 1348#ifndef WITHOUT_COMPLEX 1349 else if (c == 'j' || c == 'J') 1350 goto imaginary; 1351#endif 1352 else if (found_decimal) { 1353 tok->done = E_TOKEN; 1354 tok_backup(tok, c); 1355 return ERRORTOKEN; 1356 } 1357 } 1358 if (c == 'l' || c == 'L') 1359 c = tok_nextc(tok); 1360 } 1361 else { 1362 /* Decimal */ 1363 do { 1364 c = tok_nextc(tok); 1365 } while (isdigit(c)); 1366 if (c == 'l' || c == 'L') 1367 c = tok_nextc(tok); 1368 else { 1369 /* Accept floating point numbers. */ 1370 if (c == '.') { 1371 fraction: 1372 /* Fraction */ 1373 do { 1374 c = tok_nextc(tok); 1375 } while (isdigit(c)); 1376 } 1377 if (c == 'e' || c == 'E') { 1378 exponent: 1379 /* Exponent part */ 1380 c = tok_nextc(tok); 1381 if (c == '+' || c == '-') 1382 c = tok_nextc(tok); 1383 if (!isdigit(c)) { 1384 tok->done = E_TOKEN; 1385 tok_backup(tok, c); 1386 return ERRORTOKEN; 1387 } 1388 do { 1389 c = tok_nextc(tok); 1390 } while (isdigit(c)); 1391 } 1392#ifndef WITHOUT_COMPLEX 1393 if (c == 'j' || c == 'J') 1394 /* Imaginary part */ 1395 imaginary: 1396 c = tok_nextc(tok); 1397#endif 1398 } 1399 } 1400 tok_backup(tok, c); 1401 *p_start = tok->start; 1402 *p_end = tok->cur; 1403 return NUMBER; 1404 } 1405 1406 letter_quote: 1407 /* String */ 1408 if (c == '\'' || c == '"') { 1409 Py_ssize_t quote2 = tok->cur - tok->start + 1; 1410 int quote = c; 1411 int triple = 0; 1412 int tripcount = 0; 1413 for (;;) { 1414 c = tok_nextc(tok); 1415 if (c == '\n') { 1416 if (!triple) { 1417 tok->done = E_EOLS; 1418 tok_backup(tok, c); 1419 return ERRORTOKEN; 1420 } 1421 tripcount = 0; 1422 tok->cont_line = 1; /* multiline string. */ 1423 } 1424 else if (c == EOF) { 1425 if (triple) 1426 tok->done = E_EOFS; 1427 else 1428 tok->done = E_EOLS; 1429 tok->cur = tok->inp; 1430 return ERRORTOKEN; 1431 } 1432 else if (c == quote) { 1433 tripcount++; 1434 if (tok->cur - tok->start == quote2) { 1435 c = tok_nextc(tok); 1436 if (c == quote) { 1437 triple = 1; 1438 tripcount = 0; 1439 continue; 1440 } 1441 tok_backup(tok, c); 1442 } 1443 if (!triple || tripcount == 3) 1444 break; 1445 } 1446 else if (c == '\\') { 1447 tripcount = 0; 1448 c = tok_nextc(tok); 1449 if (c == EOF) { 1450 tok->done = E_EOLS; 1451 tok->cur = tok->inp; 1452 return ERRORTOKEN; 1453 } 1454 } 1455 else 1456 tripcount = 0; 1457 } 1458 *p_start = tok->start; 1459 *p_end = tok->cur; 1460 return STRING; 1461 } 1462 1463 /* Line continuation */ 1464 if (c == '\\') { 1465 c = tok_nextc(tok); 1466 if (c != '\n') { 1467 tok->done = E_LINECONT; 1468 tok->cur = tok->inp; 1469 return ERRORTOKEN; 1470 } 1471 tok->cont_line = 1; 1472 goto again; /* Read next line */ 1473 } 1474 1475 /* Check for two-character token */ 1476 { 1477 int c2 = tok_nextc(tok); 1478 int token = PyToken_TwoChars(c, c2); 1479 if (token != OP) { 1480 int c3 = tok_nextc(tok); 1481 int token3 = PyToken_ThreeChars(c, c2, c3); 1482 if (token3 != OP) { 1483 token = token3; 1484 } else { 1485 tok_backup(tok, c3); 1486 } 1487 *p_start = tok->start; 1488 *p_end = tok->cur; 1489 return token; 1490 } 1491 tok_backup(tok, c2); 1492 } 1493 1494 /* Keep track of parentheses nesting level */ 1495 switch (c) { 1496 case '(': 1497 case '[': 1498 case '{': 1499 tok->level++; 1500 break; 1501 case ')': 1502 case ']': 1503 case '}': 1504 tok->level--; 1505 break; 1506 } 1507 1508 /* Punctuation character */ 1509 *p_start = tok->start; 1510 *p_end = tok->cur; 1511 return PyToken_OneChar(c); 1512} 1513 1514int 1515PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end) 1516{ 1517 int result = tok_get(tok, p_start, p_end); 1518 if (tok->decoding_erred) { 1519 result = ERRORTOKEN; 1520 tok->done = E_DECODE; 1521 } 1522 return result; 1523} 1524 1525/* This function is only called from parsetok. However, it cannot live 1526 there, as it must be empty for PGEN, and we can check for PGEN only 1527 in this file. */ 1528 1529#ifdef PGEN 1530char* 1531PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int* offset) 1532{ 1533 return NULL; 1534} 1535#else 1536static PyObject * 1537dec_utf8(const char *enc, const char *text, size_t len) { 1538 PyObject *ret = NULL; 1539 PyObject *unicode_text = PyUnicode_DecodeUTF8(text, len, "replace"); 1540 if (unicode_text) { 1541 ret = PyUnicode_AsEncodedString(unicode_text, enc, "replace"); 1542 Py_DECREF(unicode_text); 1543 } 1544 if (!ret) { 1545 PyErr_Clear(); 1546 } 1547 return ret; 1548} 1549 1550char * 1551PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int *offset) 1552{ 1553 char *text = NULL; 1554 if (tok->encoding) { 1555 /* convert source to original encondig */ 1556 PyObject *lineobj = dec_utf8(tok->encoding, tok->buf, len); 1557 if (lineobj != NULL) { 1558 int linelen = PyString_Size(lineobj); 1559 const char *line = PyString_AsString(lineobj); 1560 text = PyObject_MALLOC(linelen + 1); 1561 if (text != NULL && line != NULL) { 1562 if (linelen) 1563 strncpy(text, line, linelen); 1564 text[linelen] = '\0'; 1565 } 1566 Py_DECREF(lineobj); 1567 1568 /* adjust error offset */ 1569 if (*offset > 1) { 1570 PyObject *offsetobj = dec_utf8(tok->encoding, 1571 tok->buf, *offset-1); 1572 if (offsetobj) { 1573 *offset = PyString_Size(offsetobj) + 1; 1574 Py_DECREF(offsetobj); 1575 } 1576 } 1577 1578 } 1579 } 1580 return text; 1581 1582} 1583#endif 1584 1585 1586 1587#ifdef Py_DEBUG 1588 1589void 1590tok_dump(int type, char *start, char *end) 1591{ 1592 printf("%s", _PyParser_TokenNames[type]); 1593 if (type == NAME || type == NUMBER || type == STRING || type == OP) 1594 printf("(%.*s)", (int)(end - start), start); 1595} 1596 1597#endif 1598