tokenizer.c revision dee2fd54481b311ad831ac455a9192bdc0f147e3
1 2/* Tokenizer implementation */ 3 4#include "Python.h" 5#include "pgenheaders.h" 6 7#include <ctype.h> 8#include <assert.h> 9 10#include "tokenizer.h" 11#include "errcode.h" 12 13#ifndef PGEN 14#include "unicodeobject.h" 15#include "stringobject.h" 16#include "fileobject.h" 17#include "codecs.h" 18#include "abstract.h" 19#endif /* PGEN */ 20 21extern char *PyOS_Readline(FILE *, FILE *, char *); 22/* Return malloc'ed string including trailing \n; 23 empty malloc'ed string for EOF; 24 NULL if interrupted */ 25 26/* Don't ever change this -- it would break the portability of Python code */ 27#define TABSIZE 8 28 29/* Convert a possibly signed character to a nonnegative int */ 30/* XXX This assumes characters are 8 bits wide */ 31#ifdef __CHAR_UNSIGNED__ 32#define Py_CHARMASK(c) (c) 33#else 34#define Py_CHARMASK(c) ((c) & 0xff) 35#endif 36 37/* Forward */ 38static struct tok_state *tok_new(void); 39static int tok_nextc(struct tok_state *tok); 40static void tok_backup(struct tok_state *tok, int c); 41 42/* Token names */ 43 44char *_PyParser_TokenNames[] = { 45 "ENDMARKER", 46 "NAME", 47 "NUMBER", 48 "STRING", 49 "NEWLINE", 50 "INDENT", 51 "DEDENT", 52 "LPAR", 53 "RPAR", 54 "LSQB", 55 "RSQB", 56 "COLON", 57 "COMMA", 58 "SEMI", 59 "PLUS", 60 "MINUS", 61 "STAR", 62 "SLASH", 63 "VBAR", 64 "AMPER", 65 "LESS", 66 "GREATER", 67 "EQUAL", 68 "DOT", 69 "PERCENT", 70 "BACKQUOTE", 71 "LBRACE", 72 "RBRACE", 73 "EQEQUAL", 74 "NOTEQUAL", 75 "LESSEQUAL", 76 "GREATEREQUAL", 77 "TILDE", 78 "CIRCUMFLEX", 79 "LEFTSHIFT", 80 "RIGHTSHIFT", 81 "DOUBLESTAR", 82 "PLUSEQUAL", 83 "MINEQUAL", 84 "STAREQUAL", 85 "SLASHEQUAL", 86 "PERCENTEQUAL", 87 "AMPEREQUAL", 88 "VBAREQUAL", 89 "CIRCUMFLEXEQUAL", 90 "LEFTSHIFTEQUAL", 91 "RIGHTSHIFTEQUAL", 92 "DOUBLESTAREQUAL", 93 "DOUBLESLASH", 94 "DOUBLESLASHEQUAL", 95 "AT", 96 /* This table must match the #defines in token.h! */ 97 "OP", 98 "<ERRORTOKEN>", 99 "<N_TOKENS>" 100}; 101 102 103/* Create and initialize a new tok_state structure */ 104 105static struct tok_state * 106tok_new(void) 107{ 108 struct tok_state *tok = PyMem_NEW(struct tok_state, 1); 109 if (tok == NULL) 110 return NULL; 111 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL; 112 tok->done = E_OK; 113 tok->fp = NULL; 114 tok->tabsize = TABSIZE; 115 tok->indent = 0; 116 tok->indstack[0] = 0; 117 tok->atbol = 1; 118 tok->pendin = 0; 119 tok->prompt = tok->nextprompt = NULL; 120 tok->lineno = 0; 121 tok->level = 0; 122 tok->filename = NULL; 123 tok->altwarning = 0; 124 tok->alterror = 0; 125 tok->alttabsize = 1; 126 tok->altindstack[0] = 0; 127 tok->decoding_state = 0; 128 tok->decoding_erred = 0; 129 tok->read_coding_spec = 0; 130 tok->issued_encoding_warning = 0; 131 tok->encoding = NULL; 132 tok->cont_line = 0; 133#ifndef PGEN 134 tok->decoding_readline = NULL; 135 tok->decoding_buffer = NULL; 136#endif 137 return tok; 138} 139 140#ifdef PGEN 141 142static char * 143decoding_fgets(char *s, int size, struct tok_state *tok) 144{ 145 return fgets(s, size, tok->fp); 146} 147 148static int 149decoding_feof(struct tok_state *tok) 150{ 151 return feof(tok->fp); 152} 153 154static const char * 155decode_str(const char *str, struct tok_state *tok) 156{ 157 return str; 158} 159 160#else /* PGEN */ 161 162static char * 163error_ret(struct tok_state *tok) /* XXX */ 164{ 165 tok->decoding_erred = 1; 166 if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */ 167 PyMem_DEL(tok->buf); 168 tok->buf = NULL; 169 return NULL; /* as if it were EOF */ 170} 171 172static char * 173new_string(const char *s, int len) 174{ 175 char* result = PyMem_NEW(char, len + 1); 176 if (result != NULL) { 177 memcpy(result, s, len); 178 result[len] = '\0'; 179 } 180 return result; 181} 182 183static char * 184get_normal_name(char *s) /* for utf-8 and latin-1 */ 185{ 186 char buf[13]; 187 int i; 188 for (i = 0; i < 12; i++) { 189 int c = s[i]; 190 if (c == '\0') break; 191 else if (c == '_') buf[i] = '-'; 192 else buf[i] = tolower(c); 193 } 194 buf[i] = '\0'; 195 if (strcmp(buf, "utf-8") == 0 || 196 strncmp(buf, "utf-8-", 6) == 0) return "utf-8"; 197 else if (strcmp(buf, "latin-1") == 0 || 198 strcmp(buf, "iso-8859-1") == 0 || 199 strcmp(buf, "iso-latin-1") == 0 || 200 strncmp(buf, "latin-1-", 8) == 0 || 201 strncmp(buf, "iso-8859-1-", 11) == 0 || 202 strncmp(buf, "iso-latin-1-", 12) == 0) return "iso-8859-1"; 203 else return s; 204} 205 206/* Return the coding spec in S, or NULL if none is found. */ 207 208static char * 209get_coding_spec(const char *s, int size) 210{ 211 int i; 212 /* Coding spec must be in a comment, and that comment must be 213 * the only statement on the source code line. */ 214 for (i = 0; i < size - 6; i++) { 215 if (s[i] == '#') 216 break; 217 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014') 218 return NULL; 219 } 220 for (; i < size - 6; i++) { /* XXX inefficient search */ 221 const char* t = s + i; 222 if (strncmp(t, "coding", 6) == 0) { 223 const char* begin = NULL; 224 t += 6; 225 if (t[0] != ':' && t[0] != '=') 226 continue; 227 do { 228 t++; 229 } while (t[0] == '\x20' || t[0] == '\t'); 230 231 begin = t; 232 while (isalnum((int)t[0]) || 233 t[0] == '-' || t[0] == '_' || t[0] == '.') 234 t++; 235 236 if (begin < t) { 237 char* r = new_string(begin, t - begin); 238 char* q = get_normal_name(r); 239 if (r != q) { 240 PyMem_DEL(r); 241 r = new_string(q, strlen(q)); 242 } 243 return r; 244 } 245 } 246 } 247 return NULL; 248} 249 250/* Check whether the line contains a coding spec. If it does, 251 invoke the set_readline function for the new encoding. 252 This function receives the tok_state and the new encoding. 253 Return 1 on success, 0 on failure. */ 254 255static int 256check_coding_spec(const char* line, int size, struct tok_state *tok, 257 int set_readline(struct tok_state *, const char *)) 258{ 259 char * cs; 260 int r = 1; 261 262 if (tok->cont_line) 263 /* It's a continuation line, so it can't be a coding spec. */ 264 return 1; 265 cs = get_coding_spec(line, size); 266 if (cs != NULL) { 267 tok->read_coding_spec = 1; 268 if (tok->encoding == NULL) { 269 assert(tok->decoding_state == 1); /* raw */ 270 if (strcmp(cs, "utf-8") == 0 || 271 strcmp(cs, "iso-8859-1") == 0) { 272 tok->encoding = cs; 273 } else { 274#ifdef Py_USING_UNICODE 275 r = set_readline(tok, cs); 276 if (r) { 277 tok->encoding = cs; 278 tok->decoding_state = -1; 279 } 280 else 281 PyMem_DEL(cs); 282#else 283 /* Without Unicode support, we cannot 284 process the coding spec. Since there 285 won't be any Unicode literals, that 286 won't matter. */ 287 PyMem_DEL(cs); 288#endif 289 } 290 } else { /* then, compare cs with BOM */ 291 r = (strcmp(tok->encoding, cs) == 0); 292 PyMem_DEL(cs); 293 } 294 } 295 return r; 296} 297 298/* See whether the file starts with a BOM. If it does, 299 invoke the set_readline function with the new encoding. 300 Return 1 on success, 0 on failure. */ 301 302static int 303check_bom(int get_char(struct tok_state *), 304 void unget_char(int, struct tok_state *), 305 int set_readline(struct tok_state *, const char *), 306 struct tok_state *tok) 307{ 308 int ch = get_char(tok); 309 tok->decoding_state = 1; 310 if (ch == EOF) { 311 return 1; 312 } else if (ch == 0xEF) { 313 ch = get_char(tok); if (ch != 0xBB) goto NON_BOM; 314 ch = get_char(tok); if (ch != 0xBF) goto NON_BOM; 315#if 0 316 /* Disable support for UTF-16 BOMs until a decision 317 is made whether this needs to be supported. */ 318 } else if (ch == 0xFE) { 319 ch = get_char(tok); if (ch != 0xFF) goto NON_BOM; 320 if (!set_readline(tok, "utf-16-be")) return 0; 321 tok->decoding_state = -1; 322 } else if (ch == 0xFF) { 323 ch = get_char(tok); if (ch != 0xFE) goto NON_BOM; 324 if (!set_readline(tok, "utf-16-le")) return 0; 325 tok->decoding_state = -1; 326#endif 327 } else { 328 unget_char(ch, tok); 329 return 1; 330 } 331 if (tok->encoding != NULL) 332 PyMem_DEL(tok->encoding); 333 tok->encoding = new_string("utf-8", 5); /* resulting is in utf-8 */ 334 return 1; 335 NON_BOM: 336 /* any token beginning with '\xEF', '\xFE', '\xFF' is a bad token */ 337 unget_char(0xFF, tok); /* XXX this will cause a syntax error */ 338 return 1; 339} 340 341/* Read a line of text from TOK into S, using the stream in TOK. 342 Return NULL on failure, else S. 343 344 On entry, tok->decoding_buffer will be one of: 345 1) NULL: need to call tok->decoding_readline to get a new line 346 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and 347 stored the result in tok->decoding_buffer 348 3) PyStringObject *: previous call to fp_readl did not have enough room 349 (in the s buffer) to copy entire contents of the line read 350 by tok->decoding_readline. tok->decoding_buffer has the overflow. 351 In this case, fp_readl is called in a loop (with an expanded buffer) 352 until the buffer ends with a '\n' (or until the end of the file is 353 reached): see tok_nextc and its calls to decoding_fgets. 354*/ 355 356static char * 357fp_readl(char *s, int size, struct tok_state *tok) 358{ 359#ifndef Py_USING_UNICODE 360 /* In a non-Unicode built, this should never be called. */ 361 Py_FatalError("fp_readl should not be called in this build."); 362 return NULL; /* Keep compiler happy (not reachable) */ 363#else 364 PyObject* utf8 = NULL; 365 PyObject* buf = tok->decoding_buffer; 366 char *str; 367 int utf8len; 368 369 /* Ask for one less byte so we can terminate it */ 370 assert(size > 0); 371 size--; 372 373 if (buf == NULL) { 374 buf = PyObject_CallObject(tok->decoding_readline, NULL); 375 if (buf == NULL) 376 return error_ret(tok); 377 } else { 378 tok->decoding_buffer = NULL; 379 if (PyString_CheckExact(buf)) 380 utf8 = buf; 381 } 382 if (utf8 == NULL) { 383 utf8 = PyUnicode_AsUTF8String(buf); 384 Py_DECREF(buf); 385 if (utf8 == NULL) 386 return error_ret(tok); 387 } 388 str = PyString_AsString(utf8); 389 utf8len = PyString_GET_SIZE(utf8); 390 if (utf8len > size) { 391 tok->decoding_buffer = PyString_FromStringAndSize(str+size, utf8len-size); 392 if (tok->decoding_buffer == NULL) { 393 Py_DECREF(utf8); 394 return error_ret(tok); 395 } 396 utf8len = size; 397 } 398 memcpy(s, str, utf8len); 399 s[utf8len] = '\0'; 400 Py_DECREF(utf8); 401 if (utf8len == 0) return NULL; /* EOF */ 402 return s; 403#endif 404} 405 406/* Set the readline function for TOK to a StreamReader's 407 readline function. The StreamReader is named ENC. 408 409 This function is called from check_bom and check_coding_spec. 410 411 ENC is usually identical to the future value of tok->encoding, 412 except for the (currently unsupported) case of UTF-16. 413 414 Return 1 on success, 0 on failure. */ 415 416static int 417fp_setreadl(struct tok_state *tok, const char* enc) 418{ 419 PyObject *reader, *stream, *readline; 420 421 /* XXX: constify filename argument. */ 422 stream = PyFile_FromFile(tok->fp, (char*)tok->filename, "rb", NULL); 423 if (stream == NULL) 424 return 0; 425 426 reader = PyCodec_StreamReader(enc, stream, NULL); 427 Py_DECREF(stream); 428 if (reader == NULL) 429 return 0; 430 431 readline = PyObject_GetAttrString(reader, "readline"); 432 Py_DECREF(reader); 433 if (readline == NULL) 434 return 0; 435 436 tok->decoding_readline = readline; 437 return 1; 438} 439 440/* Fetch the next byte from TOK. */ 441 442static int fp_getc(struct tok_state *tok) { 443 return getc(tok->fp); 444} 445 446/* Unfetch the last byte back into TOK. */ 447 448static void fp_ungetc(int c, struct tok_state *tok) { 449 ungetc(c, tok->fp); 450} 451 452/* Read a line of input from TOK. Determine encoding 453 if necessary. */ 454 455static char * 456decoding_fgets(char *s, int size, struct tok_state *tok) 457{ 458 char *line = NULL; 459 int warn = 0, badchar = 0; 460 for (;;) { 461 if (tok->decoding_state < 0) { 462 /* We already have a codec associated with 463 this input. */ 464 line = fp_readl(s, size, tok); 465 break; 466 } else if (tok->decoding_state > 0) { 467 /* We want a 'raw' read. */ 468 line = Py_UniversalNewlineFgets(s, size, 469 tok->fp, NULL); 470 warn = 1; 471 break; 472 } else { 473 /* We have not yet determined the encoding. 474 If an encoding is found, use the file-pointer 475 reader functions from now on. */ 476 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok)) 477 return error_ret(tok); 478 assert(tok->decoding_state != 0); 479 } 480 } 481 if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) { 482 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) { 483 return error_ret(tok); 484 } 485 } 486#ifndef PGEN 487 if (warn && line && !tok->issued_encoding_warning && !tok->encoding) { 488 unsigned char *c; 489 for (c = (unsigned char *)line; *c; c++) 490 if (*c > 127) { 491 badchar = *c; 492 break; 493 } 494 } 495 if (badchar) { 496 char buf[500]; 497 /* Need to add 1 to the line number, since this line 498 has not been counted, yet. */ 499 sprintf(buf, 500 "Non-ASCII character '\\x%.2x' " 501 "in file %.200s on line %i, " 502 "but no encoding declared; " 503 "see http://www.python.org/peps/pep-0263.html for details", 504 badchar, tok->filename, tok->lineno + 1); 505 /* We don't use PyErr_WarnExplicit() here because 506 printing the line in question to e.g. a log file 507 could result in sensitive information being 508 exposed. */ 509 PyErr_Warn(PyExc_DeprecationWarning, buf); 510 tok->issued_encoding_warning = 1; 511 } 512#endif 513 return line; 514} 515 516static int 517decoding_feof(struct tok_state *tok) 518{ 519 if (tok->decoding_state >= 0) { 520 return feof(tok->fp); 521 } else { 522 PyObject* buf = tok->decoding_buffer; 523 if (buf == NULL) { 524 buf = PyObject_CallObject(tok->decoding_readline, NULL); 525 if (buf == NULL) { 526 error_ret(tok); 527 return 1; 528 } else { 529 tok->decoding_buffer = buf; 530 } 531 } 532 return PyObject_Length(buf) == 0; 533 } 534} 535 536/* Fetch a byte from TOK, using the string buffer. */ 537 538static int buf_getc(struct tok_state *tok) { 539 return Py_CHARMASK(*tok->str++); 540} 541 542/* Unfetch a byte from TOK, using the string buffer. */ 543 544static void buf_ungetc(int c, struct tok_state *tok) { 545 tok->str--; 546 assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */ 547} 548 549/* Set the readline function for TOK to ENC. For the string-based 550 tokenizer, this means to just record the encoding. */ 551 552static int buf_setreadl(struct tok_state *tok, const char* enc) { 553 tok->enc = enc; 554 return 1; 555} 556 557/* Return a UTF-8 encoding Python string object from the 558 C byte string STR, which is encoded with ENC. */ 559 560#ifdef Py_USING_UNICODE 561static PyObject * 562translate_into_utf8(const char* str, const char* enc) { 563 PyObject *utf8; 564 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL); 565 if (buf == NULL) 566 return NULL; 567 utf8 = PyUnicode_AsUTF8String(buf); 568 Py_DECREF(buf); 569 return utf8; 570} 571#endif 572 573/* Decode a byte string STR for use as the buffer of TOK. 574 Look for encoding declarations inside STR, and record them 575 inside TOK. */ 576 577static const char * 578decode_str(const char *str, struct tok_state *tok) 579{ 580 PyObject* utf8 = NULL; 581 const char *s; 582 int lineno = 0; 583 tok->enc = NULL; 584 tok->str = str; 585 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok)) 586 return error_ret(tok); 587 str = tok->str; /* string after BOM if any */ 588 assert(str); 589#ifdef Py_USING_UNICODE 590 if (tok->enc != NULL) { 591 utf8 = translate_into_utf8(str, tok->enc); 592 if (utf8 == NULL) 593 return error_ret(tok); 594 str = PyString_AsString(utf8); 595 } 596#endif 597 for (s = str;; s++) { 598 if (*s == '\0') break; 599 else if (*s == '\n') { 600 lineno++; 601 if (lineno == 2) break; 602 } 603 } 604 tok->enc = NULL; 605 if (!check_coding_spec(str, s - str, tok, buf_setreadl)) 606 return error_ret(tok); 607#ifdef Py_USING_UNICODE 608 if (tok->enc != NULL) { 609 assert(utf8 == NULL); 610 utf8 = translate_into_utf8(str, tok->enc); 611 if (utf8 == NULL) { 612 PyErr_Format(PyExc_SyntaxError, 613 "unknown encoding: %s", tok->enc); 614 return error_ret(tok); 615 } 616 str = PyString_AsString(utf8); 617 } 618#endif 619 assert(tok->decoding_buffer == NULL); 620 tok->decoding_buffer = utf8; /* CAUTION */ 621 return str; 622} 623 624#endif /* PGEN */ 625 626/* Set up tokenizer for string */ 627 628struct tok_state * 629PyTokenizer_FromString(const char *str) 630{ 631 struct tok_state *tok = tok_new(); 632 if (tok == NULL) 633 return NULL; 634 str = (char *)decode_str(str, tok); 635 if (str == NULL) { 636 PyTokenizer_Free(tok); 637 return NULL; 638 } 639 640 /* XXX: constify members. */ 641 tok->buf = tok->cur = tok->end = tok->inp = (char*)str; 642 return tok; 643} 644 645 646/* Set up tokenizer for file */ 647 648struct tok_state * 649PyTokenizer_FromFile(FILE *fp, char *ps1, char *ps2) 650{ 651 struct tok_state *tok = tok_new(); 652 if (tok == NULL) 653 return NULL; 654 if ((tok->buf = PyMem_NEW(char, BUFSIZ)) == NULL) { 655 PyTokenizer_Free(tok); 656 return NULL; 657 } 658 tok->cur = tok->inp = tok->buf; 659 tok->end = tok->buf + BUFSIZ; 660 tok->fp = fp; 661 tok->prompt = ps1; 662 tok->nextprompt = ps2; 663 return tok; 664} 665 666 667/* Free a tok_state structure */ 668 669void 670PyTokenizer_Free(struct tok_state *tok) 671{ 672 if (tok->encoding != NULL) 673 PyMem_DEL(tok->encoding); 674#ifndef PGEN 675 Py_XDECREF(tok->decoding_readline); 676 Py_XDECREF(tok->decoding_buffer); 677#endif 678 if (tok->fp != NULL && tok->buf != NULL) 679 PyMem_DEL(tok->buf); 680 PyMem_DEL(tok); 681} 682 683#if !defined(PGEN) && defined(Py_USING_UNICODE) 684static int 685tok_stdin_decode(struct tok_state *tok, char **inp) 686{ 687 PyObject *enc, *sysstdin, *decoded, *utf8; 688 const char *encoding; 689 char *converted; 690 691 if (PySys_GetFile((char *)"stdin", NULL) != stdin) 692 return 0; 693 sysstdin = PySys_GetObject("stdin"); 694 if (sysstdin == NULL || !PyFile_Check(sysstdin)) 695 return 0; 696 697 enc = ((PyFileObject *)sysstdin)->f_encoding; 698 if (enc == NULL || !PyString_Check(enc)) 699 return 0; 700 Py_INCREF(enc); 701 702 encoding = PyString_AsString(enc); 703 decoded = PyUnicode_Decode(*inp, strlen(*inp), encoding, NULL); 704 if (decoded == NULL) 705 goto error_clear; 706 707 utf8 = PyUnicode_AsEncodedString(decoded, "utf-8", NULL); 708 Py_DECREF(decoded); 709 if (utf8 == NULL) 710 goto error_clear; 711 712 converted = new_string(PyString_AsString(utf8), PyString_Size(utf8)); 713 Py_DECREF(utf8); 714 if (converted == NULL) 715 goto error_nomem; 716 717 PyMem_FREE(*inp); 718 *inp = converted; 719 if (tok->encoding != NULL) 720 PyMem_DEL(tok->encoding); 721 tok->encoding = new_string(encoding, strlen(encoding)); 722 if (tok->encoding == NULL) 723 goto error_nomem; 724 725 Py_DECREF(enc); 726 return 0; 727 728error_nomem: 729 Py_DECREF(enc); 730 tok->done = E_NOMEM; 731 return -1; 732 733error_clear: 734 /* Fallback to iso-8859-1: for backward compatibility */ 735 Py_DECREF(enc); 736 PyErr_Clear(); 737 return 0; 738} 739#endif 740 741/* Get next char, updating state; error code goes into tok->done */ 742 743static int 744tok_nextc(register struct tok_state *tok) 745{ 746 for (;;) { 747 if (tok->cur != tok->inp) { 748 return Py_CHARMASK(*tok->cur++); /* Fast path */ 749 } 750 if (tok->done != E_OK) 751 return EOF; 752 if (tok->fp == NULL) { 753 char *end = strchr(tok->inp, '\n'); 754 if (end != NULL) 755 end++; 756 else { 757 end = strchr(tok->inp, '\0'); 758 if (end == tok->inp) { 759 tok->done = E_EOF; 760 return EOF; 761 } 762 } 763 if (tok->start == NULL) 764 tok->buf = tok->cur; 765 tok->lineno++; 766 tok->inp = end; 767 return Py_CHARMASK(*tok->cur++); 768 } 769 if (tok->prompt != NULL) { 770 char *new = PyOS_Readline(stdin, stdout, tok->prompt); 771 if (tok->nextprompt != NULL) 772 tok->prompt = tok->nextprompt; 773 if (new == NULL) 774 tok->done = E_INTR; 775 else if (*new == '\0') { 776 PyMem_FREE(new); 777 tok->done = E_EOF; 778 } 779#if !defined(PGEN) && defined(Py_USING_UNICODE) 780 else if (tok_stdin_decode(tok, &new) != 0) 781 PyMem_FREE(new); 782#endif 783 else if (tok->start != NULL) { 784 size_t start = tok->start - tok->buf; 785 size_t oldlen = tok->cur - tok->buf; 786 size_t newlen = oldlen + strlen(new); 787 char *buf = tok->buf; 788 PyMem_RESIZE(buf, char, newlen+1); 789 tok->lineno++; 790 if (buf == NULL) { 791 PyMem_DEL(tok->buf); 792 tok->buf = NULL; 793 PyMem_FREE(new); 794 tok->done = E_NOMEM; 795 return EOF; 796 } 797 tok->buf = buf; 798 tok->cur = tok->buf + oldlen; 799 strcpy(tok->buf + oldlen, new); 800 PyMem_FREE(new); 801 tok->inp = tok->buf + newlen; 802 tok->end = tok->inp + 1; 803 tok->start = tok->buf + start; 804 } 805 else { 806 tok->lineno++; 807 if (tok->buf != NULL) 808 PyMem_DEL(tok->buf); 809 tok->buf = new; 810 tok->cur = tok->buf; 811 tok->inp = strchr(tok->buf, '\0'); 812 tok->end = tok->inp + 1; 813 } 814 } 815 else { 816 int done = 0; 817 int cur = 0; 818 char *pt; 819 if (tok->start == NULL) { 820 if (tok->buf == NULL) { 821 tok->buf = PyMem_NEW(char, BUFSIZ); 822 if (tok->buf == NULL) { 823 tok->done = E_NOMEM; 824 return EOF; 825 } 826 tok->end = tok->buf + BUFSIZ; 827 } 828 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf), 829 tok) == NULL) { 830 tok->done = E_EOF; 831 done = 1; 832 } 833 else { 834 tok->done = E_OK; 835 tok->inp = strchr(tok->buf, '\0'); 836 done = tok->inp[-1] == '\n'; 837 } 838 } 839 else { 840 cur = tok->cur - tok->buf; 841 if (decoding_feof(tok)) { 842 tok->done = E_EOF; 843 done = 1; 844 } 845 else 846 tok->done = E_OK; 847 } 848 tok->lineno++; 849 /* Read until '\n' or EOF */ 850 while (!done) { 851 int curstart = tok->start == NULL ? -1 : 852 tok->start - tok->buf; 853 int curvalid = tok->inp - tok->buf; 854 int newsize = curvalid + BUFSIZ; 855 char *newbuf = tok->buf; 856 PyMem_RESIZE(newbuf, char, newsize); 857 if (newbuf == NULL) { 858 tok->done = E_NOMEM; 859 tok->cur = tok->inp; 860 return EOF; 861 } 862 tok->buf = newbuf; 863 tok->inp = tok->buf + curvalid; 864 tok->end = tok->buf + newsize; 865 tok->start = curstart < 0 ? NULL : 866 tok->buf + curstart; 867 if (decoding_fgets(tok->inp, 868 (int)(tok->end - tok->inp), 869 tok) == NULL) { 870 /* Last line does not end in \n, 871 fake one */ 872 strcpy(tok->inp, "\n"); 873 } 874 tok->inp = strchr(tok->inp, '\0'); 875 done = tok->inp[-1] == '\n'; 876 } 877 tok->cur = tok->buf + cur; 878 /* replace "\r\n" with "\n" */ 879 /* For Mac we leave the \r, giving a syntax error */ 880 pt = tok->inp - 2; 881 if (pt >= tok->buf && *pt == '\r') { 882 *pt++ = '\n'; 883 *pt = '\0'; 884 tok->inp = pt; 885 } 886 } 887 if (tok->done != E_OK) { 888 if (tok->prompt != NULL) 889 PySys_WriteStderr("\n"); 890 tok->cur = tok->inp; 891 return EOF; 892 } 893 } 894 /*NOTREACHED*/ 895} 896 897 898/* Back-up one character */ 899 900static void 901tok_backup(register struct tok_state *tok, register int c) 902{ 903 if (c != EOF) { 904 if (--tok->cur < tok->buf) 905 Py_FatalError("tok_backup: begin of buffer"); 906 if (*tok->cur != c) 907 *tok->cur = c; 908 } 909} 910 911 912/* Return the token corresponding to a single character */ 913 914int 915PyToken_OneChar(int c) 916{ 917 switch (c) { 918 case '(': return LPAR; 919 case ')': return RPAR; 920 case '[': return LSQB; 921 case ']': return RSQB; 922 case ':': return COLON; 923 case ',': return COMMA; 924 case ';': return SEMI; 925 case '+': return PLUS; 926 case '-': return MINUS; 927 case '*': return STAR; 928 case '/': return SLASH; 929 case '|': return VBAR; 930 case '&': return AMPER; 931 case '<': return LESS; 932 case '>': return GREATER; 933 case '=': return EQUAL; 934 case '.': return DOT; 935 case '%': return PERCENT; 936 case '`': return BACKQUOTE; 937 case '{': return LBRACE; 938 case '}': return RBRACE; 939 case '^': return CIRCUMFLEX; 940 case '~': return TILDE; 941 case '@': return AT; 942 default: return OP; 943 } 944} 945 946 947int 948PyToken_TwoChars(int c1, int c2) 949{ 950 switch (c1) { 951 case '=': 952 switch (c2) { 953 case '=': return EQEQUAL; 954 } 955 break; 956 case '!': 957 switch (c2) { 958 case '=': return NOTEQUAL; 959 } 960 break; 961 case '<': 962 switch (c2) { 963 case '>': return NOTEQUAL; 964 case '=': return LESSEQUAL; 965 case '<': return LEFTSHIFT; 966 } 967 break; 968 case '>': 969 switch (c2) { 970 case '=': return GREATEREQUAL; 971 case '>': return RIGHTSHIFT; 972 } 973 break; 974 case '+': 975 switch (c2) { 976 case '=': return PLUSEQUAL; 977 } 978 break; 979 case '-': 980 switch (c2) { 981 case '=': return MINEQUAL; 982 } 983 break; 984 case '*': 985 switch (c2) { 986 case '*': return DOUBLESTAR; 987 case '=': return STAREQUAL; 988 } 989 break; 990 case '/': 991 switch (c2) { 992 case '/': return DOUBLESLASH; 993 case '=': return SLASHEQUAL; 994 } 995 break; 996 case '|': 997 switch (c2) { 998 case '=': return VBAREQUAL; 999 } 1000 break; 1001 case '%': 1002 switch (c2) { 1003 case '=': return PERCENTEQUAL; 1004 } 1005 break; 1006 case '&': 1007 switch (c2) { 1008 case '=': return AMPEREQUAL; 1009 } 1010 break; 1011 case '^': 1012 switch (c2) { 1013 case '=': return CIRCUMFLEXEQUAL; 1014 } 1015 break; 1016 } 1017 return OP; 1018} 1019 1020int 1021PyToken_ThreeChars(int c1, int c2, int c3) 1022{ 1023 switch (c1) { 1024 case '<': 1025 switch (c2) { 1026 case '<': 1027 switch (c3) { 1028 case '=': 1029 return LEFTSHIFTEQUAL; 1030 } 1031 break; 1032 } 1033 break; 1034 case '>': 1035 switch (c2) { 1036 case '>': 1037 switch (c3) { 1038 case '=': 1039 return RIGHTSHIFTEQUAL; 1040 } 1041 break; 1042 } 1043 break; 1044 case '*': 1045 switch (c2) { 1046 case '*': 1047 switch (c3) { 1048 case '=': 1049 return DOUBLESTAREQUAL; 1050 } 1051 break; 1052 } 1053 break; 1054 case '/': 1055 switch (c2) { 1056 case '/': 1057 switch (c3) { 1058 case '=': 1059 return DOUBLESLASHEQUAL; 1060 } 1061 break; 1062 } 1063 break; 1064 } 1065 return OP; 1066} 1067 1068static int 1069indenterror(struct tok_state *tok) 1070{ 1071 if (tok->alterror) { 1072 tok->done = E_TABSPACE; 1073 tok->cur = tok->inp; 1074 return 1; 1075 } 1076 if (tok->altwarning) { 1077 PySys_WriteStderr("%s: inconsistent use of tabs and spaces " 1078 "in indentation\n", tok->filename); 1079 tok->altwarning = 0; 1080 } 1081 return 0; 1082} 1083 1084 1085/* Get next token, after space stripping etc. */ 1086 1087static int 1088tok_get(register struct tok_state *tok, char **p_start, char **p_end) 1089{ 1090 register int c; 1091 int blankline; 1092 1093 *p_start = *p_end = NULL; 1094 nextline: 1095 tok->start = NULL; 1096 blankline = 0; 1097 1098 /* Get indentation level */ 1099 if (tok->atbol) { 1100 register int col = 0; 1101 register int altcol = 0; 1102 tok->atbol = 0; 1103 for (;;) { 1104 c = tok_nextc(tok); 1105 if (c == ' ') 1106 col++, altcol++; 1107 else if (c == '\t') { 1108 col = (col/tok->tabsize + 1) * tok->tabsize; 1109 altcol = (altcol/tok->alttabsize + 1) 1110 * tok->alttabsize; 1111 } 1112 else if (c == '\014') /* Control-L (formfeed) */ 1113 col = altcol = 0; /* For Emacs users */ 1114 else 1115 break; 1116 } 1117 tok_backup(tok, c); 1118 if (c == '#' || c == '\n') { 1119 /* Lines with only whitespace and/or comments 1120 shouldn't affect the indentation and are 1121 not passed to the parser as NEWLINE tokens, 1122 except *totally* empty lines in interactive 1123 mode, which signal the end of a command group. */ 1124 if (col == 0 && c == '\n' && tok->prompt != NULL) 1125 blankline = 0; /* Let it through */ 1126 else 1127 blankline = 1; /* Ignore completely */ 1128 /* We can't jump back right here since we still 1129 may need to skip to the end of a comment */ 1130 } 1131 if (!blankline && tok->level == 0) { 1132 if (col == tok->indstack[tok->indent]) { 1133 /* No change */ 1134 if (altcol != tok->altindstack[tok->indent]) { 1135 if (indenterror(tok)) 1136 return ERRORTOKEN; 1137 } 1138 } 1139 else if (col > tok->indstack[tok->indent]) { 1140 /* Indent -- always one */ 1141 if (tok->indent+1 >= MAXINDENT) { 1142 tok->done = E_TOODEEP; 1143 tok->cur = tok->inp; 1144 return ERRORTOKEN; 1145 } 1146 if (altcol <= tok->altindstack[tok->indent]) { 1147 if (indenterror(tok)) 1148 return ERRORTOKEN; 1149 } 1150 tok->pendin++; 1151 tok->indstack[++tok->indent] = col; 1152 tok->altindstack[tok->indent] = altcol; 1153 } 1154 else /* col < tok->indstack[tok->indent] */ { 1155 /* Dedent -- any number, must be consistent */ 1156 while (tok->indent > 0 && 1157 col < tok->indstack[tok->indent]) { 1158 tok->pendin--; 1159 tok->indent--; 1160 } 1161 if (col != tok->indstack[tok->indent]) { 1162 tok->done = E_DEDENT; 1163 tok->cur = tok->inp; 1164 return ERRORTOKEN; 1165 } 1166 if (altcol != tok->altindstack[tok->indent]) { 1167 if (indenterror(tok)) 1168 return ERRORTOKEN; 1169 } 1170 } 1171 } 1172 } 1173 1174 tok->start = tok->cur; 1175 1176 /* Return pending indents/dedents */ 1177 if (tok->pendin != 0) { 1178 if (tok->pendin < 0) { 1179 tok->pendin++; 1180 return DEDENT; 1181 } 1182 else { 1183 tok->pendin--; 1184 return INDENT; 1185 } 1186 } 1187 1188 again: 1189 tok->start = NULL; 1190 /* Skip spaces */ 1191 do { 1192 c = tok_nextc(tok); 1193 } while (c == ' ' || c == '\t' || c == '\014'); 1194 1195 /* Set start of current token */ 1196 tok->start = tok->cur - 1; 1197 1198 /* Skip comment, while looking for tab-setting magic */ 1199 if (c == '#') { 1200 static char *tabforms[] = { 1201 "tab-width:", /* Emacs */ 1202 ":tabstop=", /* vim, full form */ 1203 ":ts=", /* vim, abbreviated form */ 1204 "set tabsize=", /* will vi never die? */ 1205 /* more templates can be added here to support other editors */ 1206 }; 1207 char cbuf[80]; 1208 char *tp, **cp; 1209 tp = cbuf; 1210 do { 1211 *tp++ = c = tok_nextc(tok); 1212 } while (c != EOF && c != '\n' && 1213 tp - cbuf + 1 < sizeof(cbuf)); 1214 *tp = '\0'; 1215 for (cp = tabforms; 1216 cp < tabforms + sizeof(tabforms)/sizeof(tabforms[0]); 1217 cp++) { 1218 if ((tp = strstr(cbuf, *cp))) { 1219 int newsize = atoi(tp + strlen(*cp)); 1220 1221 if (newsize >= 1 && newsize <= 40) { 1222 tok->tabsize = newsize; 1223 if (Py_VerboseFlag) 1224 PySys_WriteStderr( 1225 "Tab size set to %d\n", 1226 newsize); 1227 } 1228 } 1229 } 1230 while (c != EOF && c != '\n') 1231 c = tok_nextc(tok); 1232 } 1233 1234 /* Check for EOF and errors now */ 1235 if (c == EOF) { 1236 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN; 1237 } 1238 1239 /* Identifier (most frequent token!) */ 1240 if (isalpha(c) || c == '_') { 1241 /* Process r"", u"" and ur"" */ 1242 switch (c) { 1243 case 'r': 1244 case 'R': 1245 c = tok_nextc(tok); 1246 if (c == '"' || c == '\'') 1247 goto letter_quote; 1248 break; 1249 case 'u': 1250 case 'U': 1251 c = tok_nextc(tok); 1252 if (c == 'r' || c == 'R') 1253 c = tok_nextc(tok); 1254 if (c == '"' || c == '\'') 1255 goto letter_quote; 1256 break; 1257 } 1258 while (isalnum(c) || c == '_') { 1259 c = tok_nextc(tok); 1260 } 1261 tok_backup(tok, c); 1262 *p_start = tok->start; 1263 *p_end = tok->cur; 1264 return NAME; 1265 } 1266 1267 /* Newline */ 1268 if (c == '\n') { 1269 tok->atbol = 1; 1270 if (blankline || tok->level > 0) 1271 goto nextline; 1272 *p_start = tok->start; 1273 *p_end = tok->cur - 1; /* Leave '\n' out of the string */ 1274 tok->cont_line = 0; 1275 return NEWLINE; 1276 } 1277 1278 /* Period or number starting with period? */ 1279 if (c == '.') { 1280 c = tok_nextc(tok); 1281 if (isdigit(c)) { 1282 goto fraction; 1283 } 1284 else { 1285 tok_backup(tok, c); 1286 *p_start = tok->start; 1287 *p_end = tok->cur; 1288 return DOT; 1289 } 1290 } 1291 1292 /* Number */ 1293 if (isdigit(c)) { 1294 if (c == '0') { 1295 /* Hex or octal -- maybe. */ 1296 c = tok_nextc(tok); 1297 if (c == '.') 1298 goto fraction; 1299#ifndef WITHOUT_COMPLEX 1300 if (c == 'j' || c == 'J') 1301 goto imaginary; 1302#endif 1303 if (c == 'x' || c == 'X') { 1304 /* Hex */ 1305 do { 1306 c = tok_nextc(tok); 1307 } while (isxdigit(c)); 1308 } 1309 else { 1310 int found_decimal = 0; 1311 /* Octal; c is first char of it */ 1312 /* There's no 'isoctdigit' macro, sigh */ 1313 while ('0' <= c && c < '8') { 1314 c = tok_nextc(tok); 1315 } 1316 if (isdigit(c)) { 1317 found_decimal = 1; 1318 do { 1319 c = tok_nextc(tok); 1320 } while (isdigit(c)); 1321 } 1322 if (c == '.') 1323 goto fraction; 1324 else if (c == 'e' || c == 'E') 1325 goto exponent; 1326#ifndef WITHOUT_COMPLEX 1327 else if (c == 'j' || c == 'J') 1328 goto imaginary; 1329#endif 1330 else if (found_decimal) { 1331 tok->done = E_TOKEN; 1332 tok_backup(tok, c); 1333 return ERRORTOKEN; 1334 } 1335 } 1336 if (c == 'l' || c == 'L') 1337 c = tok_nextc(tok); 1338 } 1339 else { 1340 /* Decimal */ 1341 do { 1342 c = tok_nextc(tok); 1343 } while (isdigit(c)); 1344 if (c == 'l' || c == 'L') 1345 c = tok_nextc(tok); 1346 else { 1347 /* Accept floating point numbers. */ 1348 if (c == '.') { 1349 fraction: 1350 /* Fraction */ 1351 do { 1352 c = tok_nextc(tok); 1353 } while (isdigit(c)); 1354 } 1355 if (c == 'e' || c == 'E') { 1356 exponent: 1357 /* Exponent part */ 1358 c = tok_nextc(tok); 1359 if (c == '+' || c == '-') 1360 c = tok_nextc(tok); 1361 if (!isdigit(c)) { 1362 tok->done = E_TOKEN; 1363 tok_backup(tok, c); 1364 return ERRORTOKEN; 1365 } 1366 do { 1367 c = tok_nextc(tok); 1368 } while (isdigit(c)); 1369 } 1370#ifndef WITHOUT_COMPLEX 1371 if (c == 'j' || c == 'J') 1372 /* Imaginary part */ 1373 imaginary: 1374 c = tok_nextc(tok); 1375#endif 1376 } 1377 } 1378 tok_backup(tok, c); 1379 *p_start = tok->start; 1380 *p_end = tok->cur; 1381 return NUMBER; 1382 } 1383 1384 letter_quote: 1385 /* String */ 1386 if (c == '\'' || c == '"') { 1387 int quote2 = tok->cur - tok->start + 1; 1388 int quote = c; 1389 int triple = 0; 1390 int tripcount = 0; 1391 for (;;) { 1392 c = tok_nextc(tok); 1393 if (c == '\n') { 1394 if (!triple) { 1395 tok->done = E_EOLS; 1396 tok_backup(tok, c); 1397 return ERRORTOKEN; 1398 } 1399 tripcount = 0; 1400 tok->cont_line = 1; /* multiline string. */ 1401 } 1402 else if (c == EOF) { 1403 if (triple) 1404 tok->done = E_EOFS; 1405 else 1406 tok->done = E_EOLS; 1407 tok->cur = tok->inp; 1408 return ERRORTOKEN; 1409 } 1410 else if (c == quote) { 1411 tripcount++; 1412 if (tok->cur - tok->start == quote2) { 1413 c = tok_nextc(tok); 1414 if (c == quote) { 1415 triple = 1; 1416 tripcount = 0; 1417 continue; 1418 } 1419 tok_backup(tok, c); 1420 } 1421 if (!triple || tripcount == 3) 1422 break; 1423 } 1424 else if (c == '\\') { 1425 tripcount = 0; 1426 c = tok_nextc(tok); 1427 if (c == EOF) { 1428 tok->done = E_EOLS; 1429 tok->cur = tok->inp; 1430 return ERRORTOKEN; 1431 } 1432 } 1433 else 1434 tripcount = 0; 1435 } 1436 *p_start = tok->start; 1437 *p_end = tok->cur; 1438 return STRING; 1439 } 1440 1441 /* Line continuation */ 1442 if (c == '\\') { 1443 c = tok_nextc(tok); 1444 if (c != '\n') { 1445 tok->done = E_LINECONT; 1446 tok->cur = tok->inp; 1447 return ERRORTOKEN; 1448 } 1449 tok->cont_line = 1; 1450 goto again; /* Read next line */ 1451 } 1452 1453 /* Check for two-character token */ 1454 { 1455 int c2 = tok_nextc(tok); 1456 int token = PyToken_TwoChars(c, c2); 1457 if (token != OP) { 1458 int c3 = tok_nextc(tok); 1459 int token3 = PyToken_ThreeChars(c, c2, c3); 1460 if (token3 != OP) { 1461 token = token3; 1462 } else { 1463 tok_backup(tok, c3); 1464 } 1465 *p_start = tok->start; 1466 *p_end = tok->cur; 1467 return token; 1468 } 1469 tok_backup(tok, c2); 1470 } 1471 1472 /* Keep track of parentheses nesting level */ 1473 switch (c) { 1474 case '(': 1475 case '[': 1476 case '{': 1477 tok->level++; 1478 break; 1479 case ')': 1480 case ']': 1481 case '}': 1482 tok->level--; 1483 break; 1484 } 1485 1486 /* Punctuation character */ 1487 *p_start = tok->start; 1488 *p_end = tok->cur; 1489 return PyToken_OneChar(c); 1490} 1491 1492int 1493PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end) 1494{ 1495 int result = tok_get(tok, p_start, p_end); 1496 if (tok->decoding_erred) { 1497 result = ERRORTOKEN; 1498 tok->done = E_DECODE; 1499 } 1500 return result; 1501} 1502 1503#ifdef Py_DEBUG 1504 1505void 1506tok_dump(int type, char *start, char *end) 1507{ 1508 printf("%s", _PyParser_TokenNames[type]); 1509 if (type == NAME || type == NUMBER || type == STRING || type == OP) 1510 printf("(%.*s)", (int)(end - start), start); 1511} 1512 1513#endif 1514