tokenizer.c revision 42d63847c32fda10b61c1f420402a09ddbbe95eb
1 2/* Tokenizer implementation */ 3 4#include "Python.h" 5#include "pgenheaders.h" 6 7#include <ctype.h> 8#include <assert.h> 9 10#include "tokenizer.h" 11#include "errcode.h" 12 13#ifndef PGEN 14#include "unicodeobject.h" 15#include "stringobject.h" 16#include "fileobject.h" 17#include "codecs.h" 18#include "abstract.h" 19#include "pydebug.h" 20#endif /* PGEN */ 21 22extern char *PyOS_Readline(FILE *, FILE *, char *); 23/* Return malloc'ed string including trailing \n; 24 empty malloc'ed string for EOF; 25 NULL if interrupted */ 26 27/* Don't ever change this -- it would break the portability of Python code */ 28#define TABSIZE 8 29 30/* Forward */ 31static struct tok_state *tok_new(void); 32static int tok_nextc(struct tok_state *tok); 33static void tok_backup(struct tok_state *tok, int c); 34 35/* Token names */ 36 37char *_PyParser_TokenNames[] = { 38 "ENDMARKER", 39 "NAME", 40 "NUMBER", 41 "STRING", 42 "NEWLINE", 43 "INDENT", 44 "DEDENT", 45 "LPAR", 46 "RPAR", 47 "LSQB", 48 "RSQB", 49 "COLON", 50 "COMMA", 51 "SEMI", 52 "PLUS", 53 "MINUS", 54 "STAR", 55 "SLASH", 56 "VBAR", 57 "AMPER", 58 "LESS", 59 "GREATER", 60 "EQUAL", 61 "DOT", 62 "PERCENT", 63 "BACKQUOTE", 64 "LBRACE", 65 "RBRACE", 66 "EQEQUAL", 67 "NOTEQUAL", 68 "LESSEQUAL", 69 "GREATEREQUAL", 70 "TILDE", 71 "CIRCUMFLEX", 72 "LEFTSHIFT", 73 "RIGHTSHIFT", 74 "DOUBLESTAR", 75 "PLUSEQUAL", 76 "MINEQUAL", 77 "STAREQUAL", 78 "SLASHEQUAL", 79 "PERCENTEQUAL", 80 "AMPEREQUAL", 81 "VBAREQUAL", 82 "CIRCUMFLEXEQUAL", 83 "LEFTSHIFTEQUAL", 84 "RIGHTSHIFTEQUAL", 85 "DOUBLESTAREQUAL", 86 "DOUBLESLASH", 87 "DOUBLESLASHEQUAL", 88 "AT", 89 /* This table must match the #defines in token.h! */ 90 "OP", 91 "<ERRORTOKEN>", 92 "<N_TOKENS>" 93}; 94 95 96/* Create and initialize a new tok_state structure */ 97 98static struct tok_state * 99tok_new(void) 100{ 101 struct tok_state *tok = (struct tok_state *)PyMem_MALLOC( 102 sizeof(struct tok_state)); 103 if (tok == NULL) 104 return NULL; 105 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL; 106 tok->done = E_OK; 107 tok->fp = NULL; 108 tok->input = NULL; 109 tok->tabsize = TABSIZE; 110 tok->indent = 0; 111 tok->indstack[0] = 0; 112 tok->atbol = 1; 113 tok->pendin = 0; 114 tok->prompt = tok->nextprompt = NULL; 115 tok->lineno = 0; 116 tok->level = 0; 117 tok->filename = NULL; 118 tok->altwarning = 0; 119 tok->alterror = 0; 120 tok->alttabsize = 1; 121 tok->altindstack[0] = 0; 122 tok->decoding_state = 0; 123 tok->decoding_erred = 0; 124 tok->read_coding_spec = 0; 125 tok->encoding = NULL; 126 tok->cont_line = 0; 127#ifndef PGEN 128 tok->decoding_readline = NULL; 129 tok->decoding_buffer = NULL; 130#endif 131 return tok; 132} 133 134static char * 135new_string(const char *s, Py_ssize_t len) 136{ 137 char* result = (char *)PyMem_MALLOC(len + 1); 138 if (result != NULL) { 139 memcpy(result, s, len); 140 result[len] = '\0'; 141 } 142 return result; 143} 144 145#ifdef PGEN 146 147static char * 148decoding_fgets(char *s, int size, struct tok_state *tok) 149{ 150 return fgets(s, size, tok->fp); 151} 152 153static int 154decoding_feof(struct tok_state *tok) 155{ 156 return feof(tok->fp); 157} 158 159static char * 160decode_str(const char *str, int exec_input, struct tok_state *tok) 161{ 162 return new_string(str, strlen(str)); 163} 164 165#else /* PGEN */ 166 167static char * 168error_ret(struct tok_state *tok) /* XXX */ 169{ 170 tok->decoding_erred = 1; 171 if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */ 172 PyMem_FREE(tok->buf); 173 tok->buf = NULL; 174 return NULL; /* as if it were EOF */ 175} 176 177 178static char * 179get_normal_name(char *s) /* for utf-8 and latin-1 */ 180{ 181 char buf[13]; 182 int i; 183 for (i = 0; i < 12; i++) { 184 int c = s[i]; 185 if (c == '\0') 186 break; 187 else if (c == '_') 188 buf[i] = '-'; 189 else 190 buf[i] = tolower(c); 191 } 192 buf[i] = '\0'; 193 if (strcmp(buf, "utf-8") == 0 || 194 strncmp(buf, "utf-8-", 6) == 0) 195 return "utf-8"; 196 else if (strcmp(buf, "latin-1") == 0 || 197 strcmp(buf, "iso-8859-1") == 0 || 198 strcmp(buf, "iso-latin-1") == 0 || 199 strncmp(buf, "latin-1-", 8) == 0 || 200 strncmp(buf, "iso-8859-1-", 11) == 0 || 201 strncmp(buf, "iso-latin-1-", 12) == 0) 202 return "iso-8859-1"; 203 else 204 return s; 205} 206 207/* Return the coding spec in S, or NULL if none is found. */ 208 209static char * 210get_coding_spec(const char *s, Py_ssize_t size) 211{ 212 Py_ssize_t i; 213 /* Coding spec must be in a comment, and that comment must be 214 * the only statement on the source code line. */ 215 for (i = 0; i < size - 6; i++) { 216 if (s[i] == '#') 217 break; 218 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014') 219 return NULL; 220 } 221 for (; i < size - 6; i++) { /* XXX inefficient search */ 222 const char* t = s + i; 223 if (strncmp(t, "coding", 6) == 0) { 224 const char* begin = NULL; 225 t += 6; 226 if (t[0] != ':' && t[0] != '=') 227 continue; 228 do { 229 t++; 230 } while (t[0] == '\x20' || t[0] == '\t'); 231 232 begin = t; 233 while (isalnum(Py_CHARMASK(t[0])) || 234 t[0] == '-' || t[0] == '_' || t[0] == '.') 235 t++; 236 237 if (begin < t) { 238 char* r = new_string(begin, t - begin); 239 char* q = get_normal_name(r); 240 if (r != q) { 241 PyMem_FREE(r); 242 r = new_string(q, strlen(q)); 243 } 244 return r; 245 } 246 } 247 } 248 return NULL; 249} 250 251/* Check whether the line contains a coding spec. If it does, 252 invoke the set_readline function for the new encoding. 253 This function receives the tok_state and the new encoding. 254 Return 1 on success, 0 on failure. */ 255 256static int 257check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok, 258 int set_readline(struct tok_state *, const char *)) 259{ 260 char * cs; 261 int r = 1; 262 263 if (tok->cont_line) 264 /* It's a continuation line, so it can't be a coding spec. */ 265 return 1; 266 cs = get_coding_spec(line, size); 267 if (cs != NULL) { 268 tok->read_coding_spec = 1; 269 if (tok->encoding == NULL) { 270 assert(tok->decoding_state == 1); /* raw */ 271 if (strcmp(cs, "utf-8") == 0 || 272 strcmp(cs, "iso-8859-1") == 0) { 273 tok->encoding = cs; 274 } else { 275#ifdef Py_USING_UNICODE 276 r = set_readline(tok, cs); 277 if (r) { 278 tok->encoding = cs; 279 tok->decoding_state = -1; 280 } 281 else 282 PyMem_FREE(cs); 283#else 284 /* Without Unicode support, we cannot 285 process the coding spec. Since there 286 won't be any Unicode literals, that 287 won't matter. */ 288 PyMem_FREE(cs); 289#endif 290 } 291 } else { /* then, compare cs with BOM */ 292 r = (strcmp(tok->encoding, cs) == 0); 293 PyMem_FREE(cs); 294 } 295 } 296 if (!r) { 297 cs = tok->encoding; 298 if (!cs) 299 cs = "with BOM"; 300 PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs); 301 } 302 return r; 303} 304 305/* See whether the file starts with a BOM. If it does, 306 invoke the set_readline function with the new encoding. 307 Return 1 on success, 0 on failure. */ 308 309static int 310check_bom(int get_char(struct tok_state *), 311 void unget_char(int, struct tok_state *), 312 int set_readline(struct tok_state *, const char *), 313 struct tok_state *tok) 314{ 315 int ch = get_char(tok); 316 tok->decoding_state = 1; 317 if (ch == EOF) { 318 return 1; 319 } else if (ch == 0xEF) { 320 ch = get_char(tok); 321 if (ch != 0xBB) 322 goto NON_BOM; 323 ch = get_char(tok); 324 if (ch != 0xBF) 325 goto NON_BOM; 326#if 0 327 /* Disable support for UTF-16 BOMs until a decision 328 is made whether this needs to be supported. */ 329 } else if (ch == 0xFE) { 330 ch = get_char(tok); 331 if (ch != 0xFF) 332 goto NON_BOM; 333 if (!set_readline(tok, "utf-16-be")) 334 return 0; 335 tok->decoding_state = -1; 336 } else if (ch == 0xFF) { 337 ch = get_char(tok); 338 if (ch != 0xFE) 339 goto NON_BOM; 340 if (!set_readline(tok, "utf-16-le")) 341 return 0; 342 tok->decoding_state = -1; 343#endif 344 } else { 345 unget_char(ch, tok); 346 return 1; 347 } 348 if (tok->encoding != NULL) 349 PyMem_FREE(tok->encoding); 350 tok->encoding = new_string("utf-8", 5); /* resulting is in utf-8 */ 351 return 1; 352 NON_BOM: 353 /* any token beginning with '\xEF', '\xFE', '\xFF' is a bad token */ 354 unget_char(0xFF, tok); /* XXX this will cause a syntax error */ 355 return 1; 356} 357 358/* Read a line of text from TOK into S, using the stream in TOK. 359 Return NULL on failure, else S. 360 361 On entry, tok->decoding_buffer will be one of: 362 1) NULL: need to call tok->decoding_readline to get a new line 363 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and 364 stored the result in tok->decoding_buffer 365 3) PyStringObject *: previous call to fp_readl did not have enough room 366 (in the s buffer) to copy entire contents of the line read 367 by tok->decoding_readline. tok->decoding_buffer has the overflow. 368 In this case, fp_readl is called in a loop (with an expanded buffer) 369 until the buffer ends with a '\n' (or until the end of the file is 370 reached): see tok_nextc and its calls to decoding_fgets. 371*/ 372 373static char * 374fp_readl(char *s, int size, struct tok_state *tok) 375{ 376#ifndef Py_USING_UNICODE 377 /* In a non-Unicode built, this should never be called. */ 378 Py_FatalError("fp_readl should not be called in this build."); 379 return NULL; /* Keep compiler happy (not reachable) */ 380#else 381 PyObject* utf8 = NULL; 382 PyObject* buf = tok->decoding_buffer; 383 char *str; 384 Py_ssize_t utf8len; 385 386 /* Ask for one less byte so we can terminate it */ 387 assert(size > 0); 388 size--; 389 390 if (buf == NULL) { 391 buf = PyObject_CallObject(tok->decoding_readline, NULL); 392 if (buf == NULL) 393 return error_ret(tok); 394 } else { 395 tok->decoding_buffer = NULL; 396 if (PyString_CheckExact(buf)) 397 utf8 = buf; 398 } 399 if (utf8 == NULL) { 400 utf8 = PyUnicode_AsUTF8String(buf); 401 Py_DECREF(buf); 402 if (utf8 == NULL) 403 return error_ret(tok); 404 } 405 str = PyString_AsString(utf8); 406 utf8len = PyString_GET_SIZE(utf8); 407 if (utf8len > size) { 408 tok->decoding_buffer = PyString_FromStringAndSize(str+size, utf8len-size); 409 if (tok->decoding_buffer == NULL) { 410 Py_DECREF(utf8); 411 return error_ret(tok); 412 } 413 utf8len = size; 414 } 415 memcpy(s, str, utf8len); 416 s[utf8len] = '\0'; 417 Py_DECREF(utf8); 418 if (utf8len == 0) 419 return NULL; /* EOF */ 420 return s; 421#endif 422} 423 424/* Set the readline function for TOK to a StreamReader's 425 readline function. The StreamReader is named ENC. 426 427 This function is called from check_bom and check_coding_spec. 428 429 ENC is usually identical to the future value of tok->encoding, 430 except for the (currently unsupported) case of UTF-16. 431 432 Return 1 on success, 0 on failure. */ 433 434static int 435fp_setreadl(struct tok_state *tok, const char* enc) 436{ 437 PyObject *reader, *stream, *readline; 438 439 /* XXX: constify filename argument. */ 440 stream = PyFile_FromFile(tok->fp, (char*)tok->filename, "rb", NULL); 441 if (stream == NULL) 442 return 0; 443 444 reader = PyCodec_StreamReader(enc, stream, NULL); 445 Py_DECREF(stream); 446 if (reader == NULL) 447 return 0; 448 449 readline = PyObject_GetAttrString(reader, "readline"); 450 Py_DECREF(reader); 451 if (readline == NULL) 452 return 0; 453 454 tok->decoding_readline = readline; 455 return 1; 456} 457 458/* Fetch the next byte from TOK. */ 459 460static int fp_getc(struct tok_state *tok) { 461 return getc(tok->fp); 462} 463 464/* Unfetch the last byte back into TOK. */ 465 466static void fp_ungetc(int c, struct tok_state *tok) { 467 ungetc(c, tok->fp); 468} 469 470/* Read a line of input from TOK. Determine encoding 471 if necessary. */ 472 473static char * 474decoding_fgets(char *s, int size, struct tok_state *tok) 475{ 476 char *line = NULL; 477 int badchar = 0; 478 for (;;) { 479 if (tok->decoding_state < 0) { 480 /* We already have a codec associated with 481 this input. */ 482 line = fp_readl(s, size, tok); 483 break; 484 } else if (tok->decoding_state > 0) { 485 /* We want a 'raw' read. */ 486 line = Py_UniversalNewlineFgets(s, size, 487 tok->fp, NULL); 488 break; 489 } else { 490 /* We have not yet determined the encoding. 491 If an encoding is found, use the file-pointer 492 reader functions from now on. */ 493 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok)) 494 return error_ret(tok); 495 assert(tok->decoding_state != 0); 496 } 497 } 498 if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) { 499 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) { 500 return error_ret(tok); 501 } 502 } 503#ifndef PGEN 504 /* The default encoding is ASCII, so make sure we don't have any 505 non-ASCII bytes in it. */ 506 if (line && !tok->encoding) { 507 unsigned char *c; 508 for (c = (unsigned char *)line; *c; c++) 509 if (*c > 127) { 510 badchar = *c; 511 break; 512 } 513 } 514 if (badchar) { 515 char buf[500]; 516 /* Need to add 1 to the line number, since this line 517 has not been counted, yet. */ 518 sprintf(buf, 519 "Non-ASCII character '\\x%.2x' " 520 "in file %.200s on line %i, " 521 "but no encoding declared; " 522 "see http://www.python.org/peps/pep-0263.html for details", 523 badchar, tok->filename, tok->lineno + 1); 524 PyErr_SetString(PyExc_SyntaxError, buf); 525 return error_ret(tok); 526 } 527#endif 528 return line; 529} 530 531static int 532decoding_feof(struct tok_state *tok) 533{ 534 if (tok->decoding_state >= 0) { 535 return feof(tok->fp); 536 } else { 537 PyObject* buf = tok->decoding_buffer; 538 if (buf == NULL) { 539 buf = PyObject_CallObject(tok->decoding_readline, NULL); 540 if (buf == NULL) { 541 error_ret(tok); 542 return 1; 543 } else { 544 tok->decoding_buffer = buf; 545 } 546 } 547 return PyObject_Length(buf) == 0; 548 } 549} 550 551/* Fetch a byte from TOK, using the string buffer. */ 552 553static int 554buf_getc(struct tok_state *tok) { 555 return Py_CHARMASK(*tok->str++); 556} 557 558/* Unfetch a byte from TOK, using the string buffer. */ 559 560static void 561buf_ungetc(int c, struct tok_state *tok) { 562 tok->str--; 563 assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */ 564} 565 566/* Set the readline function for TOK to ENC. For the string-based 567 tokenizer, this means to just record the encoding. */ 568 569static int 570buf_setreadl(struct tok_state *tok, const char* enc) { 571 tok->enc = enc; 572 return 1; 573} 574 575/* Return a UTF-8 encoding Python string object from the 576 C byte string STR, which is encoded with ENC. */ 577 578#ifdef Py_USING_UNICODE 579static PyObject * 580translate_into_utf8(const char* str, const char* enc) { 581 PyObject *utf8; 582 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL); 583 if (buf == NULL) 584 return NULL; 585 utf8 = PyUnicode_AsUTF8String(buf); 586 Py_DECREF(buf); 587 return utf8; 588} 589#endif 590 591 592static char * 593translate_newlines(const char *s, int exec_input, struct tok_state *tok) { 594 int skip_next_lf = 0, needed_length = strlen(s) + 2, final_length; 595 char *buf, *current; 596 char c = '\0'; 597 buf = PyMem_MALLOC(needed_length); 598 if (buf == NULL) { 599 tok->done = E_NOMEM; 600 return NULL; 601 } 602 for (current = buf; *s; s++, current++) { 603 c = *s; 604 if (skip_next_lf) { 605 skip_next_lf = 0; 606 if (c == '\n') { 607 c = *++s; 608 if (!c) 609 break; 610 } 611 } 612 if (c == '\r') { 613 skip_next_lf = 1; 614 c = '\n'; 615 } 616 *current = c; 617 } 618 /* If this is exec input, add a newline to the end of the string if 619 there isn't one already. */ 620 if (exec_input && c != '\n') { 621 *current = '\n'; 622 current++; 623 } 624 *current = '\0'; 625 final_length = current - buf + 1; 626 if (final_length < needed_length && final_length) 627 /* should never fail */ 628 buf = PyMem_REALLOC(buf, final_length); 629 return buf; 630} 631 632/* Decode a byte string STR for use as the buffer of TOK. 633 Look for encoding declarations inside STR, and record them 634 inside TOK. */ 635 636static const char * 637decode_str(const char *input, int single, struct tok_state *tok) 638{ 639 PyObject* utf8 = NULL; 640 const char *str; 641 const char *s; 642 const char *newl[2] = {NULL, NULL}; 643 int lineno = 0; 644 tok->input = str = translate_newlines(input, single, tok); 645 if (str == NULL) 646 return NULL; 647 tok->enc = NULL; 648 tok->str = str; 649 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok)) 650 return error_ret(tok); 651 str = tok->str; /* string after BOM if any */ 652 assert(str); 653#ifdef Py_USING_UNICODE 654 if (tok->enc != NULL) { 655 utf8 = translate_into_utf8(str, tok->enc); 656 if (utf8 == NULL) 657 return error_ret(tok); 658 str = PyString_AsString(utf8); 659 } 660#endif 661 for (s = str;; s++) { 662 if (*s == '\0') break; 663 else if (*s == '\n') { 664 assert(lineno < 2); 665 newl[lineno] = s; 666 lineno++; 667 if (lineno == 2) break; 668 } 669 } 670 tok->enc = NULL; 671 /* need to check line 1 and 2 separately since check_coding_spec 672 assumes a single line as input */ 673 if (newl[0]) { 674 if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl)) 675 return error_ret(tok); 676 if (tok->enc == NULL && newl[1]) { 677 if (!check_coding_spec(newl[0]+1, newl[1] - newl[0], 678 tok, buf_setreadl)) 679 return error_ret(tok); 680 } 681 } 682#ifdef Py_USING_UNICODE 683 if (tok->enc != NULL) { 684 assert(utf8 == NULL); 685 utf8 = translate_into_utf8(str, tok->enc); 686 if (utf8 == NULL) 687 return error_ret(tok); 688 str = PyString_AsString(utf8); 689 } 690#endif 691 assert(tok->decoding_buffer == NULL); 692 tok->decoding_buffer = utf8; /* CAUTION */ 693 return str; 694} 695 696#endif /* PGEN */ 697 698/* Set up tokenizer for string */ 699 700struct tok_state * 701PyTokenizer_FromString(const char *str, int exec_input) 702{ 703 struct tok_state *tok = tok_new(); 704 if (tok == NULL) 705 return NULL; 706 str = (char *)decode_str(str, exec_input, tok); 707 if (str == NULL) { 708 PyTokenizer_Free(tok); 709 return NULL; 710 } 711 712 /* XXX: constify members. */ 713 tok->buf = tok->cur = tok->end = tok->inp = (char*)str; 714 return tok; 715} 716 717 718/* Set up tokenizer for file */ 719 720struct tok_state * 721PyTokenizer_FromFile(FILE *fp, char *ps1, char *ps2) 722{ 723 struct tok_state *tok = tok_new(); 724 if (tok == NULL) 725 return NULL; 726 if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) { 727 PyTokenizer_Free(tok); 728 return NULL; 729 } 730 tok->cur = tok->inp = tok->buf; 731 tok->end = tok->buf + BUFSIZ; 732 tok->fp = fp; 733 tok->prompt = ps1; 734 tok->nextprompt = ps2; 735 return tok; 736} 737 738 739/* Free a tok_state structure */ 740 741void 742PyTokenizer_Free(struct tok_state *tok) 743{ 744 if (tok->encoding != NULL) 745 PyMem_FREE(tok->encoding); 746#ifndef PGEN 747 Py_XDECREF(tok->decoding_readline); 748 Py_XDECREF(tok->decoding_buffer); 749#endif 750 if (tok->fp != NULL && tok->buf != NULL) 751 PyMem_FREE(tok->buf); 752 if (tok->input) 753 PyMem_FREE((char *)tok->input); 754 PyMem_FREE(tok); 755} 756 757#if !defined(PGEN) && defined(Py_USING_UNICODE) 758static int 759tok_stdin_decode(struct tok_state *tok, char **inp) 760{ 761 PyObject *enc, *sysstdin, *decoded, *utf8; 762 const char *encoding; 763 char *converted; 764 765 if (PySys_GetFile((char *)"stdin", NULL) != stdin) 766 return 0; 767 sysstdin = PySys_GetObject("stdin"); 768 if (sysstdin == NULL || !PyFile_Check(sysstdin)) 769 return 0; 770 771 enc = ((PyFileObject *)sysstdin)->f_encoding; 772 if (enc == NULL || !PyString_Check(enc)) 773 return 0; 774 Py_INCREF(enc); 775 776 encoding = PyString_AsString(enc); 777 decoded = PyUnicode_Decode(*inp, strlen(*inp), encoding, NULL); 778 if (decoded == NULL) 779 goto error_clear; 780 781 utf8 = PyUnicode_AsEncodedString(decoded, "utf-8", NULL); 782 Py_DECREF(decoded); 783 if (utf8 == NULL) 784 goto error_clear; 785 786 assert(PyString_Check(utf8)); 787 converted = new_string(PyString_AS_STRING(utf8), 788 PyString_GET_SIZE(utf8)); 789 Py_DECREF(utf8); 790 if (converted == NULL) 791 goto error_nomem; 792 793 PyMem_FREE(*inp); 794 *inp = converted; 795 if (tok->encoding != NULL) 796 PyMem_FREE(tok->encoding); 797 tok->encoding = new_string(encoding, strlen(encoding)); 798 if (tok->encoding == NULL) 799 goto error_nomem; 800 801 Py_DECREF(enc); 802 return 0; 803 804error_nomem: 805 Py_DECREF(enc); 806 tok->done = E_NOMEM; 807 return -1; 808 809error_clear: 810 /* Fallback to iso-8859-1: for backward compatibility */ 811 Py_DECREF(enc); 812 PyErr_Clear(); 813 return 0; 814} 815#endif 816 817/* Get next char, updating state; error code goes into tok->done */ 818 819static int 820tok_nextc(register struct tok_state *tok) 821{ 822 for (;;) { 823 if (tok->cur != tok->inp) { 824 return Py_CHARMASK(*tok->cur++); /* Fast path */ 825 } 826 if (tok->done != E_OK) 827 return EOF; 828 if (tok->fp == NULL) { 829 char *end = strchr(tok->inp, '\n'); 830 if (end != NULL) 831 end++; 832 else { 833 end = strchr(tok->inp, '\0'); 834 if (end == tok->inp) { 835 tok->done = E_EOF; 836 return EOF; 837 } 838 } 839 if (tok->start == NULL) 840 tok->buf = tok->cur; 841 tok->line_start = tok->cur; 842 tok->lineno++; 843 tok->inp = end; 844 return Py_CHARMASK(*tok->cur++); 845 } 846 if (tok->prompt != NULL) { 847 char *newtok = PyOS_Readline(stdin, stdout, tok->prompt); 848 if (tok->nextprompt != NULL) 849 tok->prompt = tok->nextprompt; 850 if (newtok == NULL) 851 tok->done = E_INTR; 852 else if (*newtok == '\0') { 853 PyMem_FREE(newtok); 854 tok->done = E_EOF; 855 } 856#if !defined(PGEN) && defined(Py_USING_UNICODE) 857 else if (tok_stdin_decode(tok, &newtok) != 0) 858 PyMem_FREE(newtok); 859#endif 860 else if (tok->start != NULL) { 861 size_t start = tok->start - tok->buf; 862 size_t oldlen = tok->cur - tok->buf; 863 size_t newlen = oldlen + strlen(newtok); 864 char *buf = tok->buf; 865 buf = (char *)PyMem_REALLOC(buf, newlen+1); 866 tok->lineno++; 867 if (buf == NULL) { 868 PyMem_FREE(tok->buf); 869 tok->buf = NULL; 870 PyMem_FREE(newtok); 871 tok->done = E_NOMEM; 872 return EOF; 873 } 874 tok->buf = buf; 875 tok->cur = tok->buf + oldlen; 876 tok->line_start = tok->cur; 877 strcpy(tok->buf + oldlen, newtok); 878 PyMem_FREE(newtok); 879 tok->inp = tok->buf + newlen; 880 tok->end = tok->inp + 1; 881 tok->start = tok->buf + start; 882 } 883 else { 884 tok->lineno++; 885 if (tok->buf != NULL) 886 PyMem_FREE(tok->buf); 887 tok->buf = newtok; 888 tok->line_start = tok->buf; 889 tok->cur = tok->buf; 890 tok->line_start = tok->buf; 891 tok->inp = strchr(tok->buf, '\0'); 892 tok->end = tok->inp + 1; 893 } 894 } 895 else { 896 int done = 0; 897 Py_ssize_t cur = 0; 898 char *pt; 899 if (tok->start == NULL) { 900 if (tok->buf == NULL) { 901 tok->buf = (char *) 902 PyMem_MALLOC(BUFSIZ); 903 if (tok->buf == NULL) { 904 tok->done = E_NOMEM; 905 return EOF; 906 } 907 tok->end = tok->buf + BUFSIZ; 908 } 909 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf), 910 tok) == NULL) { 911 tok->done = E_EOF; 912 done = 1; 913 } 914 else { 915 tok->done = E_OK; 916 tok->inp = strchr(tok->buf, '\0'); 917 done = tok->inp[-1] == '\n'; 918 } 919 } 920 else { 921 cur = tok->cur - tok->buf; 922 if (decoding_feof(tok)) { 923 tok->done = E_EOF; 924 done = 1; 925 } 926 else 927 tok->done = E_OK; 928 } 929 tok->lineno++; 930 /* Read until '\n' or EOF */ 931 while (!done) { 932 Py_ssize_t curstart = tok->start == NULL ? -1 : 933 tok->start - tok->buf; 934 Py_ssize_t curvalid = tok->inp - tok->buf; 935 Py_ssize_t newsize = curvalid + BUFSIZ; 936 char *newbuf = tok->buf; 937 newbuf = (char *)PyMem_REALLOC(newbuf, 938 newsize); 939 if (newbuf == NULL) { 940 tok->done = E_NOMEM; 941 tok->cur = tok->inp; 942 return EOF; 943 } 944 tok->buf = newbuf; 945 tok->inp = tok->buf + curvalid; 946 tok->end = tok->buf + newsize; 947 tok->start = curstart < 0 ? NULL : 948 tok->buf + curstart; 949 if (decoding_fgets(tok->inp, 950 (int)(tok->end - tok->inp), 951 tok) == NULL) { 952 /* Break out early on decoding 953 errors, as tok->buf will be NULL 954 */ 955 if (tok->decoding_erred) 956 return EOF; 957 /* Last line does not end in \n, 958 fake one */ 959 strcpy(tok->inp, "\n"); 960 } 961 tok->inp = strchr(tok->inp, '\0'); 962 done = tok->inp[-1] == '\n'; 963 } 964 if (tok->buf != NULL) { 965 tok->cur = tok->buf + cur; 966 tok->line_start = tok->cur; 967 /* replace "\r\n" with "\n" */ 968 /* For Mac leave the \r, giving a syntax error */ 969 pt = tok->inp - 2; 970 if (pt >= tok->buf && *pt == '\r') { 971 *pt++ = '\n'; 972 *pt = '\0'; 973 tok->inp = pt; 974 } 975 } 976 } 977 if (tok->done != E_OK) { 978 if (tok->prompt != NULL) 979 PySys_WriteStderr("\n"); 980 tok->cur = tok->inp; 981 return EOF; 982 } 983 } 984 /*NOTREACHED*/ 985} 986 987 988/* Back-up one character */ 989 990static void 991tok_backup(register struct tok_state *tok, register int c) 992{ 993 if (c != EOF) { 994 if (--tok->cur < tok->buf) 995 Py_FatalError("tok_backup: beginning of buffer"); 996 if (*tok->cur != c) 997 *tok->cur = c; 998 } 999} 1000 1001 1002/* Return the token corresponding to a single character */ 1003 1004int 1005PyToken_OneChar(int c) 1006{ 1007 switch (c) { 1008 case '(': return LPAR; 1009 case ')': return RPAR; 1010 case '[': return LSQB; 1011 case ']': return RSQB; 1012 case ':': return COLON; 1013 case ',': return COMMA; 1014 case ';': return SEMI; 1015 case '+': return PLUS; 1016 case '-': return MINUS; 1017 case '*': return STAR; 1018 case '/': return SLASH; 1019 case '|': return VBAR; 1020 case '&': return AMPER; 1021 case '<': return LESS; 1022 case '>': return GREATER; 1023 case '=': return EQUAL; 1024 case '.': return DOT; 1025 case '%': return PERCENT; 1026 case '`': return BACKQUOTE; 1027 case '{': return LBRACE; 1028 case '}': return RBRACE; 1029 case '^': return CIRCUMFLEX; 1030 case '~': return TILDE; 1031 case '@': return AT; 1032 default: return OP; 1033 } 1034} 1035 1036 1037int 1038PyToken_TwoChars(int c1, int c2) 1039{ 1040 switch (c1) { 1041 case '=': 1042 switch (c2) { 1043 case '=': return EQEQUAL; 1044 } 1045 break; 1046 case '!': 1047 switch (c2) { 1048 case '=': return NOTEQUAL; 1049 } 1050 break; 1051 case '<': 1052 switch (c2) { 1053 case '>': return NOTEQUAL; 1054 case '=': return LESSEQUAL; 1055 case '<': return LEFTSHIFT; 1056 } 1057 break; 1058 case '>': 1059 switch (c2) { 1060 case '=': return GREATEREQUAL; 1061 case '>': return RIGHTSHIFT; 1062 } 1063 break; 1064 case '+': 1065 switch (c2) { 1066 case '=': return PLUSEQUAL; 1067 } 1068 break; 1069 case '-': 1070 switch (c2) { 1071 case '=': return MINEQUAL; 1072 } 1073 break; 1074 case '*': 1075 switch (c2) { 1076 case '*': return DOUBLESTAR; 1077 case '=': return STAREQUAL; 1078 } 1079 break; 1080 case '/': 1081 switch (c2) { 1082 case '/': return DOUBLESLASH; 1083 case '=': return SLASHEQUAL; 1084 } 1085 break; 1086 case '|': 1087 switch (c2) { 1088 case '=': return VBAREQUAL; 1089 } 1090 break; 1091 case '%': 1092 switch (c2) { 1093 case '=': return PERCENTEQUAL; 1094 } 1095 break; 1096 case '&': 1097 switch (c2) { 1098 case '=': return AMPEREQUAL; 1099 } 1100 break; 1101 case '^': 1102 switch (c2) { 1103 case '=': return CIRCUMFLEXEQUAL; 1104 } 1105 break; 1106 } 1107 return OP; 1108} 1109 1110int 1111PyToken_ThreeChars(int c1, int c2, int c3) 1112{ 1113 switch (c1) { 1114 case '<': 1115 switch (c2) { 1116 case '<': 1117 switch (c3) { 1118 case '=': 1119 return LEFTSHIFTEQUAL; 1120 } 1121 break; 1122 } 1123 break; 1124 case '>': 1125 switch (c2) { 1126 case '>': 1127 switch (c3) { 1128 case '=': 1129 return RIGHTSHIFTEQUAL; 1130 } 1131 break; 1132 } 1133 break; 1134 case '*': 1135 switch (c2) { 1136 case '*': 1137 switch (c3) { 1138 case '=': 1139 return DOUBLESTAREQUAL; 1140 } 1141 break; 1142 } 1143 break; 1144 case '/': 1145 switch (c2) { 1146 case '/': 1147 switch (c3) { 1148 case '=': 1149 return DOUBLESLASHEQUAL; 1150 } 1151 break; 1152 } 1153 break; 1154 } 1155 return OP; 1156} 1157 1158static int 1159indenterror(struct tok_state *tok) 1160{ 1161 if (tok->alterror) { 1162 tok->done = E_TABSPACE; 1163 tok->cur = tok->inp; 1164 return 1; 1165 } 1166 if (tok->altwarning) { 1167 PySys_WriteStderr("%s: inconsistent use of tabs and spaces " 1168 "in indentation\n", tok->filename); 1169 tok->altwarning = 0; 1170 } 1171 return 0; 1172} 1173 1174 1175/* Get next token, after space stripping etc. */ 1176 1177static int 1178tok_get(register struct tok_state *tok, char **p_start, char **p_end) 1179{ 1180 register int c; 1181 int blankline; 1182 1183 *p_start = *p_end = NULL; 1184 nextline: 1185 tok->start = NULL; 1186 blankline = 0; 1187 1188 /* Get indentation level */ 1189 if (tok->atbol) { 1190 register int col = 0; 1191 register int altcol = 0; 1192 tok->atbol = 0; 1193 for (;;) { 1194 c = tok_nextc(tok); 1195 if (c == ' ') 1196 col++, altcol++; 1197 else if (c == '\t') { 1198 col = (col/tok->tabsize + 1) * tok->tabsize; 1199 altcol = (altcol/tok->alttabsize + 1) 1200 * tok->alttabsize; 1201 } 1202 else if (c == '\014') /* Control-L (formfeed) */ 1203 col = altcol = 0; /* For Emacs users */ 1204 else 1205 break; 1206 } 1207 tok_backup(tok, c); 1208 if (c == '#' || c == '\n') { 1209 /* Lines with only whitespace and/or comments 1210 shouldn't affect the indentation and are 1211 not passed to the parser as NEWLINE tokens, 1212 except *totally* empty lines in interactive 1213 mode, which signal the end of a command group. */ 1214 if (col == 0 && c == '\n' && tok->prompt != NULL) 1215 blankline = 0; /* Let it through */ 1216 else 1217 blankline = 1; /* Ignore completely */ 1218 /* We can't jump back right here since we still 1219 may need to skip to the end of a comment */ 1220 } 1221 if (!blankline && tok->level == 0) { 1222 if (col == tok->indstack[tok->indent]) { 1223 /* No change */ 1224 if (altcol != tok->altindstack[tok->indent]) { 1225 if (indenterror(tok)) 1226 return ERRORTOKEN; 1227 } 1228 } 1229 else if (col > tok->indstack[tok->indent]) { 1230 /* Indent -- always one */ 1231 if (tok->indent+1 >= MAXINDENT) { 1232 tok->done = E_TOODEEP; 1233 tok->cur = tok->inp; 1234 return ERRORTOKEN; 1235 } 1236 if (altcol <= tok->altindstack[tok->indent]) { 1237 if (indenterror(tok)) 1238 return ERRORTOKEN; 1239 } 1240 tok->pendin++; 1241 tok->indstack[++tok->indent] = col; 1242 tok->altindstack[tok->indent] = altcol; 1243 } 1244 else /* col < tok->indstack[tok->indent] */ { 1245 /* Dedent -- any number, must be consistent */ 1246 while (tok->indent > 0 && 1247 col < tok->indstack[tok->indent]) { 1248 tok->pendin--; 1249 tok->indent--; 1250 } 1251 if (col != tok->indstack[tok->indent]) { 1252 tok->done = E_DEDENT; 1253 tok->cur = tok->inp; 1254 return ERRORTOKEN; 1255 } 1256 if (altcol != tok->altindstack[tok->indent]) { 1257 if (indenterror(tok)) 1258 return ERRORTOKEN; 1259 } 1260 } 1261 } 1262 } 1263 1264 tok->start = tok->cur; 1265 1266 /* Return pending indents/dedents */ 1267 if (tok->pendin != 0) { 1268 if (tok->pendin < 0) { 1269 tok->pendin++; 1270 return DEDENT; 1271 } 1272 else { 1273 tok->pendin--; 1274 return INDENT; 1275 } 1276 } 1277 1278 again: 1279 tok->start = NULL; 1280 /* Skip spaces */ 1281 do { 1282 c = tok_nextc(tok); 1283 } while (c == ' ' || c == '\t' || c == '\014'); 1284 1285 /* Set start of current token */ 1286 tok->start = tok->cur - 1; 1287 1288 /* Skip comment, while looking for tab-setting magic */ 1289 if (c == '#') { 1290 static char *tabforms[] = { 1291 "tab-width:", /* Emacs */ 1292 ":tabstop=", /* vim, full form */ 1293 ":ts=", /* vim, abbreviated form */ 1294 "set tabsize=", /* will vi never die? */ 1295 /* more templates can be added here to support other editors */ 1296 }; 1297 char cbuf[80]; 1298 char *tp, **cp; 1299 tp = cbuf; 1300 do { 1301 *tp++ = c = tok_nextc(tok); 1302 } while (c != EOF && c != '\n' && 1303 (size_t)(tp - cbuf + 1) < sizeof(cbuf)); 1304 *tp = '\0'; 1305 for (cp = tabforms; 1306 cp < tabforms + sizeof(tabforms)/sizeof(tabforms[0]); 1307 cp++) { 1308 if ((tp = strstr(cbuf, *cp))) { 1309 int newsize = atoi(tp + strlen(*cp)); 1310 1311 if (newsize >= 1 && newsize <= 40) { 1312 tok->tabsize = newsize; 1313 if (Py_VerboseFlag) 1314 PySys_WriteStderr( 1315 "Tab size set to %d\n", 1316 newsize); 1317 } 1318 } 1319 } 1320 while (c != EOF && c != '\n') 1321 c = tok_nextc(tok); 1322 } 1323 1324 /* Check for EOF and errors now */ 1325 if (c == EOF) { 1326 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN; 1327 } 1328 1329 /* Identifier (most frequent token!) */ 1330 if (isalpha(c) || c == '_') { 1331 /* Process r"", u"" and ur"" */ 1332 switch (c) { 1333 case 'b': 1334 case 'B': 1335 c = tok_nextc(tok); 1336 if (c == 'r' || c == 'R') 1337 c = tok_nextc(tok); 1338 if (c == '"' || c == '\'') 1339 goto letter_quote; 1340 break; 1341 case 'r': 1342 case 'R': 1343 c = tok_nextc(tok); 1344 if (c == '"' || c == '\'') 1345 goto letter_quote; 1346 break; 1347 case 'u': 1348 case 'U': 1349 c = tok_nextc(tok); 1350 if (c == 'r' || c == 'R') 1351 c = tok_nextc(tok); 1352 if (c == '"' || c == '\'') 1353 goto letter_quote; 1354 break; 1355 } 1356 while (isalnum(c) || c == '_') { 1357 c = tok_nextc(tok); 1358 } 1359 tok_backup(tok, c); 1360 *p_start = tok->start; 1361 *p_end = tok->cur; 1362 return NAME; 1363 } 1364 1365 /* Newline */ 1366 if (c == '\n') { 1367 tok->atbol = 1; 1368 if (blankline || tok->level > 0) 1369 goto nextline; 1370 *p_start = tok->start; 1371 *p_end = tok->cur - 1; /* Leave '\n' out of the string */ 1372 tok->cont_line = 0; 1373 return NEWLINE; 1374 } 1375 1376 /* Period or number starting with period? */ 1377 if (c == '.') { 1378 c = tok_nextc(tok); 1379 if (isdigit(c)) { 1380 goto fraction; 1381 } 1382 else { 1383 tok_backup(tok, c); 1384 *p_start = tok->start; 1385 *p_end = tok->cur; 1386 return DOT; 1387 } 1388 } 1389 1390 /* Number */ 1391 if (isdigit(c)) { 1392 if (c == '0') { 1393 /* Hex, octal or binary -- maybe. */ 1394 c = tok_nextc(tok); 1395 if (c == '.') 1396 goto fraction; 1397#ifndef WITHOUT_COMPLEX 1398 if (c == 'j' || c == 'J') 1399 goto imaginary; 1400#endif 1401 if (c == 'x' || c == 'X') { 1402 1403 /* Hex */ 1404 c = tok_nextc(tok); 1405 if (!isxdigit(c)) { 1406 tok->done = E_TOKEN; 1407 tok_backup(tok, c); 1408 return ERRORTOKEN; 1409 } 1410 do { 1411 c = tok_nextc(tok); 1412 } while (isxdigit(c)); 1413 } 1414 else if (c == 'o' || c == 'O') { 1415 /* Octal */ 1416 c = tok_nextc(tok); 1417 if (c < '0' || c >= '8') { 1418 tok->done = E_TOKEN; 1419 tok_backup(tok, c); 1420 return ERRORTOKEN; 1421 } 1422 do { 1423 c = tok_nextc(tok); 1424 } while ('0' <= c && c < '8'); 1425 } 1426 else if (c == 'b' || c == 'B') { 1427 /* Binary */ 1428 c = tok_nextc(tok); 1429 if (c != '0' && c != '1') { 1430 tok->done = E_TOKEN; 1431 tok_backup(tok, c); 1432 return ERRORTOKEN; 1433 } 1434 do { 1435 c = tok_nextc(tok); 1436 } while (c == '0' || c == '1'); 1437 } 1438 else { 1439 int found_decimal = 0; 1440 /* Octal; c is first char of it */ 1441 /* There's no 'isoctdigit' macro, sigh */ 1442 while ('0' <= c && c < '8') { 1443 c = tok_nextc(tok); 1444 } 1445 if (isdigit(c)) { 1446 found_decimal = 1; 1447 do { 1448 c = tok_nextc(tok); 1449 } while (isdigit(c)); 1450 } 1451 if (c == '.') 1452 goto fraction; 1453 else if (c == 'e' || c == 'E') 1454 goto exponent; 1455#ifndef WITHOUT_COMPLEX 1456 else if (c == 'j' || c == 'J') 1457 goto imaginary; 1458#endif 1459 else if (found_decimal) { 1460 tok->done = E_TOKEN; 1461 tok_backup(tok, c); 1462 return ERRORTOKEN; 1463 } 1464 } 1465 if (c == 'l' || c == 'L') 1466 c = tok_nextc(tok); 1467 } 1468 else { 1469 /* Decimal */ 1470 do { 1471 c = tok_nextc(tok); 1472 } while (isdigit(c)); 1473 if (c == 'l' || c == 'L') 1474 c = tok_nextc(tok); 1475 else { 1476 /* Accept floating point numbers. */ 1477 if (c == '.') { 1478 fraction: 1479 /* Fraction */ 1480 do { 1481 c = tok_nextc(tok); 1482 } while (isdigit(c)); 1483 } 1484 if (c == 'e' || c == 'E') { 1485 exponent: 1486 /* Exponent part */ 1487 c = tok_nextc(tok); 1488 if (c == '+' || c == '-') 1489 c = tok_nextc(tok); 1490 if (!isdigit(c)) { 1491 tok->done = E_TOKEN; 1492 tok_backup(tok, c); 1493 return ERRORTOKEN; 1494 } 1495 do { 1496 c = tok_nextc(tok); 1497 } while (isdigit(c)); 1498 } 1499#ifndef WITHOUT_COMPLEX 1500 if (c == 'j' || c == 'J') 1501 /* Imaginary part */ 1502 imaginary: 1503 c = tok_nextc(tok); 1504#endif 1505 } 1506 } 1507 tok_backup(tok, c); 1508 *p_start = tok->start; 1509 *p_end = tok->cur; 1510 return NUMBER; 1511 } 1512 1513 letter_quote: 1514 /* String */ 1515 if (c == '\'' || c == '"') { 1516 Py_ssize_t quote2 = tok->cur - tok->start + 1; 1517 int quote = c; 1518 int triple = 0; 1519 int tripcount = 0; 1520 for (;;) { 1521 c = tok_nextc(tok); 1522 if (c == '\n') { 1523 if (!triple) { 1524 tok->done = E_EOLS; 1525 tok_backup(tok, c); 1526 return ERRORTOKEN; 1527 } 1528 tripcount = 0; 1529 tok->cont_line = 1; /* multiline string. */ 1530 } 1531 else if (c == EOF) { 1532 if (triple) 1533 tok->done = E_EOFS; 1534 else 1535 tok->done = E_EOLS; 1536 tok->cur = tok->inp; 1537 return ERRORTOKEN; 1538 } 1539 else if (c == quote) { 1540 tripcount++; 1541 if (tok->cur - tok->start == quote2) { 1542 c = tok_nextc(tok); 1543 if (c == quote) { 1544 triple = 1; 1545 tripcount = 0; 1546 continue; 1547 } 1548 tok_backup(tok, c); 1549 } 1550 if (!triple || tripcount == 3) 1551 break; 1552 } 1553 else if (c == '\\') { 1554 tripcount = 0; 1555 c = tok_nextc(tok); 1556 if (c == EOF) { 1557 tok->done = E_EOLS; 1558 tok->cur = tok->inp; 1559 return ERRORTOKEN; 1560 } 1561 } 1562 else 1563 tripcount = 0; 1564 } 1565 *p_start = tok->start; 1566 *p_end = tok->cur; 1567 return STRING; 1568 } 1569 1570 /* Line continuation */ 1571 if (c == '\\') { 1572 c = tok_nextc(tok); 1573 if (c != '\n') { 1574 tok->done = E_LINECONT; 1575 tok->cur = tok->inp; 1576 return ERRORTOKEN; 1577 } 1578 tok->cont_line = 1; 1579 goto again; /* Read next line */ 1580 } 1581 1582 /* Check for two-character token */ 1583 { 1584 int c2 = tok_nextc(tok); 1585 int token = PyToken_TwoChars(c, c2); 1586#ifndef PGEN 1587 if (Py_Py3kWarningFlag && token == NOTEQUAL && c == '<') { 1588 if (PyErr_WarnExplicit(PyExc_DeprecationWarning, 1589 "<> not supported in 3.x; use !=", 1590 tok->filename, tok->lineno, 1591 NULL, NULL)) { 1592 return ERRORTOKEN; 1593 } 1594 } 1595#endif 1596 if (token != OP) { 1597 int c3 = tok_nextc(tok); 1598 int token3 = PyToken_ThreeChars(c, c2, c3); 1599 if (token3 != OP) { 1600 token = token3; 1601 } else { 1602 tok_backup(tok, c3); 1603 } 1604 *p_start = tok->start; 1605 *p_end = tok->cur; 1606 return token; 1607 } 1608 tok_backup(tok, c2); 1609 } 1610 1611 /* Keep track of parentheses nesting level */ 1612 switch (c) { 1613 case '(': 1614 case '[': 1615 case '{': 1616 tok->level++; 1617 break; 1618 case ')': 1619 case ']': 1620 case '}': 1621 tok->level--; 1622 break; 1623 } 1624 1625 /* Punctuation character */ 1626 *p_start = tok->start; 1627 *p_end = tok->cur; 1628 return PyToken_OneChar(c); 1629} 1630 1631int 1632PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end) 1633{ 1634 int result = tok_get(tok, p_start, p_end); 1635 if (tok->decoding_erred) { 1636 result = ERRORTOKEN; 1637 tok->done = E_DECODE; 1638 } 1639 return result; 1640} 1641 1642/* This function is only called from parsetok. However, it cannot live 1643 there, as it must be empty for PGEN, and we can check for PGEN only 1644 in this file. */ 1645 1646#if defined(PGEN) || !defined(Py_USING_UNICODE) 1647char* 1648PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int* offset) 1649{ 1650 return NULL; 1651} 1652#else 1653#ifdef Py_USING_UNICODE 1654static PyObject * 1655dec_utf8(const char *enc, const char *text, size_t len) { 1656 PyObject *ret = NULL; 1657 PyObject *unicode_text = PyUnicode_DecodeUTF8(text, len, "replace"); 1658 if (unicode_text) { 1659 ret = PyUnicode_AsEncodedString(unicode_text, enc, "replace"); 1660 Py_DECREF(unicode_text); 1661 } 1662 if (!ret) { 1663 PyErr_Clear(); 1664 } 1665 return ret; 1666} 1667char * 1668PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int *offset) 1669{ 1670 char *text = NULL; 1671 if (tok->encoding) { 1672 /* convert source to original encondig */ 1673 PyObject *lineobj = dec_utf8(tok->encoding, tok->buf, len); 1674 if (lineobj != NULL) { 1675 int linelen = PyString_Size(lineobj); 1676 const char *line = PyString_AsString(lineobj); 1677 text = PyObject_MALLOC(linelen + 1); 1678 if (text != NULL && line != NULL) { 1679 if (linelen) 1680 strncpy(text, line, linelen); 1681 text[linelen] = '\0'; 1682 } 1683 Py_DECREF(lineobj); 1684 1685 /* adjust error offset */ 1686 if (*offset > 1) { 1687 PyObject *offsetobj = dec_utf8(tok->encoding, 1688 tok->buf, *offset-1); 1689 if (offsetobj) { 1690 *offset = PyString_Size(offsetobj) + 1; 1691 Py_DECREF(offsetobj); 1692 } 1693 } 1694 1695 } 1696 } 1697 return text; 1698 1699} 1700#endif /* defined(Py_USING_UNICODE) */ 1701#endif 1702 1703 1704#ifdef Py_DEBUG 1705 1706void 1707tok_dump(int type, char *start, char *end) 1708{ 1709 printf("%s", _PyParser_TokenNames[type]); 1710 if (type == NAME || type == NUMBER || type == STRING || type == OP) 1711 printf("(%.*s)", (int)(end - start), start); 1712} 1713 1714#endif 1715