tokenizer.c revision 4ceeeb09d8ff445888b24aa324bc06175d141cb9
1 2/* Tokenizer implementation */ 3 4#include "Python.h" 5#include "pgenheaders.h" 6 7#include <ctype.h> 8#include <assert.h> 9 10#include "tokenizer.h" 11#include "errcode.h" 12 13#ifndef PGEN 14#include "unicodeobject.h" 15#include "stringobject.h" 16#include "fileobject.h" 17#include "codecs.h" 18#include "abstract.h" 19#include "pydebug.h" 20#endif /* PGEN */ 21 22extern char *PyOS_Readline(FILE *, FILE *, char *); 23/* Return malloc'ed string including trailing \n; 24 empty malloc'ed string for EOF; 25 NULL if interrupted */ 26 27/* Don't ever change this -- it would break the portability of Python code */ 28#define TABSIZE 8 29 30/* Forward */ 31static struct tok_state *tok_new(void); 32static int tok_nextc(struct tok_state *tok); 33static void tok_backup(struct tok_state *tok, int c); 34 35/* Token names */ 36 37char *_PyParser_TokenNames[] = { 38 "ENDMARKER", 39 "NAME", 40 "NUMBER", 41 "STRING", 42 "NEWLINE", 43 "INDENT", 44 "DEDENT", 45 "LPAR", 46 "RPAR", 47 "LSQB", 48 "RSQB", 49 "COLON", 50 "COMMA", 51 "SEMI", 52 "PLUS", 53 "MINUS", 54 "STAR", 55 "SLASH", 56 "VBAR", 57 "AMPER", 58 "LESS", 59 "GREATER", 60 "EQUAL", 61 "DOT", 62 "PERCENT", 63 "BACKQUOTE", 64 "LBRACE", 65 "RBRACE", 66 "EQEQUAL", 67 "NOTEQUAL", 68 "LESSEQUAL", 69 "GREATEREQUAL", 70 "TILDE", 71 "CIRCUMFLEX", 72 "LEFTSHIFT", 73 "RIGHTSHIFT", 74 "DOUBLESTAR", 75 "PLUSEQUAL", 76 "MINEQUAL", 77 "STAREQUAL", 78 "SLASHEQUAL", 79 "PERCENTEQUAL", 80 "AMPEREQUAL", 81 "VBAREQUAL", 82 "CIRCUMFLEXEQUAL", 83 "LEFTSHIFTEQUAL", 84 "RIGHTSHIFTEQUAL", 85 "DOUBLESTAREQUAL", 86 "DOUBLESLASH", 87 "DOUBLESLASHEQUAL", 88 "AT", 89 /* This table must match the #defines in token.h! */ 90 "OP", 91 "<ERRORTOKEN>", 92 "<N_TOKENS>" 93}; 94 95 96/* Ensure that the locale does not interfere with tokenization. */ 97 98static int 99ascii_isalpha(int c) 100{ 101 return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z'); 102} 103 104static int 105ascii_isalnum(int c) 106{ 107 return ascii_isalpha(c) || ('0' <= c && c <= '9'); 108} 109 110 111/* Create and initialize a new tok_state structure */ 112 113static struct tok_state * 114tok_new(void) 115{ 116 struct tok_state *tok = (struct tok_state *)PyMem_MALLOC( 117 sizeof(struct tok_state)); 118 if (tok == NULL) 119 return NULL; 120 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL; 121 tok->done = E_OK; 122 tok->fp = NULL; 123 tok->input = NULL; 124 tok->tabsize = TABSIZE; 125 tok->indent = 0; 126 tok->indstack[0] = 0; 127 tok->atbol = 1; 128 tok->pendin = 0; 129 tok->prompt = tok->nextprompt = NULL; 130 tok->lineno = 0; 131 tok->level = 0; 132 tok->filename = NULL; 133 tok->altwarning = 0; 134 tok->alterror = 0; 135 tok->alttabsize = 1; 136 tok->altindstack[0] = 0; 137 tok->decoding_state = 0; 138 tok->decoding_erred = 0; 139 tok->read_coding_spec = 0; 140 tok->encoding = NULL; 141 tok->cont_line = 0; 142#ifndef PGEN 143 tok->decoding_readline = NULL; 144 tok->decoding_buffer = NULL; 145#endif 146 return tok; 147} 148 149static char * 150new_string(const char *s, Py_ssize_t len) 151{ 152 char* result = (char *)PyMem_MALLOC(len + 1); 153 if (result != NULL) { 154 memcpy(result, s, len); 155 result[len] = '\0'; 156 } 157 return result; 158} 159 160#ifdef PGEN 161 162static char * 163decoding_fgets(char *s, int size, struct tok_state *tok) 164{ 165 return fgets(s, size, tok->fp); 166} 167 168static int 169decoding_feof(struct tok_state *tok) 170{ 171 return feof(tok->fp); 172} 173 174static char * 175decode_str(const char *str, int exec_input, struct tok_state *tok) 176{ 177 return new_string(str, strlen(str)); 178} 179 180#else /* PGEN */ 181 182static char * 183error_ret(struct tok_state *tok) /* XXX */ 184{ 185 tok->decoding_erred = 1; 186 if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */ 187 PyMem_FREE(tok->buf); 188 tok->buf = NULL; 189 return NULL; /* as if it were EOF */ 190} 191 192 193static char * 194get_normal_name(char *s) /* for utf-8 and latin-1 */ 195{ 196 char buf[13]; 197 int i; 198 for (i = 0; i < 12; i++) { 199 int c = s[i]; 200 if (c == '\0') 201 break; 202 else if (c == '_') 203 buf[i] = '-'; 204 else 205 buf[i] = tolower(c); 206 } 207 buf[i] = '\0'; 208 if (strcmp(buf, "utf-8") == 0 || 209 strncmp(buf, "utf-8-", 6) == 0) 210 return "utf-8"; 211 else if (strcmp(buf, "latin-1") == 0 || 212 strcmp(buf, "iso-8859-1") == 0 || 213 strcmp(buf, "iso-latin-1") == 0 || 214 strncmp(buf, "latin-1-", 8) == 0 || 215 strncmp(buf, "iso-8859-1-", 11) == 0 || 216 strncmp(buf, "iso-latin-1-", 12) == 0) 217 return "iso-8859-1"; 218 else 219 return s; 220} 221 222/* Return the coding spec in S, or NULL if none is found. */ 223 224static char * 225get_coding_spec(const char *s, Py_ssize_t size) 226{ 227 Py_ssize_t i; 228 /* Coding spec must be in a comment, and that comment must be 229 * the only statement on the source code line. */ 230 for (i = 0; i < size - 6; i++) { 231 if (s[i] == '#') 232 break; 233 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014') 234 return NULL; 235 } 236 for (; i < size - 6; i++) { /* XXX inefficient search */ 237 const char* t = s + i; 238 if (strncmp(t, "coding", 6) == 0) { 239 const char* begin = NULL; 240 t += 6; 241 if (t[0] != ':' && t[0] != '=') 242 continue; 243 do { 244 t++; 245 } while (t[0] == '\x20' || t[0] == '\t'); 246 247 begin = t; 248 while (ascii_isalnum(Py_CHARMASK(t[0])) || 249 t[0] == '-' || t[0] == '_' || t[0] == '.') 250 t++; 251 252 if (begin < t) { 253 char* r = new_string(begin, t - begin); 254 char* q = get_normal_name(r); 255 if (r != q) { 256 PyMem_FREE(r); 257 r = new_string(q, strlen(q)); 258 } 259 return r; 260 } 261 } 262 } 263 return NULL; 264} 265 266/* Check whether the line contains a coding spec. If it does, 267 invoke the set_readline function for the new encoding. 268 This function receives the tok_state and the new encoding. 269 Return 1 on success, 0 on failure. */ 270 271static int 272check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok, 273 int set_readline(struct tok_state *, const char *)) 274{ 275 char * cs; 276 int r = 1; 277 278 if (tok->cont_line) 279 /* It's a continuation line, so it can't be a coding spec. */ 280 return 1; 281 cs = get_coding_spec(line, size); 282 if (cs != NULL) { 283 tok->read_coding_spec = 1; 284 if (tok->encoding == NULL) { 285 assert(tok->decoding_state == 1); /* raw */ 286 if (strcmp(cs, "utf-8") == 0 || 287 strcmp(cs, "iso-8859-1") == 0) { 288 tok->encoding = cs; 289 } else { 290#ifdef Py_USING_UNICODE 291 r = set_readline(tok, cs); 292 if (r) { 293 tok->encoding = cs; 294 tok->decoding_state = -1; 295 } 296 else 297 PyMem_FREE(cs); 298#else 299 /* Without Unicode support, we cannot 300 process the coding spec. Since there 301 won't be any Unicode literals, that 302 won't matter. */ 303 PyMem_FREE(cs); 304#endif 305 } 306 } else { /* then, compare cs with BOM */ 307 r = (strcmp(tok->encoding, cs) == 0); 308 PyMem_FREE(cs); 309 } 310 } 311 if (!r) { 312 cs = tok->encoding; 313 if (!cs) 314 cs = "with BOM"; 315 PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs); 316 } 317 return r; 318} 319 320/* See whether the file starts with a BOM. If it does, 321 invoke the set_readline function with the new encoding. 322 Return 1 on success, 0 on failure. */ 323 324static int 325check_bom(int get_char(struct tok_state *), 326 void unget_char(int, struct tok_state *), 327 int set_readline(struct tok_state *, const char *), 328 struct tok_state *tok) 329{ 330 int ch1, ch2, ch3; 331 ch1 = get_char(tok); 332 tok->decoding_state = 1; 333 if (ch1 == EOF) { 334 return 1; 335 } else if (ch1 == 0xEF) { 336 ch2 = get_char(tok); 337 if (ch2 != 0xBB) { 338 unget_char(ch2, tok); 339 unget_char(ch1, tok); 340 return 1; 341 } 342 ch3 = get_char(tok); 343 if (ch3 != 0xBF) { 344 unget_char(ch3, tok); 345 unget_char(ch2, tok); 346 unget_char(ch1, tok); 347 return 1; 348 } 349#if 0 350 /* Disable support for UTF-16 BOMs until a decision 351 is made whether this needs to be supported. */ 352 } else if (ch1 == 0xFE) { 353 ch2 = get_char(tok); 354 if (ch2 != 0xFF) { 355 unget_char(ch2, tok); 356 unget_char(ch1, tok); 357 return 1; 358 } 359 if (!set_readline(tok, "utf-16-be")) 360 return 0; 361 tok->decoding_state = -1; 362 } else if (ch1 == 0xFF) { 363 ch2 = get_char(tok); 364 if (ch2 != 0xFE) { 365 unget_char(ch2, tok); 366 unget_char(ch1, tok); 367 return 1; 368 } 369 if (!set_readline(tok, "utf-16-le")) 370 return 0; 371 tok->decoding_state = -1; 372#endif 373 } else { 374 unget_char(ch1, tok); 375 return 1; 376 } 377 if (tok->encoding != NULL) 378 PyMem_FREE(tok->encoding); 379 tok->encoding = new_string("utf-8", 5); /* resulting is in utf-8 */ 380 return 1; 381} 382 383/* Read a line of text from TOK into S, using the stream in TOK. 384 Return NULL on failure, else S. 385 386 On entry, tok->decoding_buffer will be one of: 387 1) NULL: need to call tok->decoding_readline to get a new line 388 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and 389 stored the result in tok->decoding_buffer 390 3) PyStringObject *: previous call to fp_readl did not have enough room 391 (in the s buffer) to copy entire contents of the line read 392 by tok->decoding_readline. tok->decoding_buffer has the overflow. 393 In this case, fp_readl is called in a loop (with an expanded buffer) 394 until the buffer ends with a '\n' (or until the end of the file is 395 reached): see tok_nextc and its calls to decoding_fgets. 396*/ 397 398static char * 399fp_readl(char *s, int size, struct tok_state *tok) 400{ 401#ifndef Py_USING_UNICODE 402 /* In a non-Unicode built, this should never be called. */ 403 Py_FatalError("fp_readl should not be called in this build."); 404 return NULL; /* Keep compiler happy (not reachable) */ 405#else 406 PyObject* utf8 = NULL; 407 PyObject* buf = tok->decoding_buffer; 408 char *str; 409 Py_ssize_t utf8len; 410 411 /* Ask for one less byte so we can terminate it */ 412 assert(size > 0); 413 size--; 414 415 if (buf == NULL) { 416 buf = PyObject_CallObject(tok->decoding_readline, NULL); 417 if (buf == NULL) 418 return error_ret(tok); 419 } else { 420 tok->decoding_buffer = NULL; 421 if (PyString_CheckExact(buf)) 422 utf8 = buf; 423 } 424 if (utf8 == NULL) { 425 utf8 = PyUnicode_AsUTF8String(buf); 426 Py_DECREF(buf); 427 if (utf8 == NULL) 428 return error_ret(tok); 429 } 430 str = PyString_AsString(utf8); 431 utf8len = PyString_GET_SIZE(utf8); 432 if (utf8len > size) { 433 tok->decoding_buffer = PyString_FromStringAndSize(str+size, utf8len-size); 434 if (tok->decoding_buffer == NULL) { 435 Py_DECREF(utf8); 436 return error_ret(tok); 437 } 438 utf8len = size; 439 } 440 memcpy(s, str, utf8len); 441 s[utf8len] = '\0'; 442 Py_DECREF(utf8); 443 if (utf8len == 0) 444 return NULL; /* EOF */ 445 return s; 446#endif 447} 448 449/* Set the readline function for TOK to a StreamReader's 450 readline function. The StreamReader is named ENC. 451 452 This function is called from check_bom and check_coding_spec. 453 454 ENC is usually identical to the future value of tok->encoding, 455 except for the (currently unsupported) case of UTF-16. 456 457 Return 1 on success, 0 on failure. */ 458 459static int 460fp_setreadl(struct tok_state *tok, const char* enc) 461{ 462 PyObject *reader, *stream, *readline; 463 464 /* XXX: constify filename argument. */ 465 stream = PyFile_FromFile(tok->fp, (char*)tok->filename, "rb", NULL); 466 if (stream == NULL) 467 return 0; 468 469 reader = PyCodec_StreamReader(enc, stream, NULL); 470 Py_DECREF(stream); 471 if (reader == NULL) 472 return 0; 473 474 readline = PyObject_GetAttrString(reader, "readline"); 475 Py_DECREF(reader); 476 if (readline == NULL) 477 return 0; 478 479 tok->decoding_readline = readline; 480 return 1; 481} 482 483/* Fetch the next byte from TOK. */ 484 485static int fp_getc(struct tok_state *tok) { 486 return getc(tok->fp); 487} 488 489/* Unfetch the last byte back into TOK. */ 490 491static void fp_ungetc(int c, struct tok_state *tok) { 492 ungetc(c, tok->fp); 493} 494 495/* Read a line of input from TOK. Determine encoding 496 if necessary. */ 497 498static char * 499decoding_fgets(char *s, int size, struct tok_state *tok) 500{ 501 char *line = NULL; 502 int badchar = 0; 503 for (;;) { 504 if (tok->decoding_state < 0) { 505 /* We already have a codec associated with 506 this input. */ 507 line = fp_readl(s, size, tok); 508 break; 509 } else if (tok->decoding_state > 0) { 510 /* We want a 'raw' read. */ 511 line = Py_UniversalNewlineFgets(s, size, 512 tok->fp, NULL); 513 break; 514 } else { 515 /* We have not yet determined the encoding. 516 If an encoding is found, use the file-pointer 517 reader functions from now on. */ 518 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok)) 519 return error_ret(tok); 520 assert(tok->decoding_state != 0); 521 } 522 } 523 if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) { 524 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) { 525 return error_ret(tok); 526 } 527 } 528#ifndef PGEN 529 /* The default encoding is ASCII, so make sure we don't have any 530 non-ASCII bytes in it. */ 531 if (line && !tok->encoding) { 532 unsigned char *c; 533 for (c = (unsigned char *)line; *c; c++) 534 if (*c > 127) { 535 badchar = *c; 536 break; 537 } 538 } 539 if (badchar) { 540 char buf[500]; 541 /* Need to add 1 to the line number, since this line 542 has not been counted, yet. */ 543 sprintf(buf, 544 "Non-ASCII character '\\x%.2x' " 545 "in file %.200s on line %i, " 546 "but no encoding declared; " 547 "see http://www.python.org/peps/pep-0263.html for details", 548 badchar, tok->filename, tok->lineno + 1); 549 PyErr_SetString(PyExc_SyntaxError, buf); 550 return error_ret(tok); 551 } 552#endif 553 return line; 554} 555 556static int 557decoding_feof(struct tok_state *tok) 558{ 559 if (tok->decoding_state >= 0) { 560 return feof(tok->fp); 561 } else { 562 PyObject* buf = tok->decoding_buffer; 563 if (buf == NULL) { 564 buf = PyObject_CallObject(tok->decoding_readline, NULL); 565 if (buf == NULL) { 566 error_ret(tok); 567 return 1; 568 } else { 569 tok->decoding_buffer = buf; 570 } 571 } 572 return PyObject_Length(buf) == 0; 573 } 574} 575 576/* Fetch a byte from TOK, using the string buffer. */ 577 578static int 579buf_getc(struct tok_state *tok) { 580 return Py_CHARMASK(*tok->str++); 581} 582 583/* Unfetch a byte from TOK, using the string buffer. */ 584 585static void 586buf_ungetc(int c, struct tok_state *tok) { 587 tok->str--; 588 assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */ 589} 590 591/* Set the readline function for TOK to ENC. For the string-based 592 tokenizer, this means to just record the encoding. */ 593 594static int 595buf_setreadl(struct tok_state *tok, const char* enc) { 596 tok->enc = enc; 597 return 1; 598} 599 600/* Return a UTF-8 encoding Python string object from the 601 C byte string STR, which is encoded with ENC. */ 602 603#ifdef Py_USING_UNICODE 604static PyObject * 605translate_into_utf8(const char* str, const char* enc) { 606 PyObject *utf8; 607 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL); 608 if (buf == NULL) 609 return NULL; 610 utf8 = PyUnicode_AsUTF8String(buf); 611 Py_DECREF(buf); 612 return utf8; 613} 614#endif 615 616 617static char * 618translate_newlines(const char *s, int exec_input, struct tok_state *tok) { 619 int skip_next_lf = 0, needed_length = strlen(s) + 2, final_length; 620 char *buf, *current; 621 char c = '\0'; 622 buf = PyMem_MALLOC(needed_length); 623 if (buf == NULL) { 624 tok->done = E_NOMEM; 625 return NULL; 626 } 627 for (current = buf; *s; s++, current++) { 628 c = *s; 629 if (skip_next_lf) { 630 skip_next_lf = 0; 631 if (c == '\n') { 632 c = *++s; 633 if (!c) 634 break; 635 } 636 } 637 if (c == '\r') { 638 skip_next_lf = 1; 639 c = '\n'; 640 } 641 *current = c; 642 } 643 /* If this is exec input, add a newline to the end of the string if 644 there isn't one already. */ 645 if (exec_input && c != '\n') { 646 *current = '\n'; 647 current++; 648 } 649 *current = '\0'; 650 final_length = current - buf + 1; 651 if (final_length < needed_length && final_length) 652 /* should never fail */ 653 buf = PyMem_REALLOC(buf, final_length); 654 return buf; 655} 656 657/* Decode a byte string STR for use as the buffer of TOK. 658 Look for encoding declarations inside STR, and record them 659 inside TOK. */ 660 661static const char * 662decode_str(const char *input, int single, struct tok_state *tok) 663{ 664 PyObject* utf8 = NULL; 665 const char *str; 666 const char *s; 667 const char *newl[2] = {NULL, NULL}; 668 int lineno = 0; 669 tok->input = str = translate_newlines(input, single, tok); 670 if (str == NULL) 671 return NULL; 672 tok->enc = NULL; 673 tok->str = str; 674 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok)) 675 return error_ret(tok); 676 str = tok->str; /* string after BOM if any */ 677 assert(str); 678#ifdef Py_USING_UNICODE 679 if (tok->enc != NULL) { 680 utf8 = translate_into_utf8(str, tok->enc); 681 if (utf8 == NULL) 682 return error_ret(tok); 683 str = PyString_AsString(utf8); 684 } 685#endif 686 for (s = str;; s++) { 687 if (*s == '\0') break; 688 else if (*s == '\n') { 689 assert(lineno < 2); 690 newl[lineno] = s; 691 lineno++; 692 if (lineno == 2) break; 693 } 694 } 695 tok->enc = NULL; 696 /* need to check line 1 and 2 separately since check_coding_spec 697 assumes a single line as input */ 698 if (newl[0]) { 699 if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl)) 700 return error_ret(tok); 701 if (tok->enc == NULL && newl[1]) { 702 if (!check_coding_spec(newl[0]+1, newl[1] - newl[0], 703 tok, buf_setreadl)) 704 return error_ret(tok); 705 } 706 } 707#ifdef Py_USING_UNICODE 708 if (tok->enc != NULL) { 709 assert(utf8 == NULL); 710 utf8 = translate_into_utf8(str, tok->enc); 711 if (utf8 == NULL) 712 return error_ret(tok); 713 str = PyString_AsString(utf8); 714 } 715#endif 716 assert(tok->decoding_buffer == NULL); 717 tok->decoding_buffer = utf8; /* CAUTION */ 718 return str; 719} 720 721#endif /* PGEN */ 722 723/* Set up tokenizer for string */ 724 725struct tok_state * 726PyTokenizer_FromString(const char *str, int exec_input) 727{ 728 struct tok_state *tok = tok_new(); 729 if (tok == NULL) 730 return NULL; 731 str = (char *)decode_str(str, exec_input, tok); 732 if (str == NULL) { 733 PyTokenizer_Free(tok); 734 return NULL; 735 } 736 737 /* XXX: constify members. */ 738 tok->buf = tok->cur = tok->end = tok->inp = (char*)str; 739 return tok; 740} 741 742 743/* Set up tokenizer for file */ 744 745struct tok_state * 746PyTokenizer_FromFile(FILE *fp, char *ps1, char *ps2) 747{ 748 struct tok_state *tok = tok_new(); 749 if (tok == NULL) 750 return NULL; 751 if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) { 752 PyTokenizer_Free(tok); 753 return NULL; 754 } 755 tok->cur = tok->inp = tok->buf; 756 tok->end = tok->buf + BUFSIZ; 757 tok->fp = fp; 758 tok->prompt = ps1; 759 tok->nextprompt = ps2; 760 return tok; 761} 762 763 764/* Free a tok_state structure */ 765 766void 767PyTokenizer_Free(struct tok_state *tok) 768{ 769 if (tok->encoding != NULL) 770 PyMem_FREE(tok->encoding); 771#ifndef PGEN 772 Py_XDECREF(tok->decoding_readline); 773 Py_XDECREF(tok->decoding_buffer); 774#endif 775 if (tok->fp != NULL && tok->buf != NULL) 776 PyMem_FREE(tok->buf); 777 if (tok->input) 778 PyMem_FREE((char *)tok->input); 779 PyMem_FREE(tok); 780} 781 782#if !defined(PGEN) && defined(Py_USING_UNICODE) 783static int 784tok_stdin_decode(struct tok_state *tok, char **inp) 785{ 786 PyObject *enc, *sysstdin, *decoded, *utf8; 787 const char *encoding; 788 char *converted; 789 790 if (PySys_GetFile((char *)"stdin", NULL) != stdin) 791 return 0; 792 sysstdin = PySys_GetObject("stdin"); 793 if (sysstdin == NULL || !PyFile_Check(sysstdin)) 794 return 0; 795 796 enc = ((PyFileObject *)sysstdin)->f_encoding; 797 if (enc == NULL || !PyString_Check(enc)) 798 return 0; 799 Py_INCREF(enc); 800 801 encoding = PyString_AsString(enc); 802 decoded = PyUnicode_Decode(*inp, strlen(*inp), encoding, NULL); 803 if (decoded == NULL) 804 goto error_clear; 805 806 utf8 = PyUnicode_AsEncodedString(decoded, "utf-8", NULL); 807 Py_DECREF(decoded); 808 if (utf8 == NULL) 809 goto error_clear; 810 811 assert(PyString_Check(utf8)); 812 converted = new_string(PyString_AS_STRING(utf8), 813 PyString_GET_SIZE(utf8)); 814 Py_DECREF(utf8); 815 if (converted == NULL) 816 goto error_nomem; 817 818 PyMem_FREE(*inp); 819 *inp = converted; 820 if (tok->encoding != NULL) 821 PyMem_FREE(tok->encoding); 822 tok->encoding = new_string(encoding, strlen(encoding)); 823 if (tok->encoding == NULL) 824 goto error_nomem; 825 826 Py_DECREF(enc); 827 return 0; 828 829error_nomem: 830 Py_DECREF(enc); 831 tok->done = E_NOMEM; 832 return -1; 833 834error_clear: 835 Py_DECREF(enc); 836 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) { 837 tok->done = E_ERROR; 838 return -1; 839 } 840 /* Fallback to iso-8859-1: for backward compatibility */ 841 PyErr_Clear(); 842 return 0; 843} 844#endif 845 846/* Get next char, updating state; error code goes into tok->done */ 847 848static int 849tok_nextc(register struct tok_state *tok) 850{ 851 for (;;) { 852 if (tok->cur != tok->inp) { 853 return Py_CHARMASK(*tok->cur++); /* Fast path */ 854 } 855 if (tok->done != E_OK) 856 return EOF; 857 if (tok->fp == NULL) { 858 char *end = strchr(tok->inp, '\n'); 859 if (end != NULL) 860 end++; 861 else { 862 end = strchr(tok->inp, '\0'); 863 if (end == tok->inp) { 864 tok->done = E_EOF; 865 return EOF; 866 } 867 } 868 if (tok->start == NULL) 869 tok->buf = tok->cur; 870 tok->line_start = tok->cur; 871 tok->lineno++; 872 tok->inp = end; 873 return Py_CHARMASK(*tok->cur++); 874 } 875 if (tok->prompt != NULL) { 876 char *newtok = PyOS_Readline(stdin, stdout, tok->prompt); 877 if (tok->nextprompt != NULL) 878 tok->prompt = tok->nextprompt; 879 if (newtok == NULL) 880 tok->done = E_INTR; 881 else if (*newtok == '\0') { 882 PyMem_FREE(newtok); 883 tok->done = E_EOF; 884 } 885#if !defined(PGEN) && defined(Py_USING_UNICODE) 886 else if (tok_stdin_decode(tok, &newtok) != 0) 887 PyMem_FREE(newtok); 888#endif 889 else if (tok->start != NULL) { 890 size_t start = tok->start - tok->buf; 891 size_t oldlen = tok->cur - tok->buf; 892 size_t newlen = oldlen + strlen(newtok); 893 char *buf = tok->buf; 894 buf = (char *)PyMem_REALLOC(buf, newlen+1); 895 tok->lineno++; 896 if (buf == NULL) { 897 PyMem_FREE(tok->buf); 898 tok->buf = NULL; 899 PyMem_FREE(newtok); 900 tok->done = E_NOMEM; 901 return EOF; 902 } 903 tok->buf = buf; 904 tok->cur = tok->buf + oldlen; 905 tok->line_start = tok->cur; 906 strcpy(tok->buf + oldlen, newtok); 907 PyMem_FREE(newtok); 908 tok->inp = tok->buf + newlen; 909 tok->end = tok->inp + 1; 910 tok->start = tok->buf + start; 911 } 912 else { 913 tok->lineno++; 914 if (tok->buf != NULL) 915 PyMem_FREE(tok->buf); 916 tok->buf = newtok; 917 tok->line_start = tok->buf; 918 tok->cur = tok->buf; 919 tok->line_start = tok->buf; 920 tok->inp = strchr(tok->buf, '\0'); 921 tok->end = tok->inp + 1; 922 } 923 } 924 else { 925 int done = 0; 926 Py_ssize_t cur = 0; 927 char *pt; 928 if (tok->start == NULL) { 929 if (tok->buf == NULL) { 930 tok->buf = (char *) 931 PyMem_MALLOC(BUFSIZ); 932 if (tok->buf == NULL) { 933 tok->done = E_NOMEM; 934 return EOF; 935 } 936 tok->end = tok->buf + BUFSIZ; 937 } 938 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf), 939 tok) == NULL) { 940 tok->done = E_EOF; 941 done = 1; 942 } 943 else { 944 tok->done = E_OK; 945 tok->inp = strchr(tok->buf, '\0'); 946 done = tok->inp[-1] == '\n'; 947 } 948 } 949 else { 950 cur = tok->cur - tok->buf; 951 if (decoding_feof(tok)) { 952 tok->done = E_EOF; 953 done = 1; 954 } 955 else 956 tok->done = E_OK; 957 } 958 tok->lineno++; 959 /* Read until '\n' or EOF */ 960 while (!done) { 961 Py_ssize_t curstart = tok->start == NULL ? -1 : 962 tok->start - tok->buf; 963 Py_ssize_t curvalid = tok->inp - tok->buf; 964 Py_ssize_t newsize = curvalid + BUFSIZ; 965 char *newbuf = tok->buf; 966 newbuf = (char *)PyMem_REALLOC(newbuf, 967 newsize); 968 if (newbuf == NULL) { 969 tok->done = E_NOMEM; 970 tok->cur = tok->inp; 971 return EOF; 972 } 973 tok->buf = newbuf; 974 tok->inp = tok->buf + curvalid; 975 tok->end = tok->buf + newsize; 976 tok->start = curstart < 0 ? NULL : 977 tok->buf + curstart; 978 if (decoding_fgets(tok->inp, 979 (int)(tok->end - tok->inp), 980 tok) == NULL) { 981 /* Break out early on decoding 982 errors, as tok->buf will be NULL 983 */ 984 if (tok->decoding_erred) 985 return EOF; 986 /* Last line does not end in \n, 987 fake one */ 988 strcpy(tok->inp, "\n"); 989 } 990 tok->inp = strchr(tok->inp, '\0'); 991 done = tok->inp[-1] == '\n'; 992 } 993 if (tok->buf != NULL) { 994 tok->cur = tok->buf + cur; 995 tok->line_start = tok->cur; 996 /* replace "\r\n" with "\n" */ 997 /* For Mac leave the \r, giving a syntax error */ 998 pt = tok->inp - 2; 999 if (pt >= tok->buf && *pt == '\r') { 1000 *pt++ = '\n'; 1001 *pt = '\0'; 1002 tok->inp = pt; 1003 } 1004 } 1005 } 1006 if (tok->done != E_OK) { 1007 if (tok->prompt != NULL) 1008 PySys_WriteStderr("\n"); 1009 tok->cur = tok->inp; 1010 return EOF; 1011 } 1012 } 1013 /*NOTREACHED*/ 1014} 1015 1016 1017/* Back-up one character */ 1018 1019static void 1020tok_backup(register struct tok_state *tok, register int c) 1021{ 1022 if (c != EOF) { 1023 if (--tok->cur < tok->buf) 1024 Py_FatalError("tok_backup: beginning of buffer"); 1025 if (*tok->cur != c) 1026 *tok->cur = c; 1027 } 1028} 1029 1030 1031/* Return the token corresponding to a single character */ 1032 1033int 1034PyToken_OneChar(int c) 1035{ 1036 switch (c) { 1037 case '(': return LPAR; 1038 case ')': return RPAR; 1039 case '[': return LSQB; 1040 case ']': return RSQB; 1041 case ':': return COLON; 1042 case ',': return COMMA; 1043 case ';': return SEMI; 1044 case '+': return PLUS; 1045 case '-': return MINUS; 1046 case '*': return STAR; 1047 case '/': return SLASH; 1048 case '|': return VBAR; 1049 case '&': return AMPER; 1050 case '<': return LESS; 1051 case '>': return GREATER; 1052 case '=': return EQUAL; 1053 case '.': return DOT; 1054 case '%': return PERCENT; 1055 case '`': return BACKQUOTE; 1056 case '{': return LBRACE; 1057 case '}': return RBRACE; 1058 case '^': return CIRCUMFLEX; 1059 case '~': return TILDE; 1060 case '@': return AT; 1061 default: return OP; 1062 } 1063} 1064 1065 1066int 1067PyToken_TwoChars(int c1, int c2) 1068{ 1069 switch (c1) { 1070 case '=': 1071 switch (c2) { 1072 case '=': return EQEQUAL; 1073 } 1074 break; 1075 case '!': 1076 switch (c2) { 1077 case '=': return NOTEQUAL; 1078 } 1079 break; 1080 case '<': 1081 switch (c2) { 1082 case '>': return NOTEQUAL; 1083 case '=': return LESSEQUAL; 1084 case '<': return LEFTSHIFT; 1085 } 1086 break; 1087 case '>': 1088 switch (c2) { 1089 case '=': return GREATEREQUAL; 1090 case '>': return RIGHTSHIFT; 1091 } 1092 break; 1093 case '+': 1094 switch (c2) { 1095 case '=': return PLUSEQUAL; 1096 } 1097 break; 1098 case '-': 1099 switch (c2) { 1100 case '=': return MINEQUAL; 1101 } 1102 break; 1103 case '*': 1104 switch (c2) { 1105 case '*': return DOUBLESTAR; 1106 case '=': return STAREQUAL; 1107 } 1108 break; 1109 case '/': 1110 switch (c2) { 1111 case '/': return DOUBLESLASH; 1112 case '=': return SLASHEQUAL; 1113 } 1114 break; 1115 case '|': 1116 switch (c2) { 1117 case '=': return VBAREQUAL; 1118 } 1119 break; 1120 case '%': 1121 switch (c2) { 1122 case '=': return PERCENTEQUAL; 1123 } 1124 break; 1125 case '&': 1126 switch (c2) { 1127 case '=': return AMPEREQUAL; 1128 } 1129 break; 1130 case '^': 1131 switch (c2) { 1132 case '=': return CIRCUMFLEXEQUAL; 1133 } 1134 break; 1135 } 1136 return OP; 1137} 1138 1139int 1140PyToken_ThreeChars(int c1, int c2, int c3) 1141{ 1142 switch (c1) { 1143 case '<': 1144 switch (c2) { 1145 case '<': 1146 switch (c3) { 1147 case '=': 1148 return LEFTSHIFTEQUAL; 1149 } 1150 break; 1151 } 1152 break; 1153 case '>': 1154 switch (c2) { 1155 case '>': 1156 switch (c3) { 1157 case '=': 1158 return RIGHTSHIFTEQUAL; 1159 } 1160 break; 1161 } 1162 break; 1163 case '*': 1164 switch (c2) { 1165 case '*': 1166 switch (c3) { 1167 case '=': 1168 return DOUBLESTAREQUAL; 1169 } 1170 break; 1171 } 1172 break; 1173 case '/': 1174 switch (c2) { 1175 case '/': 1176 switch (c3) { 1177 case '=': 1178 return DOUBLESLASHEQUAL; 1179 } 1180 break; 1181 } 1182 break; 1183 } 1184 return OP; 1185} 1186 1187static int 1188indenterror(struct tok_state *tok) 1189{ 1190 if (tok->alterror) { 1191 tok->done = E_TABSPACE; 1192 tok->cur = tok->inp; 1193 return 1; 1194 } 1195 if (tok->altwarning) { 1196 PySys_WriteStderr("%s: inconsistent use of tabs and spaces " 1197 "in indentation\n", tok->filename); 1198 tok->altwarning = 0; 1199 } 1200 return 0; 1201} 1202 1203/* Get next token, after space stripping etc. */ 1204 1205static int 1206tok_get(register struct tok_state *tok, char **p_start, char **p_end) 1207{ 1208 register int c; 1209 int blankline; 1210 1211 *p_start = *p_end = NULL; 1212 nextline: 1213 tok->start = NULL; 1214 blankline = 0; 1215 1216 /* Get indentation level */ 1217 if (tok->atbol) { 1218 register int col = 0; 1219 register int altcol = 0; 1220 tok->atbol = 0; 1221 for (;;) { 1222 c = tok_nextc(tok); 1223 if (c == ' ') 1224 col++, altcol++; 1225 else if (c == '\t') { 1226 col = (col/tok->tabsize + 1) * tok->tabsize; 1227 altcol = (altcol/tok->alttabsize + 1) 1228 * tok->alttabsize; 1229 } 1230 else if (c == '\014') /* Control-L (formfeed) */ 1231 col = altcol = 0; /* For Emacs users */ 1232 else 1233 break; 1234 } 1235 tok_backup(tok, c); 1236 if (c == '#' || c == '\n') { 1237 /* Lines with only whitespace and/or comments 1238 shouldn't affect the indentation and are 1239 not passed to the parser as NEWLINE tokens, 1240 except *totally* empty lines in interactive 1241 mode, which signal the end of a command group. */ 1242 if (col == 0 && c == '\n' && tok->prompt != NULL) 1243 blankline = 0; /* Let it through */ 1244 else 1245 blankline = 1; /* Ignore completely */ 1246 /* We can't jump back right here since we still 1247 may need to skip to the end of a comment */ 1248 } 1249 if (!blankline && tok->level == 0) { 1250 if (col == tok->indstack[tok->indent]) { 1251 /* No change */ 1252 if (altcol != tok->altindstack[tok->indent]) { 1253 if (indenterror(tok)) 1254 return ERRORTOKEN; 1255 } 1256 } 1257 else if (col > tok->indstack[tok->indent]) { 1258 /* Indent -- always one */ 1259 if (tok->indent+1 >= MAXINDENT) { 1260 tok->done = E_TOODEEP; 1261 tok->cur = tok->inp; 1262 return ERRORTOKEN; 1263 } 1264 if (altcol <= tok->altindstack[tok->indent]) { 1265 if (indenterror(tok)) 1266 return ERRORTOKEN; 1267 } 1268 tok->pendin++; 1269 tok->indstack[++tok->indent] = col; 1270 tok->altindstack[tok->indent] = altcol; 1271 } 1272 else /* col < tok->indstack[tok->indent] */ { 1273 /* Dedent -- any number, must be consistent */ 1274 while (tok->indent > 0 && 1275 col < tok->indstack[tok->indent]) { 1276 tok->pendin--; 1277 tok->indent--; 1278 } 1279 if (col != tok->indstack[tok->indent]) { 1280 tok->done = E_DEDENT; 1281 tok->cur = tok->inp; 1282 return ERRORTOKEN; 1283 } 1284 if (altcol != tok->altindstack[tok->indent]) { 1285 if (indenterror(tok)) 1286 return ERRORTOKEN; 1287 } 1288 } 1289 } 1290 } 1291 1292 tok->start = tok->cur; 1293 1294 /* Return pending indents/dedents */ 1295 if (tok->pendin != 0) { 1296 if (tok->pendin < 0) { 1297 tok->pendin++; 1298 return DEDENT; 1299 } 1300 else { 1301 tok->pendin--; 1302 return INDENT; 1303 } 1304 } 1305 1306 again: 1307 tok->start = NULL; 1308 /* Skip spaces */ 1309 do { 1310 c = tok_nextc(tok); 1311 } while (c == ' ' || c == '\t' || c == '\014'); 1312 1313 /* Set start of current token */ 1314 tok->start = tok->cur - 1; 1315 1316 /* Skip comment, while looking for tab-setting magic */ 1317 if (c == '#') { 1318 static char *tabforms[] = { 1319 "tab-width:", /* Emacs */ 1320 ":tabstop=", /* vim, full form */ 1321 ":ts=", /* vim, abbreviated form */ 1322 "set tabsize=", /* will vi never die? */ 1323 /* more templates can be added here to support other editors */ 1324 }; 1325 char cbuf[80]; 1326 char *tp, **cp; 1327 tp = cbuf; 1328 do { 1329 *tp++ = c = tok_nextc(tok); 1330 } while (c != EOF && c != '\n' && 1331 (size_t)(tp - cbuf + 1) < sizeof(cbuf)); 1332 *tp = '\0'; 1333 for (cp = tabforms; 1334 cp < tabforms + sizeof(tabforms)/sizeof(tabforms[0]); 1335 cp++) { 1336 if ((tp = strstr(cbuf, *cp))) { 1337 int newsize = atoi(tp + strlen(*cp)); 1338 1339 if (newsize >= 1 && newsize <= 40) { 1340 tok->tabsize = newsize; 1341 if (Py_VerboseFlag) 1342 PySys_WriteStderr( 1343 "Tab size set to %d\n", 1344 newsize); 1345 } 1346 } 1347 } 1348 while (c != EOF && c != '\n') 1349 c = tok_nextc(tok); 1350 } 1351 1352 /* Check for EOF and errors now */ 1353 if (c == EOF) { 1354 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN; 1355 } 1356 1357 /* Identifier (most frequent token!) */ 1358 if (ascii_isalpha(c) || c == '_') { 1359 /* Process r"", u"" and ur"" */ 1360 switch (c) { 1361 case 'b': 1362 case 'B': 1363 c = tok_nextc(tok); 1364 if (c == 'r' || c == 'R') 1365 c = tok_nextc(tok); 1366 if (c == '"' || c == '\'') 1367 goto letter_quote; 1368 break; 1369 case 'r': 1370 case 'R': 1371 c = tok_nextc(tok); 1372 if (c == '"' || c == '\'') 1373 goto letter_quote; 1374 break; 1375 case 'u': 1376 case 'U': 1377 c = tok_nextc(tok); 1378 if (c == 'r' || c == 'R') 1379 c = tok_nextc(tok); 1380 if (c == '"' || c == '\'') 1381 goto letter_quote; 1382 break; 1383 } 1384 while (ascii_isalnum(c) || c == '_') { 1385 c = tok_nextc(tok); 1386 } 1387 tok_backup(tok, c); 1388 *p_start = tok->start; 1389 *p_end = tok->cur; 1390 return NAME; 1391 } 1392 1393 /* Newline */ 1394 if (c == '\n') { 1395 tok->atbol = 1; 1396 if (blankline || tok->level > 0) 1397 goto nextline; 1398 *p_start = tok->start; 1399 *p_end = tok->cur - 1; /* Leave '\n' out of the string */ 1400 tok->cont_line = 0; 1401 return NEWLINE; 1402 } 1403 1404 /* Period or number starting with period? */ 1405 if (c == '.') { 1406 c = tok_nextc(tok); 1407 if (isdigit(c)) { 1408 goto fraction; 1409 } 1410 else { 1411 tok_backup(tok, c); 1412 *p_start = tok->start; 1413 *p_end = tok->cur; 1414 return DOT; 1415 } 1416 } 1417 1418 /* Number */ 1419 if (isdigit(c)) { 1420 if (c == '0') { 1421 /* Hex, octal or binary -- maybe. */ 1422 c = tok_nextc(tok); 1423 if (c == '.') 1424 goto fraction; 1425#ifndef WITHOUT_COMPLEX 1426 if (c == 'j' || c == 'J') 1427 goto imaginary; 1428#endif 1429 if (c == 'x' || c == 'X') { 1430 1431 /* Hex */ 1432 c = tok_nextc(tok); 1433 if (!isxdigit(c)) { 1434 tok->done = E_TOKEN; 1435 tok_backup(tok, c); 1436 return ERRORTOKEN; 1437 } 1438 do { 1439 c = tok_nextc(tok); 1440 } while (isxdigit(c)); 1441 } 1442 else if (c == 'o' || c == 'O') { 1443 /* Octal */ 1444 c = tok_nextc(tok); 1445 if (c < '0' || c >= '8') { 1446 tok->done = E_TOKEN; 1447 tok_backup(tok, c); 1448 return ERRORTOKEN; 1449 } 1450 do { 1451 c = tok_nextc(tok); 1452 } while ('0' <= c && c < '8'); 1453 } 1454 else if (c == 'b' || c == 'B') { 1455 /* Binary */ 1456 c = tok_nextc(tok); 1457 if (c != '0' && c != '1') { 1458 tok->done = E_TOKEN; 1459 tok_backup(tok, c); 1460 return ERRORTOKEN; 1461 } 1462 do { 1463 c = tok_nextc(tok); 1464 } while (c == '0' || c == '1'); 1465 } 1466 else { 1467 int found_decimal = 0; 1468 /* Octal; c is first char of it */ 1469 /* There's no 'isoctdigit' macro, sigh */ 1470 while ('0' <= c && c < '8') { 1471 c = tok_nextc(tok); 1472 } 1473 if (isdigit(c)) { 1474 found_decimal = 1; 1475 do { 1476 c = tok_nextc(tok); 1477 } while (isdigit(c)); 1478 } 1479 if (c == '.') 1480 goto fraction; 1481 else if (c == 'e' || c == 'E') 1482 goto exponent; 1483#ifndef WITHOUT_COMPLEX 1484 else if (c == 'j' || c == 'J') 1485 goto imaginary; 1486#endif 1487 else if (found_decimal) { 1488 tok->done = E_TOKEN; 1489 tok_backup(tok, c); 1490 return ERRORTOKEN; 1491 } 1492 } 1493 if (c == 'l' || c == 'L') 1494 c = tok_nextc(tok); 1495 } 1496 else { 1497 /* Decimal */ 1498 do { 1499 c = tok_nextc(tok); 1500 } while (isdigit(c)); 1501 if (c == 'l' || c == 'L') 1502 c = tok_nextc(tok); 1503 else { 1504 /* Accept floating point numbers. */ 1505 if (c == '.') { 1506 fraction: 1507 /* Fraction */ 1508 do { 1509 c = tok_nextc(tok); 1510 } while (isdigit(c)); 1511 } 1512 if (c == 'e' || c == 'E') { 1513 exponent: 1514 /* Exponent part */ 1515 c = tok_nextc(tok); 1516 if (c == '+' || c == '-') 1517 c = tok_nextc(tok); 1518 if (!isdigit(c)) { 1519 tok->done = E_TOKEN; 1520 tok_backup(tok, c); 1521 return ERRORTOKEN; 1522 } 1523 do { 1524 c = tok_nextc(tok); 1525 } while (isdigit(c)); 1526 } 1527#ifndef WITHOUT_COMPLEX 1528 if (c == 'j' || c == 'J') 1529 /* Imaginary part */ 1530 imaginary: 1531 c = tok_nextc(tok); 1532#endif 1533 } 1534 } 1535 tok_backup(tok, c); 1536 *p_start = tok->start; 1537 *p_end = tok->cur; 1538 return NUMBER; 1539 } 1540 1541 letter_quote: 1542 /* String */ 1543 if (c == '\'' || c == '"') { 1544 Py_ssize_t quote2 = tok->cur - tok->start + 1; 1545 int quote = c; 1546 int triple = 0; 1547 int tripcount = 0; 1548 for (;;) { 1549 c = tok_nextc(tok); 1550 if (c == '\n') { 1551 if (!triple) { 1552 tok->done = E_EOLS; 1553 tok_backup(tok, c); 1554 return ERRORTOKEN; 1555 } 1556 tripcount = 0; 1557 tok->cont_line = 1; /* multiline string. */ 1558 } 1559 else if (c == EOF) { 1560 if (triple) 1561 tok->done = E_EOFS; 1562 else 1563 tok->done = E_EOLS; 1564 tok->cur = tok->inp; 1565 return ERRORTOKEN; 1566 } 1567 else if (c == quote) { 1568 tripcount++; 1569 if (tok->cur - tok->start == quote2) { 1570 c = tok_nextc(tok); 1571 if (c == quote) { 1572 triple = 1; 1573 tripcount = 0; 1574 continue; 1575 } 1576 tok_backup(tok, c); 1577 } 1578 if (!triple || tripcount == 3) 1579 break; 1580 } 1581 else if (c == '\\') { 1582 tripcount = 0; 1583 c = tok_nextc(tok); 1584 if (c == EOF) { 1585 tok->done = E_EOLS; 1586 tok->cur = tok->inp; 1587 return ERRORTOKEN; 1588 } 1589 } 1590 else 1591 tripcount = 0; 1592 } 1593 *p_start = tok->start; 1594 *p_end = tok->cur; 1595 return STRING; 1596 } 1597 1598 /* Line continuation */ 1599 if (c == '\\') { 1600 c = tok_nextc(tok); 1601 if (c != '\n') { 1602 tok->done = E_LINECONT; 1603 tok->cur = tok->inp; 1604 return ERRORTOKEN; 1605 } 1606 tok->cont_line = 1; 1607 goto again; /* Read next line */ 1608 } 1609 1610 /* Check for two-character token */ 1611 { 1612 int c2 = tok_nextc(tok); 1613 int token = PyToken_TwoChars(c, c2); 1614#ifndef PGEN 1615 if (Py_Py3kWarningFlag && token == NOTEQUAL && c == '<') { 1616 if (PyErr_WarnExplicit(PyExc_DeprecationWarning, 1617 "<> not supported in 3.x; use !=", 1618 tok->filename, tok->lineno, 1619 NULL, NULL)) { 1620 return ERRORTOKEN; 1621 } 1622 } 1623#endif 1624 if (token != OP) { 1625 int c3 = tok_nextc(tok); 1626 int token3 = PyToken_ThreeChars(c, c2, c3); 1627 if (token3 != OP) { 1628 token = token3; 1629 } else { 1630 tok_backup(tok, c3); 1631 } 1632 *p_start = tok->start; 1633 *p_end = tok->cur; 1634 return token; 1635 } 1636 tok_backup(tok, c2); 1637 } 1638 1639 /* Keep track of parentheses nesting level */ 1640 switch (c) { 1641 case '(': 1642 case '[': 1643 case '{': 1644 tok->level++; 1645 break; 1646 case ')': 1647 case ']': 1648 case '}': 1649 tok->level--; 1650 break; 1651 } 1652 1653 /* Punctuation character */ 1654 *p_start = tok->start; 1655 *p_end = tok->cur; 1656 return PyToken_OneChar(c); 1657} 1658 1659int 1660PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end) 1661{ 1662 int result = tok_get(tok, p_start, p_end); 1663 if (tok->decoding_erred) { 1664 result = ERRORTOKEN; 1665 tok->done = E_DECODE; 1666 } 1667 return result; 1668} 1669 1670/* This function is only called from parsetok. However, it cannot live 1671 there, as it must be empty for PGEN, and we can check for PGEN only 1672 in this file. */ 1673 1674#if defined(PGEN) || !defined(Py_USING_UNICODE) 1675char* 1676PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int* offset) 1677{ 1678 return NULL; 1679} 1680#else 1681#ifdef Py_USING_UNICODE 1682static PyObject * 1683dec_utf8(const char *enc, const char *text, size_t len) { 1684 PyObject *ret = NULL; 1685 PyObject *unicode_text = PyUnicode_DecodeUTF8(text, len, "replace"); 1686 if (unicode_text) { 1687 ret = PyUnicode_AsEncodedString(unicode_text, enc, "replace"); 1688 Py_DECREF(unicode_text); 1689 } 1690 if (!ret) { 1691 PyErr_Clear(); 1692 } 1693 return ret; 1694} 1695char * 1696PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int *offset) 1697{ 1698 char *text = NULL; 1699 if (tok->encoding) { 1700 /* convert source to original encondig */ 1701 PyObject *lineobj = dec_utf8(tok->encoding, tok->buf, len); 1702 if (lineobj != NULL) { 1703 int linelen = PyString_Size(lineobj); 1704 const char *line = PyString_AsString(lineobj); 1705 text = PyObject_MALLOC(linelen + 1); 1706 if (text != NULL && line != NULL) { 1707 if (linelen) 1708 strncpy(text, line, linelen); 1709 text[linelen] = '\0'; 1710 } 1711 Py_DECREF(lineobj); 1712 1713 /* adjust error offset */ 1714 if (*offset > 1) { 1715 PyObject *offsetobj = dec_utf8(tok->encoding, 1716 tok->buf, *offset-1); 1717 if (offsetobj) { 1718 *offset = PyString_Size(offsetobj) + 1; 1719 Py_DECREF(offsetobj); 1720 } 1721 } 1722 1723 } 1724 } 1725 return text; 1726 1727} 1728#endif /* defined(Py_USING_UNICODE) */ 1729#endif 1730 1731 1732#ifdef Py_DEBUG 1733 1734void 1735tok_dump(int type, char *start, char *end) 1736{ 1737 printf("%s", _PyParser_TokenNames[type]); 1738 if (type == NAME || type == NUMBER || type == STRING || type == OP) 1739 printf("(%.*s)", (int)(end - start), start); 1740} 1741 1742#endif 1743