tokenizer.c revision 9ff19b54346d39d15cdcf75e9d66ab46ea6064d6
1 2/* Tokenizer implementation */ 3 4#include "Python.h" 5#include "pgenheaders.h" 6 7#include <ctype.h> 8#include <assert.h> 9 10#include "tokenizer.h" 11#include "errcode.h" 12 13#ifndef PGEN 14#include "unicodeobject.h" 15#include "stringobject.h" 16#include "fileobject.h" 17#include "codecs.h" 18#include "abstract.h" 19#include "pydebug.h" 20#endif /* PGEN */ 21 22extern char *PyOS_Readline(FILE *, FILE *, char *); 23/* Return malloc'ed string including trailing \n; 24 empty malloc'ed string for EOF; 25 NULL if interrupted */ 26 27/* Don't ever change this -- it would break the portability of Python code */ 28#define TABSIZE 8 29 30/* Convert a possibly signed character to a nonnegative int */ 31/* XXX This assumes characters are 8 bits wide */ 32#ifdef __CHAR_UNSIGNED__ 33#define Py_CHARMASK(c) (c) 34#else 35#define Py_CHARMASK(c) ((c) & 0xff) 36#endif 37 38/* Forward */ 39static struct tok_state *tok_new(void); 40static int tok_nextc(struct tok_state *tok); 41static void tok_backup(struct tok_state *tok, int c); 42 43/* Token names */ 44 45char *_PyParser_TokenNames[] = { 46 "ENDMARKER", 47 "NAME", 48 "NUMBER", 49 "STRING", 50 "NEWLINE", 51 "INDENT", 52 "DEDENT", 53 "LPAR", 54 "RPAR", 55 "LSQB", 56 "RSQB", 57 "COLON", 58 "COMMA", 59 "SEMI", 60 "PLUS", 61 "MINUS", 62 "STAR", 63 "SLASH", 64 "VBAR", 65 "AMPER", 66 "LESS", 67 "GREATER", 68 "EQUAL", 69 "DOT", 70 "PERCENT", 71 "BACKQUOTE", 72 "LBRACE", 73 "RBRACE", 74 "EQEQUAL", 75 "NOTEQUAL", 76 "LESSEQUAL", 77 "GREATEREQUAL", 78 "TILDE", 79 "CIRCUMFLEX", 80 "LEFTSHIFT", 81 "RIGHTSHIFT", 82 "DOUBLESTAR", 83 "PLUSEQUAL", 84 "MINEQUAL", 85 "STAREQUAL", 86 "SLASHEQUAL", 87 "PERCENTEQUAL", 88 "AMPEREQUAL", 89 "VBAREQUAL", 90 "CIRCUMFLEXEQUAL", 91 "LEFTSHIFTEQUAL", 92 "RIGHTSHIFTEQUAL", 93 "DOUBLESTAREQUAL", 94 "DOUBLESLASH", 95 "DOUBLESLASHEQUAL", 96 "AT", 97 /* This table must match the #defines in token.h! */ 98 "OP", 99 "<ERRORTOKEN>", 100 "<N_TOKENS>" 101}; 102 103 104/* Create and initialize a new tok_state structure */ 105 106static struct tok_state * 107tok_new(void) 108{ 109 struct tok_state *tok = (struct tok_state *)PyMem_MALLOC( 110 sizeof(struct tok_state)); 111 if (tok == NULL) 112 return NULL; 113 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL; 114 tok->done = E_OK; 115 tok->fp = NULL; 116 tok->tabsize = TABSIZE; 117 tok->indent = 0; 118 tok->indstack[0] = 0; 119 tok->atbol = 1; 120 tok->pendin = 0; 121 tok->prompt = tok->nextprompt = NULL; 122 tok->lineno = 0; 123 tok->level = 0; 124 tok->filename = NULL; 125 tok->altwarning = 0; 126 tok->alterror = 0; 127 tok->alttabsize = 1; 128 tok->altindstack[0] = 0; 129 tok->decoding_state = 0; 130 tok->decoding_erred = 0; 131 tok->read_coding_spec = 0; 132 tok->encoding = NULL; 133 tok->cont_line = 0; 134#ifndef PGEN 135 tok->decoding_readline = NULL; 136 tok->decoding_buffer = NULL; 137#endif 138 return tok; 139} 140 141#ifdef PGEN 142 143static char * 144decoding_fgets(char *s, int size, struct tok_state *tok) 145{ 146 return fgets(s, size, tok->fp); 147} 148 149static int 150decoding_feof(struct tok_state *tok) 151{ 152 return feof(tok->fp); 153} 154 155static const char * 156decode_str(const char *str, struct tok_state *tok) 157{ 158 return str; 159} 160 161#else /* PGEN */ 162 163static char * 164error_ret(struct tok_state *tok) /* XXX */ 165{ 166 tok->decoding_erred = 1; 167 if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */ 168 PyMem_FREE(tok->buf); 169 tok->buf = NULL; 170 return NULL; /* as if it were EOF */ 171} 172 173static char * 174new_string(const char *s, Py_ssize_t len) 175{ 176 char* result = (char *)PyMem_MALLOC(len + 1); 177 if (result != NULL) { 178 memcpy(result, s, len); 179 result[len] = '\0'; 180 } 181 return result; 182} 183 184static char * 185get_normal_name(char *s) /* for utf-8 and latin-1 */ 186{ 187 char buf[13]; 188 int i; 189 for (i = 0; i < 12; i++) { 190 int c = s[i]; 191 if (c == '\0') break; 192 else if (c == '_') buf[i] = '-'; 193 else buf[i] = tolower(c); 194 } 195 buf[i] = '\0'; 196 if (strcmp(buf, "utf-8") == 0 || 197 strncmp(buf, "utf-8-", 6) == 0) return "utf-8"; 198 else if (strcmp(buf, "latin-1") == 0 || 199 strcmp(buf, "iso-8859-1") == 0 || 200 strcmp(buf, "iso-latin-1") == 0 || 201 strncmp(buf, "latin-1-", 8) == 0 || 202 strncmp(buf, "iso-8859-1-", 11) == 0 || 203 strncmp(buf, "iso-latin-1-", 12) == 0) return "iso-8859-1"; 204 else return s; 205} 206 207/* Return the coding spec in S, or NULL if none is found. */ 208 209static char * 210get_coding_spec(const char *s, Py_ssize_t size) 211{ 212 Py_ssize_t i; 213 /* Coding spec must be in a comment, and that comment must be 214 * the only statement on the source code line. */ 215 for (i = 0; i < size - 6; i++) { 216 if (s[i] == '#') 217 break; 218 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014') 219 return NULL; 220 } 221 for (; i < size - 6; i++) { /* XXX inefficient search */ 222 const char* t = s + i; 223 if (strncmp(t, "coding", 6) == 0) { 224 const char* begin = NULL; 225 t += 6; 226 if (t[0] != ':' && t[0] != '=') 227 continue; 228 do { 229 t++; 230 } while (t[0] == '\x20' || t[0] == '\t'); 231 232 begin = t; 233 while (isalnum(Py_CHARMASK(t[0])) || 234 t[0] == '-' || t[0] == '_' || t[0] == '.') 235 t++; 236 237 if (begin < t) { 238 char* r = new_string(begin, t - begin); 239 char* q = get_normal_name(r); 240 if (r != q) { 241 PyMem_FREE(r); 242 r = new_string(q, strlen(q)); 243 } 244 return r; 245 } 246 } 247 } 248 return NULL; 249} 250 251/* Check whether the line contains a coding spec. If it does, 252 invoke the set_readline function for the new encoding. 253 This function receives the tok_state and the new encoding. 254 Return 1 on success, 0 on failure. */ 255 256static int 257check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok, 258 int set_readline(struct tok_state *, const char *)) 259{ 260 char * cs; 261 int r = 1; 262 263 if (tok->cont_line) 264 /* It's a continuation line, so it can't be a coding spec. */ 265 return 1; 266 cs = get_coding_spec(line, size); 267 if (cs != NULL) { 268 tok->read_coding_spec = 1; 269 if (tok->encoding == NULL) { 270 assert(tok->decoding_state == 1); /* raw */ 271 if (strcmp(cs, "utf-8") == 0 || 272 strcmp(cs, "iso-8859-1") == 0) { 273 tok->encoding = cs; 274 } else { 275#ifdef Py_USING_UNICODE 276 r = set_readline(tok, cs); 277 if (r) { 278 tok->encoding = cs; 279 tok->decoding_state = -1; 280 } 281 else 282 PyMem_FREE(cs); 283#else 284 /* Without Unicode support, we cannot 285 process the coding spec. Since there 286 won't be any Unicode literals, that 287 won't matter. */ 288 PyMem_FREE(cs); 289#endif 290 } 291 } else { /* then, compare cs with BOM */ 292 r = (strcmp(tok->encoding, cs) == 0); 293 PyMem_FREE(cs); 294 } 295 } 296 if (!r) { 297 cs = tok->encoding; 298 if (!cs) 299 cs = "with BOM"; 300 PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs); 301 } 302 return r; 303} 304 305/* See whether the file starts with a BOM. If it does, 306 invoke the set_readline function with the new encoding. 307 Return 1 on success, 0 on failure. */ 308 309static int 310check_bom(int get_char(struct tok_state *), 311 void unget_char(int, struct tok_state *), 312 int set_readline(struct tok_state *, const char *), 313 struct tok_state *tok) 314{ 315 int ch = get_char(tok); 316 tok->decoding_state = 1; 317 if (ch == EOF) { 318 return 1; 319 } else if (ch == 0xEF) { 320 ch = get_char(tok); if (ch != 0xBB) goto NON_BOM; 321 ch = get_char(tok); if (ch != 0xBF) goto NON_BOM; 322#if 0 323 /* Disable support for UTF-16 BOMs until a decision 324 is made whether this needs to be supported. */ 325 } else if (ch == 0xFE) { 326 ch = get_char(tok); if (ch != 0xFF) goto NON_BOM; 327 if (!set_readline(tok, "utf-16-be")) return 0; 328 tok->decoding_state = -1; 329 } else if (ch == 0xFF) { 330 ch = get_char(tok); if (ch != 0xFE) goto NON_BOM; 331 if (!set_readline(tok, "utf-16-le")) return 0; 332 tok->decoding_state = -1; 333#endif 334 } else { 335 unget_char(ch, tok); 336 return 1; 337 } 338 if (tok->encoding != NULL) 339 PyMem_FREE(tok->encoding); 340 tok->encoding = new_string("utf-8", 5); /* resulting is in utf-8 */ 341 return 1; 342 NON_BOM: 343 /* any token beginning with '\xEF', '\xFE', '\xFF' is a bad token */ 344 unget_char(0xFF, tok); /* XXX this will cause a syntax error */ 345 return 1; 346} 347 348/* Read a line of text from TOK into S, using the stream in TOK. 349 Return NULL on failure, else S. 350 351 On entry, tok->decoding_buffer will be one of: 352 1) NULL: need to call tok->decoding_readline to get a new line 353 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and 354 stored the result in tok->decoding_buffer 355 3) PyStringObject *: previous call to fp_readl did not have enough room 356 (in the s buffer) to copy entire contents of the line read 357 by tok->decoding_readline. tok->decoding_buffer has the overflow. 358 In this case, fp_readl is called in a loop (with an expanded buffer) 359 until the buffer ends with a '\n' (or until the end of the file is 360 reached): see tok_nextc and its calls to decoding_fgets. 361*/ 362 363static char * 364fp_readl(char *s, int size, struct tok_state *tok) 365{ 366#ifndef Py_USING_UNICODE 367 /* In a non-Unicode built, this should never be called. */ 368 Py_FatalError("fp_readl should not be called in this build."); 369 return NULL; /* Keep compiler happy (not reachable) */ 370#else 371 PyObject* utf8 = NULL; 372 PyObject* buf = tok->decoding_buffer; 373 char *str; 374 Py_ssize_t utf8len; 375 376 /* Ask for one less byte so we can terminate it */ 377 assert(size > 0); 378 size--; 379 380 if (buf == NULL) { 381 buf = PyObject_CallObject(tok->decoding_readline, NULL); 382 if (buf == NULL) 383 return error_ret(tok); 384 } else { 385 tok->decoding_buffer = NULL; 386 if (PyString_CheckExact(buf)) 387 utf8 = buf; 388 } 389 if (utf8 == NULL) { 390 utf8 = PyUnicode_AsUTF8String(buf); 391 Py_DECREF(buf); 392 if (utf8 == NULL) 393 return error_ret(tok); 394 } 395 str = PyString_AsString(utf8); 396 utf8len = PyString_GET_SIZE(utf8); 397 if (utf8len > size) { 398 tok->decoding_buffer = PyString_FromStringAndSize(str+size, utf8len-size); 399 if (tok->decoding_buffer == NULL) { 400 Py_DECREF(utf8); 401 return error_ret(tok); 402 } 403 utf8len = size; 404 } 405 memcpy(s, str, utf8len); 406 s[utf8len] = '\0'; 407 Py_DECREF(utf8); 408 if (utf8len == 0) return NULL; /* EOF */ 409 return s; 410#endif 411} 412 413/* Set the readline function for TOK to a StreamReader's 414 readline function. The StreamReader is named ENC. 415 416 This function is called from check_bom and check_coding_spec. 417 418 ENC is usually identical to the future value of tok->encoding, 419 except for the (currently unsupported) case of UTF-16. 420 421 Return 1 on success, 0 on failure. */ 422 423static int 424fp_setreadl(struct tok_state *tok, const char* enc) 425{ 426 PyObject *reader, *stream, *readline; 427 428 /* XXX: constify filename argument. */ 429 stream = PyFile_FromFile(tok->fp, (char*)tok->filename, "rb", NULL); 430 if (stream == NULL) 431 return 0; 432 433 reader = PyCodec_StreamReader(enc, stream, NULL); 434 Py_DECREF(stream); 435 if (reader == NULL) 436 return 0; 437 438 readline = PyObject_GetAttrString(reader, "readline"); 439 Py_DECREF(reader); 440 if (readline == NULL) 441 return 0; 442 443 tok->decoding_readline = readline; 444 return 1; 445} 446 447/* Fetch the next byte from TOK. */ 448 449static int fp_getc(struct tok_state *tok) { 450 return getc(tok->fp); 451} 452 453/* Unfetch the last byte back into TOK. */ 454 455static void fp_ungetc(int c, struct tok_state *tok) { 456 ungetc(c, tok->fp); 457} 458 459/* Read a line of input from TOK. Determine encoding 460 if necessary. */ 461 462static char * 463decoding_fgets(char *s, int size, struct tok_state *tok) 464{ 465 char *line = NULL; 466 int badchar = 0; 467 for (;;) { 468 if (tok->decoding_state < 0) { 469 /* We already have a codec associated with 470 this input. */ 471 line = fp_readl(s, size, tok); 472 break; 473 } else if (tok->decoding_state > 0) { 474 /* We want a 'raw' read. */ 475 line = Py_UniversalNewlineFgets(s, size, 476 tok->fp, NULL); 477 break; 478 } else { 479 /* We have not yet determined the encoding. 480 If an encoding is found, use the file-pointer 481 reader functions from now on. */ 482 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok)) 483 return error_ret(tok); 484 assert(tok->decoding_state != 0); 485 } 486 } 487 if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) { 488 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) { 489 return error_ret(tok); 490 } 491 } 492#ifndef PGEN 493 /* The default encoding is ASCII, so make sure we don't have any 494 non-ASCII bytes in it. */ 495 if (line && !tok->encoding) { 496 unsigned char *c; 497 for (c = (unsigned char *)line; *c; c++) 498 if (*c > 127) { 499 badchar = *c; 500 break; 501 } 502 } 503 if (badchar) { 504 char buf[500]; 505 /* Need to add 1 to the line number, since this line 506 has not been counted, yet. */ 507 sprintf(buf, 508 "Non-ASCII character '\\x%.2x' " 509 "in file %.200s on line %i, " 510 "but no encoding declared; " 511 "see http://www.python.org/peps/pep-0263.html for details", 512 badchar, tok->filename, tok->lineno + 1); 513 PyErr_SetString(PyExc_SyntaxError, buf); 514 return error_ret(tok); 515 } 516#endif 517 return line; 518} 519 520static int 521decoding_feof(struct tok_state *tok) 522{ 523 if (tok->decoding_state >= 0) { 524 return feof(tok->fp); 525 } else { 526 PyObject* buf = tok->decoding_buffer; 527 if (buf == NULL) { 528 buf = PyObject_CallObject(tok->decoding_readline, NULL); 529 if (buf == NULL) { 530 error_ret(tok); 531 return 1; 532 } else { 533 tok->decoding_buffer = buf; 534 } 535 } 536 return PyObject_Length(buf) == 0; 537 } 538} 539 540/* Fetch a byte from TOK, using the string buffer. */ 541 542static int 543buf_getc(struct tok_state *tok) { 544 return Py_CHARMASK(*tok->str++); 545} 546 547/* Unfetch a byte from TOK, using the string buffer. */ 548 549static void 550buf_ungetc(int c, struct tok_state *tok) { 551 tok->str--; 552 assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */ 553} 554 555/* Set the readline function for TOK to ENC. For the string-based 556 tokenizer, this means to just record the encoding. */ 557 558static int 559buf_setreadl(struct tok_state *tok, const char* enc) { 560 tok->enc = enc; 561 return 1; 562} 563 564/* Return a UTF-8 encoding Python string object from the 565 C byte string STR, which is encoded with ENC. */ 566 567#ifdef Py_USING_UNICODE 568static PyObject * 569translate_into_utf8(const char* str, const char* enc) { 570 PyObject *utf8; 571 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL); 572 if (buf == NULL) 573 return NULL; 574 utf8 = PyUnicode_AsUTF8String(buf); 575 Py_DECREF(buf); 576 return utf8; 577} 578#endif 579 580/* Decode a byte string STR for use as the buffer of TOK. 581 Look for encoding declarations inside STR, and record them 582 inside TOK. */ 583 584static const char * 585decode_str(const char *str, struct tok_state *tok) 586{ 587 PyObject* utf8 = NULL; 588 const char *s; 589 const char *newl[2] = {NULL, NULL}; 590 int lineno = 0; 591 tok->enc = NULL; 592 tok->str = str; 593 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok)) 594 return error_ret(tok); 595 str = tok->str; /* string after BOM if any */ 596 assert(str); 597#ifdef Py_USING_UNICODE 598 if (tok->enc != NULL) { 599 utf8 = translate_into_utf8(str, tok->enc); 600 if (utf8 == NULL) 601 return error_ret(tok); 602 str = PyString_AsString(utf8); 603 } 604#endif 605 for (s = str;; s++) { 606 if (*s == '\0') break; 607 else if (*s == '\n') { 608 assert(lineno < 2); 609 newl[lineno] = s; 610 lineno++; 611 if (lineno == 2) break; 612 } 613 } 614 tok->enc = NULL; 615 /* need to check line 1 and 2 separately since check_coding_spec 616 assumes a single line as input */ 617 if (newl[0]) { 618 if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl)) 619 return error_ret(tok); 620 if (tok->enc == NULL && newl[1]) { 621 if (!check_coding_spec(newl[0]+1, newl[1] - newl[0], 622 tok, buf_setreadl)) 623 return error_ret(tok); 624 } 625 } 626#ifdef Py_USING_UNICODE 627 if (tok->enc != NULL) { 628 assert(utf8 == NULL); 629 utf8 = translate_into_utf8(str, tok->enc); 630 if (utf8 == NULL) { 631 PyErr_Format(PyExc_SyntaxError, 632 "unknown encoding: %s", tok->enc); 633 return error_ret(tok); 634 } 635 str = PyString_AsString(utf8); 636 } 637#endif 638 assert(tok->decoding_buffer == NULL); 639 tok->decoding_buffer = utf8; /* CAUTION */ 640 return str; 641} 642 643#endif /* PGEN */ 644 645/* Set up tokenizer for string */ 646 647struct tok_state * 648PyTokenizer_FromString(const char *str) 649{ 650 struct tok_state *tok = tok_new(); 651 if (tok == NULL) 652 return NULL; 653 str = (char *)decode_str(str, tok); 654 if (str == NULL) { 655 PyTokenizer_Free(tok); 656 return NULL; 657 } 658 659 /* XXX: constify members. */ 660 tok->buf = tok->cur = tok->end = tok->inp = (char*)str; 661 return tok; 662} 663 664 665/* Set up tokenizer for file */ 666 667struct tok_state * 668PyTokenizer_FromFile(FILE *fp, char *ps1, char *ps2) 669{ 670 struct tok_state *tok = tok_new(); 671 if (tok == NULL) 672 return NULL; 673 if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) { 674 PyTokenizer_Free(tok); 675 return NULL; 676 } 677 tok->cur = tok->inp = tok->buf; 678 tok->end = tok->buf + BUFSIZ; 679 tok->fp = fp; 680 tok->prompt = ps1; 681 tok->nextprompt = ps2; 682 return tok; 683} 684 685 686/* Free a tok_state structure */ 687 688void 689PyTokenizer_Free(struct tok_state *tok) 690{ 691 if (tok->encoding != NULL) 692 PyMem_FREE(tok->encoding); 693#ifndef PGEN 694 Py_XDECREF(tok->decoding_readline); 695 Py_XDECREF(tok->decoding_buffer); 696#endif 697 if (tok->fp != NULL && tok->buf != NULL) 698 PyMem_FREE(tok->buf); 699 PyMem_FREE(tok); 700} 701 702#if !defined(PGEN) && defined(Py_USING_UNICODE) 703static int 704tok_stdin_decode(struct tok_state *tok, char **inp) 705{ 706 PyObject *enc, *sysstdin, *decoded, *utf8; 707 const char *encoding; 708 char *converted; 709 710 if (PySys_GetFile((char *)"stdin", NULL) != stdin) 711 return 0; 712 sysstdin = PySys_GetObject("stdin"); 713 if (sysstdin == NULL || !PyFile_Check(sysstdin)) 714 return 0; 715 716 enc = ((PyFileObject *)sysstdin)->f_encoding; 717 if (enc == NULL || !PyString_Check(enc)) 718 return 0; 719 Py_INCREF(enc); 720 721 encoding = PyString_AsString(enc); 722 decoded = PyUnicode_Decode(*inp, strlen(*inp), encoding, NULL); 723 if (decoded == NULL) 724 goto error_clear; 725 726 utf8 = PyUnicode_AsEncodedString(decoded, "utf-8", NULL); 727 Py_DECREF(decoded); 728 if (utf8 == NULL) 729 goto error_clear; 730 731 assert(PyString_Check(utf8)); 732 converted = new_string(PyString_AS_STRING(utf8), 733 PyString_GET_SIZE(utf8)); 734 Py_DECREF(utf8); 735 if (converted == NULL) 736 goto error_nomem; 737 738 PyMem_FREE(*inp); 739 *inp = converted; 740 if (tok->encoding != NULL) 741 PyMem_FREE(tok->encoding); 742 tok->encoding = new_string(encoding, strlen(encoding)); 743 if (tok->encoding == NULL) 744 goto error_nomem; 745 746 Py_DECREF(enc); 747 return 0; 748 749error_nomem: 750 Py_DECREF(enc); 751 tok->done = E_NOMEM; 752 return -1; 753 754error_clear: 755 /* Fallback to iso-8859-1: for backward compatibility */ 756 Py_DECREF(enc); 757 PyErr_Clear(); 758 return 0; 759} 760#endif 761 762/* Get next char, updating state; error code goes into tok->done */ 763 764static int 765tok_nextc(register struct tok_state *tok) 766{ 767 for (;;) { 768 if (tok->cur != tok->inp) { 769 return Py_CHARMASK(*tok->cur++); /* Fast path */ 770 } 771 if (tok->done != E_OK) 772 return EOF; 773 if (tok->fp == NULL) { 774 char *end = strchr(tok->inp, '\n'); 775 if (end != NULL) 776 end++; 777 else { 778 end = strchr(tok->inp, '\0'); 779 if (end == tok->inp) { 780 tok->done = E_EOF; 781 return EOF; 782 } 783 } 784 if (tok->start == NULL) 785 tok->buf = tok->cur; 786 tok->line_start = tok->cur; 787 tok->lineno++; 788 tok->inp = end; 789 return Py_CHARMASK(*tok->cur++); 790 } 791 if (tok->prompt != NULL) { 792 char *newtok = PyOS_Readline(stdin, stdout, tok->prompt); 793 if (tok->nextprompt != NULL) 794 tok->prompt = tok->nextprompt; 795 if (newtok == NULL) 796 tok->done = E_INTR; 797 else if (*newtok == '\0') { 798 PyMem_FREE(newtok); 799 tok->done = E_EOF; 800 } 801#if !defined(PGEN) && defined(Py_USING_UNICODE) 802 else if (tok_stdin_decode(tok, &newtok) != 0) 803 PyMem_FREE(newtok); 804#endif 805 else if (tok->start != NULL) { 806 size_t start = tok->start - tok->buf; 807 size_t oldlen = tok->cur - tok->buf; 808 size_t newlen = oldlen + strlen(newtok); 809 char *buf = tok->buf; 810 buf = (char *)PyMem_REALLOC(buf, newlen+1); 811 tok->lineno++; 812 if (buf == NULL) { 813 PyMem_FREE(tok->buf); 814 tok->buf = NULL; 815 PyMem_FREE(newtok); 816 tok->done = E_NOMEM; 817 return EOF; 818 } 819 tok->buf = buf; 820 tok->cur = tok->buf + oldlen; 821 tok->line_start = tok->cur; 822 strcpy(tok->buf + oldlen, newtok); 823 PyMem_FREE(newtok); 824 tok->inp = tok->buf + newlen; 825 tok->end = tok->inp + 1; 826 tok->start = tok->buf + start; 827 } 828 else { 829 tok->lineno++; 830 if (tok->buf != NULL) 831 PyMem_FREE(tok->buf); 832 tok->buf = newtok; 833 tok->line_start = tok->buf; 834 tok->cur = tok->buf; 835 tok->line_start = tok->buf; 836 tok->inp = strchr(tok->buf, '\0'); 837 tok->end = tok->inp + 1; 838 } 839 } 840 else { 841 int done = 0; 842 Py_ssize_t cur = 0; 843 char *pt; 844 if (tok->start == NULL) { 845 if (tok->buf == NULL) { 846 tok->buf = (char *) 847 PyMem_MALLOC(BUFSIZ); 848 if (tok->buf == NULL) { 849 tok->done = E_NOMEM; 850 return EOF; 851 } 852 tok->end = tok->buf + BUFSIZ; 853 } 854 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf), 855 tok) == NULL) { 856 tok->done = E_EOF; 857 done = 1; 858 } 859 else { 860 tok->done = E_OK; 861 tok->inp = strchr(tok->buf, '\0'); 862 done = tok->inp[-1] == '\n'; 863 } 864 } 865 else { 866 cur = tok->cur - tok->buf; 867 if (decoding_feof(tok)) { 868 tok->done = E_EOF; 869 done = 1; 870 } 871 else 872 tok->done = E_OK; 873 } 874 tok->lineno++; 875 /* Read until '\n' or EOF */ 876 while (!done) { 877 Py_ssize_t curstart = tok->start == NULL ? -1 : 878 tok->start - tok->buf; 879 Py_ssize_t curvalid = tok->inp - tok->buf; 880 Py_ssize_t newsize = curvalid + BUFSIZ; 881 char *newbuf = tok->buf; 882 newbuf = (char *)PyMem_REALLOC(newbuf, 883 newsize); 884 if (newbuf == NULL) { 885 tok->done = E_NOMEM; 886 tok->cur = tok->inp; 887 return EOF; 888 } 889 tok->buf = newbuf; 890 tok->inp = tok->buf + curvalid; 891 tok->end = tok->buf + newsize; 892 tok->start = curstart < 0 ? NULL : 893 tok->buf + curstart; 894 if (decoding_fgets(tok->inp, 895 (int)(tok->end - tok->inp), 896 tok) == NULL) { 897 /* Break out early on decoding 898 errors, as tok->buf will be NULL 899 */ 900 if (tok->decoding_erred) 901 return EOF; 902 /* Last line does not end in \n, 903 fake one */ 904 strcpy(tok->inp, "\n"); 905 } 906 tok->inp = strchr(tok->inp, '\0'); 907 done = tok->inp[-1] == '\n'; 908 } 909 if (tok->buf != NULL) { 910 tok->cur = tok->buf + cur; 911 tok->line_start = tok->cur; 912 /* replace "\r\n" with "\n" */ 913 /* For Mac leave the \r, giving a syntax error */ 914 pt = tok->inp - 2; 915 if (pt >= tok->buf && *pt == '\r') { 916 *pt++ = '\n'; 917 *pt = '\0'; 918 tok->inp = pt; 919 } 920 } 921 } 922 if (tok->done != E_OK) { 923 if (tok->prompt != NULL) 924 PySys_WriteStderr("\n"); 925 tok->cur = tok->inp; 926 return EOF; 927 } 928 } 929 /*NOTREACHED*/ 930} 931 932 933/* Back-up one character */ 934 935static void 936tok_backup(register struct tok_state *tok, register int c) 937{ 938 if (c != EOF) { 939 if (--tok->cur < tok->buf) 940 Py_FatalError("tok_backup: begin of buffer"); 941 if (*tok->cur != c) 942 *tok->cur = c; 943 } 944} 945 946 947/* Return the token corresponding to a single character */ 948 949int 950PyToken_OneChar(int c) 951{ 952 switch (c) { 953 case '(': return LPAR; 954 case ')': return RPAR; 955 case '[': return LSQB; 956 case ']': return RSQB; 957 case ':': return COLON; 958 case ',': return COMMA; 959 case ';': return SEMI; 960 case '+': return PLUS; 961 case '-': return MINUS; 962 case '*': return STAR; 963 case '/': return SLASH; 964 case '|': return VBAR; 965 case '&': return AMPER; 966 case '<': return LESS; 967 case '>': return GREATER; 968 case '=': return EQUAL; 969 case '.': return DOT; 970 case '%': return PERCENT; 971 case '`': return BACKQUOTE; 972 case '{': return LBRACE; 973 case '}': return RBRACE; 974 case '^': return CIRCUMFLEX; 975 case '~': return TILDE; 976 case '@': return AT; 977 default: return OP; 978 } 979} 980 981 982int 983PyToken_TwoChars(int c1, int c2) 984{ 985 switch (c1) { 986 case '=': 987 switch (c2) { 988 case '=': return EQEQUAL; 989 } 990 break; 991 case '!': 992 switch (c2) { 993 case '=': return NOTEQUAL; 994 } 995 break; 996 case '<': 997 switch (c2) { 998 case '>': return NOTEQUAL; 999 case '=': return LESSEQUAL; 1000 case '<': return LEFTSHIFT; 1001 } 1002 break; 1003 case '>': 1004 switch (c2) { 1005 case '=': return GREATEREQUAL; 1006 case '>': return RIGHTSHIFT; 1007 } 1008 break; 1009 case '+': 1010 switch (c2) { 1011 case '=': return PLUSEQUAL; 1012 } 1013 break; 1014 case '-': 1015 switch (c2) { 1016 case '=': return MINEQUAL; 1017 } 1018 break; 1019 case '*': 1020 switch (c2) { 1021 case '*': return DOUBLESTAR; 1022 case '=': return STAREQUAL; 1023 } 1024 break; 1025 case '/': 1026 switch (c2) { 1027 case '/': return DOUBLESLASH; 1028 case '=': return SLASHEQUAL; 1029 } 1030 break; 1031 case '|': 1032 switch (c2) { 1033 case '=': return VBAREQUAL; 1034 } 1035 break; 1036 case '%': 1037 switch (c2) { 1038 case '=': return PERCENTEQUAL; 1039 } 1040 break; 1041 case '&': 1042 switch (c2) { 1043 case '=': return AMPEREQUAL; 1044 } 1045 break; 1046 case '^': 1047 switch (c2) { 1048 case '=': return CIRCUMFLEXEQUAL; 1049 } 1050 break; 1051 } 1052 return OP; 1053} 1054 1055int 1056PyToken_ThreeChars(int c1, int c2, int c3) 1057{ 1058 switch (c1) { 1059 case '<': 1060 switch (c2) { 1061 case '<': 1062 switch (c3) { 1063 case '=': 1064 return LEFTSHIFTEQUAL; 1065 } 1066 break; 1067 } 1068 break; 1069 case '>': 1070 switch (c2) { 1071 case '>': 1072 switch (c3) { 1073 case '=': 1074 return RIGHTSHIFTEQUAL; 1075 } 1076 break; 1077 } 1078 break; 1079 case '*': 1080 switch (c2) { 1081 case '*': 1082 switch (c3) { 1083 case '=': 1084 return DOUBLESTAREQUAL; 1085 } 1086 break; 1087 } 1088 break; 1089 case '/': 1090 switch (c2) { 1091 case '/': 1092 switch (c3) { 1093 case '=': 1094 return DOUBLESLASHEQUAL; 1095 } 1096 break; 1097 } 1098 break; 1099 } 1100 return OP; 1101} 1102 1103static int 1104indenterror(struct tok_state *tok) 1105{ 1106 if (tok->alterror) { 1107 tok->done = E_TABSPACE; 1108 tok->cur = tok->inp; 1109 return 1; 1110 } 1111 if (tok->altwarning) { 1112 PySys_WriteStderr("%s: inconsistent use of tabs and spaces " 1113 "in indentation\n", tok->filename); 1114 tok->altwarning = 0; 1115 } 1116 return 0; 1117} 1118 1119 1120/* Get next token, after space stripping etc. */ 1121 1122static int 1123tok_get(register struct tok_state *tok, char **p_start, char **p_end) 1124{ 1125 register int c; 1126 int blankline; 1127 1128 *p_start = *p_end = NULL; 1129 nextline: 1130 tok->start = NULL; 1131 blankline = 0; 1132 1133 /* Get indentation level */ 1134 if (tok->atbol) { 1135 register int col = 0; 1136 register int altcol = 0; 1137 tok->atbol = 0; 1138 for (;;) { 1139 c = tok_nextc(tok); 1140 if (c == ' ') 1141 col++, altcol++; 1142 else if (c == '\t') { 1143 col = (col/tok->tabsize + 1) * tok->tabsize; 1144 altcol = (altcol/tok->alttabsize + 1) 1145 * tok->alttabsize; 1146 } 1147 else if (c == '\014') /* Control-L (formfeed) */ 1148 col = altcol = 0; /* For Emacs users */ 1149 else 1150 break; 1151 } 1152 tok_backup(tok, c); 1153 if (c == '#' || c == '\n') { 1154 /* Lines with only whitespace and/or comments 1155 shouldn't affect the indentation and are 1156 not passed to the parser as NEWLINE tokens, 1157 except *totally* empty lines in interactive 1158 mode, which signal the end of a command group. */ 1159 if (col == 0 && c == '\n' && tok->prompt != NULL) 1160 blankline = 0; /* Let it through */ 1161 else 1162 blankline = 1; /* Ignore completely */ 1163 /* We can't jump back right here since we still 1164 may need to skip to the end of a comment */ 1165 } 1166 if (!blankline && tok->level == 0) { 1167 if (col == tok->indstack[tok->indent]) { 1168 /* No change */ 1169 if (altcol != tok->altindstack[tok->indent]) { 1170 if (indenterror(tok)) 1171 return ERRORTOKEN; 1172 } 1173 } 1174 else if (col > tok->indstack[tok->indent]) { 1175 /* Indent -- always one */ 1176 if (tok->indent+1 >= MAXINDENT) { 1177 tok->done = E_TOODEEP; 1178 tok->cur = tok->inp; 1179 return ERRORTOKEN; 1180 } 1181 if (altcol <= tok->altindstack[tok->indent]) { 1182 if (indenterror(tok)) 1183 return ERRORTOKEN; 1184 } 1185 tok->pendin++; 1186 tok->indstack[++tok->indent] = col; 1187 tok->altindstack[tok->indent] = altcol; 1188 } 1189 else /* col < tok->indstack[tok->indent] */ { 1190 /* Dedent -- any number, must be consistent */ 1191 while (tok->indent > 0 && 1192 col < tok->indstack[tok->indent]) { 1193 tok->pendin--; 1194 tok->indent--; 1195 } 1196 if (col != tok->indstack[tok->indent]) { 1197 tok->done = E_DEDENT; 1198 tok->cur = tok->inp; 1199 return ERRORTOKEN; 1200 } 1201 if (altcol != tok->altindstack[tok->indent]) { 1202 if (indenterror(tok)) 1203 return ERRORTOKEN; 1204 } 1205 } 1206 } 1207 } 1208 1209 tok->start = tok->cur; 1210 1211 /* Return pending indents/dedents */ 1212 if (tok->pendin != 0) { 1213 if (tok->pendin < 0) { 1214 tok->pendin++; 1215 return DEDENT; 1216 } 1217 else { 1218 tok->pendin--; 1219 return INDENT; 1220 } 1221 } 1222 1223 again: 1224 tok->start = NULL; 1225 /* Skip spaces */ 1226 do { 1227 c = tok_nextc(tok); 1228 } while (c == ' ' || c == '\t' || c == '\014'); 1229 1230 /* Set start of current token */ 1231 tok->start = tok->cur - 1; 1232 1233 /* Skip comment, while looking for tab-setting magic */ 1234 if (c == '#') { 1235 static char *tabforms[] = { 1236 "tab-width:", /* Emacs */ 1237 ":tabstop=", /* vim, full form */ 1238 ":ts=", /* vim, abbreviated form */ 1239 "set tabsize=", /* will vi never die? */ 1240 /* more templates can be added here to support other editors */ 1241 }; 1242 char cbuf[80]; 1243 char *tp, **cp; 1244 tp = cbuf; 1245 do { 1246 *tp++ = c = tok_nextc(tok); 1247 } while (c != EOF && c != '\n' && 1248 (size_t)(tp - cbuf + 1) < sizeof(cbuf)); 1249 *tp = '\0'; 1250 for (cp = tabforms; 1251 cp < tabforms + sizeof(tabforms)/sizeof(tabforms[0]); 1252 cp++) { 1253 if ((tp = strstr(cbuf, *cp))) { 1254 int newsize = atoi(tp + strlen(*cp)); 1255 1256 if (newsize >= 1 && newsize <= 40) { 1257 tok->tabsize = newsize; 1258 if (Py_VerboseFlag) 1259 PySys_WriteStderr( 1260 "Tab size set to %d\n", 1261 newsize); 1262 } 1263 } 1264 } 1265 while (c != EOF && c != '\n') 1266 c = tok_nextc(tok); 1267 } 1268 1269 /* Check for EOF and errors now */ 1270 if (c == EOF) { 1271 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN; 1272 } 1273 1274 /* Identifier (most frequent token!) */ 1275 if (isalpha(c) || c == '_') { 1276 /* Process r"", u"" and ur"" */ 1277 switch (c) { 1278 case 'b': 1279 case 'B': 1280 c = tok_nextc(tok); 1281 if (c == 'r' || c == 'R') 1282 c = tok_nextc(tok); 1283 if (c == '"' || c == '\'') 1284 goto letter_quote; 1285 break; 1286 case 'r': 1287 case 'R': 1288 c = tok_nextc(tok); 1289 if (c == '"' || c == '\'') 1290 goto letter_quote; 1291 break; 1292 case 'u': 1293 case 'U': 1294 c = tok_nextc(tok); 1295 if (c == 'r' || c == 'R') 1296 c = tok_nextc(tok); 1297 if (c == '"' || c == '\'') 1298 goto letter_quote; 1299 break; 1300 } 1301 while (isalnum(c) || c == '_') { 1302 c = tok_nextc(tok); 1303 } 1304 tok_backup(tok, c); 1305 *p_start = tok->start; 1306 *p_end = tok->cur; 1307 return NAME; 1308 } 1309 1310 /* Newline */ 1311 if (c == '\n') { 1312 tok->atbol = 1; 1313 if (blankline || tok->level > 0) 1314 goto nextline; 1315 *p_start = tok->start; 1316 *p_end = tok->cur - 1; /* Leave '\n' out of the string */ 1317 tok->cont_line = 0; 1318 return NEWLINE; 1319 } 1320 1321 /* Period or number starting with period? */ 1322 if (c == '.') { 1323 c = tok_nextc(tok); 1324 if (isdigit(c)) { 1325 goto fraction; 1326 } 1327 else { 1328 tok_backup(tok, c); 1329 *p_start = tok->start; 1330 *p_end = tok->cur; 1331 return DOT; 1332 } 1333 } 1334 1335 /* Number */ 1336 if (isdigit(c)) { 1337 if (c == '0') { 1338 /* Hex, octal or binary -- maybe. */ 1339 c = tok_nextc(tok); 1340 if (c == '.') 1341 goto fraction; 1342#ifndef WITHOUT_COMPLEX 1343 if (c == 'j' || c == 'J') 1344 goto imaginary; 1345#endif 1346 if (c == 'x' || c == 'X') { 1347 1348 /* Hex */ 1349 c = tok_nextc(tok); 1350 if (!isxdigit(c)) { 1351 tok->done = E_TOKEN; 1352 tok_backup(tok, c); 1353 return ERRORTOKEN; 1354 } 1355 do { 1356 c = tok_nextc(tok); 1357 } while (isxdigit(c)); 1358 } 1359 else if (c == 'o' || c == 'O') { 1360 /* Octal */ 1361 c = tok_nextc(tok); 1362 if (c < '0' || c > '8') { 1363 tok->done = E_TOKEN; 1364 tok_backup(tok, c); 1365 return ERRORTOKEN; 1366 } 1367 do { 1368 c = tok_nextc(tok); 1369 } while ('0' <= c && c < '8'); 1370 } 1371 else if (c == 'b' || c == 'B') { 1372 /* Binary */ 1373 c = tok_nextc(tok); 1374 if (c != '0' && c != '1') { 1375 tok->done = E_TOKEN; 1376 tok_backup(tok, c); 1377 return ERRORTOKEN; 1378 } 1379 do { 1380 c = tok_nextc(tok); 1381 } while (c == '0' || c == '1'); 1382 } 1383 else { 1384 int found_decimal = 0; 1385 /* Octal; c is first char of it */ 1386 /* There's no 'isoctdigit' macro, sigh */ 1387 while ('0' <= c && c < '8') { 1388 c = tok_nextc(tok); 1389 } 1390 if (isdigit(c)) { 1391 found_decimal = 1; 1392 do { 1393 c = tok_nextc(tok); 1394 } while (isdigit(c)); 1395 } 1396 if (c == '.') 1397 goto fraction; 1398 else if (c == 'e' || c == 'E') 1399 goto exponent; 1400#ifndef WITHOUT_COMPLEX 1401 else if (c == 'j' || c == 'J') 1402 goto imaginary; 1403#endif 1404 else if (found_decimal) { 1405 tok->done = E_TOKEN; 1406 tok_backup(tok, c); 1407 return ERRORTOKEN; 1408 } 1409 } 1410 if (c == 'l' || c == 'L') 1411 c = tok_nextc(tok); 1412 } 1413 else { 1414 /* Decimal */ 1415 do { 1416 c = tok_nextc(tok); 1417 } while (isdigit(c)); 1418 if (c == 'l' || c == 'L') 1419 c = tok_nextc(tok); 1420 else { 1421 /* Accept floating point numbers. */ 1422 if (c == '.') { 1423 fraction: 1424 /* Fraction */ 1425 do { 1426 c = tok_nextc(tok); 1427 } while (isdigit(c)); 1428 } 1429 if (c == 'e' || c == 'E') { 1430 exponent: 1431 /* Exponent part */ 1432 c = tok_nextc(tok); 1433 if (c == '+' || c == '-') 1434 c = tok_nextc(tok); 1435 if (!isdigit(c)) { 1436 tok->done = E_TOKEN; 1437 tok_backup(tok, c); 1438 return ERRORTOKEN; 1439 } 1440 do { 1441 c = tok_nextc(tok); 1442 } while (isdigit(c)); 1443 } 1444#ifndef WITHOUT_COMPLEX 1445 if (c == 'j' || c == 'J') 1446 /* Imaginary part */ 1447 imaginary: 1448 c = tok_nextc(tok); 1449#endif 1450 } 1451 } 1452 tok_backup(tok, c); 1453 *p_start = tok->start; 1454 *p_end = tok->cur; 1455 return NUMBER; 1456 } 1457 1458 letter_quote: 1459 /* String */ 1460 if (c == '\'' || c == '"') { 1461 Py_ssize_t quote2 = tok->cur - tok->start + 1; 1462 int quote = c; 1463 int triple = 0; 1464 int tripcount = 0; 1465 for (;;) { 1466 c = tok_nextc(tok); 1467 if (c == '\n') { 1468 if (!triple) { 1469 tok->done = E_EOLS; 1470 tok_backup(tok, c); 1471 return ERRORTOKEN; 1472 } 1473 tripcount = 0; 1474 tok->cont_line = 1; /* multiline string. */ 1475 } 1476 else if (c == EOF) { 1477 if (triple) 1478 tok->done = E_EOFS; 1479 else 1480 tok->done = E_EOLS; 1481 tok->cur = tok->inp; 1482 return ERRORTOKEN; 1483 } 1484 else if (c == quote) { 1485 tripcount++; 1486 if (tok->cur - tok->start == quote2) { 1487 c = tok_nextc(tok); 1488 if (c == quote) { 1489 triple = 1; 1490 tripcount = 0; 1491 continue; 1492 } 1493 tok_backup(tok, c); 1494 } 1495 if (!triple || tripcount == 3) 1496 break; 1497 } 1498 else if (c == '\\') { 1499 tripcount = 0; 1500 c = tok_nextc(tok); 1501 if (c == EOF) { 1502 tok->done = E_EOLS; 1503 tok->cur = tok->inp; 1504 return ERRORTOKEN; 1505 } 1506 } 1507 else 1508 tripcount = 0; 1509 } 1510 *p_start = tok->start; 1511 *p_end = tok->cur; 1512 return STRING; 1513 } 1514 1515 /* Line continuation */ 1516 if (c == '\\') { 1517 c = tok_nextc(tok); 1518 if (c != '\n') { 1519 tok->done = E_LINECONT; 1520 tok->cur = tok->inp; 1521 return ERRORTOKEN; 1522 } 1523 tok->cont_line = 1; 1524 goto again; /* Read next line */ 1525 } 1526 1527 /* Check for two-character token */ 1528 { 1529 int c2 = tok_nextc(tok); 1530 int token = PyToken_TwoChars(c, c2); 1531#ifndef PGEN 1532 if (Py_Py3kWarningFlag && token == NOTEQUAL && c == '<') { 1533 if (PyErr_WarnExplicit(PyExc_DeprecationWarning, 1534 "<> not supported in 3.x", 1535 tok->filename, tok->lineno, 1536 NULL, NULL)) { 1537 return ERRORTOKEN; 1538 } 1539 } 1540#endif 1541 if (token != OP) { 1542 int c3 = tok_nextc(tok); 1543 int token3 = PyToken_ThreeChars(c, c2, c3); 1544 if (token3 != OP) { 1545 token = token3; 1546 } else { 1547 tok_backup(tok, c3); 1548 } 1549 *p_start = tok->start; 1550 *p_end = tok->cur; 1551 return token; 1552 } 1553 tok_backup(tok, c2); 1554 } 1555 1556 /* Keep track of parentheses nesting level */ 1557 switch (c) { 1558 case '(': 1559 case '[': 1560 case '{': 1561 tok->level++; 1562 break; 1563 case ')': 1564 case ']': 1565 case '}': 1566 tok->level--; 1567 break; 1568 } 1569 1570 /* Punctuation character */ 1571 *p_start = tok->start; 1572 *p_end = tok->cur; 1573 return PyToken_OneChar(c); 1574} 1575 1576int 1577PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end) 1578{ 1579 int result = tok_get(tok, p_start, p_end); 1580 if (tok->decoding_erred) { 1581 result = ERRORTOKEN; 1582 tok->done = E_DECODE; 1583 } 1584 return result; 1585} 1586 1587/* This function is only called from parsetok. However, it cannot live 1588 there, as it must be empty for PGEN, and we can check for PGEN only 1589 in this file. */ 1590 1591#if defined(PGEN) || !defined(Py_USING_UNICODE) 1592char* 1593PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int* offset) 1594{ 1595 return NULL; 1596} 1597#else 1598#ifdef Py_USING_UNICODE 1599static PyObject * 1600dec_utf8(const char *enc, const char *text, size_t len) { 1601 PyObject *ret = NULL; 1602 PyObject *unicode_text = PyUnicode_DecodeUTF8(text, len, "replace"); 1603 if (unicode_text) { 1604 ret = PyUnicode_AsEncodedString(unicode_text, enc, "replace"); 1605 Py_DECREF(unicode_text); 1606 } 1607 if (!ret) { 1608 PyErr_Clear(); 1609 } 1610 return ret; 1611} 1612char * 1613PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int *offset) 1614{ 1615 char *text = NULL; 1616 if (tok->encoding) { 1617 /* convert source to original encondig */ 1618 PyObject *lineobj = dec_utf8(tok->encoding, tok->buf, len); 1619 if (lineobj != NULL) { 1620 int linelen = PyString_Size(lineobj); 1621 const char *line = PyString_AsString(lineobj); 1622 text = PyObject_MALLOC(linelen + 1); 1623 if (text != NULL && line != NULL) { 1624 if (linelen) 1625 strncpy(text, line, linelen); 1626 text[linelen] = '\0'; 1627 } 1628 Py_DECREF(lineobj); 1629 1630 /* adjust error offset */ 1631 if (*offset > 1) { 1632 PyObject *offsetobj = dec_utf8(tok->encoding, 1633 tok->buf, *offset-1); 1634 if (offsetobj) { 1635 *offset = PyString_Size(offsetobj) + 1; 1636 Py_DECREF(offsetobj); 1637 } 1638 } 1639 1640 } 1641 } 1642 return text; 1643 1644} 1645#endif /* defined(Py_USING_UNICODE) */ 1646#endif 1647 1648 1649#ifdef Py_DEBUG 1650 1651void 1652tok_dump(int type, char *start, char *end) 1653{ 1654 printf("%s", _PyParser_TokenNames[type]); 1655 if (type == NAME || type == NUMBER || type == STRING || type == OP) 1656 printf("(%.*s)", (int)(end - start), start); 1657} 1658 1659#endif 1660