1 2/* Tokenizer implementation */ 3 4#include "Python.h" 5#include "pgenheaders.h" 6 7#include <ctype.h> 8#include <assert.h> 9 10#include "tokenizer.h" 11#include "errcode.h" 12 13#ifndef PGEN 14#include "unicodeobject.h" 15#include "bytesobject.h" 16#include "fileobject.h" 17#include "codecs.h" 18#include "abstract.h" 19#endif /* PGEN */ 20 21#define is_potential_identifier_start(c) (\ 22 (c >= 'a' && c <= 'z')\ 23 || (c >= 'A' && c <= 'Z')\ 24 || c == '_'\ 25 || (c >= 128)) 26 27#define is_potential_identifier_char(c) (\ 28 (c >= 'a' && c <= 'z')\ 29 || (c >= 'A' && c <= 'Z')\ 30 || (c >= '0' && c <= '9')\ 31 || c == '_'\ 32 || (c >= 128)) 33 34extern char *PyOS_Readline(FILE *, FILE *, const char *); 35/* Return malloc'ed string including trailing \n; 36 empty malloc'ed string for EOF; 37 NULL if interrupted */ 38 39/* Don't ever change this -- it would break the portability of Python code */ 40#define TABSIZE 8 41 42/* Forward */ 43static struct tok_state *tok_new(void); 44static int tok_nextc(struct tok_state *tok); 45static void tok_backup(struct tok_state *tok, int c); 46 47 48/* Token names */ 49 50const char *_PyParser_TokenNames[] = { 51 "ENDMARKER", 52 "NAME", 53 "NUMBER", 54 "STRING", 55 "NEWLINE", 56 "INDENT", 57 "DEDENT", 58 "LPAR", 59 "RPAR", 60 "LSQB", 61 "RSQB", 62 "COLON", 63 "COMMA", 64 "SEMI", 65 "PLUS", 66 "MINUS", 67 "STAR", 68 "SLASH", 69 "VBAR", 70 "AMPER", 71 "LESS", 72 "GREATER", 73 "EQUAL", 74 "DOT", 75 "PERCENT", 76 "LBRACE", 77 "RBRACE", 78 "EQEQUAL", 79 "NOTEQUAL", 80 "LESSEQUAL", 81 "GREATEREQUAL", 82 "TILDE", 83 "CIRCUMFLEX", 84 "LEFTSHIFT", 85 "RIGHTSHIFT", 86 "DOUBLESTAR", 87 "PLUSEQUAL", 88 "MINEQUAL", 89 "STAREQUAL", 90 "SLASHEQUAL", 91 "PERCENTEQUAL", 92 "AMPEREQUAL", 93 "VBAREQUAL", 94 "CIRCUMFLEXEQUAL", 95 "LEFTSHIFTEQUAL", 96 "RIGHTSHIFTEQUAL", 97 "DOUBLESTAREQUAL", 98 "DOUBLESLASH", 99 "DOUBLESLASHEQUAL", 100 "AT", 101 "ATEQUAL", 102 "RARROW", 103 "ELLIPSIS", 104 /* This table must match the #defines in token.h! */ 105 "OP", 106 "AWAIT", 107 "ASYNC", 108 "<ERRORTOKEN>", 109 "<N_TOKENS>" 110}; 111 112 113/* Create and initialize a new tok_state structure */ 114 115static struct tok_state * 116tok_new(void) 117{ 118 struct tok_state *tok = (struct tok_state *)PyMem_MALLOC( 119 sizeof(struct tok_state)); 120 if (tok == NULL) 121 return NULL; 122 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL; 123 tok->done = E_OK; 124 tok->fp = NULL; 125 tok->input = NULL; 126 tok->tabsize = TABSIZE; 127 tok->indent = 0; 128 tok->indstack[0] = 0; 129 130 tok->atbol = 1; 131 tok->pendin = 0; 132 tok->prompt = tok->nextprompt = NULL; 133 tok->lineno = 0; 134 tok->level = 0; 135 tok->altwarning = 1; 136 tok->alterror = 1; 137 tok->alttabsize = 1; 138 tok->altindstack[0] = 0; 139 tok->decoding_state = STATE_INIT; 140 tok->decoding_erred = 0; 141 tok->read_coding_spec = 0; 142 tok->enc = NULL; 143 tok->encoding = NULL; 144 tok->cont_line = 0; 145#ifndef PGEN 146 tok->filename = NULL; 147 tok->decoding_readline = NULL; 148 tok->decoding_buffer = NULL; 149#endif 150 151 tok->async_def = 0; 152 tok->async_def_indent = 0; 153 tok->async_def_nl = 0; 154 155 return tok; 156} 157 158static char * 159new_string(const char *s, Py_ssize_t len, struct tok_state *tok) 160{ 161 char* result = (char *)PyMem_MALLOC(len + 1); 162 if (!result) { 163 tok->done = E_NOMEM; 164 return NULL; 165 } 166 memcpy(result, s, len); 167 result[len] = '\0'; 168 return result; 169} 170 171#ifdef PGEN 172 173static char * 174decoding_fgets(char *s, int size, struct tok_state *tok) 175{ 176 return fgets(s, size, tok->fp); 177} 178 179static int 180decoding_feof(struct tok_state *tok) 181{ 182 return feof(tok->fp); 183} 184 185static char * 186decode_str(const char *str, int exec_input, struct tok_state *tok) 187{ 188 return new_string(str, strlen(str), tok); 189} 190 191#else /* PGEN */ 192 193static char * 194error_ret(struct tok_state *tok) /* XXX */ 195{ 196 tok->decoding_erred = 1; 197 if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */ 198 PyMem_FREE(tok->buf); 199 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL; 200 tok->done = E_DECODE; 201 return NULL; /* as if it were EOF */ 202} 203 204 205static const char * 206get_normal_name(const char *s) /* for utf-8 and latin-1 */ 207{ 208 char buf[13]; 209 int i; 210 for (i = 0; i < 12; i++) { 211 int c = s[i]; 212 if (c == '\0') 213 break; 214 else if (c == '_') 215 buf[i] = '-'; 216 else 217 buf[i] = tolower(c); 218 } 219 buf[i] = '\0'; 220 if (strcmp(buf, "utf-8") == 0 || 221 strncmp(buf, "utf-8-", 6) == 0) 222 return "utf-8"; 223 else if (strcmp(buf, "latin-1") == 0 || 224 strcmp(buf, "iso-8859-1") == 0 || 225 strcmp(buf, "iso-latin-1") == 0 || 226 strncmp(buf, "latin-1-", 8) == 0 || 227 strncmp(buf, "iso-8859-1-", 11) == 0 || 228 strncmp(buf, "iso-latin-1-", 12) == 0) 229 return "iso-8859-1"; 230 else 231 return s; 232} 233 234/* Return the coding spec in S, or NULL if none is found. */ 235 236static int 237get_coding_spec(const char *s, char **spec, Py_ssize_t size, struct tok_state *tok) 238{ 239 Py_ssize_t i; 240 *spec = NULL; 241 /* Coding spec must be in a comment, and that comment must be 242 * the only statement on the source code line. */ 243 for (i = 0; i < size - 6; i++) { 244 if (s[i] == '#') 245 break; 246 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014') 247 return 1; 248 } 249 for (; i < size - 6; i++) { /* XXX inefficient search */ 250 const char* t = s + i; 251 if (strncmp(t, "coding", 6) == 0) { 252 const char* begin = NULL; 253 t += 6; 254 if (t[0] != ':' && t[0] != '=') 255 continue; 256 do { 257 t++; 258 } while (t[0] == '\x20' || t[0] == '\t'); 259 260 begin = t; 261 while (Py_ISALNUM(t[0]) || 262 t[0] == '-' || t[0] == '_' || t[0] == '.') 263 t++; 264 265 if (begin < t) { 266 char* r = new_string(begin, t - begin, tok); 267 const char* q; 268 if (!r) 269 return 0; 270 q = get_normal_name(r); 271 if (r != q) { 272 PyMem_FREE(r); 273 r = new_string(q, strlen(q), tok); 274 if (!r) 275 return 0; 276 } 277 *spec = r; 278 break; 279 } 280 } 281 } 282 return 1; 283} 284 285/* Check whether the line contains a coding spec. If it does, 286 invoke the set_readline function for the new encoding. 287 This function receives the tok_state and the new encoding. 288 Return 1 on success, 0 on failure. */ 289 290static int 291check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok, 292 int set_readline(struct tok_state *, const char *)) 293{ 294 char *cs; 295 int r = 1; 296 297 if (tok->cont_line) { 298 /* It's a continuation line, so it can't be a coding spec. */ 299 tok->read_coding_spec = 1; 300 return 1; 301 } 302 if (!get_coding_spec(line, &cs, size, tok)) 303 return 0; 304 if (!cs) { 305 Py_ssize_t i; 306 for (i = 0; i < size; i++) { 307 if (line[i] == '#' || line[i] == '\n' || line[i] == '\r') 308 break; 309 if (line[i] != ' ' && line[i] != '\t' && line[i] != '\014') { 310 /* Stop checking coding spec after a line containing 311 * anything except a comment. */ 312 tok->read_coding_spec = 1; 313 break; 314 } 315 } 316 return 1; 317 } 318 tok->read_coding_spec = 1; 319 if (tok->encoding == NULL) { 320 assert(tok->decoding_state == STATE_RAW); 321 if (strcmp(cs, "utf-8") == 0) { 322 tok->encoding = cs; 323 } else { 324 r = set_readline(tok, cs); 325 if (r) { 326 tok->encoding = cs; 327 tok->decoding_state = STATE_NORMAL; 328 } 329 else { 330 PyErr_Format(PyExc_SyntaxError, 331 "encoding problem: %s", cs); 332 PyMem_FREE(cs); 333 } 334 } 335 } else { /* then, compare cs with BOM */ 336 r = (strcmp(tok->encoding, cs) == 0); 337 if (!r) 338 PyErr_Format(PyExc_SyntaxError, 339 "encoding problem: %s with BOM", cs); 340 PyMem_FREE(cs); 341 } 342 return r; 343} 344 345/* See whether the file starts with a BOM. If it does, 346 invoke the set_readline function with the new encoding. 347 Return 1 on success, 0 on failure. */ 348 349static int 350check_bom(int get_char(struct tok_state *), 351 void unget_char(int, struct tok_state *), 352 int set_readline(struct tok_state *, const char *), 353 struct tok_state *tok) 354{ 355 int ch1, ch2, ch3; 356 ch1 = get_char(tok); 357 tok->decoding_state = STATE_RAW; 358 if (ch1 == EOF) { 359 return 1; 360 } else if (ch1 == 0xEF) { 361 ch2 = get_char(tok); 362 if (ch2 != 0xBB) { 363 unget_char(ch2, tok); 364 unget_char(ch1, tok); 365 return 1; 366 } 367 ch3 = get_char(tok); 368 if (ch3 != 0xBF) { 369 unget_char(ch3, tok); 370 unget_char(ch2, tok); 371 unget_char(ch1, tok); 372 return 1; 373 } 374#if 0 375 /* Disable support for UTF-16 BOMs until a decision 376 is made whether this needs to be supported. */ 377 } else if (ch1 == 0xFE) { 378 ch2 = get_char(tok); 379 if (ch2 != 0xFF) { 380 unget_char(ch2, tok); 381 unget_char(ch1, tok); 382 return 1; 383 } 384 if (!set_readline(tok, "utf-16-be")) 385 return 0; 386 tok->decoding_state = STATE_NORMAL; 387 } else if (ch1 == 0xFF) { 388 ch2 = get_char(tok); 389 if (ch2 != 0xFE) { 390 unget_char(ch2, tok); 391 unget_char(ch1, tok); 392 return 1; 393 } 394 if (!set_readline(tok, "utf-16-le")) 395 return 0; 396 tok->decoding_state = STATE_NORMAL; 397#endif 398 } else { 399 unget_char(ch1, tok); 400 return 1; 401 } 402 if (tok->encoding != NULL) 403 PyMem_FREE(tok->encoding); 404 tok->encoding = new_string("utf-8", 5, tok); 405 if (!tok->encoding) 406 return 0; 407 /* No need to set_readline: input is already utf-8 */ 408 return 1; 409} 410 411/* Read a line of text from TOK into S, using the stream in TOK. 412 Return NULL on failure, else S. 413 414 On entry, tok->decoding_buffer will be one of: 415 1) NULL: need to call tok->decoding_readline to get a new line 416 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and 417 stored the result in tok->decoding_buffer 418 3) PyByteArrayObject *: previous call to fp_readl did not have enough room 419 (in the s buffer) to copy entire contents of the line read 420 by tok->decoding_readline. tok->decoding_buffer has the overflow. 421 In this case, fp_readl is called in a loop (with an expanded buffer) 422 until the buffer ends with a '\n' (or until the end of the file is 423 reached): see tok_nextc and its calls to decoding_fgets. 424*/ 425 426static char * 427fp_readl(char *s, int size, struct tok_state *tok) 428{ 429 PyObject* bufobj; 430 const char *buf; 431 Py_ssize_t buflen; 432 433 /* Ask for one less byte so we can terminate it */ 434 assert(size > 0); 435 size--; 436 437 if (tok->decoding_buffer) { 438 bufobj = tok->decoding_buffer; 439 Py_INCREF(bufobj); 440 } 441 else 442 { 443 bufobj = PyObject_CallObject(tok->decoding_readline, NULL); 444 if (bufobj == NULL) 445 goto error; 446 } 447 if (PyUnicode_CheckExact(bufobj)) 448 { 449 buf = PyUnicode_AsUTF8AndSize(bufobj, &buflen); 450 if (buf == NULL) { 451 goto error; 452 } 453 } 454 else 455 { 456 buf = PyByteArray_AsString(bufobj); 457 if (buf == NULL) { 458 goto error; 459 } 460 buflen = PyByteArray_GET_SIZE(bufobj); 461 } 462 463 Py_XDECREF(tok->decoding_buffer); 464 if (buflen > size) { 465 /* Too many chars, the rest goes into tok->decoding_buffer */ 466 tok->decoding_buffer = PyByteArray_FromStringAndSize(buf+size, 467 buflen-size); 468 if (tok->decoding_buffer == NULL) 469 goto error; 470 buflen = size; 471 } 472 else 473 tok->decoding_buffer = NULL; 474 475 memcpy(s, buf, buflen); 476 s[buflen] = '\0'; 477 if (buflen == 0) /* EOF */ 478 s = NULL; 479 Py_DECREF(bufobj); 480 return s; 481 482error: 483 Py_XDECREF(bufobj); 484 return error_ret(tok); 485} 486 487/* Set the readline function for TOK to a StreamReader's 488 readline function. The StreamReader is named ENC. 489 490 This function is called from check_bom and check_coding_spec. 491 492 ENC is usually identical to the future value of tok->encoding, 493 except for the (currently unsupported) case of UTF-16. 494 495 Return 1 on success, 0 on failure. */ 496 497static int 498fp_setreadl(struct tok_state *tok, const char* enc) 499{ 500 PyObject *readline, *io, *stream; 501 _Py_IDENTIFIER(open); 502 _Py_IDENTIFIER(readline); 503 int fd; 504 long pos; 505 506 fd = fileno(tok->fp); 507 /* Due to buffering the file offset for fd can be different from the file 508 * position of tok->fp. If tok->fp was opened in text mode on Windows, 509 * its file position counts CRLF as one char and can't be directly mapped 510 * to the file offset for fd. Instead we step back one byte and read to 511 * the end of line.*/ 512 pos = ftell(tok->fp); 513 if (pos == -1 || 514 lseek(fd, (off_t)(pos > 0 ? pos - 1 : pos), SEEK_SET) == (off_t)-1) { 515 PyErr_SetFromErrnoWithFilename(PyExc_OSError, NULL); 516 return 0; 517 } 518 519 io = PyImport_ImportModuleNoBlock("io"); 520 if (io == NULL) 521 return 0; 522 523 stream = _PyObject_CallMethodId(io, &PyId_open, "isisOOO", 524 fd, "r", -1, enc, Py_None, Py_None, Py_False); 525 Py_DECREF(io); 526 if (stream == NULL) 527 return 0; 528 529 readline = _PyObject_GetAttrId(stream, &PyId_readline); 530 Py_DECREF(stream); 531 if (readline == NULL) 532 return 0; 533 Py_XSETREF(tok->decoding_readline, readline); 534 535 if (pos > 0) { 536 PyObject *bufobj = PyObject_CallObject(readline, NULL); 537 if (bufobj == NULL) 538 return 0; 539 Py_DECREF(bufobj); 540 } 541 542 return 1; 543} 544 545/* Fetch the next byte from TOK. */ 546 547static int fp_getc(struct tok_state *tok) { 548 return getc(tok->fp); 549} 550 551/* Unfetch the last byte back into TOK. */ 552 553static void fp_ungetc(int c, struct tok_state *tok) { 554 ungetc(c, tok->fp); 555} 556 557/* Check whether the characters at s start a valid 558 UTF-8 sequence. Return the number of characters forming 559 the sequence if yes, 0 if not. */ 560static int valid_utf8(const unsigned char* s) 561{ 562 int expected = 0; 563 int length; 564 if (*s < 0x80) 565 /* single-byte code */ 566 return 1; 567 if (*s < 0xc0) 568 /* following byte */ 569 return 0; 570 if (*s < 0xE0) 571 expected = 1; 572 else if (*s < 0xF0) 573 expected = 2; 574 else if (*s < 0xF8) 575 expected = 3; 576 else 577 return 0; 578 length = expected + 1; 579 for (; expected; expected--) 580 if (s[expected] < 0x80 || s[expected] >= 0xC0) 581 return 0; 582 return length; 583} 584 585/* Read a line of input from TOK. Determine encoding 586 if necessary. */ 587 588static char * 589decoding_fgets(char *s, int size, struct tok_state *tok) 590{ 591 char *line = NULL; 592 int badchar = 0; 593 for (;;) { 594 if (tok->decoding_state == STATE_NORMAL) { 595 /* We already have a codec associated with 596 this input. */ 597 line = fp_readl(s, size, tok); 598 break; 599 } else if (tok->decoding_state == STATE_RAW) { 600 /* We want a 'raw' read. */ 601 line = Py_UniversalNewlineFgets(s, size, 602 tok->fp, NULL); 603 break; 604 } else { 605 /* We have not yet determined the encoding. 606 If an encoding is found, use the file-pointer 607 reader functions from now on. */ 608 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok)) 609 return error_ret(tok); 610 assert(tok->decoding_state != STATE_INIT); 611 } 612 } 613 if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) { 614 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) { 615 return error_ret(tok); 616 } 617 } 618#ifndef PGEN 619 /* The default encoding is UTF-8, so make sure we don't have any 620 non-UTF-8 sequences in it. */ 621 if (line && !tok->encoding) { 622 unsigned char *c; 623 int length; 624 for (c = (unsigned char *)line; *c; c += length) 625 if (!(length = valid_utf8(c))) { 626 badchar = *c; 627 break; 628 } 629 } 630 if (badchar) { 631 /* Need to add 1 to the line number, since this line 632 has not been counted, yet. */ 633 PyErr_Format(PyExc_SyntaxError, 634 "Non-UTF-8 code starting with '\\x%.2x' " 635 "in file %U on line %i, " 636 "but no encoding declared; " 637 "see http://python.org/dev/peps/pep-0263/ for details", 638 badchar, tok->filename, tok->lineno + 1); 639 return error_ret(tok); 640 } 641#endif 642 return line; 643} 644 645static int 646decoding_feof(struct tok_state *tok) 647{ 648 if (tok->decoding_state != STATE_NORMAL) { 649 return feof(tok->fp); 650 } else { 651 PyObject* buf = tok->decoding_buffer; 652 if (buf == NULL) { 653 buf = PyObject_CallObject(tok->decoding_readline, NULL); 654 if (buf == NULL) { 655 error_ret(tok); 656 return 1; 657 } else { 658 tok->decoding_buffer = buf; 659 } 660 } 661 return PyObject_Length(buf) == 0; 662 } 663} 664 665/* Fetch a byte from TOK, using the string buffer. */ 666 667static int 668buf_getc(struct tok_state *tok) { 669 return Py_CHARMASK(*tok->str++); 670} 671 672/* Unfetch a byte from TOK, using the string buffer. */ 673 674static void 675buf_ungetc(int c, struct tok_state *tok) { 676 tok->str--; 677 assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */ 678} 679 680/* Set the readline function for TOK to ENC. For the string-based 681 tokenizer, this means to just record the encoding. */ 682 683static int 684buf_setreadl(struct tok_state *tok, const char* enc) { 685 tok->enc = enc; 686 return 1; 687} 688 689/* Return a UTF-8 encoding Python string object from the 690 C byte string STR, which is encoded with ENC. */ 691 692static PyObject * 693translate_into_utf8(const char* str, const char* enc) { 694 PyObject *utf8; 695 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL); 696 if (buf == NULL) 697 return NULL; 698 utf8 = PyUnicode_AsUTF8String(buf); 699 Py_DECREF(buf); 700 return utf8; 701} 702 703 704static char * 705translate_newlines(const char *s, int exec_input, struct tok_state *tok) { 706 int skip_next_lf = 0; 707 size_t needed_length = strlen(s) + 2, final_length; 708 char *buf, *current; 709 char c = '\0'; 710 buf = PyMem_MALLOC(needed_length); 711 if (buf == NULL) { 712 tok->done = E_NOMEM; 713 return NULL; 714 } 715 for (current = buf; *s; s++, current++) { 716 c = *s; 717 if (skip_next_lf) { 718 skip_next_lf = 0; 719 if (c == '\n') { 720 c = *++s; 721 if (!c) 722 break; 723 } 724 } 725 if (c == '\r') { 726 skip_next_lf = 1; 727 c = '\n'; 728 } 729 *current = c; 730 } 731 /* If this is exec input, add a newline to the end of the string if 732 there isn't one already. */ 733 if (exec_input && c != '\n') { 734 *current = '\n'; 735 current++; 736 } 737 *current = '\0'; 738 final_length = current - buf + 1; 739 if (final_length < needed_length && final_length) 740 /* should never fail */ 741 buf = PyMem_REALLOC(buf, final_length); 742 return buf; 743} 744 745/* Decode a byte string STR for use as the buffer of TOK. 746 Look for encoding declarations inside STR, and record them 747 inside TOK. */ 748 749static const char * 750decode_str(const char *input, int single, struct tok_state *tok) 751{ 752 PyObject* utf8 = NULL; 753 const char *str; 754 const char *s; 755 const char *newl[2] = {NULL, NULL}; 756 int lineno = 0; 757 tok->input = str = translate_newlines(input, single, tok); 758 if (str == NULL) 759 return NULL; 760 tok->enc = NULL; 761 tok->str = str; 762 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok)) 763 return error_ret(tok); 764 str = tok->str; /* string after BOM if any */ 765 assert(str); 766 if (tok->enc != NULL) { 767 utf8 = translate_into_utf8(str, tok->enc); 768 if (utf8 == NULL) 769 return error_ret(tok); 770 str = PyBytes_AsString(utf8); 771 } 772 for (s = str;; s++) { 773 if (*s == '\0') break; 774 else if (*s == '\n') { 775 assert(lineno < 2); 776 newl[lineno] = s; 777 lineno++; 778 if (lineno == 2) break; 779 } 780 } 781 tok->enc = NULL; 782 /* need to check line 1 and 2 separately since check_coding_spec 783 assumes a single line as input */ 784 if (newl[0]) { 785 if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl)) 786 return error_ret(tok); 787 if (tok->enc == NULL && !tok->read_coding_spec && newl[1]) { 788 if (!check_coding_spec(newl[0]+1, newl[1] - newl[0], 789 tok, buf_setreadl)) 790 return error_ret(tok); 791 } 792 } 793 if (tok->enc != NULL) { 794 assert(utf8 == NULL); 795 utf8 = translate_into_utf8(str, tok->enc); 796 if (utf8 == NULL) 797 return error_ret(tok); 798 str = PyBytes_AS_STRING(utf8); 799 } 800 assert(tok->decoding_buffer == NULL); 801 tok->decoding_buffer = utf8; /* CAUTION */ 802 return str; 803} 804 805#endif /* PGEN */ 806 807/* Set up tokenizer for string */ 808 809struct tok_state * 810PyTokenizer_FromString(const char *str, int exec_input) 811{ 812 struct tok_state *tok = tok_new(); 813 if (tok == NULL) 814 return NULL; 815 str = decode_str(str, exec_input, tok); 816 if (str == NULL) { 817 PyTokenizer_Free(tok); 818 return NULL; 819 } 820 821 /* XXX: constify members. */ 822 tok->buf = tok->cur = tok->end = tok->inp = (char*)str; 823 return tok; 824} 825 826struct tok_state * 827PyTokenizer_FromUTF8(const char *str, int exec_input) 828{ 829 struct tok_state *tok = tok_new(); 830 if (tok == NULL) 831 return NULL; 832#ifndef PGEN 833 tok->input = str = translate_newlines(str, exec_input, tok); 834#endif 835 if (str == NULL) { 836 PyTokenizer_Free(tok); 837 return NULL; 838 } 839 tok->decoding_state = STATE_RAW; 840 tok->read_coding_spec = 1; 841 tok->enc = NULL; 842 tok->str = str; 843 tok->encoding = (char *)PyMem_MALLOC(6); 844 if (!tok->encoding) { 845 PyTokenizer_Free(tok); 846 return NULL; 847 } 848 strcpy(tok->encoding, "utf-8"); 849 850 /* XXX: constify members. */ 851 tok->buf = tok->cur = tok->end = tok->inp = (char*)str; 852 return tok; 853} 854 855/* Set up tokenizer for file */ 856 857struct tok_state * 858PyTokenizer_FromFile(FILE *fp, const char* enc, 859 const char *ps1, const char *ps2) 860{ 861 struct tok_state *tok = tok_new(); 862 if (tok == NULL) 863 return NULL; 864 if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) { 865 PyTokenizer_Free(tok); 866 return NULL; 867 } 868 tok->cur = tok->inp = tok->buf; 869 tok->end = tok->buf + BUFSIZ; 870 tok->fp = fp; 871 tok->prompt = ps1; 872 tok->nextprompt = ps2; 873 if (enc != NULL) { 874 /* Must copy encoding declaration since it 875 gets copied into the parse tree. */ 876 tok->encoding = PyMem_MALLOC(strlen(enc)+1); 877 if (!tok->encoding) { 878 PyTokenizer_Free(tok); 879 return NULL; 880 } 881 strcpy(tok->encoding, enc); 882 tok->decoding_state = STATE_NORMAL; 883 } 884 return tok; 885} 886 887 888/* Free a tok_state structure */ 889 890void 891PyTokenizer_Free(struct tok_state *tok) 892{ 893 if (tok->encoding != NULL) 894 PyMem_FREE(tok->encoding); 895#ifndef PGEN 896 Py_XDECREF(tok->decoding_readline); 897 Py_XDECREF(tok->decoding_buffer); 898 Py_XDECREF(tok->filename); 899#endif 900 if (tok->fp != NULL && tok->buf != NULL) 901 PyMem_FREE(tok->buf); 902 if (tok->input) 903 PyMem_FREE((char *)tok->input); 904 PyMem_FREE(tok); 905} 906 907/* Get next char, updating state; error code goes into tok->done */ 908 909static int 910tok_nextc(struct tok_state *tok) 911{ 912 for (;;) { 913 if (tok->cur != tok->inp) { 914 return Py_CHARMASK(*tok->cur++); /* Fast path */ 915 } 916 if (tok->done != E_OK) 917 return EOF; 918 if (tok->fp == NULL) { 919 char *end = strchr(tok->inp, '\n'); 920 if (end != NULL) 921 end++; 922 else { 923 end = strchr(tok->inp, '\0'); 924 if (end == tok->inp) { 925 tok->done = E_EOF; 926 return EOF; 927 } 928 } 929 if (tok->start == NULL) 930 tok->buf = tok->cur; 931 tok->line_start = tok->cur; 932 tok->lineno++; 933 tok->inp = end; 934 return Py_CHARMASK(*tok->cur++); 935 } 936 if (tok->prompt != NULL) { 937 char *newtok = PyOS_Readline(stdin, stdout, tok->prompt); 938#ifndef PGEN 939 if (newtok != NULL) { 940 char *translated = translate_newlines(newtok, 0, tok); 941 PyMem_FREE(newtok); 942 if (translated == NULL) 943 return EOF; 944 newtok = translated; 945 } 946 if (tok->encoding && newtok && *newtok) { 947 /* Recode to UTF-8 */ 948 Py_ssize_t buflen; 949 const char* buf; 950 PyObject *u = translate_into_utf8(newtok, tok->encoding); 951 PyMem_FREE(newtok); 952 if (!u) { 953 tok->done = E_DECODE; 954 return EOF; 955 } 956 buflen = PyBytes_GET_SIZE(u); 957 buf = PyBytes_AS_STRING(u); 958 newtok = PyMem_MALLOC(buflen+1); 959 strcpy(newtok, buf); 960 Py_DECREF(u); 961 } 962#endif 963 if (tok->nextprompt != NULL) 964 tok->prompt = tok->nextprompt; 965 if (newtok == NULL) 966 tok->done = E_INTR; 967 else if (*newtok == '\0') { 968 PyMem_FREE(newtok); 969 tok->done = E_EOF; 970 } 971 else if (tok->start != NULL) { 972 size_t start = tok->start - tok->buf; 973 size_t oldlen = tok->cur - tok->buf; 974 size_t newlen = oldlen + strlen(newtok); 975 char *buf = tok->buf; 976 buf = (char *)PyMem_REALLOC(buf, newlen+1); 977 tok->lineno++; 978 if (buf == NULL) { 979 PyMem_FREE(tok->buf); 980 tok->buf = NULL; 981 PyMem_FREE(newtok); 982 tok->done = E_NOMEM; 983 return EOF; 984 } 985 tok->buf = buf; 986 tok->cur = tok->buf + oldlen; 987 tok->line_start = tok->cur; 988 strcpy(tok->buf + oldlen, newtok); 989 PyMem_FREE(newtok); 990 tok->inp = tok->buf + newlen; 991 tok->end = tok->inp + 1; 992 tok->start = tok->buf + start; 993 } 994 else { 995 tok->lineno++; 996 if (tok->buf != NULL) 997 PyMem_FREE(tok->buf); 998 tok->buf = newtok; 999 tok->cur = tok->buf; 1000 tok->line_start = tok->buf; 1001 tok->inp = strchr(tok->buf, '\0'); 1002 tok->end = tok->inp + 1; 1003 } 1004 } 1005 else { 1006 int done = 0; 1007 Py_ssize_t cur = 0; 1008 char *pt; 1009 if (tok->start == NULL) { 1010 if (tok->buf == NULL) { 1011 tok->buf = (char *) 1012 PyMem_MALLOC(BUFSIZ); 1013 if (tok->buf == NULL) { 1014 tok->done = E_NOMEM; 1015 return EOF; 1016 } 1017 tok->end = tok->buf + BUFSIZ; 1018 } 1019 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf), 1020 tok) == NULL) { 1021 if (!tok->decoding_erred) 1022 tok->done = E_EOF; 1023 done = 1; 1024 } 1025 else { 1026 tok->done = E_OK; 1027 tok->inp = strchr(tok->buf, '\0'); 1028 done = tok->inp == tok->buf || tok->inp[-1] == '\n'; 1029 } 1030 } 1031 else { 1032 cur = tok->cur - tok->buf; 1033 if (decoding_feof(tok)) { 1034 tok->done = E_EOF; 1035 done = 1; 1036 } 1037 else 1038 tok->done = E_OK; 1039 } 1040 tok->lineno++; 1041 /* Read until '\n' or EOF */ 1042 while (!done) { 1043 Py_ssize_t curstart = tok->start == NULL ? -1 : 1044 tok->start - tok->buf; 1045 Py_ssize_t curvalid = tok->inp - tok->buf; 1046 Py_ssize_t newsize = curvalid + BUFSIZ; 1047 char *newbuf = tok->buf; 1048 newbuf = (char *)PyMem_REALLOC(newbuf, 1049 newsize); 1050 if (newbuf == NULL) { 1051 tok->done = E_NOMEM; 1052 tok->cur = tok->inp; 1053 return EOF; 1054 } 1055 tok->buf = newbuf; 1056 tok->cur = tok->buf + cur; 1057 tok->line_start = tok->cur; 1058 tok->inp = tok->buf + curvalid; 1059 tok->end = tok->buf + newsize; 1060 tok->start = curstart < 0 ? NULL : 1061 tok->buf + curstart; 1062 if (decoding_fgets(tok->inp, 1063 (int)(tok->end - tok->inp), 1064 tok) == NULL) { 1065 /* Break out early on decoding 1066 errors, as tok->buf will be NULL 1067 */ 1068 if (tok->decoding_erred) 1069 return EOF; 1070 /* Last line does not end in \n, 1071 fake one */ 1072 strcpy(tok->inp, "\n"); 1073 } 1074 tok->inp = strchr(tok->inp, '\0'); 1075 done = tok->inp[-1] == '\n'; 1076 } 1077 if (tok->buf != NULL) { 1078 tok->cur = tok->buf + cur; 1079 tok->line_start = tok->cur; 1080 /* replace "\r\n" with "\n" */ 1081 /* For Mac leave the \r, giving a syntax error */ 1082 pt = tok->inp - 2; 1083 if (pt >= tok->buf && *pt == '\r') { 1084 *pt++ = '\n'; 1085 *pt = '\0'; 1086 tok->inp = pt; 1087 } 1088 } 1089 } 1090 if (tok->done != E_OK) { 1091 if (tok->prompt != NULL) 1092 PySys_WriteStderr("\n"); 1093 tok->cur = tok->inp; 1094 return EOF; 1095 } 1096 } 1097 /*NOTREACHED*/ 1098} 1099 1100 1101/* Back-up one character */ 1102 1103static void 1104tok_backup(struct tok_state *tok, int c) 1105{ 1106 if (c != EOF) { 1107 if (--tok->cur < tok->buf) 1108 Py_FatalError("tok_backup: beginning of buffer"); 1109 if (*tok->cur != c) 1110 *tok->cur = c; 1111 } 1112} 1113 1114 1115/* Return the token corresponding to a single character */ 1116 1117int 1118PyToken_OneChar(int c) 1119{ 1120 switch (c) { 1121 case '(': return LPAR; 1122 case ')': return RPAR; 1123 case '[': return LSQB; 1124 case ']': return RSQB; 1125 case ':': return COLON; 1126 case ',': return COMMA; 1127 case ';': return SEMI; 1128 case '+': return PLUS; 1129 case '-': return MINUS; 1130 case '*': return STAR; 1131 case '/': return SLASH; 1132 case '|': return VBAR; 1133 case '&': return AMPER; 1134 case '<': return LESS; 1135 case '>': return GREATER; 1136 case '=': return EQUAL; 1137 case '.': return DOT; 1138 case '%': return PERCENT; 1139 case '{': return LBRACE; 1140 case '}': return RBRACE; 1141 case '^': return CIRCUMFLEX; 1142 case '~': return TILDE; 1143 case '@': return AT; 1144 default: return OP; 1145 } 1146} 1147 1148 1149int 1150PyToken_TwoChars(int c1, int c2) 1151{ 1152 switch (c1) { 1153 case '=': 1154 switch (c2) { 1155 case '=': return EQEQUAL; 1156 } 1157 break; 1158 case '!': 1159 switch (c2) { 1160 case '=': return NOTEQUAL; 1161 } 1162 break; 1163 case '<': 1164 switch (c2) { 1165 case '>': return NOTEQUAL; 1166 case '=': return LESSEQUAL; 1167 case '<': return LEFTSHIFT; 1168 } 1169 break; 1170 case '>': 1171 switch (c2) { 1172 case '=': return GREATEREQUAL; 1173 case '>': return RIGHTSHIFT; 1174 } 1175 break; 1176 case '+': 1177 switch (c2) { 1178 case '=': return PLUSEQUAL; 1179 } 1180 break; 1181 case '-': 1182 switch (c2) { 1183 case '=': return MINEQUAL; 1184 case '>': return RARROW; 1185 } 1186 break; 1187 case '*': 1188 switch (c2) { 1189 case '*': return DOUBLESTAR; 1190 case '=': return STAREQUAL; 1191 } 1192 break; 1193 case '/': 1194 switch (c2) { 1195 case '/': return DOUBLESLASH; 1196 case '=': return SLASHEQUAL; 1197 } 1198 break; 1199 case '|': 1200 switch (c2) { 1201 case '=': return VBAREQUAL; 1202 } 1203 break; 1204 case '%': 1205 switch (c2) { 1206 case '=': return PERCENTEQUAL; 1207 } 1208 break; 1209 case '&': 1210 switch (c2) { 1211 case '=': return AMPEREQUAL; 1212 } 1213 break; 1214 case '^': 1215 switch (c2) { 1216 case '=': return CIRCUMFLEXEQUAL; 1217 } 1218 break; 1219 case '@': 1220 switch (c2) { 1221 case '=': return ATEQUAL; 1222 } 1223 break; 1224 } 1225 return OP; 1226} 1227 1228int 1229PyToken_ThreeChars(int c1, int c2, int c3) 1230{ 1231 switch (c1) { 1232 case '<': 1233 switch (c2) { 1234 case '<': 1235 switch (c3) { 1236 case '=': 1237 return LEFTSHIFTEQUAL; 1238 } 1239 break; 1240 } 1241 break; 1242 case '>': 1243 switch (c2) { 1244 case '>': 1245 switch (c3) { 1246 case '=': 1247 return RIGHTSHIFTEQUAL; 1248 } 1249 break; 1250 } 1251 break; 1252 case '*': 1253 switch (c2) { 1254 case '*': 1255 switch (c3) { 1256 case '=': 1257 return DOUBLESTAREQUAL; 1258 } 1259 break; 1260 } 1261 break; 1262 case '/': 1263 switch (c2) { 1264 case '/': 1265 switch (c3) { 1266 case '=': 1267 return DOUBLESLASHEQUAL; 1268 } 1269 break; 1270 } 1271 break; 1272 case '.': 1273 switch (c2) { 1274 case '.': 1275 switch (c3) { 1276 case '.': 1277 return ELLIPSIS; 1278 } 1279 break; 1280 } 1281 break; 1282 } 1283 return OP; 1284} 1285 1286static int 1287indenterror(struct tok_state *tok) 1288{ 1289 if (tok->alterror) { 1290 tok->done = E_TABSPACE; 1291 tok->cur = tok->inp; 1292 return 1; 1293 } 1294 if (tok->altwarning) { 1295#ifdef PGEN 1296 PySys_WriteStderr("inconsistent use of tabs and spaces " 1297 "in indentation\n"); 1298#else 1299 PySys_FormatStderr("%U: inconsistent use of tabs and spaces " 1300 "in indentation\n", tok->filename); 1301#endif 1302 tok->altwarning = 0; 1303 } 1304 return 0; 1305} 1306 1307#ifdef PGEN 1308#define verify_identifier(tok) 1 1309#else 1310/* Verify that the identifier follows PEP 3131. 1311 All identifier strings are guaranteed to be "ready" unicode objects. 1312 */ 1313static int 1314verify_identifier(struct tok_state *tok) 1315{ 1316 PyObject *s; 1317 int result; 1318 if (tok->decoding_erred) 1319 return 0; 1320 s = PyUnicode_DecodeUTF8(tok->start, tok->cur - tok->start, NULL); 1321 if (s == NULL || PyUnicode_READY(s) == -1) { 1322 if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) { 1323 PyErr_Clear(); 1324 tok->done = E_IDENTIFIER; 1325 } else { 1326 tok->done = E_ERROR; 1327 } 1328 return 0; 1329 } 1330 result = PyUnicode_IsIdentifier(s); 1331 Py_DECREF(s); 1332 if (result == 0) 1333 tok->done = E_IDENTIFIER; 1334 return result; 1335} 1336#endif 1337 1338static int 1339tok_decimal_tail(struct tok_state *tok) 1340{ 1341 int c; 1342 1343 while (1) { 1344 do { 1345 c = tok_nextc(tok); 1346 } while (isdigit(c)); 1347 if (c != '_') { 1348 break; 1349 } 1350 c = tok_nextc(tok); 1351 if (!isdigit(c)) { 1352 tok->done = E_TOKEN; 1353 tok_backup(tok, c); 1354 return 0; 1355 } 1356 } 1357 return c; 1358} 1359 1360/* Get next token, after space stripping etc. */ 1361 1362static int 1363tok_get(struct tok_state *tok, char **p_start, char **p_end) 1364{ 1365 int c; 1366 int blankline, nonascii; 1367 1368 *p_start = *p_end = NULL; 1369 nextline: 1370 tok->start = NULL; 1371 blankline = 0; 1372 1373 /* Get indentation level */ 1374 if (tok->atbol) { 1375 int col = 0; 1376 int altcol = 0; 1377 tok->atbol = 0; 1378 for (;;) { 1379 c = tok_nextc(tok); 1380 if (c == ' ') { 1381 col++, altcol++; 1382 } 1383 else if (c == '\t') { 1384 col = (col/tok->tabsize + 1) * tok->tabsize; 1385 altcol = (altcol/tok->alttabsize + 1) 1386 * tok->alttabsize; 1387 } 1388 else if (c == '\014') {/* Control-L (formfeed) */ 1389 col = altcol = 0; /* For Emacs users */ 1390 } 1391 else { 1392 break; 1393 } 1394 } 1395 tok_backup(tok, c); 1396 if (c == '#' || c == '\n') { 1397 /* Lines with only whitespace and/or comments 1398 shouldn't affect the indentation and are 1399 not passed to the parser as NEWLINE tokens, 1400 except *totally* empty lines in interactive 1401 mode, which signal the end of a command group. */ 1402 if (col == 0 && c == '\n' && tok->prompt != NULL) { 1403 blankline = 0; /* Let it through */ 1404 } 1405 else { 1406 blankline = 1; /* Ignore completely */ 1407 } 1408 /* We can't jump back right here since we still 1409 may need to skip to the end of a comment */ 1410 } 1411 if (!blankline && tok->level == 0) { 1412 if (col == tok->indstack[tok->indent]) { 1413 /* No change */ 1414 if (altcol != tok->altindstack[tok->indent]) { 1415 if (indenterror(tok)) { 1416 return ERRORTOKEN; 1417 } 1418 } 1419 } 1420 else if (col > tok->indstack[tok->indent]) { 1421 /* Indent -- always one */ 1422 if (tok->indent+1 >= MAXINDENT) { 1423 tok->done = E_TOODEEP; 1424 tok->cur = tok->inp; 1425 return ERRORTOKEN; 1426 } 1427 if (altcol <= tok->altindstack[tok->indent]) { 1428 if (indenterror(tok)) { 1429 return ERRORTOKEN; 1430 } 1431 } 1432 tok->pendin++; 1433 tok->indstack[++tok->indent] = col; 1434 tok->altindstack[tok->indent] = altcol; 1435 } 1436 else /* col < tok->indstack[tok->indent] */ { 1437 /* Dedent -- any number, must be consistent */ 1438 while (tok->indent > 0 && 1439 col < tok->indstack[tok->indent]) { 1440 tok->pendin--; 1441 tok->indent--; 1442 } 1443 if (col != tok->indstack[tok->indent]) { 1444 tok->done = E_DEDENT; 1445 tok->cur = tok->inp; 1446 return ERRORTOKEN; 1447 } 1448 if (altcol != tok->altindstack[tok->indent]) { 1449 if (indenterror(tok)) { 1450 return ERRORTOKEN; 1451 } 1452 } 1453 } 1454 } 1455 } 1456 1457 tok->start = tok->cur; 1458 1459 /* Return pending indents/dedents */ 1460 if (tok->pendin != 0) { 1461 if (tok->pendin < 0) { 1462 tok->pendin++; 1463 return DEDENT; 1464 } 1465 else { 1466 tok->pendin--; 1467 return INDENT; 1468 } 1469 } 1470 1471 if (tok->async_def 1472 && !blankline 1473 && tok->level == 0 1474 /* There was a NEWLINE after ASYNC DEF, 1475 so we're past the signature. */ 1476 && tok->async_def_nl 1477 /* Current indentation level is less than where 1478 the async function was defined */ 1479 && tok->async_def_indent >= tok->indent) 1480 { 1481 tok->async_def = 0; 1482 tok->async_def_indent = 0; 1483 tok->async_def_nl = 0; 1484 } 1485 1486 again: 1487 tok->start = NULL; 1488 /* Skip spaces */ 1489 do { 1490 c = tok_nextc(tok); 1491 } while (c == ' ' || c == '\t' || c == '\014'); 1492 1493 /* Set start of current token */ 1494 tok->start = tok->cur - 1; 1495 1496 /* Skip comment */ 1497 if (c == '#') { 1498 while (c != EOF && c != '\n') { 1499 c = tok_nextc(tok); 1500 } 1501 } 1502 1503 /* Check for EOF and errors now */ 1504 if (c == EOF) { 1505 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN; 1506 } 1507 1508 /* Identifier (most frequent token!) */ 1509 nonascii = 0; 1510 if (is_potential_identifier_start(c)) { 1511 /* Process the various legal combinations of b"", r"", u"", and f"". */ 1512 int saw_b = 0, saw_r = 0, saw_u = 0, saw_f = 0; 1513 while (1) { 1514 if (!(saw_b || saw_u || saw_f) && (c == 'b' || c == 'B')) 1515 saw_b = 1; 1516 /* Since this is a backwards compatibility support literal we don't 1517 want to support it in arbitrary order like byte literals. */ 1518 else if (!(saw_b || saw_u || saw_r || saw_f) 1519 && (c == 'u'|| c == 'U')) { 1520 saw_u = 1; 1521 } 1522 /* ur"" and ru"" are not supported */ 1523 else if (!(saw_r || saw_u) && (c == 'r' || c == 'R')) { 1524 saw_r = 1; 1525 } 1526 else if (!(saw_f || saw_b || saw_u) && (c == 'f' || c == 'F')) { 1527 saw_f = 1; 1528 } 1529 else { 1530 break; 1531 } 1532 c = tok_nextc(tok); 1533 if (c == '"' || c == '\'') { 1534 goto letter_quote; 1535 } 1536 } 1537 while (is_potential_identifier_char(c)) { 1538 if (c >= 128) { 1539 nonascii = 1; 1540 } 1541 c = tok_nextc(tok); 1542 } 1543 tok_backup(tok, c); 1544 if (nonascii && !verify_identifier(tok)) { 1545 return ERRORTOKEN; 1546 } 1547 *p_start = tok->start; 1548 *p_end = tok->cur; 1549 1550 /* async/await parsing block. */ 1551 if (tok->cur - tok->start == 5) { 1552 /* Current token length is 5. */ 1553 if (tok->async_def) { 1554 /* We're inside an 'async def' function. */ 1555 if (memcmp(tok->start, "async", 5) == 0) { 1556 return ASYNC; 1557 } 1558 if (memcmp(tok->start, "await", 5) == 0) { 1559 return AWAIT; 1560 } 1561 } 1562 else if (memcmp(tok->start, "async", 5) == 0) { 1563 /* The current token is 'async'. 1564 Look ahead one token.*/ 1565 1566 struct tok_state ahead_tok; 1567 char *ahead_tok_start = NULL, *ahead_tok_end = NULL; 1568 int ahead_tok_kind; 1569 1570 memcpy(&ahead_tok, tok, sizeof(ahead_tok)); 1571 ahead_tok_kind = tok_get(&ahead_tok, &ahead_tok_start, 1572 &ahead_tok_end); 1573 1574 if (ahead_tok_kind == NAME 1575 && ahead_tok.cur - ahead_tok.start == 3 1576 && memcmp(ahead_tok.start, "def", 3) == 0) 1577 { 1578 /* The next token is going to be 'def', so instead of 1579 returning 'async' NAME token, we return ASYNC. */ 1580 tok->async_def_indent = tok->indent; 1581 tok->async_def = 1; 1582 return ASYNC; 1583 } 1584 } 1585 } 1586 1587 return NAME; 1588 } 1589 1590 /* Newline */ 1591 if (c == '\n') { 1592 tok->atbol = 1; 1593 if (blankline || tok->level > 0) { 1594 goto nextline; 1595 } 1596 *p_start = tok->start; 1597 *p_end = tok->cur - 1; /* Leave '\n' out of the string */ 1598 tok->cont_line = 0; 1599 if (tok->async_def) { 1600 /* We're somewhere inside an 'async def' function, and 1601 we've encountered a NEWLINE after its signature. */ 1602 tok->async_def_nl = 1; 1603 } 1604 return NEWLINE; 1605 } 1606 1607 /* Period or number starting with period? */ 1608 if (c == '.') { 1609 c = tok_nextc(tok); 1610 if (isdigit(c)) { 1611 goto fraction; 1612 } else if (c == '.') { 1613 c = tok_nextc(tok); 1614 if (c == '.') { 1615 *p_start = tok->start; 1616 *p_end = tok->cur; 1617 return ELLIPSIS; 1618 } 1619 else { 1620 tok_backup(tok, c); 1621 } 1622 tok_backup(tok, '.'); 1623 } 1624 else { 1625 tok_backup(tok, c); 1626 } 1627 *p_start = tok->start; 1628 *p_end = tok->cur; 1629 return DOT; 1630 } 1631 1632 /* Number */ 1633 if (isdigit(c)) { 1634 if (c == '0') { 1635 /* Hex, octal or binary -- maybe. */ 1636 c = tok_nextc(tok); 1637 if (c == 'x' || c == 'X') { 1638 /* Hex */ 1639 c = tok_nextc(tok); 1640 do { 1641 if (c == '_') { 1642 c = tok_nextc(tok); 1643 } 1644 if (!isxdigit(c)) { 1645 tok->done = E_TOKEN; 1646 tok_backup(tok, c); 1647 return ERRORTOKEN; 1648 } 1649 do { 1650 c = tok_nextc(tok); 1651 } while (isxdigit(c)); 1652 } while (c == '_'); 1653 } 1654 else if (c == 'o' || c == 'O') { 1655 /* Octal */ 1656 c = tok_nextc(tok); 1657 do { 1658 if (c == '_') { 1659 c = tok_nextc(tok); 1660 } 1661 if (c < '0' || c >= '8') { 1662 tok->done = E_TOKEN; 1663 tok_backup(tok, c); 1664 return ERRORTOKEN; 1665 } 1666 do { 1667 c = tok_nextc(tok); 1668 } while ('0' <= c && c < '8'); 1669 } while (c == '_'); 1670 } 1671 else if (c == 'b' || c == 'B') { 1672 /* Binary */ 1673 c = tok_nextc(tok); 1674 do { 1675 if (c == '_') { 1676 c = tok_nextc(tok); 1677 } 1678 if (c != '0' && c != '1') { 1679 tok->done = E_TOKEN; 1680 tok_backup(tok, c); 1681 return ERRORTOKEN; 1682 } 1683 do { 1684 c = tok_nextc(tok); 1685 } while (c == '0' || c == '1'); 1686 } while (c == '_'); 1687 } 1688 else { 1689 int nonzero = 0; 1690 /* maybe old-style octal; c is first char of it */ 1691 /* in any case, allow '0' as a literal */ 1692 while (1) { 1693 if (c == '_') { 1694 c = tok_nextc(tok); 1695 if (!isdigit(c)) { 1696 tok->done = E_TOKEN; 1697 tok_backup(tok, c); 1698 return ERRORTOKEN; 1699 } 1700 } 1701 if (c != '0') { 1702 break; 1703 } 1704 c = tok_nextc(tok); 1705 } 1706 if (isdigit(c)) { 1707 nonzero = 1; 1708 c = tok_decimal_tail(tok); 1709 if (c == 0) { 1710 return ERRORTOKEN; 1711 } 1712 } 1713 if (c == '.') { 1714 c = tok_nextc(tok); 1715 goto fraction; 1716 } 1717 else if (c == 'e' || c == 'E') { 1718 goto exponent; 1719 } 1720 else if (c == 'j' || c == 'J') { 1721 goto imaginary; 1722 } 1723 else if (nonzero) { 1724 /* Old-style octal: now disallowed. */ 1725 tok->done = E_TOKEN; 1726 tok_backup(tok, c); 1727 return ERRORTOKEN; 1728 } 1729 } 1730 } 1731 else { 1732 /* Decimal */ 1733 c = tok_decimal_tail(tok); 1734 if (c == 0) { 1735 return ERRORTOKEN; 1736 } 1737 { 1738 /* Accept floating point numbers. */ 1739 if (c == '.') { 1740 c = tok_nextc(tok); 1741 fraction: 1742 /* Fraction */ 1743 if (isdigit(c)) { 1744 c = tok_decimal_tail(tok); 1745 if (c == 0) { 1746 return ERRORTOKEN; 1747 } 1748 } 1749 } 1750 if (c == 'e' || c == 'E') { 1751 int e; 1752 exponent: 1753 e = c; 1754 /* Exponent part */ 1755 c = tok_nextc(tok); 1756 if (c == '+' || c == '-') { 1757 c = tok_nextc(tok); 1758 if (!isdigit(c)) { 1759 tok->done = E_TOKEN; 1760 tok_backup(tok, c); 1761 return ERRORTOKEN; 1762 } 1763 } else if (!isdigit(c)) { 1764 tok_backup(tok, c); 1765 tok_backup(tok, e); 1766 *p_start = tok->start; 1767 *p_end = tok->cur; 1768 return NUMBER; 1769 } 1770 c = tok_decimal_tail(tok); 1771 if (c == 0) { 1772 return ERRORTOKEN; 1773 } 1774 } 1775 if (c == 'j' || c == 'J') { 1776 /* Imaginary part */ 1777 imaginary: 1778 c = tok_nextc(tok); 1779 } 1780 } 1781 } 1782 tok_backup(tok, c); 1783 *p_start = tok->start; 1784 *p_end = tok->cur; 1785 return NUMBER; 1786 } 1787 1788 letter_quote: 1789 /* String */ 1790 if (c == '\'' || c == '"') { 1791 int quote = c; 1792 int quote_size = 1; /* 1 or 3 */ 1793 int end_quote_size = 0; 1794 1795 /* Find the quote size and start of string */ 1796 c = tok_nextc(tok); 1797 if (c == quote) { 1798 c = tok_nextc(tok); 1799 if (c == quote) { 1800 quote_size = 3; 1801 } 1802 else { 1803 end_quote_size = 1; /* empty string found */ 1804 } 1805 } 1806 if (c != quote) { 1807 tok_backup(tok, c); 1808 } 1809 1810 /* Get rest of string */ 1811 while (end_quote_size != quote_size) { 1812 c = tok_nextc(tok); 1813 if (c == EOF) { 1814 if (quote_size == 3) { 1815 tok->done = E_EOFS; 1816 } 1817 else { 1818 tok->done = E_EOLS; 1819 } 1820 tok->cur = tok->inp; 1821 return ERRORTOKEN; 1822 } 1823 if (quote_size == 1 && c == '\n') { 1824 tok->done = E_EOLS; 1825 tok->cur = tok->inp; 1826 return ERRORTOKEN; 1827 } 1828 if (c == quote) { 1829 end_quote_size += 1; 1830 } 1831 else { 1832 end_quote_size = 0; 1833 if (c == '\\') { 1834 tok_nextc(tok); /* skip escaped char */ 1835 } 1836 } 1837 } 1838 1839 *p_start = tok->start; 1840 *p_end = tok->cur; 1841 return STRING; 1842 } 1843 1844 /* Line continuation */ 1845 if (c == '\\') { 1846 c = tok_nextc(tok); 1847 if (c != '\n') { 1848 tok->done = E_LINECONT; 1849 tok->cur = tok->inp; 1850 return ERRORTOKEN; 1851 } 1852 tok->cont_line = 1; 1853 goto again; /* Read next line */ 1854 } 1855 1856 /* Check for two-character token */ 1857 { 1858 int c2 = tok_nextc(tok); 1859 int token = PyToken_TwoChars(c, c2); 1860 if (token != OP) { 1861 int c3 = tok_nextc(tok); 1862 int token3 = PyToken_ThreeChars(c, c2, c3); 1863 if (token3 != OP) { 1864 token = token3; 1865 } 1866 else { 1867 tok_backup(tok, c3); 1868 } 1869 *p_start = tok->start; 1870 *p_end = tok->cur; 1871 return token; 1872 } 1873 tok_backup(tok, c2); 1874 } 1875 1876 /* Keep track of parentheses nesting level */ 1877 switch (c) { 1878 case '(': 1879 case '[': 1880 case '{': 1881 tok->level++; 1882 break; 1883 case ')': 1884 case ']': 1885 case '}': 1886 tok->level--; 1887 break; 1888 } 1889 1890 /* Punctuation character */ 1891 *p_start = tok->start; 1892 *p_end = tok->cur; 1893 return PyToken_OneChar(c); 1894} 1895 1896int 1897PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end) 1898{ 1899 int result = tok_get(tok, p_start, p_end); 1900 if (tok->decoding_erred) { 1901 result = ERRORTOKEN; 1902 tok->done = E_DECODE; 1903 } 1904 return result; 1905} 1906 1907/* Get the encoding of a Python file. Check for the coding cookie and check if 1908 the file starts with a BOM. 1909 1910 PyTokenizer_FindEncodingFilename() returns NULL when it can't find the 1911 encoding in the first or second line of the file (in which case the encoding 1912 should be assumed to be UTF-8). 1913 1914 The char* returned is malloc'ed via PyMem_MALLOC() and thus must be freed 1915 by the caller. */ 1916 1917char * 1918PyTokenizer_FindEncodingFilename(int fd, PyObject *filename) 1919{ 1920 struct tok_state *tok; 1921 FILE *fp; 1922 char *p_start =NULL , *p_end =NULL , *encoding = NULL; 1923 1924#ifndef PGEN 1925 fd = _Py_dup(fd); 1926#else 1927 fd = dup(fd); 1928#endif 1929 if (fd < 0) { 1930 return NULL; 1931 } 1932 1933 fp = fdopen(fd, "r"); 1934 if (fp == NULL) { 1935 return NULL; 1936 } 1937 tok = PyTokenizer_FromFile(fp, NULL, NULL, NULL); 1938 if (tok == NULL) { 1939 fclose(fp); 1940 return NULL; 1941 } 1942#ifndef PGEN 1943 if (filename != NULL) { 1944 Py_INCREF(filename); 1945 tok->filename = filename; 1946 } 1947 else { 1948 tok->filename = PyUnicode_FromString("<string>"); 1949 if (tok->filename == NULL) { 1950 fclose(fp); 1951 PyTokenizer_Free(tok); 1952 return encoding; 1953 } 1954 } 1955#endif 1956 while (tok->lineno < 2 && tok->done == E_OK) { 1957 PyTokenizer_Get(tok, &p_start, &p_end); 1958 } 1959 fclose(fp); 1960 if (tok->encoding) { 1961 encoding = (char *)PyMem_MALLOC(strlen(tok->encoding) + 1); 1962 if (encoding) 1963 strcpy(encoding, tok->encoding); 1964 } 1965 PyTokenizer_Free(tok); 1966 return encoding; 1967} 1968 1969char * 1970PyTokenizer_FindEncoding(int fd) 1971{ 1972 return PyTokenizer_FindEncodingFilename(fd, NULL); 1973} 1974 1975#ifdef Py_DEBUG 1976 1977void 1978tok_dump(int type, char *start, char *end) 1979{ 1980 printf("%s", _PyParser_TokenNames[type]); 1981 if (type == NAME || type == NUMBER || type == STRING || type == OP) 1982 printf("(%.*s)", (int)(end - start), start); 1983} 1984 1985#endif 1986