unicodeobject.c revision 020340f2841ec2b70b9e09921850d16019d0667e
1/* 2 3Unicode implementation based on original code by Fredrik Lundh, 4modified by Marc-Andre Lemburg <mal@lemburg.com> according to the 5Unicode Integration Proposal (see file Misc/unicode.txt). 6 7Major speed upgrades to the method implementations at the Reykjavik 8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke. 9 10Copyright (c) Corporation for National Research Initiatives. 11 12-------------------------------------------------------------------- 13The original string type implementation is: 14 15 Copyright (c) 1999 by Secret Labs AB 16 Copyright (c) 1999 by Fredrik Lundh 17 18By obtaining, using, and/or copying this software and/or its 19associated documentation, you agree that you have read, understood, 20and will comply with the following terms and conditions: 21 22Permission to use, copy, modify, and distribute this software and its 23associated documentation for any purpose and without fee is hereby 24granted, provided that the above copyright notice appears in all 25copies, and that both that copyright notice and this permission notice 26appear in supporting documentation, and that the name of Secret Labs 27AB or the author not be used in advertising or publicity pertaining to 28distribution of the software without specific, written prior 29permission. 30 31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO 32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND 33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR 34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT 37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 38-------------------------------------------------------------------- 39 40*/ 41 42#define PY_SSIZE_T_CLEAN 43#include "Python.h" 44#include "ucnhash.h" 45 46#ifdef MS_WINDOWS 47#include <windows.h> 48#endif 49 50/* Limit for the Unicode object free list */ 51 52#define PyUnicode_MAXFREELIST 1024 53 54/* Limit for the Unicode object free list stay alive optimization. 55 56 The implementation will keep allocated Unicode memory intact for 57 all objects on the free list having a size less than this 58 limit. This reduces malloc() overhead for small Unicode objects. 59 60 At worst this will result in PyUnicode_MAXFREELIST * 61 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT + 62 malloc()-overhead) bytes of unused garbage. 63 64 Setting the limit to 0 effectively turns the feature off. 65 66 Note: This is an experimental feature ! If you get core dumps when 67 using Unicode objects, turn this feature off. 68 69*/ 70 71#define KEEPALIVE_SIZE_LIMIT 9 72 73/* Endianness switches; defaults to little endian */ 74 75#ifdef WORDS_BIGENDIAN 76# define BYTEORDER_IS_BIG_ENDIAN 77#else 78# define BYTEORDER_IS_LITTLE_ENDIAN 79#endif 80 81/* --- Globals ------------------------------------------------------------ 82 83 The globals are initialized by the _PyUnicode_Init() API and should 84 not be used before calling that API. 85 86*/ 87 88 89#ifdef __cplusplus 90extern "C" { 91#endif 92 93/* This dictionary holds all interned unicode strings. Note that references 94 to strings in this dictionary are *not* counted in the string's ob_refcnt. 95 When the interned string reaches a refcnt of 0 the string deallocation 96 function will delete the reference from this dictionary. 97 98 Another way to look at this is that to say that the actual reference 99 count of a string is: s->ob_refcnt + (s->state ? 2 : 0) 100*/ 101static PyObject *interned; 102 103/* Free list for Unicode objects */ 104static PyUnicodeObject *free_list; 105static int numfree; 106 107/* The empty Unicode object is shared to improve performance. */ 108static PyUnicodeObject *unicode_empty; 109 110/* Single character Unicode strings in the Latin-1 range are being 111 shared as well. */ 112static PyUnicodeObject *unicode_latin1[256]; 113 114/* Fast detection of the most frequent whitespace characters */ 115const unsigned char _Py_ascii_whitespace[] = { 116 0, 0, 0, 0, 0, 0, 0, 0, 117/* case 0x0009: * CHARACTER TABULATION */ 118/* case 0x000A: * LINE FEED */ 119/* case 0x000B: * LINE TABULATION */ 120/* case 0x000C: * FORM FEED */ 121/* case 0x000D: * CARRIAGE RETURN */ 122 0, 1, 1, 1, 1, 1, 0, 0, 123 0, 0, 0, 0, 0, 0, 0, 0, 124/* case 0x001C: * FILE SEPARATOR */ 125/* case 0x001D: * GROUP SEPARATOR */ 126/* case 0x001E: * RECORD SEPARATOR */ 127/* case 0x001F: * UNIT SEPARATOR */ 128 0, 0, 0, 0, 1, 1, 1, 1, 129/* case 0x0020: * SPACE */ 130 1, 0, 0, 0, 0, 0, 0, 0, 131 0, 0, 0, 0, 0, 0, 0, 0, 132 0, 0, 0, 0, 0, 0, 0, 0, 133 0, 0, 0, 0, 0, 0, 0, 0, 134 135 0, 0, 0, 0, 0, 0, 0, 0, 136 0, 0, 0, 0, 0, 0, 0, 0, 137 0, 0, 0, 0, 0, 0, 0, 0, 138 0, 0, 0, 0, 0, 0, 0, 0, 139 0, 0, 0, 0, 0, 0, 0, 0, 140 0, 0, 0, 0, 0, 0, 0, 0, 141 0, 0, 0, 0, 0, 0, 0, 0, 142 0, 0, 0, 0, 0, 0, 0, 0 143}; 144 145static PyObject * 146unicode_encode_call_errorhandler(const char *errors, 147 PyObject **errorHandler,const char *encoding, const char *reason, 148 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject, 149 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos); 150 151static void 152raise_encode_exception(PyObject **exceptionObject, 153 const char *encoding, 154 const Py_UNICODE *unicode, Py_ssize_t size, 155 Py_ssize_t startpos, Py_ssize_t endpos, 156 const char *reason); 157 158/* Same for linebreaks */ 159static unsigned char ascii_linebreak[] = { 160 0, 0, 0, 0, 0, 0, 0, 0, 161/* 0x000A, * LINE FEED */ 162/* 0x000B, * LINE TABULATION */ 163/* 0x000C, * FORM FEED */ 164/* 0x000D, * CARRIAGE RETURN */ 165 0, 0, 1, 1, 1, 1, 0, 0, 166 0, 0, 0, 0, 0, 0, 0, 0, 167/* 0x001C, * FILE SEPARATOR */ 168/* 0x001D, * GROUP SEPARATOR */ 169/* 0x001E, * RECORD SEPARATOR */ 170 0, 0, 0, 0, 1, 1, 1, 0, 171 0, 0, 0, 0, 0, 0, 0, 0, 172 0, 0, 0, 0, 0, 0, 0, 0, 173 0, 0, 0, 0, 0, 0, 0, 0, 174 0, 0, 0, 0, 0, 0, 0, 0, 175 176 0, 0, 0, 0, 0, 0, 0, 0, 177 0, 0, 0, 0, 0, 0, 0, 0, 178 0, 0, 0, 0, 0, 0, 0, 0, 179 0, 0, 0, 0, 0, 0, 0, 0, 180 0, 0, 0, 0, 0, 0, 0, 0, 181 0, 0, 0, 0, 0, 0, 0, 0, 182 0, 0, 0, 0, 0, 0, 0, 0, 183 0, 0, 0, 0, 0, 0, 0, 0 184}; 185 186 187Py_UNICODE 188PyUnicode_GetMax(void) 189{ 190#ifdef Py_UNICODE_WIDE 191 return 0x10FFFF; 192#else 193 /* This is actually an illegal character, so it should 194 not be passed to unichr. */ 195 return 0xFFFF; 196#endif 197} 198 199/* --- Bloom Filters ----------------------------------------------------- */ 200 201/* stuff to implement simple "bloom filters" for Unicode characters. 202 to keep things simple, we use a single bitmask, using the least 5 203 bits from each unicode characters as the bit index. */ 204 205/* the linebreak mask is set up by Unicode_Init below */ 206 207#if LONG_BIT >= 128 208#define BLOOM_WIDTH 128 209#elif LONG_BIT >= 64 210#define BLOOM_WIDTH 64 211#elif LONG_BIT >= 32 212#define BLOOM_WIDTH 32 213#else 214#error "LONG_BIT is smaller than 32" 215#endif 216 217#define BLOOM_MASK unsigned long 218 219static BLOOM_MASK bloom_linebreak; 220 221#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1))))) 222#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1))))) 223 224#define BLOOM_LINEBREAK(ch) \ 225 ((ch) < 128U ? ascii_linebreak[(ch)] : \ 226 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch))) 227 228Py_LOCAL_INLINE(BLOOM_MASK) 229make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len) 230{ 231 /* calculate simple bloom-style bitmask for a given unicode string */ 232 233 BLOOM_MASK mask; 234 Py_ssize_t i; 235 236 mask = 0; 237 for (i = 0; i < len; i++) 238 BLOOM_ADD(mask, ptr[i]); 239 240 return mask; 241} 242 243Py_LOCAL_INLINE(int) 244unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen) 245{ 246 Py_ssize_t i; 247 248 for (i = 0; i < setlen; i++) 249 if (set[i] == chr) 250 return 1; 251 252 return 0; 253} 254 255#define BLOOM_MEMBER(mask, chr, set, setlen) \ 256 BLOOM(mask, chr) && unicode_member(chr, set, setlen) 257 258/* --- Unicode Object ----------------------------------------------------- */ 259 260static int 261unicode_resize(register PyUnicodeObject *unicode, 262 Py_ssize_t length) 263{ 264 void *oldstr; 265 266 /* Shortcut if there's nothing much to do. */ 267 if (unicode->length == length) 268 goto reset; 269 270 /* Resizing shared object (unicode_empty or single character 271 objects) in-place is not allowed. Use PyUnicode_Resize() 272 instead ! */ 273 274 if (unicode == unicode_empty || 275 (unicode->length == 1 && 276 unicode->str[0] < 256U && 277 unicode_latin1[unicode->str[0]] == unicode)) { 278 PyErr_SetString(PyExc_SystemError, 279 "can't resize shared str objects"); 280 return -1; 281 } 282 283 /* We allocate one more byte to make sure the string is Ux0000 terminated. 284 The overallocation is also used by fastsearch, which assumes that it's 285 safe to look at str[length] (without making any assumptions about what 286 it contains). */ 287 288 oldstr = unicode->str; 289 unicode->str = PyObject_REALLOC(unicode->str, 290 sizeof(Py_UNICODE) * (length + 1)); 291 if (!unicode->str) { 292 unicode->str = (Py_UNICODE *)oldstr; 293 PyErr_NoMemory(); 294 return -1; 295 } 296 unicode->str[length] = 0; 297 unicode->length = length; 298 299 reset: 300 /* Reset the object caches */ 301 if (unicode->defenc) { 302 Py_CLEAR(unicode->defenc); 303 } 304 unicode->hash = -1; 305 306 return 0; 307} 308 309/* We allocate one more byte to make sure the string is 310 Ux0000 terminated; some code (e.g. new_identifier) 311 relies on that. 312 313 XXX This allocator could further be enhanced by assuring that the 314 free list never reduces its size below 1. 315 316*/ 317 318static PyUnicodeObject * 319_PyUnicode_New(Py_ssize_t length) 320{ 321 register PyUnicodeObject *unicode; 322 323 /* Optimization for empty strings */ 324 if (length == 0 && unicode_empty != NULL) { 325 Py_INCREF(unicode_empty); 326 return unicode_empty; 327 } 328 329 /* Ensure we won't overflow the size. */ 330 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) { 331 return (PyUnicodeObject *)PyErr_NoMemory(); 332 } 333 334 /* Unicode freelist & memory allocation */ 335 if (free_list) { 336 unicode = free_list; 337 free_list = *(PyUnicodeObject **)unicode; 338 numfree--; 339 if (unicode->str) { 340 /* Keep-Alive optimization: we only upsize the buffer, 341 never downsize it. */ 342 if ((unicode->length < length) && 343 unicode_resize(unicode, length) < 0) { 344 PyObject_DEL(unicode->str); 345 unicode->str = NULL; 346 } 347 } 348 else { 349 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1); 350 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size); 351 } 352 PyObject_INIT(unicode, &PyUnicode_Type); 353 } 354 else { 355 size_t new_size; 356 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type); 357 if (unicode == NULL) 358 return NULL; 359 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1); 360 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size); 361 } 362 363 if (!unicode->str) { 364 PyErr_NoMemory(); 365 goto onError; 366 } 367 /* Initialize the first element to guard against cases where 368 * the caller fails before initializing str -- unicode_resize() 369 * reads str[0], and the Keep-Alive optimization can keep memory 370 * allocated for str alive across a call to unicode_dealloc(unicode). 371 * We don't want unicode_resize to read uninitialized memory in 372 * that case. 373 */ 374 unicode->str[0] = 0; 375 unicode->str[length] = 0; 376 unicode->length = length; 377 unicode->hash = -1; 378 unicode->state = 0; 379 unicode->defenc = NULL; 380 return unicode; 381 382 onError: 383 /* XXX UNREF/NEWREF interface should be more symmetrical */ 384 _Py_DEC_REFTOTAL; 385 _Py_ForgetReference((PyObject *)unicode); 386 PyObject_Del(unicode); 387 return NULL; 388} 389 390static void 391unicode_dealloc(register PyUnicodeObject *unicode) 392{ 393 switch (PyUnicode_CHECK_INTERNED(unicode)) { 394 case SSTATE_NOT_INTERNED: 395 break; 396 397 case SSTATE_INTERNED_MORTAL: 398 /* revive dead object temporarily for DelItem */ 399 Py_REFCNT(unicode) = 3; 400 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0) 401 Py_FatalError( 402 "deletion of interned string failed"); 403 break; 404 405 case SSTATE_INTERNED_IMMORTAL: 406 Py_FatalError("Immortal interned string died."); 407 408 default: 409 Py_FatalError("Inconsistent interned string state."); 410 } 411 412 if (PyUnicode_CheckExact(unicode) && 413 numfree < PyUnicode_MAXFREELIST) { 414 /* Keep-Alive optimization */ 415 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) { 416 PyObject_DEL(unicode->str); 417 unicode->str = NULL; 418 unicode->length = 0; 419 } 420 if (unicode->defenc) { 421 Py_CLEAR(unicode->defenc); 422 } 423 /* Add to free list */ 424 *(PyUnicodeObject **)unicode = free_list; 425 free_list = unicode; 426 numfree++; 427 } 428 else { 429 PyObject_DEL(unicode->str); 430 Py_XDECREF(unicode->defenc); 431 Py_TYPE(unicode)->tp_free((PyObject *)unicode); 432 } 433} 434 435static int 436_PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length) 437{ 438 register PyUnicodeObject *v; 439 440 /* Argument checks */ 441 if (unicode == NULL) { 442 PyErr_BadInternalCall(); 443 return -1; 444 } 445 v = *unicode; 446 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) { 447 PyErr_BadInternalCall(); 448 return -1; 449 } 450 451 /* Resizing unicode_empty and single character objects is not 452 possible since these are being shared. We simply return a fresh 453 copy with the same Unicode content. */ 454 if (v->length != length && 455 (v == unicode_empty || v->length == 1)) { 456 PyUnicodeObject *w = _PyUnicode_New(length); 457 if (w == NULL) 458 return -1; 459 Py_UNICODE_COPY(w->str, v->str, 460 length < v->length ? length : v->length); 461 Py_DECREF(*unicode); 462 *unicode = w; 463 return 0; 464 } 465 466 /* Note that we don't have to modify *unicode for unshared Unicode 467 objects, since we can modify them in-place. */ 468 return unicode_resize(v, length); 469} 470 471int 472PyUnicode_Resize(PyObject **unicode, Py_ssize_t length) 473{ 474 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length); 475} 476 477PyObject * 478PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size) 479{ 480 PyUnicodeObject *unicode; 481 482 /* If the Unicode data is known at construction time, we can apply 483 some optimizations which share commonly used objects. */ 484 if (u != NULL) { 485 486 /* Optimization for empty strings */ 487 if (size == 0 && unicode_empty != NULL) { 488 Py_INCREF(unicode_empty); 489 return (PyObject *)unicode_empty; 490 } 491 492 /* Single character Unicode objects in the Latin-1 range are 493 shared when using this constructor */ 494 if (size == 1 && *u < 256) { 495 unicode = unicode_latin1[*u]; 496 if (!unicode) { 497 unicode = _PyUnicode_New(1); 498 if (!unicode) 499 return NULL; 500 unicode->str[0] = *u; 501 unicode_latin1[*u] = unicode; 502 } 503 Py_INCREF(unicode); 504 return (PyObject *)unicode; 505 } 506 } 507 508 unicode = _PyUnicode_New(size); 509 if (!unicode) 510 return NULL; 511 512 /* Copy the Unicode data into the new object */ 513 if (u != NULL) 514 Py_UNICODE_COPY(unicode->str, u, size); 515 516 return (PyObject *)unicode; 517} 518 519PyObject * 520PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size) 521{ 522 PyUnicodeObject *unicode; 523 524 if (size < 0) { 525 PyErr_SetString(PyExc_SystemError, 526 "Negative size passed to PyUnicode_FromStringAndSize"); 527 return NULL; 528 } 529 530 /* If the Unicode data is known at construction time, we can apply 531 some optimizations which share commonly used objects. 532 Also, this means the input must be UTF-8, so fall back to the 533 UTF-8 decoder at the end. */ 534 if (u != NULL) { 535 536 /* Optimization for empty strings */ 537 if (size == 0 && unicode_empty != NULL) { 538 Py_INCREF(unicode_empty); 539 return (PyObject *)unicode_empty; 540 } 541 542 /* Single characters are shared when using this constructor. 543 Restrict to ASCII, since the input must be UTF-8. */ 544 if (size == 1 && Py_CHARMASK(*u) < 128) { 545 unicode = unicode_latin1[Py_CHARMASK(*u)]; 546 if (!unicode) { 547 unicode = _PyUnicode_New(1); 548 if (!unicode) 549 return NULL; 550 unicode->str[0] = Py_CHARMASK(*u); 551 unicode_latin1[Py_CHARMASK(*u)] = unicode; 552 } 553 Py_INCREF(unicode); 554 return (PyObject *)unicode; 555 } 556 557 return PyUnicode_DecodeUTF8(u, size, NULL); 558 } 559 560 unicode = _PyUnicode_New(size); 561 if (!unicode) 562 return NULL; 563 564 return (PyObject *)unicode; 565} 566 567PyObject * 568PyUnicode_FromString(const char *u) 569{ 570 size_t size = strlen(u); 571 if (size > PY_SSIZE_T_MAX) { 572 PyErr_SetString(PyExc_OverflowError, "input too long"); 573 return NULL; 574 } 575 576 return PyUnicode_FromStringAndSize(u, size); 577} 578 579#ifdef HAVE_WCHAR_H 580 581#if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4) 582# define CONVERT_WCHAR_TO_SURROGATES 583#endif 584 585#ifdef CONVERT_WCHAR_TO_SURROGATES 586 587/* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need 588 to convert from UTF32 to UTF16. */ 589 590PyObject * 591PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size) 592{ 593 PyUnicodeObject *unicode; 594 register Py_ssize_t i; 595 Py_ssize_t alloc; 596 const wchar_t *orig_w; 597 598 if (w == NULL) { 599 if (size == 0) 600 return PyUnicode_FromStringAndSize(NULL, 0); 601 PyErr_BadInternalCall(); 602 return NULL; 603 } 604 605 if (size == -1) { 606 size = wcslen(w); 607 } 608 609 alloc = size; 610 orig_w = w; 611 for (i = size; i > 0; i--) { 612 if (*w > 0xFFFF) 613 alloc++; 614 w++; 615 } 616 w = orig_w; 617 unicode = _PyUnicode_New(alloc); 618 if (!unicode) 619 return NULL; 620 621 /* Copy the wchar_t data into the new object */ 622 { 623 register Py_UNICODE *u; 624 u = PyUnicode_AS_UNICODE(unicode); 625 for (i = size; i > 0; i--) { 626 if (*w > 0xFFFF) { 627 wchar_t ordinal = *w++; 628 ordinal -= 0x10000; 629 *u++ = 0xD800 | (ordinal >> 10); 630 *u++ = 0xDC00 | (ordinal & 0x3FF); 631 } 632 else 633 *u++ = *w++; 634 } 635 } 636 return (PyObject *)unicode; 637} 638 639#else 640 641PyObject * 642PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size) 643{ 644 PyUnicodeObject *unicode; 645 646 if (w == NULL) { 647 if (size == 0) 648 return PyUnicode_FromStringAndSize(NULL, 0); 649 PyErr_BadInternalCall(); 650 return NULL; 651 } 652 653 if (size == -1) { 654 size = wcslen(w); 655 } 656 657 unicode = _PyUnicode_New(size); 658 if (!unicode) 659 return NULL; 660 661 /* Copy the wchar_t data into the new object */ 662#if Py_UNICODE_SIZE == SIZEOF_WCHAR_T 663 memcpy(unicode->str, w, size * sizeof(wchar_t)); 664#else 665 { 666 register Py_UNICODE *u; 667 register Py_ssize_t i; 668 u = PyUnicode_AS_UNICODE(unicode); 669 for (i = size; i > 0; i--) 670 *u++ = *w++; 671 } 672#endif 673 674 return (PyObject *)unicode; 675} 676 677#endif /* CONVERT_WCHAR_TO_SURROGATES */ 678 679#undef CONVERT_WCHAR_TO_SURROGATES 680 681static void 682makefmt(char *fmt, int longflag, int longlongflag, int size_tflag, 683 int zeropad, int width, int precision, char c) 684{ 685 *fmt++ = '%'; 686 if (width) { 687 if (zeropad) 688 *fmt++ = '0'; 689 fmt += sprintf(fmt, "%d", width); 690 } 691 if (precision) 692 fmt += sprintf(fmt, ".%d", precision); 693 if (longflag) 694 *fmt++ = 'l'; 695 else if (longlongflag) { 696 /* longlongflag should only ever be nonzero on machines with 697 HAVE_LONG_LONG defined */ 698#ifdef HAVE_LONG_LONG 699 char *f = PY_FORMAT_LONG_LONG; 700 while (*f) 701 *fmt++ = *f++; 702#else 703 /* we shouldn't ever get here */ 704 assert(0); 705 *fmt++ = 'l'; 706#endif 707 } 708 else if (size_tflag) { 709 char *f = PY_FORMAT_SIZE_T; 710 while (*f) 711 *fmt++ = *f++; 712 } 713 *fmt++ = c; 714 *fmt = '\0'; 715} 716 717/* helper for PyUnicode_FromFormatV() */ 718 719static const char* 720parse_format_flags(const char *f, 721 int *p_width, int *p_precision, 722 int *p_longflag, int *p_longlongflag, int *p_size_tflag) 723{ 724 int width, precision, longflag, longlongflag, size_tflag; 725 726 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */ 727 f++; 728 width = 0; 729 while (Py_ISDIGIT((unsigned)*f)) 730 width = (width*10) + *f++ - '0'; 731 precision = 0; 732 if (*f == '.') { 733 f++; 734 while (Py_ISDIGIT((unsigned)*f)) 735 precision = (precision*10) + *f++ - '0'; 736 if (*f == '%') { 737 /* "%.3%s" => f points to "3" */ 738 f--; 739 } 740 } 741 if (*f == '\0') { 742 /* bogus format "%.1" => go backward, f points to "1" */ 743 f--; 744 } 745 if (p_width != NULL) 746 *p_width = width; 747 if (p_precision != NULL) 748 *p_precision = precision; 749 750 /* Handle %ld, %lu, %lld and %llu. */ 751 longflag = 0; 752 longlongflag = 0; 753 size_tflag = 0; 754 755 if (*f == 'l') { 756 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') { 757 longflag = 1; 758 ++f; 759 } 760#ifdef HAVE_LONG_LONG 761 else if (f[1] == 'l' && 762 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) { 763 longlongflag = 1; 764 f += 2; 765 } 766#endif 767 } 768 /* handle the size_t flag. */ 769 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) { 770 size_tflag = 1; 771 ++f; 772 } 773 if (p_longflag != NULL) 774 *p_longflag = longflag; 775 if (p_longlongflag != NULL) 776 *p_longlongflag = longlongflag; 777 if (p_size_tflag != NULL) 778 *p_size_tflag = size_tflag; 779 return f; 780} 781 782#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;} 783 784/* size of fixed-size buffer for formatting single arguments */ 785#define ITEM_BUFFER_LEN 21 786/* maximum number of characters required for output of %ld. 21 characters 787 allows for 64-bit integers (in decimal) and an optional sign. */ 788#define MAX_LONG_CHARS 21 789/* maximum number of characters required for output of %lld. 790 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits, 791 plus 1 for the sign. 53/22 is an upper bound for log10(256). */ 792#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22) 793 794PyObject * 795PyUnicode_FromFormatV(const char *format, va_list vargs) 796{ 797 va_list count; 798 Py_ssize_t callcount = 0; 799 PyObject **callresults = NULL; 800 PyObject **callresult = NULL; 801 Py_ssize_t n = 0; 802 int width = 0; 803 int precision = 0; 804 int zeropad; 805 const char* f; 806 Py_UNICODE *s; 807 PyObject *string; 808 /* used by sprintf */ 809 char buffer[ITEM_BUFFER_LEN+1]; 810 /* use abuffer instead of buffer, if we need more space 811 * (which can happen if there's a format specifier with width). */ 812 char *abuffer = NULL; 813 char *realbuffer; 814 Py_ssize_t abuffersize = 0; 815 char fmt[61]; /* should be enough for %0width.precisionlld */ 816 const char *copy; 817 818 Py_VA_COPY(count, vargs); 819 /* step 1: count the number of %S/%R/%A/%s format specifications 820 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/ 821 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the 822 * result in an array) */ 823 for (f = format; *f; f++) { 824 if (*f == '%') { 825 /* skip width or width.precision (eg. "1.2" of "%1.2f") */ 826 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL); 827 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V') 828 ++callcount; 829 } 830 else if (128 <= (unsigned char)*f) { 831 PyErr_Format(PyExc_ValueError, 832 "PyUnicode_FromFormatV() expects an ASCII-encoded format " 833 "string, got a non-ASCII byte: 0x%02x", 834 (unsigned char)*f); 835 return NULL; 836 } 837 } 838 /* step 2: allocate memory for the results of 839 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */ 840 if (callcount) { 841 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount); 842 if (!callresults) { 843 PyErr_NoMemory(); 844 return NULL; 845 } 846 callresult = callresults; 847 } 848 /* step 3: figure out how large a buffer we need */ 849 for (f = format; *f; f++) { 850 if (*f == '%') { 851#ifdef HAVE_LONG_LONG 852 int longlongflag; 853#endif 854 const char* p; 855 856 p = f; 857 f = parse_format_flags(f, &width, NULL, 858 NULL, &longlongflag, NULL); 859 860 switch (*f) { 861 case 'c': 862 { 863#ifndef Py_UNICODE_WIDE 864 int ordinal = va_arg(count, int); 865 if (ordinal > 0xffff) 866 n += 2; 867 else 868 n++; 869#else 870 (void)va_arg(count, int); 871 n++; 872#endif 873 break; 874 } 875 case '%': 876 n++; 877 break; 878 case 'd': case 'u': case 'i': case 'x': 879 (void) va_arg(count, int); 880#ifdef HAVE_LONG_LONG 881 if (longlongflag) { 882 if (width < MAX_LONG_LONG_CHARS) 883 width = MAX_LONG_LONG_CHARS; 884 } 885 else 886#endif 887 /* MAX_LONG_CHARS is enough to hold a 64-bit integer, 888 including sign. Decimal takes the most space. This 889 isn't enough for octal. If a width is specified we 890 need more (which we allocate later). */ 891 if (width < MAX_LONG_CHARS) 892 width = MAX_LONG_CHARS; 893 n += width; 894 /* XXX should allow for large precision here too. */ 895 if (abuffersize < width) 896 abuffersize = width; 897 break; 898 case 's': 899 { 900 /* UTF-8 */ 901 const char *s = va_arg(count, const char*); 902 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace"); 903 if (!str) 904 goto fail; 905 n += PyUnicode_GET_SIZE(str); 906 /* Remember the str and switch to the next slot */ 907 *callresult++ = str; 908 break; 909 } 910 case 'U': 911 { 912 PyObject *obj = va_arg(count, PyObject *); 913 assert(obj && PyUnicode_Check(obj)); 914 n += PyUnicode_GET_SIZE(obj); 915 break; 916 } 917 case 'V': 918 { 919 PyObject *obj = va_arg(count, PyObject *); 920 const char *str = va_arg(count, const char *); 921 PyObject *str_obj; 922 assert(obj || str); 923 assert(!obj || PyUnicode_Check(obj)); 924 if (obj) { 925 n += PyUnicode_GET_SIZE(obj); 926 *callresult++ = NULL; 927 } 928 else { 929 str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace"); 930 if (!str_obj) 931 goto fail; 932 n += PyUnicode_GET_SIZE(str_obj); 933 *callresult++ = str_obj; 934 } 935 break; 936 } 937 case 'S': 938 { 939 PyObject *obj = va_arg(count, PyObject *); 940 PyObject *str; 941 assert(obj); 942 str = PyObject_Str(obj); 943 if (!str) 944 goto fail; 945 n += PyUnicode_GET_SIZE(str); 946 /* Remember the str and switch to the next slot */ 947 *callresult++ = str; 948 break; 949 } 950 case 'R': 951 { 952 PyObject *obj = va_arg(count, PyObject *); 953 PyObject *repr; 954 assert(obj); 955 repr = PyObject_Repr(obj); 956 if (!repr) 957 goto fail; 958 n += PyUnicode_GET_SIZE(repr); 959 /* Remember the repr and switch to the next slot */ 960 *callresult++ = repr; 961 break; 962 } 963 case 'A': 964 { 965 PyObject *obj = va_arg(count, PyObject *); 966 PyObject *ascii; 967 assert(obj); 968 ascii = PyObject_ASCII(obj); 969 if (!ascii) 970 goto fail; 971 n += PyUnicode_GET_SIZE(ascii); 972 /* Remember the repr and switch to the next slot */ 973 *callresult++ = ascii; 974 break; 975 } 976 case 'p': 977 (void) va_arg(count, int); 978 /* maximum 64-bit pointer representation: 979 * 0xffffffffffffffff 980 * so 19 characters is enough. 981 * XXX I count 18 -- what's the extra for? 982 */ 983 n += 19; 984 break; 985 default: 986 /* if we stumble upon an unknown 987 formatting code, copy the rest of 988 the format string to the output 989 string. (we cannot just skip the 990 code, since there's no way to know 991 what's in the argument list) */ 992 n += strlen(p); 993 goto expand; 994 } 995 } else 996 n++; 997 } 998 expand: 999 if (abuffersize > ITEM_BUFFER_LEN) { 1000 /* add 1 for sprintf's trailing null byte */ 1001 abuffer = PyObject_Malloc(abuffersize + 1); 1002 if (!abuffer) { 1003 PyErr_NoMemory(); 1004 goto fail; 1005 } 1006 realbuffer = abuffer; 1007 } 1008 else 1009 realbuffer = buffer; 1010 /* step 4: fill the buffer */ 1011 /* Since we've analyzed how much space we need for the worst case, 1012 we don't have to resize the string. 1013 There can be no errors beyond this point. */ 1014 string = PyUnicode_FromUnicode(NULL, n); 1015 if (!string) 1016 goto fail; 1017 1018 s = PyUnicode_AS_UNICODE(string); 1019 callresult = callresults; 1020 1021 for (f = format; *f; f++) { 1022 if (*f == '%') { 1023 const char* p; 1024 int longflag; 1025 int longlongflag; 1026 int size_tflag; 1027 1028 p = f; 1029 zeropad = (f[1] == '0'); 1030 f = parse_format_flags(f, &width, &precision, 1031 &longflag, &longlongflag, &size_tflag); 1032 1033 switch (*f) { 1034 case 'c': 1035 { 1036 int ordinal = va_arg(vargs, int); 1037#ifndef Py_UNICODE_WIDE 1038 if (ordinal > 0xffff) { 1039 ordinal -= 0x10000; 1040 *s++ = 0xD800 | (ordinal >> 10); 1041 *s++ = 0xDC00 | (ordinal & 0x3FF); 1042 } else 1043#endif 1044 *s++ = ordinal; 1045 break; 1046 } 1047 case 'i': 1048 case 'd': 1049 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad, 1050 width, precision, *f); 1051 if (longflag) 1052 sprintf(realbuffer, fmt, va_arg(vargs, long)); 1053#ifdef HAVE_LONG_LONG 1054 else if (longlongflag) 1055 sprintf(realbuffer, fmt, va_arg(vargs, PY_LONG_LONG)); 1056#endif 1057 else if (size_tflag) 1058 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t)); 1059 else 1060 sprintf(realbuffer, fmt, va_arg(vargs, int)); 1061 appendstring(realbuffer); 1062 break; 1063 case 'u': 1064 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad, 1065 width, precision, 'u'); 1066 if (longflag) 1067 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long)); 1068#ifdef HAVE_LONG_LONG 1069 else if (longlongflag) 1070 sprintf(realbuffer, fmt, va_arg(vargs, 1071 unsigned PY_LONG_LONG)); 1072#endif 1073 else if (size_tflag) 1074 sprintf(realbuffer, fmt, va_arg(vargs, size_t)); 1075 else 1076 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int)); 1077 appendstring(realbuffer); 1078 break; 1079 case 'x': 1080 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x'); 1081 sprintf(realbuffer, fmt, va_arg(vargs, int)); 1082 appendstring(realbuffer); 1083 break; 1084 case 's': 1085 { 1086 /* unused, since we already have the result */ 1087 (void) va_arg(vargs, char *); 1088 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult), 1089 PyUnicode_GET_SIZE(*callresult)); 1090 s += PyUnicode_GET_SIZE(*callresult); 1091 /* We're done with the unicode()/repr() => forget it */ 1092 Py_DECREF(*callresult); 1093 /* switch to next unicode()/repr() result */ 1094 ++callresult; 1095 break; 1096 } 1097 case 'U': 1098 { 1099 PyObject *obj = va_arg(vargs, PyObject *); 1100 Py_ssize_t size = PyUnicode_GET_SIZE(obj); 1101 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size); 1102 s += size; 1103 break; 1104 } 1105 case 'V': 1106 { 1107 PyObject *obj = va_arg(vargs, PyObject *); 1108 va_arg(vargs, const char *); 1109 if (obj) { 1110 Py_ssize_t size = PyUnicode_GET_SIZE(obj); 1111 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size); 1112 s += size; 1113 } else { 1114 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult), 1115 PyUnicode_GET_SIZE(*callresult)); 1116 s += PyUnicode_GET_SIZE(*callresult); 1117 Py_DECREF(*callresult); 1118 } 1119 ++callresult; 1120 break; 1121 } 1122 case 'S': 1123 case 'R': 1124 case 'A': 1125 { 1126 Py_UNICODE *ucopy; 1127 Py_ssize_t usize; 1128 Py_ssize_t upos; 1129 /* unused, since we already have the result */ 1130 (void) va_arg(vargs, PyObject *); 1131 ucopy = PyUnicode_AS_UNICODE(*callresult); 1132 usize = PyUnicode_GET_SIZE(*callresult); 1133 for (upos = 0; upos<usize;) 1134 *s++ = ucopy[upos++]; 1135 /* We're done with the unicode()/repr() => forget it */ 1136 Py_DECREF(*callresult); 1137 /* switch to next unicode()/repr() result */ 1138 ++callresult; 1139 break; 1140 } 1141 case 'p': 1142 sprintf(buffer, "%p", va_arg(vargs, void*)); 1143 /* %p is ill-defined: ensure leading 0x. */ 1144 if (buffer[1] == 'X') 1145 buffer[1] = 'x'; 1146 else if (buffer[1] != 'x') { 1147 memmove(buffer+2, buffer, strlen(buffer)+1); 1148 buffer[0] = '0'; 1149 buffer[1] = 'x'; 1150 } 1151 appendstring(buffer); 1152 break; 1153 case '%': 1154 *s++ = '%'; 1155 break; 1156 default: 1157 appendstring(p); 1158 goto end; 1159 } 1160 } 1161 else 1162 *s++ = *f; 1163 } 1164 1165 end: 1166 if (callresults) 1167 PyObject_Free(callresults); 1168 if (abuffer) 1169 PyObject_Free(abuffer); 1170 PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string)); 1171 return string; 1172 fail: 1173 if (callresults) { 1174 PyObject **callresult2 = callresults; 1175 while (callresult2 < callresult) { 1176 Py_XDECREF(*callresult2); 1177 ++callresult2; 1178 } 1179 PyObject_Free(callresults); 1180 } 1181 if (abuffer) 1182 PyObject_Free(abuffer); 1183 return NULL; 1184} 1185 1186#undef appendstring 1187 1188PyObject * 1189PyUnicode_FromFormat(const char *format, ...) 1190{ 1191 PyObject* ret; 1192 va_list vargs; 1193 1194#ifdef HAVE_STDARG_PROTOTYPES 1195 va_start(vargs, format); 1196#else 1197 va_start(vargs); 1198#endif 1199 ret = PyUnicode_FromFormatV(format, vargs); 1200 va_end(vargs); 1201 return ret; 1202} 1203 1204/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString(): 1205 convert a Unicode object to a wide character string. 1206 1207 - If w is NULL: return the number of wide characters (including the nul 1208 character) required to convert the unicode object. Ignore size argument. 1209 1210 - Otherwise: return the number of wide characters (excluding the nul 1211 character) written into w. Write at most size wide characters (including 1212 the nul character). */ 1213static Py_ssize_t 1214unicode_aswidechar(PyUnicodeObject *unicode, 1215 wchar_t *w, 1216 Py_ssize_t size) 1217{ 1218#if Py_UNICODE_SIZE == SIZEOF_WCHAR_T 1219 Py_ssize_t res; 1220 if (w != NULL) { 1221 res = PyUnicode_GET_SIZE(unicode); 1222 if (size > res) 1223 size = res + 1; 1224 else 1225 res = size; 1226 memcpy(w, unicode->str, size * sizeof(wchar_t)); 1227 return res; 1228 } 1229 else 1230 return PyUnicode_GET_SIZE(unicode) + 1; 1231#elif Py_UNICODE_SIZE == 2 && SIZEOF_WCHAR_T == 4 1232 register const Py_UNICODE *u; 1233 const Py_UNICODE *uend; 1234 const wchar_t *worig, *wend; 1235 Py_ssize_t nchar; 1236 1237 u = PyUnicode_AS_UNICODE(unicode); 1238 uend = u + PyUnicode_GET_SIZE(unicode); 1239 if (w != NULL) { 1240 worig = w; 1241 wend = w + size; 1242 while (u != uend && w != wend) { 1243 if (0xD800 <= u[0] && u[0] <= 0xDBFF 1244 && 0xDC00 <= u[1] && u[1] <= 0xDFFF) 1245 { 1246 *w = (((u[0] & 0x3FF) << 10) | (u[1] & 0x3FF)) + 0x10000; 1247 u += 2; 1248 } 1249 else { 1250 *w = *u; 1251 u++; 1252 } 1253 w++; 1254 } 1255 if (w != wend) 1256 *w = L'\0'; 1257 return w - worig; 1258 } 1259 else { 1260 nchar = 1; /* nul character at the end */ 1261 while (u != uend) { 1262 if (0xD800 <= u[0] && u[0] <= 0xDBFF 1263 && 0xDC00 <= u[1] && u[1] <= 0xDFFF) 1264 u += 2; 1265 else 1266 u++; 1267 nchar++; 1268 } 1269 } 1270 return nchar; 1271#elif Py_UNICODE_SIZE == 4 && SIZEOF_WCHAR_T == 2 1272 register Py_UNICODE *u, *uend, ordinal; 1273 register Py_ssize_t i; 1274 wchar_t *worig, *wend; 1275 Py_ssize_t nchar; 1276 1277 u = PyUnicode_AS_UNICODE(unicode); 1278 uend = u + PyUnicode_GET_SIZE(u); 1279 if (w != NULL) { 1280 worig = w; 1281 wend = w + size; 1282 while (u != uend && w != wend) { 1283 ordinal = *u; 1284 if (ordinal > 0xffff) { 1285 ordinal -= 0x10000; 1286 *w++ = 0xD800 | (ordinal >> 10); 1287 *w++ = 0xDC00 | (ordinal & 0x3FF); 1288 } 1289 else 1290 *w++ = ordinal; 1291 u++; 1292 } 1293 if (w != wend) 1294 *w = 0; 1295 return w - worig; 1296 } 1297 else { 1298 nchar = 1; /* nul character */ 1299 while (u != uend) { 1300 if (*u > 0xffff) 1301 nchar += 2; 1302 else 1303 nchar++; 1304 u++; 1305 } 1306 return nchar; 1307 } 1308#else 1309# error "unsupported wchar_t and Py_UNICODE sizes, see issue #8670" 1310#endif 1311} 1312 1313Py_ssize_t 1314PyUnicode_AsWideChar(PyObject *unicode, 1315 wchar_t *w, 1316 Py_ssize_t size) 1317{ 1318 if (unicode == NULL) { 1319 PyErr_BadInternalCall(); 1320 return -1; 1321 } 1322 return unicode_aswidechar((PyUnicodeObject*)unicode, w, size); 1323} 1324 1325wchar_t* 1326PyUnicode_AsWideCharString(PyObject *unicode, 1327 Py_ssize_t *size) 1328{ 1329 wchar_t* buffer; 1330 Py_ssize_t buflen; 1331 1332 if (unicode == NULL) { 1333 PyErr_BadInternalCall(); 1334 return NULL; 1335 } 1336 1337 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, NULL, 0); 1338 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) { 1339 PyErr_NoMemory(); 1340 return NULL; 1341 } 1342 1343 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t)); 1344 if (buffer == NULL) { 1345 PyErr_NoMemory(); 1346 return NULL; 1347 } 1348 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, buffer, buflen); 1349 if (size != NULL) 1350 *size = buflen; 1351 return buffer; 1352} 1353 1354#endif 1355 1356PyObject * 1357PyUnicode_FromOrdinal(int ordinal) 1358{ 1359 Py_UNICODE s[2]; 1360 1361 if (ordinal < 0 || ordinal > 0x10ffff) { 1362 PyErr_SetString(PyExc_ValueError, 1363 "chr() arg not in range(0x110000)"); 1364 return NULL; 1365 } 1366 1367#ifndef Py_UNICODE_WIDE 1368 if (ordinal > 0xffff) { 1369 ordinal -= 0x10000; 1370 s[0] = 0xD800 | (ordinal >> 10); 1371 s[1] = 0xDC00 | (ordinal & 0x3FF); 1372 return PyUnicode_FromUnicode(s, 2); 1373 } 1374#endif 1375 1376 s[0] = (Py_UNICODE)ordinal; 1377 return PyUnicode_FromUnicode(s, 1); 1378} 1379 1380PyObject * 1381PyUnicode_FromObject(register PyObject *obj) 1382{ 1383 /* XXX Perhaps we should make this API an alias of 1384 PyObject_Str() instead ?! */ 1385 if (PyUnicode_CheckExact(obj)) { 1386 Py_INCREF(obj); 1387 return obj; 1388 } 1389 if (PyUnicode_Check(obj)) { 1390 /* For a Unicode subtype that's not a Unicode object, 1391 return a true Unicode object with the same data. */ 1392 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj), 1393 PyUnicode_GET_SIZE(obj)); 1394 } 1395 PyErr_Format(PyExc_TypeError, 1396 "Can't convert '%.100s' object to str implicitly", 1397 Py_TYPE(obj)->tp_name); 1398 return NULL; 1399} 1400 1401PyObject * 1402PyUnicode_FromEncodedObject(register PyObject *obj, 1403 const char *encoding, 1404 const char *errors) 1405{ 1406 Py_buffer buffer; 1407 PyObject *v; 1408 1409 if (obj == NULL) { 1410 PyErr_BadInternalCall(); 1411 return NULL; 1412 } 1413 1414 /* Decoding bytes objects is the most common case and should be fast */ 1415 if (PyBytes_Check(obj)) { 1416 if (PyBytes_GET_SIZE(obj) == 0) { 1417 Py_INCREF(unicode_empty); 1418 v = (PyObject *) unicode_empty; 1419 } 1420 else { 1421 v = PyUnicode_Decode( 1422 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj), 1423 encoding, errors); 1424 } 1425 return v; 1426 } 1427 1428 if (PyUnicode_Check(obj)) { 1429 PyErr_SetString(PyExc_TypeError, 1430 "decoding str is not supported"); 1431 return NULL; 1432 } 1433 1434 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */ 1435 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) { 1436 PyErr_Format(PyExc_TypeError, 1437 "coercing to str: need bytes, bytearray " 1438 "or buffer-like object, %.80s found", 1439 Py_TYPE(obj)->tp_name); 1440 return NULL; 1441 } 1442 1443 if (buffer.len == 0) { 1444 Py_INCREF(unicode_empty); 1445 v = (PyObject *) unicode_empty; 1446 } 1447 else 1448 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors); 1449 1450 PyBuffer_Release(&buffer); 1451 return v; 1452} 1453 1454/* Convert encoding to lower case and replace '_' with '-' in order to 1455 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1), 1456 1 on success. */ 1457static int 1458normalize_encoding(const char *encoding, 1459 char *lower, 1460 size_t lower_len) 1461{ 1462 const char *e; 1463 char *l; 1464 char *l_end; 1465 1466 e = encoding; 1467 l = lower; 1468 l_end = &lower[lower_len - 1]; 1469 while (*e) { 1470 if (l == l_end) 1471 return 0; 1472 if (Py_ISUPPER(*e)) { 1473 *l++ = Py_TOLOWER(*e++); 1474 } 1475 else if (*e == '_') { 1476 *l++ = '-'; 1477 e++; 1478 } 1479 else { 1480 *l++ = *e++; 1481 } 1482 } 1483 *l = '\0'; 1484 return 1; 1485} 1486 1487PyObject * 1488PyUnicode_Decode(const char *s, 1489 Py_ssize_t size, 1490 const char *encoding, 1491 const char *errors) 1492{ 1493 PyObject *buffer = NULL, *unicode; 1494 Py_buffer info; 1495 char lower[11]; /* Enough for any encoding shortcut */ 1496 1497 if (encoding == NULL) 1498 return PyUnicode_DecodeUTF8(s, size, errors); 1499 1500 /* Shortcuts for common default encodings */ 1501 if (normalize_encoding(encoding, lower, sizeof(lower))) { 1502 if ((strcmp(lower, "utf-8") == 0) || 1503 (strcmp(lower, "utf8") == 0)) 1504 return PyUnicode_DecodeUTF8(s, size, errors); 1505 else if ((strcmp(lower, "latin-1") == 0) || 1506 (strcmp(lower, "latin1") == 0) || 1507 (strcmp(lower, "iso-8859-1") == 0)) 1508 return PyUnicode_DecodeLatin1(s, size, errors); 1509#ifdef HAVE_MBCS 1510 else if (strcmp(lower, "mbcs") == 0) 1511 return PyUnicode_DecodeMBCS(s, size, errors); 1512#endif 1513 else if (strcmp(lower, "ascii") == 0) 1514 return PyUnicode_DecodeASCII(s, size, errors); 1515 else if (strcmp(lower, "utf-16") == 0) 1516 return PyUnicode_DecodeUTF16(s, size, errors, 0); 1517 else if (strcmp(lower, "utf-32") == 0) 1518 return PyUnicode_DecodeUTF32(s, size, errors, 0); 1519 } 1520 1521 /* Decode via the codec registry */ 1522 buffer = NULL; 1523 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0) 1524 goto onError; 1525 buffer = PyMemoryView_FromBuffer(&info); 1526 if (buffer == NULL) 1527 goto onError; 1528 unicode = PyCodec_Decode(buffer, encoding, errors); 1529 if (unicode == NULL) 1530 goto onError; 1531 if (!PyUnicode_Check(unicode)) { 1532 PyErr_Format(PyExc_TypeError, 1533 "decoder did not return a str object (type=%.400s)", 1534 Py_TYPE(unicode)->tp_name); 1535 Py_DECREF(unicode); 1536 goto onError; 1537 } 1538 Py_DECREF(buffer); 1539 return unicode; 1540 1541 onError: 1542 Py_XDECREF(buffer); 1543 return NULL; 1544} 1545 1546PyObject * 1547PyUnicode_AsDecodedObject(PyObject *unicode, 1548 const char *encoding, 1549 const char *errors) 1550{ 1551 PyObject *v; 1552 1553 if (!PyUnicode_Check(unicode)) { 1554 PyErr_BadArgument(); 1555 goto onError; 1556 } 1557 1558 if (encoding == NULL) 1559 encoding = PyUnicode_GetDefaultEncoding(); 1560 1561 /* Decode via the codec registry */ 1562 v = PyCodec_Decode(unicode, encoding, errors); 1563 if (v == NULL) 1564 goto onError; 1565 return v; 1566 1567 onError: 1568 return NULL; 1569} 1570 1571PyObject * 1572PyUnicode_AsDecodedUnicode(PyObject *unicode, 1573 const char *encoding, 1574 const char *errors) 1575{ 1576 PyObject *v; 1577 1578 if (!PyUnicode_Check(unicode)) { 1579 PyErr_BadArgument(); 1580 goto onError; 1581 } 1582 1583 if (encoding == NULL) 1584 encoding = PyUnicode_GetDefaultEncoding(); 1585 1586 /* Decode via the codec registry */ 1587 v = PyCodec_Decode(unicode, encoding, errors); 1588 if (v == NULL) 1589 goto onError; 1590 if (!PyUnicode_Check(v)) { 1591 PyErr_Format(PyExc_TypeError, 1592 "decoder did not return a str object (type=%.400s)", 1593 Py_TYPE(v)->tp_name); 1594 Py_DECREF(v); 1595 goto onError; 1596 } 1597 return v; 1598 1599 onError: 1600 return NULL; 1601} 1602 1603PyObject * 1604PyUnicode_Encode(const Py_UNICODE *s, 1605 Py_ssize_t size, 1606 const char *encoding, 1607 const char *errors) 1608{ 1609 PyObject *v, *unicode; 1610 1611 unicode = PyUnicode_FromUnicode(s, size); 1612 if (unicode == NULL) 1613 return NULL; 1614 v = PyUnicode_AsEncodedString(unicode, encoding, errors); 1615 Py_DECREF(unicode); 1616 return v; 1617} 1618 1619PyObject * 1620PyUnicode_AsEncodedObject(PyObject *unicode, 1621 const char *encoding, 1622 const char *errors) 1623{ 1624 PyObject *v; 1625 1626 if (!PyUnicode_Check(unicode)) { 1627 PyErr_BadArgument(); 1628 goto onError; 1629 } 1630 1631 if (encoding == NULL) 1632 encoding = PyUnicode_GetDefaultEncoding(); 1633 1634 /* Encode via the codec registry */ 1635 v = PyCodec_Encode(unicode, encoding, errors); 1636 if (v == NULL) 1637 goto onError; 1638 return v; 1639 1640 onError: 1641 return NULL; 1642} 1643 1644PyObject * 1645PyUnicode_EncodeFSDefault(PyObject *unicode) 1646{ 1647#ifdef HAVE_MBCS 1648 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode), 1649 PyUnicode_GET_SIZE(unicode), 1650 NULL); 1651#elif defined(__APPLE__) 1652 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode), 1653 PyUnicode_GET_SIZE(unicode), 1654 "surrogateescape"); 1655#else 1656 PyInterpreterState *interp = PyThreadState_GET()->interp; 1657 /* Bootstrap check: if the filesystem codec is implemented in Python, we 1658 cannot use it to encode and decode filenames before it is loaded. Load 1659 the Python codec requires to encode at least its own filename. Use the C 1660 version of the locale codec until the codec registry is initialized and 1661 the Python codec is loaded. 1662 1663 Py_FileSystemDefaultEncoding is shared between all interpreters, we 1664 cannot only rely on it: check also interp->fscodec_initialized for 1665 subinterpreters. */ 1666 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) { 1667 return PyUnicode_AsEncodedString(unicode, 1668 Py_FileSystemDefaultEncoding, 1669 "surrogateescape"); 1670 } 1671 else { 1672 /* locale encoding with surrogateescape */ 1673 wchar_t *wchar; 1674 char *bytes; 1675 PyObject *bytes_obj; 1676 size_t error_pos; 1677 1678 wchar = PyUnicode_AsWideCharString(unicode, NULL); 1679 if (wchar == NULL) 1680 return NULL; 1681 bytes = _Py_wchar2char(wchar, &error_pos); 1682 if (bytes == NULL) { 1683 if (error_pos != (size_t)-1) { 1684 char *errmsg = strerror(errno); 1685 PyObject *exc = NULL; 1686 if (errmsg == NULL) 1687 errmsg = "Py_wchar2char() failed"; 1688 raise_encode_exception(&exc, 1689 "filesystemencoding", 1690 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode), 1691 error_pos, error_pos+1, 1692 errmsg); 1693 Py_XDECREF(exc); 1694 } 1695 else 1696 PyErr_NoMemory(); 1697 PyMem_Free(wchar); 1698 return NULL; 1699 } 1700 PyMem_Free(wchar); 1701 1702 bytes_obj = PyBytes_FromString(bytes); 1703 PyMem_Free(bytes); 1704 return bytes_obj; 1705 } 1706#endif 1707} 1708 1709PyObject * 1710PyUnicode_AsEncodedString(PyObject *unicode, 1711 const char *encoding, 1712 const char *errors) 1713{ 1714 PyObject *v; 1715 char lower[11]; /* Enough for any encoding shortcut */ 1716 1717 if (!PyUnicode_Check(unicode)) { 1718 PyErr_BadArgument(); 1719 return NULL; 1720 } 1721 1722 if (encoding == NULL) { 1723 if (errors == NULL || strcmp(errors, "strict") == 0) 1724 return PyUnicode_AsUTF8String(unicode); 1725 else 1726 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode), 1727 PyUnicode_GET_SIZE(unicode), 1728 errors); 1729 } 1730 1731 /* Shortcuts for common default encodings */ 1732 if (normalize_encoding(encoding, lower, sizeof(lower))) { 1733 if ((strcmp(lower, "utf-8") == 0) || 1734 (strcmp(lower, "utf8") == 0)) 1735 { 1736 if (errors == NULL || strcmp(errors, "strict") == 0) 1737 return PyUnicode_AsUTF8String(unicode); 1738 else 1739 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode), 1740 PyUnicode_GET_SIZE(unicode), 1741 errors); 1742 } 1743 else if ((strcmp(lower, "latin-1") == 0) || 1744 (strcmp(lower, "latin1") == 0) || 1745 (strcmp(lower, "iso-8859-1") == 0)) 1746 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode), 1747 PyUnicode_GET_SIZE(unicode), 1748 errors); 1749#ifdef HAVE_MBCS 1750 else if (strcmp(lower, "mbcs") == 0) 1751 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode), 1752 PyUnicode_GET_SIZE(unicode), 1753 errors); 1754#endif 1755 else if (strcmp(lower, "ascii") == 0) 1756 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode), 1757 PyUnicode_GET_SIZE(unicode), 1758 errors); 1759 } 1760 1761 /* Encode via the codec registry */ 1762 v = PyCodec_Encode(unicode, encoding, errors); 1763 if (v == NULL) 1764 return NULL; 1765 1766 /* The normal path */ 1767 if (PyBytes_Check(v)) 1768 return v; 1769 1770 /* If the codec returns a buffer, raise a warning and convert to bytes */ 1771 if (PyByteArray_Check(v)) { 1772 int error; 1773 PyObject *b; 1774 1775 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1, 1776 "encoder %s returned bytearray instead of bytes", 1777 encoding); 1778 if (error) { 1779 Py_DECREF(v); 1780 return NULL; 1781 } 1782 1783 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v)); 1784 Py_DECREF(v); 1785 return b; 1786 } 1787 1788 PyErr_Format(PyExc_TypeError, 1789 "encoder did not return a bytes object (type=%.400s)", 1790 Py_TYPE(v)->tp_name); 1791 Py_DECREF(v); 1792 return NULL; 1793} 1794 1795PyObject * 1796PyUnicode_AsEncodedUnicode(PyObject *unicode, 1797 const char *encoding, 1798 const char *errors) 1799{ 1800 PyObject *v; 1801 1802 if (!PyUnicode_Check(unicode)) { 1803 PyErr_BadArgument(); 1804 goto onError; 1805 } 1806 1807 if (encoding == NULL) 1808 encoding = PyUnicode_GetDefaultEncoding(); 1809 1810 /* Encode via the codec registry */ 1811 v = PyCodec_Encode(unicode, encoding, errors); 1812 if (v == NULL) 1813 goto onError; 1814 if (!PyUnicode_Check(v)) { 1815 PyErr_Format(PyExc_TypeError, 1816 "encoder did not return an str object (type=%.400s)", 1817 Py_TYPE(v)->tp_name); 1818 Py_DECREF(v); 1819 goto onError; 1820 } 1821 return v; 1822 1823 onError: 1824 return NULL; 1825} 1826 1827PyObject * 1828_PyUnicode_AsDefaultEncodedString(PyObject *unicode) 1829{ 1830 PyObject *v = ((PyUnicodeObject *)unicode)->defenc; 1831 if (v) 1832 return v; 1833 v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode), 1834 PyUnicode_GET_SIZE(unicode), 1835 NULL); 1836 if (!v) 1837 return NULL; 1838 ((PyUnicodeObject *)unicode)->defenc = v; 1839 return v; 1840} 1841 1842PyObject* 1843PyUnicode_DecodeFSDefault(const char *s) { 1844 Py_ssize_t size = (Py_ssize_t)strlen(s); 1845 return PyUnicode_DecodeFSDefaultAndSize(s, size); 1846} 1847 1848PyObject* 1849PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size) 1850{ 1851#ifdef HAVE_MBCS 1852 return PyUnicode_DecodeMBCS(s, size, NULL); 1853#elif defined(__APPLE__) 1854 return PyUnicode_DecodeUTF8(s, size, "surrogateescape"); 1855#else 1856 PyInterpreterState *interp = PyThreadState_GET()->interp; 1857 /* Bootstrap check: if the filesystem codec is implemented in Python, we 1858 cannot use it to encode and decode filenames before it is loaded. Load 1859 the Python codec requires to encode at least its own filename. Use the C 1860 version of the locale codec until the codec registry is initialized and 1861 the Python codec is loaded. 1862 1863 Py_FileSystemDefaultEncoding is shared between all interpreters, we 1864 cannot only rely on it: check also interp->fscodec_initialized for 1865 subinterpreters. */ 1866 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) { 1867 return PyUnicode_Decode(s, size, 1868 Py_FileSystemDefaultEncoding, 1869 "surrogateescape"); 1870 } 1871 else { 1872 /* locale encoding with surrogateescape */ 1873 wchar_t *wchar; 1874 PyObject *unicode; 1875 size_t len; 1876 1877 if (s[size] != '\0' || size != strlen(s)) { 1878 PyErr_SetString(PyExc_TypeError, "embedded NUL character"); 1879 return NULL; 1880 } 1881 1882 wchar = _Py_char2wchar(s, &len); 1883 if (wchar == NULL) 1884 return PyErr_NoMemory(); 1885 1886 unicode = PyUnicode_FromWideChar(wchar, len); 1887 PyMem_Free(wchar); 1888 return unicode; 1889 } 1890#endif 1891} 1892 1893 1894int 1895PyUnicode_FSConverter(PyObject* arg, void* addr) 1896{ 1897 PyObject *output = NULL; 1898 Py_ssize_t size; 1899 void *data; 1900 if (arg == NULL) { 1901 Py_DECREF(*(PyObject**)addr); 1902 return 1; 1903 } 1904 if (PyBytes_Check(arg)) { 1905 output = arg; 1906 Py_INCREF(output); 1907 } 1908 else { 1909 arg = PyUnicode_FromObject(arg); 1910 if (!arg) 1911 return 0; 1912 output = PyUnicode_EncodeFSDefault(arg); 1913 Py_DECREF(arg); 1914 if (!output) 1915 return 0; 1916 if (!PyBytes_Check(output)) { 1917 Py_DECREF(output); 1918 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes"); 1919 return 0; 1920 } 1921 } 1922 size = PyBytes_GET_SIZE(output); 1923 data = PyBytes_AS_STRING(output); 1924 if (size != strlen(data)) { 1925 PyErr_SetString(PyExc_TypeError, "embedded NULL character"); 1926 Py_DECREF(output); 1927 return 0; 1928 } 1929 *(PyObject**)addr = output; 1930 return Py_CLEANUP_SUPPORTED; 1931} 1932 1933 1934int 1935PyUnicode_FSDecoder(PyObject* arg, void* addr) 1936{ 1937 PyObject *output = NULL; 1938 Py_ssize_t size; 1939 void *data; 1940 if (arg == NULL) { 1941 Py_DECREF(*(PyObject**)addr); 1942 return 1; 1943 } 1944 if (PyUnicode_Check(arg)) { 1945 output = arg; 1946 Py_INCREF(output); 1947 } 1948 else { 1949 arg = PyBytes_FromObject(arg); 1950 if (!arg) 1951 return 0; 1952 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg), 1953 PyBytes_GET_SIZE(arg)); 1954 Py_DECREF(arg); 1955 if (!output) 1956 return 0; 1957 if (!PyUnicode_Check(output)) { 1958 Py_DECREF(output); 1959 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode"); 1960 return 0; 1961 } 1962 } 1963 size = PyUnicode_GET_SIZE(output); 1964 data = PyUnicode_AS_UNICODE(output); 1965 if (size != Py_UNICODE_strlen(data)) { 1966 PyErr_SetString(PyExc_TypeError, "embedded NUL character"); 1967 Py_DECREF(output); 1968 return 0; 1969 } 1970 *(PyObject**)addr = output; 1971 return Py_CLEANUP_SUPPORTED; 1972} 1973 1974 1975char* 1976_PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize) 1977{ 1978 PyObject *bytes; 1979 if (!PyUnicode_Check(unicode)) { 1980 PyErr_BadArgument(); 1981 return NULL; 1982 } 1983 bytes = _PyUnicode_AsDefaultEncodedString(unicode); 1984 if (bytes == NULL) 1985 return NULL; 1986 if (psize != NULL) 1987 *psize = PyBytes_GET_SIZE(bytes); 1988 return PyBytes_AS_STRING(bytes); 1989} 1990 1991char* 1992_PyUnicode_AsString(PyObject *unicode) 1993{ 1994 return _PyUnicode_AsStringAndSize(unicode, NULL); 1995} 1996 1997Py_UNICODE * 1998PyUnicode_AsUnicode(PyObject *unicode) 1999{ 2000 if (!PyUnicode_Check(unicode)) { 2001 PyErr_BadArgument(); 2002 goto onError; 2003 } 2004 return PyUnicode_AS_UNICODE(unicode); 2005 2006 onError: 2007 return NULL; 2008} 2009 2010Py_ssize_t 2011PyUnicode_GetSize(PyObject *unicode) 2012{ 2013 if (!PyUnicode_Check(unicode)) { 2014 PyErr_BadArgument(); 2015 goto onError; 2016 } 2017 return PyUnicode_GET_SIZE(unicode); 2018 2019 onError: 2020 return -1; 2021} 2022 2023const char * 2024PyUnicode_GetDefaultEncoding(void) 2025{ 2026 return "utf-8"; 2027} 2028 2029/* create or adjust a UnicodeDecodeError */ 2030static void 2031make_decode_exception(PyObject **exceptionObject, 2032 const char *encoding, 2033 const char *input, Py_ssize_t length, 2034 Py_ssize_t startpos, Py_ssize_t endpos, 2035 const char *reason) 2036{ 2037 if (*exceptionObject == NULL) { 2038 *exceptionObject = PyUnicodeDecodeError_Create( 2039 encoding, input, length, startpos, endpos, reason); 2040 } 2041 else { 2042 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos)) 2043 goto onError; 2044 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos)) 2045 goto onError; 2046 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason)) 2047 goto onError; 2048 } 2049 return; 2050 2051onError: 2052 Py_DECREF(*exceptionObject); 2053 *exceptionObject = NULL; 2054} 2055 2056/* error handling callback helper: 2057 build arguments, call the callback and check the arguments, 2058 if no exception occurred, copy the replacement to the output 2059 and adjust various state variables. 2060 return 0 on success, -1 on error 2061*/ 2062 2063static int 2064unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler, 2065 const char *encoding, const char *reason, 2066 const char **input, const char **inend, Py_ssize_t *startinpos, 2067 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr, 2068 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr) 2069{ 2070 static char *argparse = "O!n;decoding error handler must return (str, int) tuple"; 2071 2072 PyObject *restuple = NULL; 2073 PyObject *repunicode = NULL; 2074 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output); 2075 Py_ssize_t insize; 2076 Py_ssize_t requiredsize; 2077 Py_ssize_t newpos; 2078 Py_UNICODE *repptr; 2079 PyObject *inputobj = NULL; 2080 Py_ssize_t repsize; 2081 int res = -1; 2082 2083 if (*errorHandler == NULL) { 2084 *errorHandler = PyCodec_LookupError(errors); 2085 if (*errorHandler == NULL) 2086 goto onError; 2087 } 2088 2089 make_decode_exception(exceptionObject, 2090 encoding, 2091 *input, *inend - *input, 2092 *startinpos, *endinpos, 2093 reason); 2094 if (*exceptionObject == NULL) 2095 goto onError; 2096 2097 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL); 2098 if (restuple == NULL) 2099 goto onError; 2100 if (!PyTuple_Check(restuple)) { 2101 PyErr_SetString(PyExc_TypeError, &argparse[4]); 2102 goto onError; 2103 } 2104 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos)) 2105 goto onError; 2106 2107 /* Copy back the bytes variables, which might have been modified by the 2108 callback */ 2109 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject); 2110 if (!inputobj) 2111 goto onError; 2112 if (!PyBytes_Check(inputobj)) { 2113 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes"); 2114 } 2115 *input = PyBytes_AS_STRING(inputobj); 2116 insize = PyBytes_GET_SIZE(inputobj); 2117 *inend = *input + insize; 2118 /* we can DECREF safely, as the exception has another reference, 2119 so the object won't go away. */ 2120 Py_DECREF(inputobj); 2121 2122 if (newpos<0) 2123 newpos = insize+newpos; 2124 if (newpos<0 || newpos>insize) { 2125 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos); 2126 goto onError; 2127 } 2128 2129 /* need more space? (at least enough for what we 2130 have+the replacement+the rest of the string (starting 2131 at the new input position), so we won't have to check space 2132 when there are no errors in the rest of the string) */ 2133 repptr = PyUnicode_AS_UNICODE(repunicode); 2134 repsize = PyUnicode_GET_SIZE(repunicode); 2135 requiredsize = *outpos + repsize + insize-newpos; 2136 if (requiredsize > outsize) { 2137 if (requiredsize<2*outsize) 2138 requiredsize = 2*outsize; 2139 if (_PyUnicode_Resize(output, requiredsize) < 0) 2140 goto onError; 2141 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos; 2142 } 2143 *endinpos = newpos; 2144 *inptr = *input + newpos; 2145 Py_UNICODE_COPY(*outptr, repptr, repsize); 2146 *outptr += repsize; 2147 *outpos += repsize; 2148 2149 /* we made it! */ 2150 res = 0; 2151 2152 onError: 2153 Py_XDECREF(restuple); 2154 return res; 2155} 2156 2157/* --- UTF-7 Codec -------------------------------------------------------- */ 2158 2159/* See RFC2152 for details. We encode conservatively and decode liberally. */ 2160 2161/* Three simple macros defining base-64. */ 2162 2163/* Is c a base-64 character? */ 2164 2165#define IS_BASE64(c) \ 2166 (((c) >= 'A' && (c) <= 'Z') || \ 2167 ((c) >= 'a' && (c) <= 'z') || \ 2168 ((c) >= '0' && (c) <= '9') || \ 2169 (c) == '+' || (c) == '/') 2170 2171/* given that c is a base-64 character, what is its base-64 value? */ 2172 2173#define FROM_BASE64(c) \ 2174 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \ 2175 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \ 2176 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \ 2177 (c) == '+' ? 62 : 63) 2178 2179/* What is the base-64 character of the bottom 6 bits of n? */ 2180 2181#define TO_BASE64(n) \ 2182 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f]) 2183 2184/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be 2185 * decoded as itself. We are permissive on decoding; the only ASCII 2186 * byte not decoding to itself is the + which begins a base64 2187 * string. */ 2188 2189#define DECODE_DIRECT(c) \ 2190 ((c) <= 127 && (c) != '+') 2191 2192/* The UTF-7 encoder treats ASCII characters differently according to 2193 * whether they are Set D, Set O, Whitespace, or special (i.e. none of 2194 * the above). See RFC2152. This array identifies these different 2195 * sets: 2196 * 0 : "Set D" 2197 * alphanumeric and '(),-./:? 2198 * 1 : "Set O" 2199 * !"#$%&*;<=>@[]^_`{|} 2200 * 2 : "whitespace" 2201 * ht nl cr sp 2202 * 3 : special (must be base64 encoded) 2203 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127) 2204 */ 2205 2206static 2207char utf7_category[128] = { 2208/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */ 2209 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3, 2210/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */ 2211 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2212/* sp ! " # $ % & ' ( ) * + , - . / */ 2213 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0, 2214/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */ 2215 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 2216/* @ A B C D E F G H I J K L M N O */ 2217 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2218/* P Q R S T U V W X Y Z [ \ ] ^ _ */ 2219 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1, 2220/* ` a b c d e f g h i j k l m n o */ 2221 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2222/* p q r s t u v w x y z { | } ~ del */ 2223 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3, 2224}; 2225 2226/* ENCODE_DIRECT: this character should be encoded as itself. The 2227 * answer depends on whether we are encoding set O as itself, and also 2228 * on whether we are encoding whitespace as itself. RFC2152 makes it 2229 * clear that the answers to these questions vary between 2230 * applications, so this code needs to be flexible. */ 2231 2232#define ENCODE_DIRECT(c, directO, directWS) \ 2233 ((c) < 128 && (c) > 0 && \ 2234 ((utf7_category[(c)] == 0) || \ 2235 (directWS && (utf7_category[(c)] == 2)) || \ 2236 (directO && (utf7_category[(c)] == 1)))) 2237 2238PyObject * 2239PyUnicode_DecodeUTF7(const char *s, 2240 Py_ssize_t size, 2241 const char *errors) 2242{ 2243 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL); 2244} 2245 2246/* The decoder. The only state we preserve is our read position, 2247 * i.e. how many characters we have consumed. So if we end in the 2248 * middle of a shift sequence we have to back off the read position 2249 * and the output to the beginning of the sequence, otherwise we lose 2250 * all the shift state (seen bits, number of bits seen, high 2251 * surrogate). */ 2252 2253PyObject * 2254PyUnicode_DecodeUTF7Stateful(const char *s, 2255 Py_ssize_t size, 2256 const char *errors, 2257 Py_ssize_t *consumed) 2258{ 2259 const char *starts = s; 2260 Py_ssize_t startinpos; 2261 Py_ssize_t endinpos; 2262 Py_ssize_t outpos; 2263 const char *e; 2264 PyUnicodeObject *unicode; 2265 Py_UNICODE *p; 2266 const char *errmsg = ""; 2267 int inShift = 0; 2268 Py_UNICODE *shiftOutStart; 2269 unsigned int base64bits = 0; 2270 unsigned long base64buffer = 0; 2271 Py_UNICODE surrogate = 0; 2272 PyObject *errorHandler = NULL; 2273 PyObject *exc = NULL; 2274 2275 unicode = _PyUnicode_New(size); 2276 if (!unicode) 2277 return NULL; 2278 if (size == 0) { 2279 if (consumed) 2280 *consumed = 0; 2281 return (PyObject *)unicode; 2282 } 2283 2284 p = unicode->str; 2285 shiftOutStart = p; 2286 e = s + size; 2287 2288 while (s < e) { 2289 Py_UNICODE ch; 2290 restart: 2291 ch = (unsigned char) *s; 2292 2293 if (inShift) { /* in a base-64 section */ 2294 if (IS_BASE64(ch)) { /* consume a base-64 character */ 2295 base64buffer = (base64buffer << 6) | FROM_BASE64(ch); 2296 base64bits += 6; 2297 s++; 2298 if (base64bits >= 16) { 2299 /* we have enough bits for a UTF-16 value */ 2300 Py_UNICODE outCh = (Py_UNICODE) 2301 (base64buffer >> (base64bits-16)); 2302 base64bits -= 16; 2303 base64buffer &= (1 << base64bits) - 1; /* clear high bits */ 2304 if (surrogate) { 2305 /* expecting a second surrogate */ 2306 if (outCh >= 0xDC00 && outCh <= 0xDFFF) { 2307#ifdef Py_UNICODE_WIDE 2308 *p++ = (((surrogate & 0x3FF)<<10) 2309 | (outCh & 0x3FF)) + 0x10000; 2310#else 2311 *p++ = surrogate; 2312 *p++ = outCh; 2313#endif 2314 surrogate = 0; 2315 } 2316 else { 2317 surrogate = 0; 2318 errmsg = "second surrogate missing"; 2319 goto utf7Error; 2320 } 2321 } 2322 else if (outCh >= 0xD800 && outCh <= 0xDBFF) { 2323 /* first surrogate */ 2324 surrogate = outCh; 2325 } 2326 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) { 2327 errmsg = "unexpected second surrogate"; 2328 goto utf7Error; 2329 } 2330 else { 2331 *p++ = outCh; 2332 } 2333 } 2334 } 2335 else { /* now leaving a base-64 section */ 2336 inShift = 0; 2337 s++; 2338 if (surrogate) { 2339 errmsg = "second surrogate missing at end of shift sequence"; 2340 goto utf7Error; 2341 } 2342 if (base64bits > 0) { /* left-over bits */ 2343 if (base64bits >= 6) { 2344 /* We've seen at least one base-64 character */ 2345 errmsg = "partial character in shift sequence"; 2346 goto utf7Error; 2347 } 2348 else { 2349 /* Some bits remain; they should be zero */ 2350 if (base64buffer != 0) { 2351 errmsg = "non-zero padding bits in shift sequence"; 2352 goto utf7Error; 2353 } 2354 } 2355 } 2356 if (ch != '-') { 2357 /* '-' is absorbed; other terminating 2358 characters are preserved */ 2359 *p++ = ch; 2360 } 2361 } 2362 } 2363 else if ( ch == '+' ) { 2364 startinpos = s-starts; 2365 s++; /* consume '+' */ 2366 if (s < e && *s == '-') { /* '+-' encodes '+' */ 2367 s++; 2368 *p++ = '+'; 2369 } 2370 else { /* begin base64-encoded section */ 2371 inShift = 1; 2372 shiftOutStart = p; 2373 base64bits = 0; 2374 } 2375 } 2376 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */ 2377 *p++ = ch; 2378 s++; 2379 } 2380 else { 2381 startinpos = s-starts; 2382 s++; 2383 errmsg = "unexpected special character"; 2384 goto utf7Error; 2385 } 2386 continue; 2387utf7Error: 2388 outpos = p-PyUnicode_AS_UNICODE(unicode); 2389 endinpos = s-starts; 2390 if (unicode_decode_call_errorhandler( 2391 errors, &errorHandler, 2392 "utf7", errmsg, 2393 &starts, &e, &startinpos, &endinpos, &exc, &s, 2394 &unicode, &outpos, &p)) 2395 goto onError; 2396 } 2397 2398 /* end of string */ 2399 2400 if (inShift && !consumed) { /* in shift sequence, no more to follow */ 2401 /* if we're in an inconsistent state, that's an error */ 2402 if (surrogate || 2403 (base64bits >= 6) || 2404 (base64bits > 0 && base64buffer != 0)) { 2405 outpos = p-PyUnicode_AS_UNICODE(unicode); 2406 endinpos = size; 2407 if (unicode_decode_call_errorhandler( 2408 errors, &errorHandler, 2409 "utf7", "unterminated shift sequence", 2410 &starts, &e, &startinpos, &endinpos, &exc, &s, 2411 &unicode, &outpos, &p)) 2412 goto onError; 2413 if (s < e) 2414 goto restart; 2415 } 2416 } 2417 2418 /* return state */ 2419 if (consumed) { 2420 if (inShift) { 2421 p = shiftOutStart; /* back off output */ 2422 *consumed = startinpos; 2423 } 2424 else { 2425 *consumed = s-starts; 2426 } 2427 } 2428 2429 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0) 2430 goto onError; 2431 2432 Py_XDECREF(errorHandler); 2433 Py_XDECREF(exc); 2434 return (PyObject *)unicode; 2435 2436 onError: 2437 Py_XDECREF(errorHandler); 2438 Py_XDECREF(exc); 2439 Py_DECREF(unicode); 2440 return NULL; 2441} 2442 2443 2444PyObject * 2445PyUnicode_EncodeUTF7(const Py_UNICODE *s, 2446 Py_ssize_t size, 2447 int base64SetO, 2448 int base64WhiteSpace, 2449 const char *errors) 2450{ 2451 PyObject *v; 2452 /* It might be possible to tighten this worst case */ 2453 Py_ssize_t allocated = 8 * size; 2454 int inShift = 0; 2455 Py_ssize_t i = 0; 2456 unsigned int base64bits = 0; 2457 unsigned long base64buffer = 0; 2458 char * out; 2459 char * start; 2460 2461 if (size == 0) 2462 return PyBytes_FromStringAndSize(NULL, 0); 2463 2464 if (allocated / 8 != size) 2465 return PyErr_NoMemory(); 2466 2467 v = PyBytes_FromStringAndSize(NULL, allocated); 2468 if (v == NULL) 2469 return NULL; 2470 2471 start = out = PyBytes_AS_STRING(v); 2472 for (;i < size; ++i) { 2473 Py_UNICODE ch = s[i]; 2474 2475 if (inShift) { 2476 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) { 2477 /* shifting out */ 2478 if (base64bits) { /* output remaining bits */ 2479 *out++ = TO_BASE64(base64buffer << (6-base64bits)); 2480 base64buffer = 0; 2481 base64bits = 0; 2482 } 2483 inShift = 0; 2484 /* Characters not in the BASE64 set implicitly unshift the sequence 2485 so no '-' is required, except if the character is itself a '-' */ 2486 if (IS_BASE64(ch) || ch == '-') { 2487 *out++ = '-'; 2488 } 2489 *out++ = (char) ch; 2490 } 2491 else { 2492 goto encode_char; 2493 } 2494 } 2495 else { /* not in a shift sequence */ 2496 if (ch == '+') { 2497 *out++ = '+'; 2498 *out++ = '-'; 2499 } 2500 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) { 2501 *out++ = (char) ch; 2502 } 2503 else { 2504 *out++ = '+'; 2505 inShift = 1; 2506 goto encode_char; 2507 } 2508 } 2509 continue; 2510encode_char: 2511#ifdef Py_UNICODE_WIDE 2512 if (ch >= 0x10000) { 2513 /* code first surrogate */ 2514 base64bits += 16; 2515 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10); 2516 while (base64bits >= 6) { 2517 *out++ = TO_BASE64(base64buffer >> (base64bits-6)); 2518 base64bits -= 6; 2519 } 2520 /* prepare second surrogate */ 2521 ch = 0xDC00 | ((ch-0x10000) & 0x3FF); 2522 } 2523#endif 2524 base64bits += 16; 2525 base64buffer = (base64buffer << 16) | ch; 2526 while (base64bits >= 6) { 2527 *out++ = TO_BASE64(base64buffer >> (base64bits-6)); 2528 base64bits -= 6; 2529 } 2530 } 2531 if (base64bits) 2532 *out++= TO_BASE64(base64buffer << (6-base64bits) ); 2533 if (inShift) 2534 *out++ = '-'; 2535 if (_PyBytes_Resize(&v, out - start) < 0) 2536 return NULL; 2537 return v; 2538} 2539 2540#undef IS_BASE64 2541#undef FROM_BASE64 2542#undef TO_BASE64 2543#undef DECODE_DIRECT 2544#undef ENCODE_DIRECT 2545 2546/* --- UTF-8 Codec -------------------------------------------------------- */ 2547 2548static 2549char utf8_code_length[256] = { 2550 /* Map UTF-8 encoded prefix byte to sequence length. Zero means 2551 illegal prefix. See RFC 3629 for details */ 2552 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */ 2553 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2554 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2555 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2556 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2557 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2558 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2559 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */ 2560 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */ 2561 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2562 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2563 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */ 2564 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */ 2565 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */ 2566 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */ 2567 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */ 2568}; 2569 2570PyObject * 2571PyUnicode_DecodeUTF8(const char *s, 2572 Py_ssize_t size, 2573 const char *errors) 2574{ 2575 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL); 2576} 2577 2578/* Mask to check or force alignment of a pointer to C 'long' boundaries */ 2579#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1) 2580 2581/* Mask to quickly check whether a C 'long' contains a 2582 non-ASCII, UTF8-encoded char. */ 2583#if (SIZEOF_LONG == 8) 2584# define ASCII_CHAR_MASK 0x8080808080808080L 2585#elif (SIZEOF_LONG == 4) 2586# define ASCII_CHAR_MASK 0x80808080L 2587#else 2588# error C 'long' size should be either 4 or 8! 2589#endif 2590 2591PyObject * 2592PyUnicode_DecodeUTF8Stateful(const char *s, 2593 Py_ssize_t size, 2594 const char *errors, 2595 Py_ssize_t *consumed) 2596{ 2597 const char *starts = s; 2598 int n; 2599 int k; 2600 Py_ssize_t startinpos; 2601 Py_ssize_t endinpos; 2602 Py_ssize_t outpos; 2603 const char *e, *aligned_end; 2604 PyUnicodeObject *unicode; 2605 Py_UNICODE *p; 2606 const char *errmsg = ""; 2607 PyObject *errorHandler = NULL; 2608 PyObject *exc = NULL; 2609 2610 /* Note: size will always be longer than the resulting Unicode 2611 character count */ 2612 unicode = _PyUnicode_New(size); 2613 if (!unicode) 2614 return NULL; 2615 if (size == 0) { 2616 if (consumed) 2617 *consumed = 0; 2618 return (PyObject *)unicode; 2619 } 2620 2621 /* Unpack UTF-8 encoded data */ 2622 p = unicode->str; 2623 e = s + size; 2624 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK); 2625 2626 while (s < e) { 2627 Py_UCS4 ch = (unsigned char)*s; 2628 2629 if (ch < 0x80) { 2630 /* Fast path for runs of ASCII characters. Given that common UTF-8 2631 input will consist of an overwhelming majority of ASCII 2632 characters, we try to optimize for this case by checking 2633 as many characters as a C 'long' can contain. 2634 First, check if we can do an aligned read, as most CPUs have 2635 a penalty for unaligned reads. 2636 */ 2637 if (!((size_t) s & LONG_PTR_MASK)) { 2638 /* Help register allocation */ 2639 register const char *_s = s; 2640 register Py_UNICODE *_p = p; 2641 while (_s < aligned_end) { 2642 /* Read a whole long at a time (either 4 or 8 bytes), 2643 and do a fast unrolled copy if it only contains ASCII 2644 characters. */ 2645 unsigned long data = *(unsigned long *) _s; 2646 if (data & ASCII_CHAR_MASK) 2647 break; 2648 _p[0] = (unsigned char) _s[0]; 2649 _p[1] = (unsigned char) _s[1]; 2650 _p[2] = (unsigned char) _s[2]; 2651 _p[3] = (unsigned char) _s[3]; 2652#if (SIZEOF_LONG == 8) 2653 _p[4] = (unsigned char) _s[4]; 2654 _p[5] = (unsigned char) _s[5]; 2655 _p[6] = (unsigned char) _s[6]; 2656 _p[7] = (unsigned char) _s[7]; 2657#endif 2658 _s += SIZEOF_LONG; 2659 _p += SIZEOF_LONG; 2660 } 2661 s = _s; 2662 p = _p; 2663 if (s == e) 2664 break; 2665 ch = (unsigned char)*s; 2666 } 2667 } 2668 2669 if (ch < 0x80) { 2670 *p++ = (Py_UNICODE)ch; 2671 s++; 2672 continue; 2673 } 2674 2675 n = utf8_code_length[ch]; 2676 2677 if (s + n > e) { 2678 if (consumed) 2679 break; 2680 else { 2681 errmsg = "unexpected end of data"; 2682 startinpos = s-starts; 2683 endinpos = startinpos+1; 2684 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++) 2685 endinpos++; 2686 goto utf8Error; 2687 } 2688 } 2689 2690 switch (n) { 2691 2692 case 0: 2693 errmsg = "invalid start byte"; 2694 startinpos = s-starts; 2695 endinpos = startinpos+1; 2696 goto utf8Error; 2697 2698 case 1: 2699 errmsg = "internal error"; 2700 startinpos = s-starts; 2701 endinpos = startinpos+1; 2702 goto utf8Error; 2703 2704 case 2: 2705 if ((s[1] & 0xc0) != 0x80) { 2706 errmsg = "invalid continuation byte"; 2707 startinpos = s-starts; 2708 endinpos = startinpos + 1; 2709 goto utf8Error; 2710 } 2711 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f); 2712 assert ((ch > 0x007F) && (ch <= 0x07FF)); 2713 *p++ = (Py_UNICODE)ch; 2714 break; 2715 2716 case 3: 2717 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf 2718 will result in surrogates in range d800-dfff. Surrogates are 2719 not valid UTF-8 so they are rejected. 2720 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf 2721 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */ 2722 if ((s[1] & 0xc0) != 0x80 || 2723 (s[2] & 0xc0) != 0x80 || 2724 ((unsigned char)s[0] == 0xE0 && 2725 (unsigned char)s[1] < 0xA0) || 2726 ((unsigned char)s[0] == 0xED && 2727 (unsigned char)s[1] > 0x9F)) { 2728 errmsg = "invalid continuation byte"; 2729 startinpos = s-starts; 2730 endinpos = startinpos + 1; 2731 2732 /* if s[1] first two bits are 1 and 0, then the invalid 2733 continuation byte is s[2], so increment endinpos by 1, 2734 if not, s[1] is invalid and endinpos doesn't need to 2735 be incremented. */ 2736 if ((s[1] & 0xC0) == 0x80) 2737 endinpos++; 2738 goto utf8Error; 2739 } 2740 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f); 2741 assert ((ch > 0x07FF) && (ch <= 0xFFFF)); 2742 *p++ = (Py_UNICODE)ch; 2743 break; 2744 2745 case 4: 2746 if ((s[1] & 0xc0) != 0x80 || 2747 (s[2] & 0xc0) != 0x80 || 2748 (s[3] & 0xc0) != 0x80 || 2749 ((unsigned char)s[0] == 0xF0 && 2750 (unsigned char)s[1] < 0x90) || 2751 ((unsigned char)s[0] == 0xF4 && 2752 (unsigned char)s[1] > 0x8F)) { 2753 errmsg = "invalid continuation byte"; 2754 startinpos = s-starts; 2755 endinpos = startinpos + 1; 2756 if ((s[1] & 0xC0) == 0x80) { 2757 endinpos++; 2758 if ((s[2] & 0xC0) == 0x80) 2759 endinpos++; 2760 } 2761 goto utf8Error; 2762 } 2763 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) + 2764 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f); 2765 assert ((ch > 0xFFFF) && (ch <= 0x10ffff)); 2766 2767#ifdef Py_UNICODE_WIDE 2768 *p++ = (Py_UNICODE)ch; 2769#else 2770 /* compute and append the two surrogates: */ 2771 2772 /* translate from 10000..10FFFF to 0..FFFF */ 2773 ch -= 0x10000; 2774 2775 /* high surrogate = top 10 bits added to D800 */ 2776 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10)); 2777 2778 /* low surrogate = bottom 10 bits added to DC00 */ 2779 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF)); 2780#endif 2781 break; 2782 } 2783 s += n; 2784 continue; 2785 2786 utf8Error: 2787 outpos = p-PyUnicode_AS_UNICODE(unicode); 2788 if (unicode_decode_call_errorhandler( 2789 errors, &errorHandler, 2790 "utf8", errmsg, 2791 &starts, &e, &startinpos, &endinpos, &exc, &s, 2792 &unicode, &outpos, &p)) 2793 goto onError; 2794 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK); 2795 } 2796 if (consumed) 2797 *consumed = s-starts; 2798 2799 /* Adjust length */ 2800 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0) 2801 goto onError; 2802 2803 Py_XDECREF(errorHandler); 2804 Py_XDECREF(exc); 2805 return (PyObject *)unicode; 2806 2807 onError: 2808 Py_XDECREF(errorHandler); 2809 Py_XDECREF(exc); 2810 Py_DECREF(unicode); 2811 return NULL; 2812} 2813 2814#undef ASCII_CHAR_MASK 2815 2816#ifdef __APPLE__ 2817 2818/* Simplified UTF-8 decoder using surrogateescape error handler, 2819 used to decode the command line arguments on Mac OS X. */ 2820 2821wchar_t* 2822_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size) 2823{ 2824 int n; 2825 const char *e; 2826 wchar_t *unicode, *p; 2827 2828 /* Note: size will always be longer than the resulting Unicode 2829 character count */ 2830 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) { 2831 PyErr_NoMemory(); 2832 return NULL; 2833 } 2834 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t)); 2835 if (!unicode) 2836 return NULL; 2837 2838 /* Unpack UTF-8 encoded data */ 2839 p = unicode; 2840 e = s + size; 2841 while (s < e) { 2842 Py_UCS4 ch = (unsigned char)*s; 2843 2844 if (ch < 0x80) { 2845 *p++ = (wchar_t)ch; 2846 s++; 2847 continue; 2848 } 2849 2850 n = utf8_code_length[ch]; 2851 if (s + n > e) { 2852 goto surrogateescape; 2853 } 2854 2855 switch (n) { 2856 case 0: 2857 case 1: 2858 goto surrogateescape; 2859 2860 case 2: 2861 if ((s[1] & 0xc0) != 0x80) 2862 goto surrogateescape; 2863 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f); 2864 assert ((ch > 0x007F) && (ch <= 0x07FF)); 2865 *p++ = (wchar_t)ch; 2866 break; 2867 2868 case 3: 2869 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf 2870 will result in surrogates in range d800-dfff. Surrogates are 2871 not valid UTF-8 so they are rejected. 2872 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf 2873 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */ 2874 if ((s[1] & 0xc0) != 0x80 || 2875 (s[2] & 0xc0) != 0x80 || 2876 ((unsigned char)s[0] == 0xE0 && 2877 (unsigned char)s[1] < 0xA0) || 2878 ((unsigned char)s[0] == 0xED && 2879 (unsigned char)s[1] > 0x9F)) { 2880 2881 goto surrogateescape; 2882 } 2883 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f); 2884 assert ((ch > 0x07FF) && (ch <= 0xFFFF)); 2885 *p++ = (Py_UNICODE)ch; 2886 break; 2887 2888 case 4: 2889 if ((s[1] & 0xc0) != 0x80 || 2890 (s[2] & 0xc0) != 0x80 || 2891 (s[3] & 0xc0) != 0x80 || 2892 ((unsigned char)s[0] == 0xF0 && 2893 (unsigned char)s[1] < 0x90) || 2894 ((unsigned char)s[0] == 0xF4 && 2895 (unsigned char)s[1] > 0x8F)) { 2896 goto surrogateescape; 2897 } 2898 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) + 2899 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f); 2900 assert ((ch > 0xFFFF) && (ch <= 0x10ffff)); 2901 2902#if SIZEOF_WCHAR_T == 4 2903 *p++ = (wchar_t)ch; 2904#else 2905 /* compute and append the two surrogates: */ 2906 2907 /* translate from 10000..10FFFF to 0..FFFF */ 2908 ch -= 0x10000; 2909 2910 /* high surrogate = top 10 bits added to D800 */ 2911 *p++ = (wchar_t)(0xD800 + (ch >> 10)); 2912 2913 /* low surrogate = bottom 10 bits added to DC00 */ 2914 *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF)); 2915#endif 2916 break; 2917 } 2918 s += n; 2919 continue; 2920 2921 surrogateescape: 2922 *p++ = 0xDC00 + ch; 2923 s++; 2924 } 2925 *p = L'\0'; 2926 return unicode; 2927} 2928 2929#endif /* __APPLE__ */ 2930 2931/* Allocation strategy: if the string is short, convert into a stack buffer 2932 and allocate exactly as much space needed at the end. Else allocate the 2933 maximum possible needed (4 result bytes per Unicode character), and return 2934 the excess memory at the end. 2935*/ 2936PyObject * 2937PyUnicode_EncodeUTF8(const Py_UNICODE *s, 2938 Py_ssize_t size, 2939 const char *errors) 2940{ 2941#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */ 2942 2943 Py_ssize_t i; /* index into s of next input byte */ 2944 PyObject *result; /* result string object */ 2945 char *p; /* next free byte in output buffer */ 2946 Py_ssize_t nallocated; /* number of result bytes allocated */ 2947 Py_ssize_t nneeded; /* number of result bytes needed */ 2948 char stackbuf[MAX_SHORT_UNICHARS * 4]; 2949 PyObject *errorHandler = NULL; 2950 PyObject *exc = NULL; 2951 2952 assert(s != NULL); 2953 assert(size >= 0); 2954 2955 if (size <= MAX_SHORT_UNICHARS) { 2956 /* Write into the stack buffer; nallocated can't overflow. 2957 * At the end, we'll allocate exactly as much heap space as it 2958 * turns out we need. 2959 */ 2960 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int); 2961 result = NULL; /* will allocate after we're done */ 2962 p = stackbuf; 2963 } 2964 else { 2965 /* Overallocate on the heap, and give the excess back at the end. */ 2966 nallocated = size * 4; 2967 if (nallocated / 4 != size) /* overflow! */ 2968 return PyErr_NoMemory(); 2969 result = PyBytes_FromStringAndSize(NULL, nallocated); 2970 if (result == NULL) 2971 return NULL; 2972 p = PyBytes_AS_STRING(result); 2973 } 2974 2975 for (i = 0; i < size;) { 2976 Py_UCS4 ch = s[i++]; 2977 2978 if (ch < 0x80) 2979 /* Encode ASCII */ 2980 *p++ = (char) ch; 2981 2982 else if (ch < 0x0800) { 2983 /* Encode Latin-1 */ 2984 *p++ = (char)(0xc0 | (ch >> 6)); 2985 *p++ = (char)(0x80 | (ch & 0x3f)); 2986 } else if (0xD800 <= ch && ch <= 0xDFFF) { 2987#ifndef Py_UNICODE_WIDE 2988 /* Special case: check for high and low surrogate */ 2989 if (ch <= 0xDBFF && i != size && 0xDC00 <= s[i] && s[i] <= 0xDFFF) { 2990 Py_UCS4 ch2 = s[i]; 2991 /* Combine the two surrogates to form a UCS4 value */ 2992 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000; 2993 i++; 2994 2995 /* Encode UCS4 Unicode ordinals */ 2996 *p++ = (char)(0xf0 | (ch >> 18)); 2997 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f)); 2998 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); 2999 *p++ = (char)(0x80 | (ch & 0x3f)); 3000 } else { 3001#endif 3002 Py_ssize_t newpos; 3003 PyObject *rep; 3004 Py_ssize_t repsize, k; 3005 rep = unicode_encode_call_errorhandler 3006 (errors, &errorHandler, "utf-8", "surrogates not allowed", 3007 s, size, &exc, i-1, i, &newpos); 3008 if (!rep) 3009 goto error; 3010 3011 if (PyBytes_Check(rep)) 3012 repsize = PyBytes_GET_SIZE(rep); 3013 else 3014 repsize = PyUnicode_GET_SIZE(rep); 3015 3016 if (repsize > 4) { 3017 Py_ssize_t offset; 3018 3019 if (result == NULL) 3020 offset = p - stackbuf; 3021 else 3022 offset = p - PyBytes_AS_STRING(result); 3023 3024 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) { 3025 /* integer overflow */ 3026 PyErr_NoMemory(); 3027 goto error; 3028 } 3029 nallocated += repsize - 4; 3030 if (result != NULL) { 3031 if (_PyBytes_Resize(&result, nallocated) < 0) 3032 goto error; 3033 } else { 3034 result = PyBytes_FromStringAndSize(NULL, nallocated); 3035 if (result == NULL) 3036 goto error; 3037 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset); 3038 } 3039 p = PyBytes_AS_STRING(result) + offset; 3040 } 3041 3042 if (PyBytes_Check(rep)) { 3043 char *prep = PyBytes_AS_STRING(rep); 3044 for(k = repsize; k > 0; k--) 3045 *p++ = *prep++; 3046 } else /* rep is unicode */ { 3047 Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep); 3048 Py_UNICODE c; 3049 3050 for(k=0; k<repsize; k++) { 3051 c = prep[k]; 3052 if (0x80 <= c) { 3053 raise_encode_exception(&exc, "utf-8", s, size, 3054 i-1, i, "surrogates not allowed"); 3055 goto error; 3056 } 3057 *p++ = (char)prep[k]; 3058 } 3059 } 3060 Py_DECREF(rep); 3061#ifndef Py_UNICODE_WIDE 3062 } 3063#endif 3064 } else if (ch < 0x10000) { 3065 *p++ = (char)(0xe0 | (ch >> 12)); 3066 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); 3067 *p++ = (char)(0x80 | (ch & 0x3f)); 3068 } else /* ch >= 0x10000 */ { 3069 /* Encode UCS4 Unicode ordinals */ 3070 *p++ = (char)(0xf0 | (ch >> 18)); 3071 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f)); 3072 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); 3073 *p++ = (char)(0x80 | (ch & 0x3f)); 3074 } 3075 } 3076 3077 if (result == NULL) { 3078 /* This was stack allocated. */ 3079 nneeded = p - stackbuf; 3080 assert(nneeded <= nallocated); 3081 result = PyBytes_FromStringAndSize(stackbuf, nneeded); 3082 } 3083 else { 3084 /* Cut back to size actually needed. */ 3085 nneeded = p - PyBytes_AS_STRING(result); 3086 assert(nneeded <= nallocated); 3087 _PyBytes_Resize(&result, nneeded); 3088 } 3089 Py_XDECREF(errorHandler); 3090 Py_XDECREF(exc); 3091 return result; 3092 error: 3093 Py_XDECREF(errorHandler); 3094 Py_XDECREF(exc); 3095 Py_XDECREF(result); 3096 return NULL; 3097 3098#undef MAX_SHORT_UNICHARS 3099} 3100 3101PyObject * 3102PyUnicode_AsUTF8String(PyObject *unicode) 3103{ 3104 PyObject *utf8; 3105 if (!PyUnicode_Check(unicode)) { 3106 PyErr_BadArgument(); 3107 return NULL; 3108 } 3109 utf8 = _PyUnicode_AsDefaultEncodedString(unicode); 3110 if (utf8 == NULL) 3111 return NULL; 3112 Py_INCREF(utf8); 3113 return utf8; 3114} 3115 3116/* --- UTF-32 Codec ------------------------------------------------------- */ 3117 3118PyObject * 3119PyUnicode_DecodeUTF32(const char *s, 3120 Py_ssize_t size, 3121 const char *errors, 3122 int *byteorder) 3123{ 3124 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL); 3125} 3126 3127PyObject * 3128PyUnicode_DecodeUTF32Stateful(const char *s, 3129 Py_ssize_t size, 3130 const char *errors, 3131 int *byteorder, 3132 Py_ssize_t *consumed) 3133{ 3134 const char *starts = s; 3135 Py_ssize_t startinpos; 3136 Py_ssize_t endinpos; 3137 Py_ssize_t outpos; 3138 PyUnicodeObject *unicode; 3139 Py_UNICODE *p; 3140#ifndef Py_UNICODE_WIDE 3141 int pairs = 0; 3142 const unsigned char *qq; 3143#else 3144 const int pairs = 0; 3145#endif 3146 const unsigned char *q, *e; 3147 int bo = 0; /* assume native ordering by default */ 3148 const char *errmsg = ""; 3149 /* Offsets from q for retrieving bytes in the right order. */ 3150#ifdef BYTEORDER_IS_LITTLE_ENDIAN 3151 int iorder[] = {0, 1, 2, 3}; 3152#else 3153 int iorder[] = {3, 2, 1, 0}; 3154#endif 3155 PyObject *errorHandler = NULL; 3156 PyObject *exc = NULL; 3157 3158 q = (unsigned char *)s; 3159 e = q + size; 3160 3161 if (byteorder) 3162 bo = *byteorder; 3163 3164 /* Check for BOM marks (U+FEFF) in the input and adjust current 3165 byte order setting accordingly. In native mode, the leading BOM 3166 mark is skipped, in all other modes, it is copied to the output 3167 stream as-is (giving a ZWNBSP character). */ 3168 if (bo == 0) { 3169 if (size >= 4) { 3170 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) | 3171 (q[iorder[1]] << 8) | q[iorder[0]]; 3172#ifdef BYTEORDER_IS_LITTLE_ENDIAN 3173 if (bom == 0x0000FEFF) { 3174 q += 4; 3175 bo = -1; 3176 } 3177 else if (bom == 0xFFFE0000) { 3178 q += 4; 3179 bo = 1; 3180 } 3181#else 3182 if (bom == 0x0000FEFF) { 3183 q += 4; 3184 bo = 1; 3185 } 3186 else if (bom == 0xFFFE0000) { 3187 q += 4; 3188 bo = -1; 3189 } 3190#endif 3191 } 3192 } 3193 3194 if (bo == -1) { 3195 /* force LE */ 3196 iorder[0] = 0; 3197 iorder[1] = 1; 3198 iorder[2] = 2; 3199 iorder[3] = 3; 3200 } 3201 else if (bo == 1) { 3202 /* force BE */ 3203 iorder[0] = 3; 3204 iorder[1] = 2; 3205 iorder[2] = 1; 3206 iorder[3] = 0; 3207 } 3208 3209 /* On narrow builds we split characters outside the BMP into two 3210 codepoints => count how much extra space we need. */ 3211#ifndef Py_UNICODE_WIDE 3212 for (qq = q; qq < e; qq += 4) 3213 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0) 3214 pairs++; 3215#endif 3216 3217 /* This might be one to much, because of a BOM */ 3218 unicode = _PyUnicode_New((size+3)/4+pairs); 3219 if (!unicode) 3220 return NULL; 3221 if (size == 0) 3222 return (PyObject *)unicode; 3223 3224 /* Unpack UTF-32 encoded data */ 3225 p = unicode->str; 3226 3227 while (q < e) { 3228 Py_UCS4 ch; 3229 /* remaining bytes at the end? (size should be divisible by 4) */ 3230 if (e-q<4) { 3231 if (consumed) 3232 break; 3233 errmsg = "truncated data"; 3234 startinpos = ((const char *)q)-starts; 3235 endinpos = ((const char *)e)-starts; 3236 goto utf32Error; 3237 /* The remaining input chars are ignored if the callback 3238 chooses to skip the input */ 3239 } 3240 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) | 3241 (q[iorder[1]] << 8) | q[iorder[0]]; 3242 3243 if (ch >= 0x110000) 3244 { 3245 errmsg = "codepoint not in range(0x110000)"; 3246 startinpos = ((const char *)q)-starts; 3247 endinpos = startinpos+4; 3248 goto utf32Error; 3249 } 3250#ifndef Py_UNICODE_WIDE 3251 if (ch >= 0x10000) 3252 { 3253 *p++ = 0xD800 | ((ch-0x10000) >> 10); 3254 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF); 3255 } 3256 else 3257#endif 3258 *p++ = ch; 3259 q += 4; 3260 continue; 3261 utf32Error: 3262 outpos = p-PyUnicode_AS_UNICODE(unicode); 3263 if (unicode_decode_call_errorhandler( 3264 errors, &errorHandler, 3265 "utf32", errmsg, 3266 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q, 3267 &unicode, &outpos, &p)) 3268 goto onError; 3269 } 3270 3271 if (byteorder) 3272 *byteorder = bo; 3273 3274 if (consumed) 3275 *consumed = (const char *)q-starts; 3276 3277 /* Adjust length */ 3278 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0) 3279 goto onError; 3280 3281 Py_XDECREF(errorHandler); 3282 Py_XDECREF(exc); 3283 return (PyObject *)unicode; 3284 3285 onError: 3286 Py_DECREF(unicode); 3287 Py_XDECREF(errorHandler); 3288 Py_XDECREF(exc); 3289 return NULL; 3290} 3291 3292PyObject * 3293PyUnicode_EncodeUTF32(const Py_UNICODE *s, 3294 Py_ssize_t size, 3295 const char *errors, 3296 int byteorder) 3297{ 3298 PyObject *v; 3299 unsigned char *p; 3300 Py_ssize_t nsize, bytesize; 3301#ifndef Py_UNICODE_WIDE 3302 Py_ssize_t i, pairs; 3303#else 3304 const int pairs = 0; 3305#endif 3306 /* Offsets from p for storing byte pairs in the right order. */ 3307#ifdef BYTEORDER_IS_LITTLE_ENDIAN 3308 int iorder[] = {0, 1, 2, 3}; 3309#else 3310 int iorder[] = {3, 2, 1, 0}; 3311#endif 3312 3313#define STORECHAR(CH) \ 3314 do { \ 3315 p[iorder[3]] = ((CH) >> 24) & 0xff; \ 3316 p[iorder[2]] = ((CH) >> 16) & 0xff; \ 3317 p[iorder[1]] = ((CH) >> 8) & 0xff; \ 3318 p[iorder[0]] = (CH) & 0xff; \ 3319 p += 4; \ 3320 } while(0) 3321 3322 /* In narrow builds we can output surrogate pairs as one codepoint, 3323 so we need less space. */ 3324#ifndef Py_UNICODE_WIDE 3325 for (i = pairs = 0; i < size-1; i++) 3326 if (0xD800 <= s[i] && s[i] <= 0xDBFF && 3327 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF) 3328 pairs++; 3329#endif 3330 nsize = (size - pairs + (byteorder == 0)); 3331 bytesize = nsize * 4; 3332 if (bytesize / 4 != nsize) 3333 return PyErr_NoMemory(); 3334 v = PyBytes_FromStringAndSize(NULL, bytesize); 3335 if (v == NULL) 3336 return NULL; 3337 3338 p = (unsigned char *)PyBytes_AS_STRING(v); 3339 if (byteorder == 0) 3340 STORECHAR(0xFEFF); 3341 if (size == 0) 3342 goto done; 3343 3344 if (byteorder == -1) { 3345 /* force LE */ 3346 iorder[0] = 0; 3347 iorder[1] = 1; 3348 iorder[2] = 2; 3349 iorder[3] = 3; 3350 } 3351 else if (byteorder == 1) { 3352 /* force BE */ 3353 iorder[0] = 3; 3354 iorder[1] = 2; 3355 iorder[2] = 1; 3356 iorder[3] = 0; 3357 } 3358 3359 while (size-- > 0) { 3360 Py_UCS4 ch = *s++; 3361#ifndef Py_UNICODE_WIDE 3362 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) { 3363 Py_UCS4 ch2 = *s; 3364 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) { 3365 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000; 3366 s++; 3367 size--; 3368 } 3369 } 3370#endif 3371 STORECHAR(ch); 3372 } 3373 3374 done: 3375 return v; 3376#undef STORECHAR 3377} 3378 3379PyObject * 3380PyUnicode_AsUTF32String(PyObject *unicode) 3381{ 3382 if (!PyUnicode_Check(unicode)) { 3383 PyErr_BadArgument(); 3384 return NULL; 3385 } 3386 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode), 3387 PyUnicode_GET_SIZE(unicode), 3388 NULL, 3389 0); 3390} 3391 3392/* --- UTF-16 Codec ------------------------------------------------------- */ 3393 3394PyObject * 3395PyUnicode_DecodeUTF16(const char *s, 3396 Py_ssize_t size, 3397 const char *errors, 3398 int *byteorder) 3399{ 3400 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL); 3401} 3402 3403/* Two masks for fast checking of whether a C 'long' may contain 3404 UTF16-encoded surrogate characters. This is an efficient heuristic, 3405 assuming that non-surrogate characters with a code point >= 0x8000 are 3406 rare in most input. 3407 FAST_CHAR_MASK is used when the input is in native byte ordering, 3408 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering. 3409*/ 3410#if (SIZEOF_LONG == 8) 3411# define FAST_CHAR_MASK 0x8000800080008000L 3412# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L 3413#elif (SIZEOF_LONG == 4) 3414# define FAST_CHAR_MASK 0x80008000L 3415# define SWAPPED_FAST_CHAR_MASK 0x00800080L 3416#else 3417# error C 'long' size should be either 4 or 8! 3418#endif 3419 3420PyObject * 3421PyUnicode_DecodeUTF16Stateful(const char *s, 3422 Py_ssize_t size, 3423 const char *errors, 3424 int *byteorder, 3425 Py_ssize_t *consumed) 3426{ 3427 const char *starts = s; 3428 Py_ssize_t startinpos; 3429 Py_ssize_t endinpos; 3430 Py_ssize_t outpos; 3431 PyUnicodeObject *unicode; 3432 Py_UNICODE *p; 3433 const unsigned char *q, *e, *aligned_end; 3434 int bo = 0; /* assume native ordering by default */ 3435 int native_ordering = 0; 3436 const char *errmsg = ""; 3437 /* Offsets from q for retrieving byte pairs in the right order. */ 3438#ifdef BYTEORDER_IS_LITTLE_ENDIAN 3439 int ihi = 1, ilo = 0; 3440#else 3441 int ihi = 0, ilo = 1; 3442#endif 3443 PyObject *errorHandler = NULL; 3444 PyObject *exc = NULL; 3445 3446 /* Note: size will always be longer than the resulting Unicode 3447 character count */ 3448 unicode = _PyUnicode_New(size); 3449 if (!unicode) 3450 return NULL; 3451 if (size == 0) 3452 return (PyObject *)unicode; 3453 3454 /* Unpack UTF-16 encoded data */ 3455 p = unicode->str; 3456 q = (unsigned char *)s; 3457 e = q + size - 1; 3458 3459 if (byteorder) 3460 bo = *byteorder; 3461 3462 /* Check for BOM marks (U+FEFF) in the input and adjust current 3463 byte order setting accordingly. In native mode, the leading BOM 3464 mark is skipped, in all other modes, it is copied to the output 3465 stream as-is (giving a ZWNBSP character). */ 3466 if (bo == 0) { 3467 if (size >= 2) { 3468 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo]; 3469#ifdef BYTEORDER_IS_LITTLE_ENDIAN 3470 if (bom == 0xFEFF) { 3471 q += 2; 3472 bo = -1; 3473 } 3474 else if (bom == 0xFFFE) { 3475 q += 2; 3476 bo = 1; 3477 } 3478#else 3479 if (bom == 0xFEFF) { 3480 q += 2; 3481 bo = 1; 3482 } 3483 else if (bom == 0xFFFE) { 3484 q += 2; 3485 bo = -1; 3486 } 3487#endif 3488 } 3489 } 3490 3491 if (bo == -1) { 3492 /* force LE */ 3493 ihi = 1; 3494 ilo = 0; 3495 } 3496 else if (bo == 1) { 3497 /* force BE */ 3498 ihi = 0; 3499 ilo = 1; 3500 } 3501#ifdef BYTEORDER_IS_LITTLE_ENDIAN 3502 native_ordering = ilo < ihi; 3503#else 3504 native_ordering = ilo > ihi; 3505#endif 3506 3507 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK); 3508 while (q < e) { 3509 Py_UNICODE ch; 3510 /* First check for possible aligned read of a C 'long'. Unaligned 3511 reads are more expensive, better to defer to another iteration. */ 3512 if (!((size_t) q & LONG_PTR_MASK)) { 3513 /* Fast path for runs of non-surrogate chars. */ 3514 register const unsigned char *_q = q; 3515 Py_UNICODE *_p = p; 3516 if (native_ordering) { 3517 /* Native ordering is simple: as long as the input cannot 3518 possibly contain a surrogate char, do an unrolled copy 3519 of several 16-bit code points to the target object. 3520 The non-surrogate check is done on several input bytes 3521 at a time (as many as a C 'long' can contain). */ 3522 while (_q < aligned_end) { 3523 unsigned long data = * (unsigned long *) _q; 3524 if (data & FAST_CHAR_MASK) 3525 break; 3526 _p[0] = ((unsigned short *) _q)[0]; 3527 _p[1] = ((unsigned short *) _q)[1]; 3528#if (SIZEOF_LONG == 8) 3529 _p[2] = ((unsigned short *) _q)[2]; 3530 _p[3] = ((unsigned short *) _q)[3]; 3531#endif 3532 _q += SIZEOF_LONG; 3533 _p += SIZEOF_LONG / 2; 3534 } 3535 } 3536 else { 3537 /* Byteswapped ordering is similar, but we must decompose 3538 the copy bytewise, and take care of zero'ing out the 3539 upper bytes if the target object is in 32-bit units 3540 (that is, in UCS-4 builds). */ 3541 while (_q < aligned_end) { 3542 unsigned long data = * (unsigned long *) _q; 3543 if (data & SWAPPED_FAST_CHAR_MASK) 3544 break; 3545 /* Zero upper bytes in UCS-4 builds */ 3546#if (Py_UNICODE_SIZE > 2) 3547 _p[0] = 0; 3548 _p[1] = 0; 3549#if (SIZEOF_LONG == 8) 3550 _p[2] = 0; 3551 _p[3] = 0; 3552#endif 3553#endif 3554 /* Issue #4916; UCS-4 builds on big endian machines must 3555 fill the two last bytes of each 4-byte unit. */ 3556#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2) 3557# define OFF 2 3558#else 3559# define OFF 0 3560#endif 3561 ((unsigned char *) _p)[OFF + 1] = _q[0]; 3562 ((unsigned char *) _p)[OFF + 0] = _q[1]; 3563 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2]; 3564 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3]; 3565#if (SIZEOF_LONG == 8) 3566 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4]; 3567 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5]; 3568 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6]; 3569 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7]; 3570#endif 3571#undef OFF 3572 _q += SIZEOF_LONG; 3573 _p += SIZEOF_LONG / 2; 3574 } 3575 } 3576 p = _p; 3577 q = _q; 3578 if (q >= e) 3579 break; 3580 } 3581 ch = (q[ihi] << 8) | q[ilo]; 3582 3583 q += 2; 3584 3585 if (ch < 0xD800 || ch > 0xDFFF) { 3586 *p++ = ch; 3587 continue; 3588 } 3589 3590 /* UTF-16 code pair: */ 3591 if (q > e) { 3592 errmsg = "unexpected end of data"; 3593 startinpos = (((const char *)q) - 2) - starts; 3594 endinpos = ((const char *)e) + 1 - starts; 3595 goto utf16Error; 3596 } 3597 if (0xD800 <= ch && ch <= 0xDBFF) { 3598 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo]; 3599 q += 2; 3600 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) { 3601#ifndef Py_UNICODE_WIDE 3602 *p++ = ch; 3603 *p++ = ch2; 3604#else 3605 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000; 3606#endif 3607 continue; 3608 } 3609 else { 3610 errmsg = "illegal UTF-16 surrogate"; 3611 startinpos = (((const char *)q)-4)-starts; 3612 endinpos = startinpos+2; 3613 goto utf16Error; 3614 } 3615 3616 } 3617 errmsg = "illegal encoding"; 3618 startinpos = (((const char *)q)-2)-starts; 3619 endinpos = startinpos+2; 3620 /* Fall through to report the error */ 3621 3622 utf16Error: 3623 outpos = p - PyUnicode_AS_UNICODE(unicode); 3624 if (unicode_decode_call_errorhandler( 3625 errors, 3626 &errorHandler, 3627 "utf16", errmsg, 3628 &starts, 3629 (const char **)&e, 3630 &startinpos, 3631 &endinpos, 3632 &exc, 3633 (const char **)&q, 3634 &unicode, 3635 &outpos, 3636 &p)) 3637 goto onError; 3638 } 3639 /* remaining byte at the end? (size should be even) */ 3640 if (e == q) { 3641 if (!consumed) { 3642 errmsg = "truncated data"; 3643 startinpos = ((const char *)q) - starts; 3644 endinpos = ((const char *)e) + 1 - starts; 3645 outpos = p - PyUnicode_AS_UNICODE(unicode); 3646 if (unicode_decode_call_errorhandler( 3647 errors, 3648 &errorHandler, 3649 "utf16", errmsg, 3650 &starts, 3651 (const char **)&e, 3652 &startinpos, 3653 &endinpos, 3654 &exc, 3655 (const char **)&q, 3656 &unicode, 3657 &outpos, 3658 &p)) 3659 goto onError; 3660 /* The remaining input chars are ignored if the callback 3661 chooses to skip the input */ 3662 } 3663 } 3664 3665 if (byteorder) 3666 *byteorder = bo; 3667 3668 if (consumed) 3669 *consumed = (const char *)q-starts; 3670 3671 /* Adjust length */ 3672 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0) 3673 goto onError; 3674 3675 Py_XDECREF(errorHandler); 3676 Py_XDECREF(exc); 3677 return (PyObject *)unicode; 3678 3679 onError: 3680 Py_DECREF(unicode); 3681 Py_XDECREF(errorHandler); 3682 Py_XDECREF(exc); 3683 return NULL; 3684} 3685 3686#undef FAST_CHAR_MASK 3687#undef SWAPPED_FAST_CHAR_MASK 3688 3689PyObject * 3690PyUnicode_EncodeUTF16(const Py_UNICODE *s, 3691 Py_ssize_t size, 3692 const char *errors, 3693 int byteorder) 3694{ 3695 PyObject *v; 3696 unsigned char *p; 3697 Py_ssize_t nsize, bytesize; 3698#ifdef Py_UNICODE_WIDE 3699 Py_ssize_t i, pairs; 3700#else 3701 const int pairs = 0; 3702#endif 3703 /* Offsets from p for storing byte pairs in the right order. */ 3704#ifdef BYTEORDER_IS_LITTLE_ENDIAN 3705 int ihi = 1, ilo = 0; 3706#else 3707 int ihi = 0, ilo = 1; 3708#endif 3709 3710#define STORECHAR(CH) \ 3711 do { \ 3712 p[ihi] = ((CH) >> 8) & 0xff; \ 3713 p[ilo] = (CH) & 0xff; \ 3714 p += 2; \ 3715 } while(0) 3716 3717#ifdef Py_UNICODE_WIDE 3718 for (i = pairs = 0; i < size; i++) 3719 if (s[i] >= 0x10000) 3720 pairs++; 3721#endif 3722 /* 2 * (size + pairs + (byteorder == 0)) */ 3723 if (size > PY_SSIZE_T_MAX || 3724 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0)) 3725 return PyErr_NoMemory(); 3726 nsize = size + pairs + (byteorder == 0); 3727 bytesize = nsize * 2; 3728 if (bytesize / 2 != nsize) 3729 return PyErr_NoMemory(); 3730 v = PyBytes_FromStringAndSize(NULL, bytesize); 3731 if (v == NULL) 3732 return NULL; 3733 3734 p = (unsigned char *)PyBytes_AS_STRING(v); 3735 if (byteorder == 0) 3736 STORECHAR(0xFEFF); 3737 if (size == 0) 3738 goto done; 3739 3740 if (byteorder == -1) { 3741 /* force LE */ 3742 ihi = 1; 3743 ilo = 0; 3744 } 3745 else if (byteorder == 1) { 3746 /* force BE */ 3747 ihi = 0; 3748 ilo = 1; 3749 } 3750 3751 while (size-- > 0) { 3752 Py_UNICODE ch = *s++; 3753 Py_UNICODE ch2 = 0; 3754#ifdef Py_UNICODE_WIDE 3755 if (ch >= 0x10000) { 3756 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF); 3757 ch = 0xD800 | ((ch-0x10000) >> 10); 3758 } 3759#endif 3760 STORECHAR(ch); 3761 if (ch2) 3762 STORECHAR(ch2); 3763 } 3764 3765 done: 3766 return v; 3767#undef STORECHAR 3768} 3769 3770PyObject * 3771PyUnicode_AsUTF16String(PyObject *unicode) 3772{ 3773 if (!PyUnicode_Check(unicode)) { 3774 PyErr_BadArgument(); 3775 return NULL; 3776 } 3777 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode), 3778 PyUnicode_GET_SIZE(unicode), 3779 NULL, 3780 0); 3781} 3782 3783/* --- Unicode Escape Codec ----------------------------------------------- */ 3784 3785static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL; 3786 3787PyObject * 3788PyUnicode_DecodeUnicodeEscape(const char *s, 3789 Py_ssize_t size, 3790 const char *errors) 3791{ 3792 const char *starts = s; 3793 Py_ssize_t startinpos; 3794 Py_ssize_t endinpos; 3795 Py_ssize_t outpos; 3796 int i; 3797 PyUnicodeObject *v; 3798 Py_UNICODE *p; 3799 const char *end; 3800 char* message; 3801 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */ 3802 PyObject *errorHandler = NULL; 3803 PyObject *exc = NULL; 3804 3805 /* Escaped strings will always be longer than the resulting 3806 Unicode string, so we start with size here and then reduce the 3807 length after conversion to the true value. 3808 (but if the error callback returns a long replacement string 3809 we'll have to allocate more space) */ 3810 v = _PyUnicode_New(size); 3811 if (v == NULL) 3812 goto onError; 3813 if (size == 0) 3814 return (PyObject *)v; 3815 3816 p = PyUnicode_AS_UNICODE(v); 3817 end = s + size; 3818 3819 while (s < end) { 3820 unsigned char c; 3821 Py_UNICODE x; 3822 int digits; 3823 3824 /* Non-escape characters are interpreted as Unicode ordinals */ 3825 if (*s != '\\') { 3826 *p++ = (unsigned char) *s++; 3827 continue; 3828 } 3829 3830 startinpos = s-starts; 3831 /* \ - Escapes */ 3832 s++; 3833 c = *s++; 3834 if (s > end) 3835 c = '\0'; /* Invalid after \ */ 3836 switch (c) { 3837 3838 /* \x escapes */ 3839 case '\n': break; 3840 case '\\': *p++ = '\\'; break; 3841 case '\'': *p++ = '\''; break; 3842 case '\"': *p++ = '\"'; break; 3843 case 'b': *p++ = '\b'; break; 3844 case 'f': *p++ = '\014'; break; /* FF */ 3845 case 't': *p++ = '\t'; break; 3846 case 'n': *p++ = '\n'; break; 3847 case 'r': *p++ = '\r'; break; 3848 case 'v': *p++ = '\013'; break; /* VT */ 3849 case 'a': *p++ = '\007'; break; /* BEL, not classic C */ 3850 3851 /* \OOO (octal) escapes */ 3852 case '0': case '1': case '2': case '3': 3853 case '4': case '5': case '6': case '7': 3854 x = s[-1] - '0'; 3855 if (s < end && '0' <= *s && *s <= '7') { 3856 x = (x<<3) + *s++ - '0'; 3857 if (s < end && '0' <= *s && *s <= '7') 3858 x = (x<<3) + *s++ - '0'; 3859 } 3860 *p++ = x; 3861 break; 3862 3863 /* hex escapes */ 3864 /* \xXX */ 3865 case 'x': 3866 digits = 2; 3867 message = "truncated \\xXX escape"; 3868 goto hexescape; 3869 3870 /* \uXXXX */ 3871 case 'u': 3872 digits = 4; 3873 message = "truncated \\uXXXX escape"; 3874 goto hexescape; 3875 3876 /* \UXXXXXXXX */ 3877 case 'U': 3878 digits = 8; 3879 message = "truncated \\UXXXXXXXX escape"; 3880 hexescape: 3881 chr = 0; 3882 outpos = p-PyUnicode_AS_UNICODE(v); 3883 if (s+digits>end) { 3884 endinpos = size; 3885 if (unicode_decode_call_errorhandler( 3886 errors, &errorHandler, 3887 "unicodeescape", "end of string in escape sequence", 3888 &starts, &end, &startinpos, &endinpos, &exc, &s, 3889 &v, &outpos, &p)) 3890 goto onError; 3891 goto nextByte; 3892 } 3893 for (i = 0; i < digits; ++i) { 3894 c = (unsigned char) s[i]; 3895 if (!Py_ISXDIGIT(c)) { 3896 endinpos = (s+i+1)-starts; 3897 if (unicode_decode_call_errorhandler( 3898 errors, &errorHandler, 3899 "unicodeescape", message, 3900 &starts, &end, &startinpos, &endinpos, &exc, &s, 3901 &v, &outpos, &p)) 3902 goto onError; 3903 goto nextByte; 3904 } 3905 chr = (chr<<4) & ~0xF; 3906 if (c >= '0' && c <= '9') 3907 chr += c - '0'; 3908 else if (c >= 'a' && c <= 'f') 3909 chr += 10 + c - 'a'; 3910 else 3911 chr += 10 + c - 'A'; 3912 } 3913 s += i; 3914 if (chr == 0xffffffff && PyErr_Occurred()) 3915 /* _decoding_error will have already written into the 3916 target buffer. */ 3917 break; 3918 store: 3919 /* when we get here, chr is a 32-bit unicode character */ 3920 if (chr <= 0xffff) 3921 /* UCS-2 character */ 3922 *p++ = (Py_UNICODE) chr; 3923 else if (chr <= 0x10ffff) { 3924 /* UCS-4 character. Either store directly, or as 3925 surrogate pair. */ 3926#ifdef Py_UNICODE_WIDE 3927 *p++ = chr; 3928#else 3929 chr -= 0x10000L; 3930 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10); 3931 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF); 3932#endif 3933 } else { 3934 endinpos = s-starts; 3935 outpos = p-PyUnicode_AS_UNICODE(v); 3936 if (unicode_decode_call_errorhandler( 3937 errors, &errorHandler, 3938 "unicodeescape", "illegal Unicode character", 3939 &starts, &end, &startinpos, &endinpos, &exc, &s, 3940 &v, &outpos, &p)) 3941 goto onError; 3942 } 3943 break; 3944 3945 /* \N{name} */ 3946 case 'N': 3947 message = "malformed \\N character escape"; 3948 if (ucnhash_CAPI == NULL) { 3949 /* load the unicode data module */ 3950 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(PyUnicodeData_CAPSULE_NAME, 1); 3951 if (ucnhash_CAPI == NULL) 3952 goto ucnhashError; 3953 } 3954 if (*s == '{') { 3955 const char *start = s+1; 3956 /* look for the closing brace */ 3957 while (*s != '}' && s < end) 3958 s++; 3959 if (s > start && s < end && *s == '}') { 3960 /* found a name. look it up in the unicode database */ 3961 message = "unknown Unicode character name"; 3962 s++; 3963 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr)) 3964 goto store; 3965 } 3966 } 3967 endinpos = s-starts; 3968 outpos = p-PyUnicode_AS_UNICODE(v); 3969 if (unicode_decode_call_errorhandler( 3970 errors, &errorHandler, 3971 "unicodeescape", message, 3972 &starts, &end, &startinpos, &endinpos, &exc, &s, 3973 &v, &outpos, &p)) 3974 goto onError; 3975 break; 3976 3977 default: 3978 if (s > end) { 3979 message = "\\ at end of string"; 3980 s--; 3981 endinpos = s-starts; 3982 outpos = p-PyUnicode_AS_UNICODE(v); 3983 if (unicode_decode_call_errorhandler( 3984 errors, &errorHandler, 3985 "unicodeescape", message, 3986 &starts, &end, &startinpos, &endinpos, &exc, &s, 3987 &v, &outpos, &p)) 3988 goto onError; 3989 } 3990 else { 3991 *p++ = '\\'; 3992 *p++ = (unsigned char)s[-1]; 3993 } 3994 break; 3995 } 3996 nextByte: 3997 ; 3998 } 3999 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0) 4000 goto onError; 4001 Py_XDECREF(errorHandler); 4002 Py_XDECREF(exc); 4003 return (PyObject *)v; 4004 4005 ucnhashError: 4006 PyErr_SetString( 4007 PyExc_UnicodeError, 4008 "\\N escapes not supported (can't load unicodedata module)" 4009 ); 4010 Py_XDECREF(v); 4011 Py_XDECREF(errorHandler); 4012 Py_XDECREF(exc); 4013 return NULL; 4014 4015 onError: 4016 Py_XDECREF(v); 4017 Py_XDECREF(errorHandler); 4018 Py_XDECREF(exc); 4019 return NULL; 4020} 4021 4022/* Return a Unicode-Escape string version of the Unicode object. 4023 4024 If quotes is true, the string is enclosed in u"" or u'' quotes as 4025 appropriate. 4026 4027*/ 4028 4029Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s, 4030 Py_ssize_t size, 4031 Py_UNICODE ch) 4032{ 4033 /* like wcschr, but doesn't stop at NULL characters */ 4034 4035 while (size-- > 0) { 4036 if (*s == ch) 4037 return s; 4038 s++; 4039 } 4040 4041 return NULL; 4042} 4043 4044static const char *hexdigits = "0123456789abcdef"; 4045 4046PyObject * 4047PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s, 4048 Py_ssize_t size) 4049{ 4050 PyObject *repr; 4051 char *p; 4052 4053#ifdef Py_UNICODE_WIDE 4054 const Py_ssize_t expandsize = 10; 4055#else 4056 const Py_ssize_t expandsize = 6; 4057#endif 4058 4059 /* XXX(nnorwitz): rather than over-allocating, it would be 4060 better to choose a different scheme. Perhaps scan the 4061 first N-chars of the string and allocate based on that size. 4062 */ 4063 /* Initial allocation is based on the longest-possible unichr 4064 escape. 4065 4066 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source 4067 unichr, so in this case it's the longest unichr escape. In 4068 narrow (UTF-16) builds this is five chars per source unichr 4069 since there are two unichrs in the surrogate pair, so in narrow 4070 (UTF-16) builds it's not the longest unichr escape. 4071 4072 In wide or narrow builds '\uxxxx' is 6 chars per source unichr, 4073 so in the narrow (UTF-16) build case it's the longest unichr 4074 escape. 4075 */ 4076 4077 if (size == 0) 4078 return PyBytes_FromStringAndSize(NULL, 0); 4079 4080 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize) 4081 return PyErr_NoMemory(); 4082 4083 repr = PyBytes_FromStringAndSize(NULL, 4084 2 4085 + expandsize*size 4086 + 1); 4087 if (repr == NULL) 4088 return NULL; 4089 4090 p = PyBytes_AS_STRING(repr); 4091 4092 while (size-- > 0) { 4093 Py_UNICODE ch = *s++; 4094 4095 /* Escape backslashes */ 4096 if (ch == '\\') { 4097 *p++ = '\\'; 4098 *p++ = (char) ch; 4099 continue; 4100 } 4101 4102#ifdef Py_UNICODE_WIDE 4103 /* Map 21-bit characters to '\U00xxxxxx' */ 4104 else if (ch >= 0x10000) { 4105 *p++ = '\\'; 4106 *p++ = 'U'; 4107 *p++ = hexdigits[(ch >> 28) & 0x0000000F]; 4108 *p++ = hexdigits[(ch >> 24) & 0x0000000F]; 4109 *p++ = hexdigits[(ch >> 20) & 0x0000000F]; 4110 *p++ = hexdigits[(ch >> 16) & 0x0000000F]; 4111 *p++ = hexdigits[(ch >> 12) & 0x0000000F]; 4112 *p++ = hexdigits[(ch >> 8) & 0x0000000F]; 4113 *p++ = hexdigits[(ch >> 4) & 0x0000000F]; 4114 *p++ = hexdigits[ch & 0x0000000F]; 4115 continue; 4116 } 4117#else 4118 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */ 4119 else if (ch >= 0xD800 && ch < 0xDC00) { 4120 Py_UNICODE ch2; 4121 Py_UCS4 ucs; 4122 4123 ch2 = *s++; 4124 size--; 4125 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) { 4126 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000; 4127 *p++ = '\\'; 4128 *p++ = 'U'; 4129 *p++ = hexdigits[(ucs >> 28) & 0x0000000F]; 4130 *p++ = hexdigits[(ucs >> 24) & 0x0000000F]; 4131 *p++ = hexdigits[(ucs >> 20) & 0x0000000F]; 4132 *p++ = hexdigits[(ucs >> 16) & 0x0000000F]; 4133 *p++ = hexdigits[(ucs >> 12) & 0x0000000F]; 4134 *p++ = hexdigits[(ucs >> 8) & 0x0000000F]; 4135 *p++ = hexdigits[(ucs >> 4) & 0x0000000F]; 4136 *p++ = hexdigits[ucs & 0x0000000F]; 4137 continue; 4138 } 4139 /* Fall through: isolated surrogates are copied as-is */ 4140 s--; 4141 size++; 4142 } 4143#endif 4144 4145 /* Map 16-bit characters to '\uxxxx' */ 4146 if (ch >= 256) { 4147 *p++ = '\\'; 4148 *p++ = 'u'; 4149 *p++ = hexdigits[(ch >> 12) & 0x000F]; 4150 *p++ = hexdigits[(ch >> 8) & 0x000F]; 4151 *p++ = hexdigits[(ch >> 4) & 0x000F]; 4152 *p++ = hexdigits[ch & 0x000F]; 4153 } 4154 4155 /* Map special whitespace to '\t', \n', '\r' */ 4156 else if (ch == '\t') { 4157 *p++ = '\\'; 4158 *p++ = 't'; 4159 } 4160 else if (ch == '\n') { 4161 *p++ = '\\'; 4162 *p++ = 'n'; 4163 } 4164 else if (ch == '\r') { 4165 *p++ = '\\'; 4166 *p++ = 'r'; 4167 } 4168 4169 /* Map non-printable US ASCII to '\xhh' */ 4170 else if (ch < ' ' || ch >= 0x7F) { 4171 *p++ = '\\'; 4172 *p++ = 'x'; 4173 *p++ = hexdigits[(ch >> 4) & 0x000F]; 4174 *p++ = hexdigits[ch & 0x000F]; 4175 } 4176 4177 /* Copy everything else as-is */ 4178 else 4179 *p++ = (char) ch; 4180 } 4181 4182 assert(p - PyBytes_AS_STRING(repr) > 0); 4183 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) 4184 return NULL; 4185 return repr; 4186} 4187 4188PyObject * 4189PyUnicode_AsUnicodeEscapeString(PyObject *unicode) 4190{ 4191 PyObject *s; 4192 if (!PyUnicode_Check(unicode)) { 4193 PyErr_BadArgument(); 4194 return NULL; 4195 } 4196 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode), 4197 PyUnicode_GET_SIZE(unicode)); 4198 return s; 4199} 4200 4201/* --- Raw Unicode Escape Codec ------------------------------------------- */ 4202 4203PyObject * 4204PyUnicode_DecodeRawUnicodeEscape(const char *s, 4205 Py_ssize_t size, 4206 const char *errors) 4207{ 4208 const char *starts = s; 4209 Py_ssize_t startinpos; 4210 Py_ssize_t endinpos; 4211 Py_ssize_t outpos; 4212 PyUnicodeObject *v; 4213 Py_UNICODE *p; 4214 const char *end; 4215 const char *bs; 4216 PyObject *errorHandler = NULL; 4217 PyObject *exc = NULL; 4218 4219 /* Escaped strings will always be longer than the resulting 4220 Unicode string, so we start with size here and then reduce the 4221 length after conversion to the true value. (But decoding error 4222 handler might have to resize the string) */ 4223 v = _PyUnicode_New(size); 4224 if (v == NULL) 4225 goto onError; 4226 if (size == 0) 4227 return (PyObject *)v; 4228 p = PyUnicode_AS_UNICODE(v); 4229 end = s + size; 4230 while (s < end) { 4231 unsigned char c; 4232 Py_UCS4 x; 4233 int i; 4234 int count; 4235 4236 /* Non-escape characters are interpreted as Unicode ordinals */ 4237 if (*s != '\\') { 4238 *p++ = (unsigned char)*s++; 4239 continue; 4240 } 4241 startinpos = s-starts; 4242 4243 /* \u-escapes are only interpreted iff the number of leading 4244 backslashes if odd */ 4245 bs = s; 4246 for (;s < end;) { 4247 if (*s != '\\') 4248 break; 4249 *p++ = (unsigned char)*s++; 4250 } 4251 if (((s - bs) & 1) == 0 || 4252 s >= end || 4253 (*s != 'u' && *s != 'U')) { 4254 continue; 4255 } 4256 p--; 4257 count = *s=='u' ? 4 : 8; 4258 s++; 4259 4260 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */ 4261 outpos = p-PyUnicode_AS_UNICODE(v); 4262 for (x = 0, i = 0; i < count; ++i, ++s) { 4263 c = (unsigned char)*s; 4264 if (!Py_ISXDIGIT(c)) { 4265 endinpos = s-starts; 4266 if (unicode_decode_call_errorhandler( 4267 errors, &errorHandler, 4268 "rawunicodeescape", "truncated \\uXXXX", 4269 &starts, &end, &startinpos, &endinpos, &exc, &s, 4270 &v, &outpos, &p)) 4271 goto onError; 4272 goto nextByte; 4273 } 4274 x = (x<<4) & ~0xF; 4275 if (c >= '0' && c <= '9') 4276 x += c - '0'; 4277 else if (c >= 'a' && c <= 'f') 4278 x += 10 + c - 'a'; 4279 else 4280 x += 10 + c - 'A'; 4281 } 4282 if (x <= 0xffff) 4283 /* UCS-2 character */ 4284 *p++ = (Py_UNICODE) x; 4285 else if (x <= 0x10ffff) { 4286 /* UCS-4 character. Either store directly, or as 4287 surrogate pair. */ 4288#ifdef Py_UNICODE_WIDE 4289 *p++ = (Py_UNICODE) x; 4290#else 4291 x -= 0x10000L; 4292 *p++ = 0xD800 + (Py_UNICODE) (x >> 10); 4293 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF); 4294#endif 4295 } else { 4296 endinpos = s-starts; 4297 outpos = p-PyUnicode_AS_UNICODE(v); 4298 if (unicode_decode_call_errorhandler( 4299 errors, &errorHandler, 4300 "rawunicodeescape", "\\Uxxxxxxxx out of range", 4301 &starts, &end, &startinpos, &endinpos, &exc, &s, 4302 &v, &outpos, &p)) 4303 goto onError; 4304 } 4305 nextByte: 4306 ; 4307 } 4308 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0) 4309 goto onError; 4310 Py_XDECREF(errorHandler); 4311 Py_XDECREF(exc); 4312 return (PyObject *)v; 4313 4314 onError: 4315 Py_XDECREF(v); 4316 Py_XDECREF(errorHandler); 4317 Py_XDECREF(exc); 4318 return NULL; 4319} 4320 4321PyObject * 4322PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s, 4323 Py_ssize_t size) 4324{ 4325 PyObject *repr; 4326 char *p; 4327 char *q; 4328 4329#ifdef Py_UNICODE_WIDE 4330 const Py_ssize_t expandsize = 10; 4331#else 4332 const Py_ssize_t expandsize = 6; 4333#endif 4334 4335 if (size > PY_SSIZE_T_MAX / expandsize) 4336 return PyErr_NoMemory(); 4337 4338 repr = PyBytes_FromStringAndSize(NULL, expandsize * size); 4339 if (repr == NULL) 4340 return NULL; 4341 if (size == 0) 4342 return repr; 4343 4344 p = q = PyBytes_AS_STRING(repr); 4345 while (size-- > 0) { 4346 Py_UNICODE ch = *s++; 4347#ifdef Py_UNICODE_WIDE 4348 /* Map 32-bit characters to '\Uxxxxxxxx' */ 4349 if (ch >= 0x10000) { 4350 *p++ = '\\'; 4351 *p++ = 'U'; 4352 *p++ = hexdigits[(ch >> 28) & 0xf]; 4353 *p++ = hexdigits[(ch >> 24) & 0xf]; 4354 *p++ = hexdigits[(ch >> 20) & 0xf]; 4355 *p++ = hexdigits[(ch >> 16) & 0xf]; 4356 *p++ = hexdigits[(ch >> 12) & 0xf]; 4357 *p++ = hexdigits[(ch >> 8) & 0xf]; 4358 *p++ = hexdigits[(ch >> 4) & 0xf]; 4359 *p++ = hexdigits[ch & 15]; 4360 } 4361 else 4362#else 4363 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */ 4364 if (ch >= 0xD800 && ch < 0xDC00) { 4365 Py_UNICODE ch2; 4366 Py_UCS4 ucs; 4367 4368 ch2 = *s++; 4369 size--; 4370 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) { 4371 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000; 4372 *p++ = '\\'; 4373 *p++ = 'U'; 4374 *p++ = hexdigits[(ucs >> 28) & 0xf]; 4375 *p++ = hexdigits[(ucs >> 24) & 0xf]; 4376 *p++ = hexdigits[(ucs >> 20) & 0xf]; 4377 *p++ = hexdigits[(ucs >> 16) & 0xf]; 4378 *p++ = hexdigits[(ucs >> 12) & 0xf]; 4379 *p++ = hexdigits[(ucs >> 8) & 0xf]; 4380 *p++ = hexdigits[(ucs >> 4) & 0xf]; 4381 *p++ = hexdigits[ucs & 0xf]; 4382 continue; 4383 } 4384 /* Fall through: isolated surrogates are copied as-is */ 4385 s--; 4386 size++; 4387 } 4388#endif 4389 /* Map 16-bit characters to '\uxxxx' */ 4390 if (ch >= 256) { 4391 *p++ = '\\'; 4392 *p++ = 'u'; 4393 *p++ = hexdigits[(ch >> 12) & 0xf]; 4394 *p++ = hexdigits[(ch >> 8) & 0xf]; 4395 *p++ = hexdigits[(ch >> 4) & 0xf]; 4396 *p++ = hexdigits[ch & 15]; 4397 } 4398 /* Copy everything else as-is */ 4399 else 4400 *p++ = (char) ch; 4401 } 4402 size = p - q; 4403 4404 assert(size > 0); 4405 if (_PyBytes_Resize(&repr, size) < 0) 4406 return NULL; 4407 return repr; 4408} 4409 4410PyObject * 4411PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode) 4412{ 4413 PyObject *s; 4414 if (!PyUnicode_Check(unicode)) { 4415 PyErr_BadArgument(); 4416 return NULL; 4417 } 4418 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode), 4419 PyUnicode_GET_SIZE(unicode)); 4420 4421 return s; 4422} 4423 4424/* --- Unicode Internal Codec ------------------------------------------- */ 4425 4426PyObject * 4427_PyUnicode_DecodeUnicodeInternal(const char *s, 4428 Py_ssize_t size, 4429 const char *errors) 4430{ 4431 const char *starts = s; 4432 Py_ssize_t startinpos; 4433 Py_ssize_t endinpos; 4434 Py_ssize_t outpos; 4435 PyUnicodeObject *v; 4436 Py_UNICODE *p; 4437 const char *end; 4438 const char *reason; 4439 PyObject *errorHandler = NULL; 4440 PyObject *exc = NULL; 4441 4442#ifdef Py_UNICODE_WIDE 4443 Py_UNICODE unimax = PyUnicode_GetMax(); 4444#endif 4445 4446 /* XXX overflow detection missing */ 4447 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE); 4448 if (v == NULL) 4449 goto onError; 4450 if (PyUnicode_GetSize((PyObject *)v) == 0) 4451 return (PyObject *)v; 4452 p = PyUnicode_AS_UNICODE(v); 4453 end = s + size; 4454 4455 while (s < end) { 4456 memcpy(p, s, sizeof(Py_UNICODE)); 4457 /* We have to sanity check the raw data, otherwise doom looms for 4458 some malformed UCS-4 data. */ 4459 if ( 4460#ifdef Py_UNICODE_WIDE 4461 *p > unimax || *p < 0 || 4462#endif 4463 end-s < Py_UNICODE_SIZE 4464 ) 4465 { 4466 startinpos = s - starts; 4467 if (end-s < Py_UNICODE_SIZE) { 4468 endinpos = end-starts; 4469 reason = "truncated input"; 4470 } 4471 else { 4472 endinpos = s - starts + Py_UNICODE_SIZE; 4473 reason = "illegal code point (> 0x10FFFF)"; 4474 } 4475 outpos = p - PyUnicode_AS_UNICODE(v); 4476 if (unicode_decode_call_errorhandler( 4477 errors, &errorHandler, 4478 "unicode_internal", reason, 4479 &starts, &end, &startinpos, &endinpos, &exc, &s, 4480 &v, &outpos, &p)) { 4481 goto onError; 4482 } 4483 } 4484 else { 4485 p++; 4486 s += Py_UNICODE_SIZE; 4487 } 4488 } 4489 4490 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0) 4491 goto onError; 4492 Py_XDECREF(errorHandler); 4493 Py_XDECREF(exc); 4494 return (PyObject *)v; 4495 4496 onError: 4497 Py_XDECREF(v); 4498 Py_XDECREF(errorHandler); 4499 Py_XDECREF(exc); 4500 return NULL; 4501} 4502 4503/* --- Latin-1 Codec ------------------------------------------------------ */ 4504 4505PyObject * 4506PyUnicode_DecodeLatin1(const char *s, 4507 Py_ssize_t size, 4508 const char *errors) 4509{ 4510 PyUnicodeObject *v; 4511 Py_UNICODE *p; 4512 const char *e, *unrolled_end; 4513 4514 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */ 4515 if (size == 1) { 4516 Py_UNICODE r = *(unsigned char*)s; 4517 return PyUnicode_FromUnicode(&r, 1); 4518 } 4519 4520 v = _PyUnicode_New(size); 4521 if (v == NULL) 4522 goto onError; 4523 if (size == 0) 4524 return (PyObject *)v; 4525 p = PyUnicode_AS_UNICODE(v); 4526 e = s + size; 4527 /* Unrolling the copy makes it much faster by reducing the looping 4528 overhead. This is similar to what many memcpy() implementations do. */ 4529 unrolled_end = e - 4; 4530 while (s < unrolled_end) { 4531 p[0] = (unsigned char) s[0]; 4532 p[1] = (unsigned char) s[1]; 4533 p[2] = (unsigned char) s[2]; 4534 p[3] = (unsigned char) s[3]; 4535 s += 4; 4536 p += 4; 4537 } 4538 while (s < e) 4539 *p++ = (unsigned char) *s++; 4540 return (PyObject *)v; 4541 4542 onError: 4543 Py_XDECREF(v); 4544 return NULL; 4545} 4546 4547/* create or adjust a UnicodeEncodeError */ 4548static void 4549make_encode_exception(PyObject **exceptionObject, 4550 const char *encoding, 4551 const Py_UNICODE *unicode, Py_ssize_t size, 4552 Py_ssize_t startpos, Py_ssize_t endpos, 4553 const char *reason) 4554{ 4555 if (*exceptionObject == NULL) { 4556 *exceptionObject = PyUnicodeEncodeError_Create( 4557 encoding, unicode, size, startpos, endpos, reason); 4558 } 4559 else { 4560 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos)) 4561 goto onError; 4562 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos)) 4563 goto onError; 4564 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason)) 4565 goto onError; 4566 return; 4567 onError: 4568 Py_DECREF(*exceptionObject); 4569 *exceptionObject = NULL; 4570 } 4571} 4572 4573/* raises a UnicodeEncodeError */ 4574static void 4575raise_encode_exception(PyObject **exceptionObject, 4576 const char *encoding, 4577 const Py_UNICODE *unicode, Py_ssize_t size, 4578 Py_ssize_t startpos, Py_ssize_t endpos, 4579 const char *reason) 4580{ 4581 make_encode_exception(exceptionObject, 4582 encoding, unicode, size, startpos, endpos, reason); 4583 if (*exceptionObject != NULL) 4584 PyCodec_StrictErrors(*exceptionObject); 4585} 4586 4587/* error handling callback helper: 4588 build arguments, call the callback and check the arguments, 4589 put the result into newpos and return the replacement string, which 4590 has to be freed by the caller */ 4591static PyObject * 4592unicode_encode_call_errorhandler(const char *errors, 4593 PyObject **errorHandler, 4594 const char *encoding, const char *reason, 4595 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject, 4596 Py_ssize_t startpos, Py_ssize_t endpos, 4597 Py_ssize_t *newpos) 4598{ 4599 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple"; 4600 4601 PyObject *restuple; 4602 PyObject *resunicode; 4603 4604 if (*errorHandler == NULL) { 4605 *errorHandler = PyCodec_LookupError(errors); 4606 if (*errorHandler == NULL) 4607 return NULL; 4608 } 4609 4610 make_encode_exception(exceptionObject, 4611 encoding, unicode, size, startpos, endpos, reason); 4612 if (*exceptionObject == NULL) 4613 return NULL; 4614 4615 restuple = PyObject_CallFunctionObjArgs( 4616 *errorHandler, *exceptionObject, NULL); 4617 if (restuple == NULL) 4618 return NULL; 4619 if (!PyTuple_Check(restuple)) { 4620 PyErr_SetString(PyExc_TypeError, &argparse[3]); 4621 Py_DECREF(restuple); 4622 return NULL; 4623 } 4624 if (!PyArg_ParseTuple(restuple, argparse, 4625 &resunicode, newpos)) { 4626 Py_DECREF(restuple); 4627 return NULL; 4628 } 4629 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) { 4630 PyErr_SetString(PyExc_TypeError, &argparse[3]); 4631 Py_DECREF(restuple); 4632 return NULL; 4633 } 4634 if (*newpos<0) 4635 *newpos = size+*newpos; 4636 if (*newpos<0 || *newpos>size) { 4637 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos); 4638 Py_DECREF(restuple); 4639 return NULL; 4640 } 4641 Py_INCREF(resunicode); 4642 Py_DECREF(restuple); 4643 return resunicode; 4644} 4645 4646static PyObject * 4647unicode_encode_ucs1(const Py_UNICODE *p, 4648 Py_ssize_t size, 4649 const char *errors, 4650 int limit) 4651{ 4652 /* output object */ 4653 PyObject *res; 4654 /* pointers to the beginning and end+1 of input */ 4655 const Py_UNICODE *startp = p; 4656 const Py_UNICODE *endp = p + size; 4657 /* pointer to the beginning of the unencodable characters */ 4658 /* const Py_UNICODE *badp = NULL; */ 4659 /* pointer into the output */ 4660 char *str; 4661 /* current output position */ 4662 Py_ssize_t ressize; 4663 const char *encoding = (limit == 256) ? "latin-1" : "ascii"; 4664 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)"; 4665 PyObject *errorHandler = NULL; 4666 PyObject *exc = NULL; 4667 /* the following variable is used for caching string comparisons 4668 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */ 4669 int known_errorHandler = -1; 4670 4671 /* allocate enough for a simple encoding without 4672 replacements, if we need more, we'll resize */ 4673 if (size == 0) 4674 return PyBytes_FromStringAndSize(NULL, 0); 4675 res = PyBytes_FromStringAndSize(NULL, size); 4676 if (res == NULL) 4677 return NULL; 4678 str = PyBytes_AS_STRING(res); 4679 ressize = size; 4680 4681 while (p<endp) { 4682 Py_UNICODE c = *p; 4683 4684 /* can we encode this? */ 4685 if (c<limit) { 4686 /* no overflow check, because we know that the space is enough */ 4687 *str++ = (char)c; 4688 ++p; 4689 } 4690 else { 4691 Py_ssize_t unicodepos = p-startp; 4692 Py_ssize_t requiredsize; 4693 PyObject *repunicode; 4694 Py_ssize_t repsize; 4695 Py_ssize_t newpos; 4696 Py_ssize_t respos; 4697 Py_UNICODE *uni2; 4698 /* startpos for collecting unencodable chars */ 4699 const Py_UNICODE *collstart = p; 4700 const Py_UNICODE *collend = p; 4701 /* find all unecodable characters */ 4702 while ((collend < endp) && ((*collend)>=limit)) 4703 ++collend; 4704 /* cache callback name lookup (if not done yet, i.e. it's the first error) */ 4705 if (known_errorHandler==-1) { 4706 if ((errors==NULL) || (!strcmp(errors, "strict"))) 4707 known_errorHandler = 1; 4708 else if (!strcmp(errors, "replace")) 4709 known_errorHandler = 2; 4710 else if (!strcmp(errors, "ignore")) 4711 known_errorHandler = 3; 4712 else if (!strcmp(errors, "xmlcharrefreplace")) 4713 known_errorHandler = 4; 4714 else 4715 known_errorHandler = 0; 4716 } 4717 switch (known_errorHandler) { 4718 case 1: /* strict */ 4719 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason); 4720 goto onError; 4721 case 2: /* replace */ 4722 while (collstart++<collend) 4723 *str++ = '?'; /* fall through */ 4724 case 3: /* ignore */ 4725 p = collend; 4726 break; 4727 case 4: /* xmlcharrefreplace */ 4728 respos = str - PyBytes_AS_STRING(res); 4729 /* determine replacement size (temporarily (mis)uses p) */ 4730 for (p = collstart, repsize = 0; p < collend; ++p) { 4731 if (*p<10) 4732 repsize += 2+1+1; 4733 else if (*p<100) 4734 repsize += 2+2+1; 4735 else if (*p<1000) 4736 repsize += 2+3+1; 4737 else if (*p<10000) 4738 repsize += 2+4+1; 4739#ifndef Py_UNICODE_WIDE 4740 else 4741 repsize += 2+5+1; 4742#else 4743 else if (*p<100000) 4744 repsize += 2+5+1; 4745 else if (*p<1000000) 4746 repsize += 2+6+1; 4747 else 4748 repsize += 2+7+1; 4749#endif 4750 } 4751 requiredsize = respos+repsize+(endp-collend); 4752 if (requiredsize > ressize) { 4753 if (requiredsize<2*ressize) 4754 requiredsize = 2*ressize; 4755 if (_PyBytes_Resize(&res, requiredsize)) 4756 goto onError; 4757 str = PyBytes_AS_STRING(res) + respos; 4758 ressize = requiredsize; 4759 } 4760 /* generate replacement (temporarily (mis)uses p) */ 4761 for (p = collstart; p < collend; ++p) { 4762 str += sprintf(str, "&#%d;", (int)*p); 4763 } 4764 p = collend; 4765 break; 4766 default: 4767 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler, 4768 encoding, reason, startp, size, &exc, 4769 collstart-startp, collend-startp, &newpos); 4770 if (repunicode == NULL) 4771 goto onError; 4772 if (PyBytes_Check(repunicode)) { 4773 /* Directly copy bytes result to output. */ 4774 repsize = PyBytes_Size(repunicode); 4775 if (repsize > 1) { 4776 /* Make room for all additional bytes. */ 4777 respos = str - PyBytes_AS_STRING(res); 4778 if (_PyBytes_Resize(&res, ressize+repsize-1)) { 4779 Py_DECREF(repunicode); 4780 goto onError; 4781 } 4782 str = PyBytes_AS_STRING(res) + respos; 4783 ressize += repsize-1; 4784 } 4785 memcpy(str, PyBytes_AsString(repunicode), repsize); 4786 str += repsize; 4787 p = startp + newpos; 4788 Py_DECREF(repunicode); 4789 break; 4790 } 4791 /* need more space? (at least enough for what we 4792 have+the replacement+the rest of the string, so 4793 we won't have to check space for encodable characters) */ 4794 respos = str - PyBytes_AS_STRING(res); 4795 repsize = PyUnicode_GET_SIZE(repunicode); 4796 requiredsize = respos+repsize+(endp-collend); 4797 if (requiredsize > ressize) { 4798 if (requiredsize<2*ressize) 4799 requiredsize = 2*ressize; 4800 if (_PyBytes_Resize(&res, requiredsize)) { 4801 Py_DECREF(repunicode); 4802 goto onError; 4803 } 4804 str = PyBytes_AS_STRING(res) + respos; 4805 ressize = requiredsize; 4806 } 4807 /* check if there is anything unencodable in the replacement 4808 and copy it to the output */ 4809 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) { 4810 c = *uni2; 4811 if (c >= limit) { 4812 raise_encode_exception(&exc, encoding, startp, size, 4813 unicodepos, unicodepos+1, reason); 4814 Py_DECREF(repunicode); 4815 goto onError; 4816 } 4817 *str = (char)c; 4818 } 4819 p = startp + newpos; 4820 Py_DECREF(repunicode); 4821 } 4822 } 4823 } 4824 /* Resize if we allocated to much */ 4825 size = str - PyBytes_AS_STRING(res); 4826 if (size < ressize) { /* If this falls res will be NULL */ 4827 assert(size >= 0); 4828 if (_PyBytes_Resize(&res, size) < 0) 4829 goto onError; 4830 } 4831 4832 Py_XDECREF(errorHandler); 4833 Py_XDECREF(exc); 4834 return res; 4835 4836 onError: 4837 Py_XDECREF(res); 4838 Py_XDECREF(errorHandler); 4839 Py_XDECREF(exc); 4840 return NULL; 4841} 4842 4843PyObject * 4844PyUnicode_EncodeLatin1(const Py_UNICODE *p, 4845 Py_ssize_t size, 4846 const char *errors) 4847{ 4848 return unicode_encode_ucs1(p, size, errors, 256); 4849} 4850 4851PyObject * 4852PyUnicode_AsLatin1String(PyObject *unicode) 4853{ 4854 if (!PyUnicode_Check(unicode)) { 4855 PyErr_BadArgument(); 4856 return NULL; 4857 } 4858 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode), 4859 PyUnicode_GET_SIZE(unicode), 4860 NULL); 4861} 4862 4863/* --- 7-bit ASCII Codec -------------------------------------------------- */ 4864 4865PyObject * 4866PyUnicode_DecodeASCII(const char *s, 4867 Py_ssize_t size, 4868 const char *errors) 4869{ 4870 const char *starts = s; 4871 PyUnicodeObject *v; 4872 Py_UNICODE *p; 4873 Py_ssize_t startinpos; 4874 Py_ssize_t endinpos; 4875 Py_ssize_t outpos; 4876 const char *e; 4877 PyObject *errorHandler = NULL; 4878 PyObject *exc = NULL; 4879 4880 /* ASCII is equivalent to the first 128 ordinals in Unicode. */ 4881 if (size == 1 && *(unsigned char*)s < 128) { 4882 Py_UNICODE r = *(unsigned char*)s; 4883 return PyUnicode_FromUnicode(&r, 1); 4884 } 4885 4886 v = _PyUnicode_New(size); 4887 if (v == NULL) 4888 goto onError; 4889 if (size == 0) 4890 return (PyObject *)v; 4891 p = PyUnicode_AS_UNICODE(v); 4892 e = s + size; 4893 while (s < e) { 4894 register unsigned char c = (unsigned char)*s; 4895 if (c < 128) { 4896 *p++ = c; 4897 ++s; 4898 } 4899 else { 4900 startinpos = s-starts; 4901 endinpos = startinpos + 1; 4902 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v); 4903 if (unicode_decode_call_errorhandler( 4904 errors, &errorHandler, 4905 "ascii", "ordinal not in range(128)", 4906 &starts, &e, &startinpos, &endinpos, &exc, &s, 4907 &v, &outpos, &p)) 4908 goto onError; 4909 } 4910 } 4911 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v)) 4912 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0) 4913 goto onError; 4914 Py_XDECREF(errorHandler); 4915 Py_XDECREF(exc); 4916 return (PyObject *)v; 4917 4918 onError: 4919 Py_XDECREF(v); 4920 Py_XDECREF(errorHandler); 4921 Py_XDECREF(exc); 4922 return NULL; 4923} 4924 4925PyObject * 4926PyUnicode_EncodeASCII(const Py_UNICODE *p, 4927 Py_ssize_t size, 4928 const char *errors) 4929{ 4930 return unicode_encode_ucs1(p, size, errors, 128); 4931} 4932 4933PyObject * 4934PyUnicode_AsASCIIString(PyObject *unicode) 4935{ 4936 if (!PyUnicode_Check(unicode)) { 4937 PyErr_BadArgument(); 4938 return NULL; 4939 } 4940 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode), 4941 PyUnicode_GET_SIZE(unicode), 4942 NULL); 4943} 4944 4945#ifdef HAVE_MBCS 4946 4947/* --- MBCS codecs for Windows -------------------------------------------- */ 4948 4949#if SIZEOF_INT < SIZEOF_SIZE_T 4950#define NEED_RETRY 4951#endif 4952 4953/* XXX This code is limited to "true" double-byte encodings, as 4954 a) it assumes an incomplete character consists of a single byte, and 4955 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte 4956 encodings, see IsDBCSLeadByteEx documentation. */ 4957 4958static int 4959is_dbcs_lead_byte(const char *s, int offset) 4960{ 4961 const char *curr = s + offset; 4962 4963 if (IsDBCSLeadByte(*curr)) { 4964 const char *prev = CharPrev(s, curr); 4965 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2); 4966 } 4967 return 0; 4968} 4969 4970/* 4971 * Decode MBCS string into unicode object. If 'final' is set, converts 4972 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise. 4973 */ 4974static int 4975decode_mbcs(PyUnicodeObject **v, 4976 const char *s, /* MBCS string */ 4977 int size, /* sizeof MBCS string */ 4978 int final, 4979 const char *errors) 4980{ 4981 Py_UNICODE *p; 4982 Py_ssize_t n; 4983 DWORD usize; 4984 DWORD flags; 4985 4986 assert(size >= 0); 4987 4988 /* check and handle 'errors' arg */ 4989 if (errors==NULL || strcmp(errors, "strict")==0) 4990 flags = MB_ERR_INVALID_CHARS; 4991 else if (strcmp(errors, "ignore")==0) 4992 flags = 0; 4993 else { 4994 PyErr_Format(PyExc_ValueError, 4995 "mbcs encoding does not support errors='%s'", 4996 errors); 4997 return -1; 4998 } 4999 5000 /* Skip trailing lead-byte unless 'final' is set */ 5001 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1)) 5002 --size; 5003 5004 /* First get the size of the result */ 5005 if (size > 0) { 5006 usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0); 5007 if (usize==0) 5008 goto mbcs_decode_error; 5009 } else 5010 usize = 0; 5011 5012 if (*v == NULL) { 5013 /* Create unicode object */ 5014 *v = _PyUnicode_New(usize); 5015 if (*v == NULL) 5016 return -1; 5017 n = 0; 5018 } 5019 else { 5020 /* Extend unicode object */ 5021 n = PyUnicode_GET_SIZE(*v); 5022 if (_PyUnicode_Resize(v, n + usize) < 0) 5023 return -1; 5024 } 5025 5026 /* Do the conversion */ 5027 if (usize > 0) { 5028 p = PyUnicode_AS_UNICODE(*v) + n; 5029 if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) { 5030 goto mbcs_decode_error; 5031 } 5032 } 5033 return size; 5034 5035mbcs_decode_error: 5036 /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then 5037 we raise a UnicodeDecodeError - else it is a 'generic' 5038 windows error 5039 */ 5040 if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) { 5041 /* Ideally, we should get reason from FormatMessage - this 5042 is the Windows 2000 English version of the message 5043 */ 5044 PyObject *exc = NULL; 5045 const char *reason = "No mapping for the Unicode character exists " 5046 "in the target multi-byte code page."; 5047 make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason); 5048 if (exc != NULL) { 5049 PyCodec_StrictErrors(exc); 5050 Py_DECREF(exc); 5051 } 5052 } else { 5053 PyErr_SetFromWindowsErrWithFilename(0, NULL); 5054 } 5055 return -1; 5056} 5057 5058PyObject * 5059PyUnicode_DecodeMBCSStateful(const char *s, 5060 Py_ssize_t size, 5061 const char *errors, 5062 Py_ssize_t *consumed) 5063{ 5064 PyUnicodeObject *v = NULL; 5065 int done; 5066 5067 if (consumed) 5068 *consumed = 0; 5069 5070#ifdef NEED_RETRY 5071 retry: 5072 if (size > INT_MAX) 5073 done = decode_mbcs(&v, s, INT_MAX, 0, errors); 5074 else 5075#endif 5076 done = decode_mbcs(&v, s, (int)size, !consumed, errors); 5077 5078 if (done < 0) { 5079 Py_XDECREF(v); 5080 return NULL; 5081 } 5082 5083 if (consumed) 5084 *consumed += done; 5085 5086#ifdef NEED_RETRY 5087 if (size > INT_MAX) { 5088 s += done; 5089 size -= done; 5090 goto retry; 5091 } 5092#endif 5093 5094 return (PyObject *)v; 5095} 5096 5097PyObject * 5098PyUnicode_DecodeMBCS(const char *s, 5099 Py_ssize_t size, 5100 const char *errors) 5101{ 5102 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL); 5103} 5104 5105/* 5106 * Convert unicode into string object (MBCS). 5107 * Returns 0 if succeed, -1 otherwise. 5108 */ 5109static int 5110encode_mbcs(PyObject **repr, 5111 const Py_UNICODE *p, /* unicode */ 5112 int size, /* size of unicode */ 5113 const char* errors) 5114{ 5115 BOOL usedDefaultChar = FALSE; 5116 BOOL *pusedDefaultChar; 5117 int mbcssize; 5118 Py_ssize_t n; 5119 PyObject *exc = NULL; 5120 DWORD flags; 5121 5122 assert(size >= 0); 5123 5124 /* check and handle 'errors' arg */ 5125 if (errors==NULL || strcmp(errors, "strict")==0) { 5126 flags = WC_NO_BEST_FIT_CHARS; 5127 pusedDefaultChar = &usedDefaultChar; 5128 } else if (strcmp(errors, "replace")==0) { 5129 flags = 0; 5130 pusedDefaultChar = NULL; 5131 } else { 5132 PyErr_Format(PyExc_ValueError, 5133 "mbcs encoding does not support errors='%s'", 5134 errors); 5135 return -1; 5136 } 5137 5138 /* First get the size of the result */ 5139 if (size > 0) { 5140 mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0, 5141 NULL, pusedDefaultChar); 5142 if (mbcssize == 0) { 5143 PyErr_SetFromWindowsErrWithFilename(0, NULL); 5144 return -1; 5145 } 5146 /* If we used a default char, then we failed! */ 5147 if (pusedDefaultChar && *pusedDefaultChar) 5148 goto mbcs_encode_error; 5149 } else { 5150 mbcssize = 0; 5151 } 5152 5153 if (*repr == NULL) { 5154 /* Create string object */ 5155 *repr = PyBytes_FromStringAndSize(NULL, mbcssize); 5156 if (*repr == NULL) 5157 return -1; 5158 n = 0; 5159 } 5160 else { 5161 /* Extend string object */ 5162 n = PyBytes_Size(*repr); 5163 if (_PyBytes_Resize(repr, n + mbcssize) < 0) 5164 return -1; 5165 } 5166 5167 /* Do the conversion */ 5168 if (size > 0) { 5169 char *s = PyBytes_AS_STRING(*repr) + n; 5170 if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize, 5171 NULL, pusedDefaultChar)) { 5172 PyErr_SetFromWindowsErrWithFilename(0, NULL); 5173 return -1; 5174 } 5175 if (pusedDefaultChar && *pusedDefaultChar) 5176 goto mbcs_encode_error; 5177 } 5178 return 0; 5179 5180mbcs_encode_error: 5181 raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character"); 5182 Py_XDECREF(exc); 5183 return -1; 5184} 5185 5186PyObject * 5187PyUnicode_EncodeMBCS(const Py_UNICODE *p, 5188 Py_ssize_t size, 5189 const char *errors) 5190{ 5191 PyObject *repr = NULL; 5192 int ret; 5193 5194#ifdef NEED_RETRY 5195 retry: 5196 if (size > INT_MAX) 5197 ret = encode_mbcs(&repr, p, INT_MAX, errors); 5198 else 5199#endif 5200 ret = encode_mbcs(&repr, p, (int)size, errors); 5201 5202 if (ret < 0) { 5203 Py_XDECREF(repr); 5204 return NULL; 5205 } 5206 5207#ifdef NEED_RETRY 5208 if (size > INT_MAX) { 5209 p += INT_MAX; 5210 size -= INT_MAX; 5211 goto retry; 5212 } 5213#endif 5214 5215 return repr; 5216} 5217 5218PyObject * 5219PyUnicode_AsMBCSString(PyObject *unicode) 5220{ 5221 if (!PyUnicode_Check(unicode)) { 5222 PyErr_BadArgument(); 5223 return NULL; 5224 } 5225 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode), 5226 PyUnicode_GET_SIZE(unicode), 5227 NULL); 5228} 5229 5230#undef NEED_RETRY 5231 5232#endif /* HAVE_MBCS */ 5233 5234/* --- Character Mapping Codec -------------------------------------------- */ 5235 5236PyObject * 5237PyUnicode_DecodeCharmap(const char *s, 5238 Py_ssize_t size, 5239 PyObject *mapping, 5240 const char *errors) 5241{ 5242 const char *starts = s; 5243 Py_ssize_t startinpos; 5244 Py_ssize_t endinpos; 5245 Py_ssize_t outpos; 5246 const char *e; 5247 PyUnicodeObject *v; 5248 Py_UNICODE *p; 5249 Py_ssize_t extrachars = 0; 5250 PyObject *errorHandler = NULL; 5251 PyObject *exc = NULL; 5252 Py_UNICODE *mapstring = NULL; 5253 Py_ssize_t maplen = 0; 5254 5255 /* Default to Latin-1 */ 5256 if (mapping == NULL) 5257 return PyUnicode_DecodeLatin1(s, size, errors); 5258 5259 v = _PyUnicode_New(size); 5260 if (v == NULL) 5261 goto onError; 5262 if (size == 0) 5263 return (PyObject *)v; 5264 p = PyUnicode_AS_UNICODE(v); 5265 e = s + size; 5266 if (PyUnicode_CheckExact(mapping)) { 5267 mapstring = PyUnicode_AS_UNICODE(mapping); 5268 maplen = PyUnicode_GET_SIZE(mapping); 5269 while (s < e) { 5270 unsigned char ch = *s; 5271 Py_UNICODE x = 0xfffe; /* illegal value */ 5272 5273 if (ch < maplen) 5274 x = mapstring[ch]; 5275 5276 if (x == 0xfffe) { 5277 /* undefined mapping */ 5278 outpos = p-PyUnicode_AS_UNICODE(v); 5279 startinpos = s-starts; 5280 endinpos = startinpos+1; 5281 if (unicode_decode_call_errorhandler( 5282 errors, &errorHandler, 5283 "charmap", "character maps to <undefined>", 5284 &starts, &e, &startinpos, &endinpos, &exc, &s, 5285 &v, &outpos, &p)) { 5286 goto onError; 5287 } 5288 continue; 5289 } 5290 *p++ = x; 5291 ++s; 5292 } 5293 } 5294 else { 5295 while (s < e) { 5296 unsigned char ch = *s; 5297 PyObject *w, *x; 5298 5299 /* Get mapping (char ordinal -> integer, Unicode char or None) */ 5300 w = PyLong_FromLong((long)ch); 5301 if (w == NULL) 5302 goto onError; 5303 x = PyObject_GetItem(mapping, w); 5304 Py_DECREF(w); 5305 if (x == NULL) { 5306 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 5307 /* No mapping found means: mapping is undefined. */ 5308 PyErr_Clear(); 5309 x = Py_None; 5310 Py_INCREF(x); 5311 } else 5312 goto onError; 5313 } 5314 5315 /* Apply mapping */ 5316 if (PyLong_Check(x)) { 5317 long value = PyLong_AS_LONG(x); 5318 if (value < 0 || value > 65535) { 5319 PyErr_SetString(PyExc_TypeError, 5320 "character mapping must be in range(65536)"); 5321 Py_DECREF(x); 5322 goto onError; 5323 } 5324 *p++ = (Py_UNICODE)value; 5325 } 5326 else if (x == Py_None) { 5327 /* undefined mapping */ 5328 outpos = p-PyUnicode_AS_UNICODE(v); 5329 startinpos = s-starts; 5330 endinpos = startinpos+1; 5331 if (unicode_decode_call_errorhandler( 5332 errors, &errorHandler, 5333 "charmap", "character maps to <undefined>", 5334 &starts, &e, &startinpos, &endinpos, &exc, &s, 5335 &v, &outpos, &p)) { 5336 Py_DECREF(x); 5337 goto onError; 5338 } 5339 Py_DECREF(x); 5340 continue; 5341 } 5342 else if (PyUnicode_Check(x)) { 5343 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x); 5344 5345 if (targetsize == 1) 5346 /* 1-1 mapping */ 5347 *p++ = *PyUnicode_AS_UNICODE(x); 5348 5349 else if (targetsize > 1) { 5350 /* 1-n mapping */ 5351 if (targetsize > extrachars) { 5352 /* resize first */ 5353 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v); 5354 Py_ssize_t needed = (targetsize - extrachars) + \ 5355 (targetsize << 2); 5356 extrachars += needed; 5357 /* XXX overflow detection missing */ 5358 if (_PyUnicode_Resize(&v, 5359 PyUnicode_GET_SIZE(v) + needed) < 0) { 5360 Py_DECREF(x); 5361 goto onError; 5362 } 5363 p = PyUnicode_AS_UNICODE(v) + oldpos; 5364 } 5365 Py_UNICODE_COPY(p, 5366 PyUnicode_AS_UNICODE(x), 5367 targetsize); 5368 p += targetsize; 5369 extrachars -= targetsize; 5370 } 5371 /* 1-0 mapping: skip the character */ 5372 } 5373 else { 5374 /* wrong return value */ 5375 PyErr_SetString(PyExc_TypeError, 5376 "character mapping must return integer, None or str"); 5377 Py_DECREF(x); 5378 goto onError; 5379 } 5380 Py_DECREF(x); 5381 ++s; 5382 } 5383 } 5384 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v)) 5385 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0) 5386 goto onError; 5387 Py_XDECREF(errorHandler); 5388 Py_XDECREF(exc); 5389 return (PyObject *)v; 5390 5391 onError: 5392 Py_XDECREF(errorHandler); 5393 Py_XDECREF(exc); 5394 Py_XDECREF(v); 5395 return NULL; 5396} 5397 5398/* Charmap encoding: the lookup table */ 5399 5400struct encoding_map { 5401 PyObject_HEAD 5402 unsigned char level1[32]; 5403 int count2, count3; 5404 unsigned char level23[1]; 5405}; 5406 5407static PyObject* 5408encoding_map_size(PyObject *obj, PyObject* args) 5409{ 5410 struct encoding_map *map = (struct encoding_map*)obj; 5411 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 + 5412 128*map->count3); 5413} 5414 5415static PyMethodDef encoding_map_methods[] = { 5416 {"size", encoding_map_size, METH_NOARGS, 5417 PyDoc_STR("Return the size (in bytes) of this object") }, 5418 { 0 } 5419}; 5420 5421static void 5422encoding_map_dealloc(PyObject* o) 5423{ 5424 PyObject_FREE(o); 5425} 5426 5427static PyTypeObject EncodingMapType = { 5428 PyVarObject_HEAD_INIT(NULL, 0) 5429 "EncodingMap", /*tp_name*/ 5430 sizeof(struct encoding_map), /*tp_basicsize*/ 5431 0, /*tp_itemsize*/ 5432 /* methods */ 5433 encoding_map_dealloc, /*tp_dealloc*/ 5434 0, /*tp_print*/ 5435 0, /*tp_getattr*/ 5436 0, /*tp_setattr*/ 5437 0, /*tp_reserved*/ 5438 0, /*tp_repr*/ 5439 0, /*tp_as_number*/ 5440 0, /*tp_as_sequence*/ 5441 0, /*tp_as_mapping*/ 5442 0, /*tp_hash*/ 5443 0, /*tp_call*/ 5444 0, /*tp_str*/ 5445 0, /*tp_getattro*/ 5446 0, /*tp_setattro*/ 5447 0, /*tp_as_buffer*/ 5448 Py_TPFLAGS_DEFAULT, /*tp_flags*/ 5449 0, /*tp_doc*/ 5450 0, /*tp_traverse*/ 5451 0, /*tp_clear*/ 5452 0, /*tp_richcompare*/ 5453 0, /*tp_weaklistoffset*/ 5454 0, /*tp_iter*/ 5455 0, /*tp_iternext*/ 5456 encoding_map_methods, /*tp_methods*/ 5457 0, /*tp_members*/ 5458 0, /*tp_getset*/ 5459 0, /*tp_base*/ 5460 0, /*tp_dict*/ 5461 0, /*tp_descr_get*/ 5462 0, /*tp_descr_set*/ 5463 0, /*tp_dictoffset*/ 5464 0, /*tp_init*/ 5465 0, /*tp_alloc*/ 5466 0, /*tp_new*/ 5467 0, /*tp_free*/ 5468 0, /*tp_is_gc*/ 5469}; 5470 5471PyObject* 5472PyUnicode_BuildEncodingMap(PyObject* string) 5473{ 5474 Py_UNICODE *decode; 5475 PyObject *result; 5476 struct encoding_map *mresult; 5477 int i; 5478 int need_dict = 0; 5479 unsigned char level1[32]; 5480 unsigned char level2[512]; 5481 unsigned char *mlevel1, *mlevel2, *mlevel3; 5482 int count2 = 0, count3 = 0; 5483 5484 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) { 5485 PyErr_BadArgument(); 5486 return NULL; 5487 } 5488 decode = PyUnicode_AS_UNICODE(string); 5489 memset(level1, 0xFF, sizeof level1); 5490 memset(level2, 0xFF, sizeof level2); 5491 5492 /* If there isn't a one-to-one mapping of NULL to \0, 5493 or if there are non-BMP characters, we need to use 5494 a mapping dictionary. */ 5495 if (decode[0] != 0) 5496 need_dict = 1; 5497 for (i = 1; i < 256; i++) { 5498 int l1, l2; 5499 if (decode[i] == 0 5500#ifdef Py_UNICODE_WIDE 5501 || decode[i] > 0xFFFF 5502#endif 5503 ) { 5504 need_dict = 1; 5505 break; 5506 } 5507 if (decode[i] == 0xFFFE) 5508 /* unmapped character */ 5509 continue; 5510 l1 = decode[i] >> 11; 5511 l2 = decode[i] >> 7; 5512 if (level1[l1] == 0xFF) 5513 level1[l1] = count2++; 5514 if (level2[l2] == 0xFF) 5515 level2[l2] = count3++; 5516 } 5517 5518 if (count2 >= 0xFF || count3 >= 0xFF) 5519 need_dict = 1; 5520 5521 if (need_dict) { 5522 PyObject *result = PyDict_New(); 5523 PyObject *key, *value; 5524 if (!result) 5525 return NULL; 5526 for (i = 0; i < 256; i++) { 5527 key = PyLong_FromLong(decode[i]); 5528 value = PyLong_FromLong(i); 5529 if (!key || !value) 5530 goto failed1; 5531 if (PyDict_SetItem(result, key, value) == -1) 5532 goto failed1; 5533 Py_DECREF(key); 5534 Py_DECREF(value); 5535 } 5536 return result; 5537 failed1: 5538 Py_XDECREF(key); 5539 Py_XDECREF(value); 5540 Py_DECREF(result); 5541 return NULL; 5542 } 5543 5544 /* Create a three-level trie */ 5545 result = PyObject_MALLOC(sizeof(struct encoding_map) + 5546 16*count2 + 128*count3 - 1); 5547 if (!result) 5548 return PyErr_NoMemory(); 5549 PyObject_Init(result, &EncodingMapType); 5550 mresult = (struct encoding_map*)result; 5551 mresult->count2 = count2; 5552 mresult->count3 = count3; 5553 mlevel1 = mresult->level1; 5554 mlevel2 = mresult->level23; 5555 mlevel3 = mresult->level23 + 16*count2; 5556 memcpy(mlevel1, level1, 32); 5557 memset(mlevel2, 0xFF, 16*count2); 5558 memset(mlevel3, 0, 128*count3); 5559 count3 = 0; 5560 for (i = 1; i < 256; i++) { 5561 int o1, o2, o3, i2, i3; 5562 if (decode[i] == 0xFFFE) 5563 /* unmapped character */ 5564 continue; 5565 o1 = decode[i]>>11; 5566 o2 = (decode[i]>>7) & 0xF; 5567 i2 = 16*mlevel1[o1] + o2; 5568 if (mlevel2[i2] == 0xFF) 5569 mlevel2[i2] = count3++; 5570 o3 = decode[i] & 0x7F; 5571 i3 = 128*mlevel2[i2] + o3; 5572 mlevel3[i3] = i; 5573 } 5574 return result; 5575} 5576 5577static int 5578encoding_map_lookup(Py_UNICODE c, PyObject *mapping) 5579{ 5580 struct encoding_map *map = (struct encoding_map*)mapping; 5581 int l1 = c>>11; 5582 int l2 = (c>>7) & 0xF; 5583 int l3 = c & 0x7F; 5584 int i; 5585 5586#ifdef Py_UNICODE_WIDE 5587 if (c > 0xFFFF) { 5588 return -1; 5589 } 5590#endif 5591 if (c == 0) 5592 return 0; 5593 /* level 1*/ 5594 i = map->level1[l1]; 5595 if (i == 0xFF) { 5596 return -1; 5597 } 5598 /* level 2*/ 5599 i = map->level23[16*i+l2]; 5600 if (i == 0xFF) { 5601 return -1; 5602 } 5603 /* level 3 */ 5604 i = map->level23[16*map->count2 + 128*i + l3]; 5605 if (i == 0) { 5606 return -1; 5607 } 5608 return i; 5609} 5610 5611/* Lookup the character ch in the mapping. If the character 5612 can't be found, Py_None is returned (or NULL, if another 5613 error occurred). */ 5614static PyObject * 5615charmapencode_lookup(Py_UNICODE c, PyObject *mapping) 5616{ 5617 PyObject *w = PyLong_FromLong((long)c); 5618 PyObject *x; 5619 5620 if (w == NULL) 5621 return NULL; 5622 x = PyObject_GetItem(mapping, w); 5623 Py_DECREF(w); 5624 if (x == NULL) { 5625 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 5626 /* No mapping found means: mapping is undefined. */ 5627 PyErr_Clear(); 5628 x = Py_None; 5629 Py_INCREF(x); 5630 return x; 5631 } else 5632 return NULL; 5633 } 5634 else if (x == Py_None) 5635 return x; 5636 else if (PyLong_Check(x)) { 5637 long value = PyLong_AS_LONG(x); 5638 if (value < 0 || value > 255) { 5639 PyErr_SetString(PyExc_TypeError, 5640 "character mapping must be in range(256)"); 5641 Py_DECREF(x); 5642 return NULL; 5643 } 5644 return x; 5645 } 5646 else if (PyBytes_Check(x)) 5647 return x; 5648 else { 5649 /* wrong return value */ 5650 PyErr_Format(PyExc_TypeError, 5651 "character mapping must return integer, bytes or None, not %.400s", 5652 x->ob_type->tp_name); 5653 Py_DECREF(x); 5654 return NULL; 5655 } 5656} 5657 5658static int 5659charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize) 5660{ 5661 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj); 5662 /* exponentially overallocate to minimize reallocations */ 5663 if (requiredsize < 2*outsize) 5664 requiredsize = 2*outsize; 5665 if (_PyBytes_Resize(outobj, requiredsize)) 5666 return -1; 5667 return 0; 5668} 5669 5670typedef enum charmapencode_result { 5671 enc_SUCCESS, enc_FAILED, enc_EXCEPTION 5672} charmapencode_result; 5673/* lookup the character, put the result in the output string and adjust 5674 various state variables. Resize the output bytes object if not enough 5675 space is available. Return a new reference to the object that 5676 was put in the output buffer, or Py_None, if the mapping was undefined 5677 (in which case no character was written) or NULL, if a 5678 reallocation error occurred. The caller must decref the result */ 5679static charmapencode_result 5680charmapencode_output(Py_UNICODE c, PyObject *mapping, 5681 PyObject **outobj, Py_ssize_t *outpos) 5682{ 5683 PyObject *rep; 5684 char *outstart; 5685 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj); 5686 5687 if (Py_TYPE(mapping) == &EncodingMapType) { 5688 int res = encoding_map_lookup(c, mapping); 5689 Py_ssize_t requiredsize = *outpos+1; 5690 if (res == -1) 5691 return enc_FAILED; 5692 if (outsize<requiredsize) 5693 if (charmapencode_resize(outobj, outpos, requiredsize)) 5694 return enc_EXCEPTION; 5695 outstart = PyBytes_AS_STRING(*outobj); 5696 outstart[(*outpos)++] = (char)res; 5697 return enc_SUCCESS; 5698 } 5699 5700 rep = charmapencode_lookup(c, mapping); 5701 if (rep==NULL) 5702 return enc_EXCEPTION; 5703 else if (rep==Py_None) { 5704 Py_DECREF(rep); 5705 return enc_FAILED; 5706 } else { 5707 if (PyLong_Check(rep)) { 5708 Py_ssize_t requiredsize = *outpos+1; 5709 if (outsize<requiredsize) 5710 if (charmapencode_resize(outobj, outpos, requiredsize)) { 5711 Py_DECREF(rep); 5712 return enc_EXCEPTION; 5713 } 5714 outstart = PyBytes_AS_STRING(*outobj); 5715 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep); 5716 } 5717 else { 5718 const char *repchars = PyBytes_AS_STRING(rep); 5719 Py_ssize_t repsize = PyBytes_GET_SIZE(rep); 5720 Py_ssize_t requiredsize = *outpos+repsize; 5721 if (outsize<requiredsize) 5722 if (charmapencode_resize(outobj, outpos, requiredsize)) { 5723 Py_DECREF(rep); 5724 return enc_EXCEPTION; 5725 } 5726 outstart = PyBytes_AS_STRING(*outobj); 5727 memcpy(outstart + *outpos, repchars, repsize); 5728 *outpos += repsize; 5729 } 5730 } 5731 Py_DECREF(rep); 5732 return enc_SUCCESS; 5733} 5734 5735/* handle an error in PyUnicode_EncodeCharmap 5736 Return 0 on success, -1 on error */ 5737static int 5738charmap_encoding_error( 5739 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping, 5740 PyObject **exceptionObject, 5741 int *known_errorHandler, PyObject **errorHandler, const char *errors, 5742 PyObject **res, Py_ssize_t *respos) 5743{ 5744 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */ 5745 Py_ssize_t repsize; 5746 Py_ssize_t newpos; 5747 Py_UNICODE *uni2; 5748 /* startpos for collecting unencodable chars */ 5749 Py_ssize_t collstartpos = *inpos; 5750 Py_ssize_t collendpos = *inpos+1; 5751 Py_ssize_t collpos; 5752 char *encoding = "charmap"; 5753 char *reason = "character maps to <undefined>"; 5754 charmapencode_result x; 5755 5756 /* find all unencodable characters */ 5757 while (collendpos < size) { 5758 PyObject *rep; 5759 if (Py_TYPE(mapping) == &EncodingMapType) { 5760 int res = encoding_map_lookup(p[collendpos], mapping); 5761 if (res != -1) 5762 break; 5763 ++collendpos; 5764 continue; 5765 } 5766 5767 rep = charmapencode_lookup(p[collendpos], mapping); 5768 if (rep==NULL) 5769 return -1; 5770 else if (rep!=Py_None) { 5771 Py_DECREF(rep); 5772 break; 5773 } 5774 Py_DECREF(rep); 5775 ++collendpos; 5776 } 5777 /* cache callback name lookup 5778 * (if not done yet, i.e. it's the first error) */ 5779 if (*known_errorHandler==-1) { 5780 if ((errors==NULL) || (!strcmp(errors, "strict"))) 5781 *known_errorHandler = 1; 5782 else if (!strcmp(errors, "replace")) 5783 *known_errorHandler = 2; 5784 else if (!strcmp(errors, "ignore")) 5785 *known_errorHandler = 3; 5786 else if (!strcmp(errors, "xmlcharrefreplace")) 5787 *known_errorHandler = 4; 5788 else 5789 *known_errorHandler = 0; 5790 } 5791 switch (*known_errorHandler) { 5792 case 1: /* strict */ 5793 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason); 5794 return -1; 5795 case 2: /* replace */ 5796 for (collpos = collstartpos; collpos<collendpos; ++collpos) { 5797 x = charmapencode_output('?', mapping, res, respos); 5798 if (x==enc_EXCEPTION) { 5799 return -1; 5800 } 5801 else if (x==enc_FAILED) { 5802 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason); 5803 return -1; 5804 } 5805 } 5806 /* fall through */ 5807 case 3: /* ignore */ 5808 *inpos = collendpos; 5809 break; 5810 case 4: /* xmlcharrefreplace */ 5811 /* generate replacement (temporarily (mis)uses p) */ 5812 for (collpos = collstartpos; collpos < collendpos; ++collpos) { 5813 char buffer[2+29+1+1]; 5814 char *cp; 5815 sprintf(buffer, "&#%d;", (int)p[collpos]); 5816 for (cp = buffer; *cp; ++cp) { 5817 x = charmapencode_output(*cp, mapping, res, respos); 5818 if (x==enc_EXCEPTION) 5819 return -1; 5820 else if (x==enc_FAILED) { 5821 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason); 5822 return -1; 5823 } 5824 } 5825 } 5826 *inpos = collendpos; 5827 break; 5828 default: 5829 repunicode = unicode_encode_call_errorhandler(errors, errorHandler, 5830 encoding, reason, p, size, exceptionObject, 5831 collstartpos, collendpos, &newpos); 5832 if (repunicode == NULL) 5833 return -1; 5834 if (PyBytes_Check(repunicode)) { 5835 /* Directly copy bytes result to output. */ 5836 Py_ssize_t outsize = PyBytes_Size(*res); 5837 Py_ssize_t requiredsize; 5838 repsize = PyBytes_Size(repunicode); 5839 requiredsize = *respos + repsize; 5840 if (requiredsize > outsize) 5841 /* Make room for all additional bytes. */ 5842 if (charmapencode_resize(res, respos, requiredsize)) { 5843 Py_DECREF(repunicode); 5844 return -1; 5845 } 5846 memcpy(PyBytes_AsString(*res) + *respos, 5847 PyBytes_AsString(repunicode), repsize); 5848 *respos += repsize; 5849 *inpos = newpos; 5850 Py_DECREF(repunicode); 5851 break; 5852 } 5853 /* generate replacement */ 5854 repsize = PyUnicode_GET_SIZE(repunicode); 5855 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) { 5856 x = charmapencode_output(*uni2, mapping, res, respos); 5857 if (x==enc_EXCEPTION) { 5858 return -1; 5859 } 5860 else if (x==enc_FAILED) { 5861 Py_DECREF(repunicode); 5862 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason); 5863 return -1; 5864 } 5865 } 5866 *inpos = newpos; 5867 Py_DECREF(repunicode); 5868 } 5869 return 0; 5870} 5871 5872PyObject * 5873PyUnicode_EncodeCharmap(const Py_UNICODE *p, 5874 Py_ssize_t size, 5875 PyObject *mapping, 5876 const char *errors) 5877{ 5878 /* output object */ 5879 PyObject *res = NULL; 5880 /* current input position */ 5881 Py_ssize_t inpos = 0; 5882 /* current output position */ 5883 Py_ssize_t respos = 0; 5884 PyObject *errorHandler = NULL; 5885 PyObject *exc = NULL; 5886 /* the following variable is used for caching string comparisons 5887 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 5888 * 3=ignore, 4=xmlcharrefreplace */ 5889 int known_errorHandler = -1; 5890 5891 /* Default to Latin-1 */ 5892 if (mapping == NULL) 5893 return PyUnicode_EncodeLatin1(p, size, errors); 5894 5895 /* allocate enough for a simple encoding without 5896 replacements, if we need more, we'll resize */ 5897 res = PyBytes_FromStringAndSize(NULL, size); 5898 if (res == NULL) 5899 goto onError; 5900 if (size == 0) 5901 return res; 5902 5903 while (inpos<size) { 5904 /* try to encode it */ 5905 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos); 5906 if (x==enc_EXCEPTION) /* error */ 5907 goto onError; 5908 if (x==enc_FAILED) { /* unencodable character */ 5909 if (charmap_encoding_error(p, size, &inpos, mapping, 5910 &exc, 5911 &known_errorHandler, &errorHandler, errors, 5912 &res, &respos)) { 5913 goto onError; 5914 } 5915 } 5916 else 5917 /* done with this character => adjust input position */ 5918 ++inpos; 5919 } 5920 5921 /* Resize if we allocated to much */ 5922 if (respos<PyBytes_GET_SIZE(res)) 5923 if (_PyBytes_Resize(&res, respos) < 0) 5924 goto onError; 5925 5926 Py_XDECREF(exc); 5927 Py_XDECREF(errorHandler); 5928 return res; 5929 5930 onError: 5931 Py_XDECREF(res); 5932 Py_XDECREF(exc); 5933 Py_XDECREF(errorHandler); 5934 return NULL; 5935} 5936 5937PyObject * 5938PyUnicode_AsCharmapString(PyObject *unicode, 5939 PyObject *mapping) 5940{ 5941 if (!PyUnicode_Check(unicode) || mapping == NULL) { 5942 PyErr_BadArgument(); 5943 return NULL; 5944 } 5945 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode), 5946 PyUnicode_GET_SIZE(unicode), 5947 mapping, 5948 NULL); 5949} 5950 5951/* create or adjust a UnicodeTranslateError */ 5952static void 5953make_translate_exception(PyObject **exceptionObject, 5954 const Py_UNICODE *unicode, Py_ssize_t size, 5955 Py_ssize_t startpos, Py_ssize_t endpos, 5956 const char *reason) 5957{ 5958 if (*exceptionObject == NULL) { 5959 *exceptionObject = PyUnicodeTranslateError_Create( 5960 unicode, size, startpos, endpos, reason); 5961 } 5962 else { 5963 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos)) 5964 goto onError; 5965 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos)) 5966 goto onError; 5967 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason)) 5968 goto onError; 5969 return; 5970 onError: 5971 Py_DECREF(*exceptionObject); 5972 *exceptionObject = NULL; 5973 } 5974} 5975 5976/* raises a UnicodeTranslateError */ 5977static void 5978raise_translate_exception(PyObject **exceptionObject, 5979 const Py_UNICODE *unicode, Py_ssize_t size, 5980 Py_ssize_t startpos, Py_ssize_t endpos, 5981 const char *reason) 5982{ 5983 make_translate_exception(exceptionObject, 5984 unicode, size, startpos, endpos, reason); 5985 if (*exceptionObject != NULL) 5986 PyCodec_StrictErrors(*exceptionObject); 5987} 5988 5989/* error handling callback helper: 5990 build arguments, call the callback and check the arguments, 5991 put the result into newpos and return the replacement string, which 5992 has to be freed by the caller */ 5993static PyObject * 5994unicode_translate_call_errorhandler(const char *errors, 5995 PyObject **errorHandler, 5996 const char *reason, 5997 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject, 5998 Py_ssize_t startpos, Py_ssize_t endpos, 5999 Py_ssize_t *newpos) 6000{ 6001 static char *argparse = "O!n;translating error handler must return (str, int) tuple"; 6002 6003 Py_ssize_t i_newpos; 6004 PyObject *restuple; 6005 PyObject *resunicode; 6006 6007 if (*errorHandler == NULL) { 6008 *errorHandler = PyCodec_LookupError(errors); 6009 if (*errorHandler == NULL) 6010 return NULL; 6011 } 6012 6013 make_translate_exception(exceptionObject, 6014 unicode, size, startpos, endpos, reason); 6015 if (*exceptionObject == NULL) 6016 return NULL; 6017 6018 restuple = PyObject_CallFunctionObjArgs( 6019 *errorHandler, *exceptionObject, NULL); 6020 if (restuple == NULL) 6021 return NULL; 6022 if (!PyTuple_Check(restuple)) { 6023 PyErr_SetString(PyExc_TypeError, &argparse[4]); 6024 Py_DECREF(restuple); 6025 return NULL; 6026 } 6027 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, 6028 &resunicode, &i_newpos)) { 6029 Py_DECREF(restuple); 6030 return NULL; 6031 } 6032 if (i_newpos<0) 6033 *newpos = size+i_newpos; 6034 else 6035 *newpos = i_newpos; 6036 if (*newpos<0 || *newpos>size) { 6037 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos); 6038 Py_DECREF(restuple); 6039 return NULL; 6040 } 6041 Py_INCREF(resunicode); 6042 Py_DECREF(restuple); 6043 return resunicode; 6044} 6045 6046/* Lookup the character ch in the mapping and put the result in result, 6047 which must be decrefed by the caller. 6048 Return 0 on success, -1 on error */ 6049static int 6050charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result) 6051{ 6052 PyObject *w = PyLong_FromLong((long)c); 6053 PyObject *x; 6054 6055 if (w == NULL) 6056 return -1; 6057 x = PyObject_GetItem(mapping, w); 6058 Py_DECREF(w); 6059 if (x == NULL) { 6060 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 6061 /* No mapping found means: use 1:1 mapping. */ 6062 PyErr_Clear(); 6063 *result = NULL; 6064 return 0; 6065 } else 6066 return -1; 6067 } 6068 else if (x == Py_None) { 6069 *result = x; 6070 return 0; 6071 } 6072 else if (PyLong_Check(x)) { 6073 long value = PyLong_AS_LONG(x); 6074 long max = PyUnicode_GetMax(); 6075 if (value < 0 || value > max) { 6076 PyErr_Format(PyExc_TypeError, 6077 "character mapping must be in range(0x%x)", max+1); 6078 Py_DECREF(x); 6079 return -1; 6080 } 6081 *result = x; 6082 return 0; 6083 } 6084 else if (PyUnicode_Check(x)) { 6085 *result = x; 6086 return 0; 6087 } 6088 else { 6089 /* wrong return value */ 6090 PyErr_SetString(PyExc_TypeError, 6091 "character mapping must return integer, None or str"); 6092 Py_DECREF(x); 6093 return -1; 6094 } 6095} 6096/* ensure that *outobj is at least requiredsize characters long, 6097 if not reallocate and adjust various state variables. 6098 Return 0 on success, -1 on error */ 6099static int 6100charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp, 6101 Py_ssize_t requiredsize) 6102{ 6103 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj); 6104 if (requiredsize > oldsize) { 6105 /* remember old output position */ 6106 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj); 6107 /* exponentially overallocate to minimize reallocations */ 6108 if (requiredsize < 2 * oldsize) 6109 requiredsize = 2 * oldsize; 6110 if (PyUnicode_Resize(outobj, requiredsize) < 0) 6111 return -1; 6112 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos; 6113 } 6114 return 0; 6115} 6116/* lookup the character, put the result in the output string and adjust 6117 various state variables. Return a new reference to the object that 6118 was put in the output buffer in *result, or Py_None, if the mapping was 6119 undefined (in which case no character was written). 6120 The called must decref result. 6121 Return 0 on success, -1 on error. */ 6122static int 6123charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp, 6124 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp, 6125 PyObject **res) 6126{ 6127 if (charmaptranslate_lookup(*curinp, mapping, res)) 6128 return -1; 6129 if (*res==NULL) { 6130 /* not found => default to 1:1 mapping */ 6131 *(*outp)++ = *curinp; 6132 } 6133 else if (*res==Py_None) 6134 ; 6135 else if (PyLong_Check(*res)) { 6136 /* no overflow check, because we know that the space is enough */ 6137 *(*outp)++ = (Py_UNICODE)PyLong_AS_LONG(*res); 6138 } 6139 else if (PyUnicode_Check(*res)) { 6140 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res); 6141 if (repsize==1) { 6142 /* no overflow check, because we know that the space is enough */ 6143 *(*outp)++ = *PyUnicode_AS_UNICODE(*res); 6144 } 6145 else if (repsize!=0) { 6146 /* more than one character */ 6147 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) + 6148 (insize - (curinp-startinp)) + 6149 repsize - 1; 6150 if (charmaptranslate_makespace(outobj, outp, requiredsize)) 6151 return -1; 6152 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize); 6153 *outp += repsize; 6154 } 6155 } 6156 else 6157 return -1; 6158 return 0; 6159} 6160 6161PyObject * 6162PyUnicode_TranslateCharmap(const Py_UNICODE *p, 6163 Py_ssize_t size, 6164 PyObject *mapping, 6165 const char *errors) 6166{ 6167 /* output object */ 6168 PyObject *res = NULL; 6169 /* pointers to the beginning and end+1 of input */ 6170 const Py_UNICODE *startp = p; 6171 const Py_UNICODE *endp = p + size; 6172 /* pointer into the output */ 6173 Py_UNICODE *str; 6174 /* current output position */ 6175 Py_ssize_t respos = 0; 6176 char *reason = "character maps to <undefined>"; 6177 PyObject *errorHandler = NULL; 6178 PyObject *exc = NULL; 6179 /* the following variable is used for caching string comparisons 6180 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 6181 * 3=ignore, 4=xmlcharrefreplace */ 6182 int known_errorHandler = -1; 6183 6184 if (mapping == NULL) { 6185 PyErr_BadArgument(); 6186 return NULL; 6187 } 6188 6189 /* allocate enough for a simple 1:1 translation without 6190 replacements, if we need more, we'll resize */ 6191 res = PyUnicode_FromUnicode(NULL, size); 6192 if (res == NULL) 6193 goto onError; 6194 if (size == 0) 6195 return res; 6196 str = PyUnicode_AS_UNICODE(res); 6197 6198 while (p<endp) { 6199 /* try to encode it */ 6200 PyObject *x = NULL; 6201 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) { 6202 Py_XDECREF(x); 6203 goto onError; 6204 } 6205 Py_XDECREF(x); 6206 if (x!=Py_None) /* it worked => adjust input pointer */ 6207 ++p; 6208 else { /* untranslatable character */ 6209 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */ 6210 Py_ssize_t repsize; 6211 Py_ssize_t newpos; 6212 Py_UNICODE *uni2; 6213 /* startpos for collecting untranslatable chars */ 6214 const Py_UNICODE *collstart = p; 6215 const Py_UNICODE *collend = p+1; 6216 const Py_UNICODE *coll; 6217 6218 /* find all untranslatable characters */ 6219 while (collend < endp) { 6220 if (charmaptranslate_lookup(*collend, mapping, &x)) 6221 goto onError; 6222 Py_XDECREF(x); 6223 if (x!=Py_None) 6224 break; 6225 ++collend; 6226 } 6227 /* cache callback name lookup 6228 * (if not done yet, i.e. it's the first error) */ 6229 if (known_errorHandler==-1) { 6230 if ((errors==NULL) || (!strcmp(errors, "strict"))) 6231 known_errorHandler = 1; 6232 else if (!strcmp(errors, "replace")) 6233 known_errorHandler = 2; 6234 else if (!strcmp(errors, "ignore")) 6235 known_errorHandler = 3; 6236 else if (!strcmp(errors, "xmlcharrefreplace")) 6237 known_errorHandler = 4; 6238 else 6239 known_errorHandler = 0; 6240 } 6241 switch (known_errorHandler) { 6242 case 1: /* strict */ 6243 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason); 6244 goto onError; 6245 case 2: /* replace */ 6246 /* No need to check for space, this is a 1:1 replacement */ 6247 for (coll = collstart; coll<collend; ++coll) 6248 *str++ = '?'; 6249 /* fall through */ 6250 case 3: /* ignore */ 6251 p = collend; 6252 break; 6253 case 4: /* xmlcharrefreplace */ 6254 /* generate replacement (temporarily (mis)uses p) */ 6255 for (p = collstart; p < collend; ++p) { 6256 char buffer[2+29+1+1]; 6257 char *cp; 6258 sprintf(buffer, "&#%d;", (int)*p); 6259 if (charmaptranslate_makespace(&res, &str, 6260 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend))) 6261 goto onError; 6262 for (cp = buffer; *cp; ++cp) 6263 *str++ = *cp; 6264 } 6265 p = collend; 6266 break; 6267 default: 6268 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler, 6269 reason, startp, size, &exc, 6270 collstart-startp, collend-startp, &newpos); 6271 if (repunicode == NULL) 6272 goto onError; 6273 /* generate replacement */ 6274 repsize = PyUnicode_GET_SIZE(repunicode); 6275 if (charmaptranslate_makespace(&res, &str, 6276 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) { 6277 Py_DECREF(repunicode); 6278 goto onError; 6279 } 6280 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) 6281 *str++ = *uni2; 6282 p = startp + newpos; 6283 Py_DECREF(repunicode); 6284 } 6285 } 6286 } 6287 /* Resize if we allocated to much */ 6288 respos = str-PyUnicode_AS_UNICODE(res); 6289 if (respos<PyUnicode_GET_SIZE(res)) { 6290 if (PyUnicode_Resize(&res, respos) < 0) 6291 goto onError; 6292 } 6293 Py_XDECREF(exc); 6294 Py_XDECREF(errorHandler); 6295 return res; 6296 6297 onError: 6298 Py_XDECREF(res); 6299 Py_XDECREF(exc); 6300 Py_XDECREF(errorHandler); 6301 return NULL; 6302} 6303 6304PyObject * 6305PyUnicode_Translate(PyObject *str, 6306 PyObject *mapping, 6307 const char *errors) 6308{ 6309 PyObject *result; 6310 6311 str = PyUnicode_FromObject(str); 6312 if (str == NULL) 6313 goto onError; 6314 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str), 6315 PyUnicode_GET_SIZE(str), 6316 mapping, 6317 errors); 6318 Py_DECREF(str); 6319 return result; 6320 6321 onError: 6322 Py_XDECREF(str); 6323 return NULL; 6324} 6325 6326PyObject * 6327PyUnicode_TransformDecimalToASCII(Py_UNICODE *s, 6328 Py_ssize_t length) 6329{ 6330 PyObject *result; 6331 Py_UNICODE *p; /* write pointer into result */ 6332 Py_ssize_t i; 6333 /* Copy to a new string */ 6334 result = (PyObject *)_PyUnicode_New(length); 6335 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length); 6336 if (result == NULL) 6337 return result; 6338 p = PyUnicode_AS_UNICODE(result); 6339 /* Iterate over code points */ 6340 for (i = 0; i < length; i++) { 6341 Py_UNICODE ch =s[i]; 6342 if (ch > 127) { 6343 int decimal = Py_UNICODE_TODECIMAL(ch); 6344 if (decimal >= 0) 6345 p[i] = '0' + decimal; 6346 } 6347 } 6348 return result; 6349} 6350/* --- Decimal Encoder ---------------------------------------------------- */ 6351 6352int 6353PyUnicode_EncodeDecimal(Py_UNICODE *s, 6354 Py_ssize_t length, 6355 char *output, 6356 const char *errors) 6357{ 6358 Py_UNICODE *p, *end; 6359 PyObject *errorHandler = NULL; 6360 PyObject *exc = NULL; 6361 const char *encoding = "decimal"; 6362 const char *reason = "invalid decimal Unicode string"; 6363 /* the following variable is used for caching string comparisons 6364 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */ 6365 int known_errorHandler = -1; 6366 6367 if (output == NULL) { 6368 PyErr_BadArgument(); 6369 return -1; 6370 } 6371 6372 p = s; 6373 end = s + length; 6374 while (p < end) { 6375 register Py_UNICODE ch = *p; 6376 int decimal; 6377 PyObject *repunicode; 6378 Py_ssize_t repsize; 6379 Py_ssize_t newpos; 6380 Py_UNICODE *uni2; 6381 Py_UNICODE *collstart; 6382 Py_UNICODE *collend; 6383 6384 if (Py_UNICODE_ISSPACE(ch)) { 6385 *output++ = ' '; 6386 ++p; 6387 continue; 6388 } 6389 decimal = Py_UNICODE_TODECIMAL(ch); 6390 if (decimal >= 0) { 6391 *output++ = '0' + decimal; 6392 ++p; 6393 continue; 6394 } 6395 if (0 < ch && ch < 256) { 6396 *output++ = (char)ch; 6397 ++p; 6398 continue; 6399 } 6400 /* All other characters are considered unencodable */ 6401 collstart = p; 6402 collend = p+1; 6403 while (collend < end) { 6404 if ((0 < *collend && *collend < 256) || 6405 !Py_UNICODE_ISSPACE(*collend) || 6406 Py_UNICODE_TODECIMAL(*collend)) 6407 break; 6408 } 6409 /* cache callback name lookup 6410 * (if not done yet, i.e. it's the first error) */ 6411 if (known_errorHandler==-1) { 6412 if ((errors==NULL) || (!strcmp(errors, "strict"))) 6413 known_errorHandler = 1; 6414 else if (!strcmp(errors, "replace")) 6415 known_errorHandler = 2; 6416 else if (!strcmp(errors, "ignore")) 6417 known_errorHandler = 3; 6418 else if (!strcmp(errors, "xmlcharrefreplace")) 6419 known_errorHandler = 4; 6420 else 6421 known_errorHandler = 0; 6422 } 6423 switch (known_errorHandler) { 6424 case 1: /* strict */ 6425 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason); 6426 goto onError; 6427 case 2: /* replace */ 6428 for (p = collstart; p < collend; ++p) 6429 *output++ = '?'; 6430 /* fall through */ 6431 case 3: /* ignore */ 6432 p = collend; 6433 break; 6434 case 4: /* xmlcharrefreplace */ 6435 /* generate replacement (temporarily (mis)uses p) */ 6436 for (p = collstart; p < collend; ++p) 6437 output += sprintf(output, "&#%d;", (int)*p); 6438 p = collend; 6439 break; 6440 default: 6441 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler, 6442 encoding, reason, s, length, &exc, 6443 collstart-s, collend-s, &newpos); 6444 if (repunicode == NULL) 6445 goto onError; 6446 if (!PyUnicode_Check(repunicode)) { 6447 /* Byte results not supported, since they have no decimal property. */ 6448 PyErr_SetString(PyExc_TypeError, "error handler should return unicode"); 6449 Py_DECREF(repunicode); 6450 goto onError; 6451 } 6452 /* generate replacement */ 6453 repsize = PyUnicode_GET_SIZE(repunicode); 6454 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) { 6455 Py_UNICODE ch = *uni2; 6456 if (Py_UNICODE_ISSPACE(ch)) 6457 *output++ = ' '; 6458 else { 6459 decimal = Py_UNICODE_TODECIMAL(ch); 6460 if (decimal >= 0) 6461 *output++ = '0' + decimal; 6462 else if (0 < ch && ch < 256) 6463 *output++ = (char)ch; 6464 else { 6465 Py_DECREF(repunicode); 6466 raise_encode_exception(&exc, encoding, 6467 s, length, collstart-s, collend-s, reason); 6468 goto onError; 6469 } 6470 } 6471 } 6472 p = s + newpos; 6473 Py_DECREF(repunicode); 6474 } 6475 } 6476 /* 0-terminate the output string */ 6477 *output++ = '\0'; 6478 Py_XDECREF(exc); 6479 Py_XDECREF(errorHandler); 6480 return 0; 6481 6482 onError: 6483 Py_XDECREF(exc); 6484 Py_XDECREF(errorHandler); 6485 return -1; 6486} 6487 6488/* --- Helpers ------------------------------------------------------------ */ 6489 6490#include "stringlib/unicodedefs.h" 6491#include "stringlib/fastsearch.h" 6492 6493#include "stringlib/count.h" 6494#include "stringlib/find.h" 6495#include "stringlib/partition.h" 6496#include "stringlib/split.h" 6497 6498#define _Py_InsertThousandsGrouping _PyUnicode_InsertThousandsGrouping 6499#define _Py_InsertThousandsGroupingLocale _PyUnicode_InsertThousandsGroupingLocale 6500#include "stringlib/localeutil.h" 6501 6502/* helper macro to fixup start/end slice values */ 6503#define ADJUST_INDICES(start, end, len) \ 6504 if (end > len) \ 6505 end = len; \ 6506 else if (end < 0) { \ 6507 end += len; \ 6508 if (end < 0) \ 6509 end = 0; \ 6510 } \ 6511 if (start < 0) { \ 6512 start += len; \ 6513 if (start < 0) \ 6514 start = 0; \ 6515 } 6516 6517Py_ssize_t 6518PyUnicode_Count(PyObject *str, 6519 PyObject *substr, 6520 Py_ssize_t start, 6521 Py_ssize_t end) 6522{ 6523 Py_ssize_t result; 6524 PyUnicodeObject* str_obj; 6525 PyUnicodeObject* sub_obj; 6526 6527 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str); 6528 if (!str_obj) 6529 return -1; 6530 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr); 6531 if (!sub_obj) { 6532 Py_DECREF(str_obj); 6533 return -1; 6534 } 6535 6536 ADJUST_INDICES(start, end, str_obj->length); 6537 result = stringlib_count( 6538 str_obj->str + start, end - start, sub_obj->str, sub_obj->length, 6539 PY_SSIZE_T_MAX 6540 ); 6541 6542 Py_DECREF(sub_obj); 6543 Py_DECREF(str_obj); 6544 6545 return result; 6546} 6547 6548Py_ssize_t 6549PyUnicode_Find(PyObject *str, 6550 PyObject *sub, 6551 Py_ssize_t start, 6552 Py_ssize_t end, 6553 int direction) 6554{ 6555 Py_ssize_t result; 6556 6557 str = PyUnicode_FromObject(str); 6558 if (!str) 6559 return -2; 6560 sub = PyUnicode_FromObject(sub); 6561 if (!sub) { 6562 Py_DECREF(str); 6563 return -2; 6564 } 6565 6566 if (direction > 0) 6567 result = stringlib_find_slice( 6568 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str), 6569 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub), 6570 start, end 6571 ); 6572 else 6573 result = stringlib_rfind_slice( 6574 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str), 6575 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub), 6576 start, end 6577 ); 6578 6579 Py_DECREF(str); 6580 Py_DECREF(sub); 6581 6582 return result; 6583} 6584 6585static int 6586tailmatch(PyUnicodeObject *self, 6587 PyUnicodeObject *substring, 6588 Py_ssize_t start, 6589 Py_ssize_t end, 6590 int direction) 6591{ 6592 if (substring->length == 0) 6593 return 1; 6594 6595 ADJUST_INDICES(start, end, self->length); 6596 end -= substring->length; 6597 if (end < start) 6598 return 0; 6599 6600 if (direction > 0) { 6601 if (Py_UNICODE_MATCH(self, end, substring)) 6602 return 1; 6603 } else { 6604 if (Py_UNICODE_MATCH(self, start, substring)) 6605 return 1; 6606 } 6607 6608 return 0; 6609} 6610 6611Py_ssize_t 6612PyUnicode_Tailmatch(PyObject *str, 6613 PyObject *substr, 6614 Py_ssize_t start, 6615 Py_ssize_t end, 6616 int direction) 6617{ 6618 Py_ssize_t result; 6619 6620 str = PyUnicode_FromObject(str); 6621 if (str == NULL) 6622 return -1; 6623 substr = PyUnicode_FromObject(substr); 6624 if (substr == NULL) { 6625 Py_DECREF(str); 6626 return -1; 6627 } 6628 6629 result = tailmatch((PyUnicodeObject *)str, 6630 (PyUnicodeObject *)substr, 6631 start, end, direction); 6632 Py_DECREF(str); 6633 Py_DECREF(substr); 6634 return result; 6635} 6636 6637/* Apply fixfct filter to the Unicode object self and return a 6638 reference to the modified object */ 6639 6640static PyObject * 6641fixup(PyUnicodeObject *self, 6642 int (*fixfct)(PyUnicodeObject *s)) 6643{ 6644 6645 PyUnicodeObject *u; 6646 6647 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length); 6648 if (u == NULL) 6649 return NULL; 6650 6651 Py_UNICODE_COPY(u->str, self->str, self->length); 6652 6653 if (!fixfct(u) && PyUnicode_CheckExact(self)) { 6654 /* fixfct should return TRUE if it modified the buffer. If 6655 FALSE, return a reference to the original buffer instead 6656 (to save space, not time) */ 6657 Py_INCREF(self); 6658 Py_DECREF(u); 6659 return (PyObject*) self; 6660 } 6661 return (PyObject*) u; 6662} 6663 6664static int 6665fixupper(PyUnicodeObject *self) 6666{ 6667 Py_ssize_t len = self->length; 6668 Py_UNICODE *s = self->str; 6669 int status = 0; 6670 6671 while (len-- > 0) { 6672 register Py_UNICODE ch; 6673 6674 ch = Py_UNICODE_TOUPPER(*s); 6675 if (ch != *s) { 6676 status = 1; 6677 *s = ch; 6678 } 6679 s++; 6680 } 6681 6682 return status; 6683} 6684 6685static int 6686fixlower(PyUnicodeObject *self) 6687{ 6688 Py_ssize_t len = self->length; 6689 Py_UNICODE *s = self->str; 6690 int status = 0; 6691 6692 while (len-- > 0) { 6693 register Py_UNICODE ch; 6694 6695 ch = Py_UNICODE_TOLOWER(*s); 6696 if (ch != *s) { 6697 status = 1; 6698 *s = ch; 6699 } 6700 s++; 6701 } 6702 6703 return status; 6704} 6705 6706static int 6707fixswapcase(PyUnicodeObject *self) 6708{ 6709 Py_ssize_t len = self->length; 6710 Py_UNICODE *s = self->str; 6711 int status = 0; 6712 6713 while (len-- > 0) { 6714 if (Py_UNICODE_ISUPPER(*s)) { 6715 *s = Py_UNICODE_TOLOWER(*s); 6716 status = 1; 6717 } else if (Py_UNICODE_ISLOWER(*s)) { 6718 *s = Py_UNICODE_TOUPPER(*s); 6719 status = 1; 6720 } 6721 s++; 6722 } 6723 6724 return status; 6725} 6726 6727static int 6728fixcapitalize(PyUnicodeObject *self) 6729{ 6730 Py_ssize_t len = self->length; 6731 Py_UNICODE *s = self->str; 6732 int status = 0; 6733 6734 if (len == 0) 6735 return 0; 6736 if (!Py_UNICODE_ISUPPER(*s)) { 6737 *s = Py_UNICODE_TOUPPER(*s); 6738 status = 1; 6739 } 6740 s++; 6741 while (--len > 0) { 6742 if (!Py_UNICODE_ISLOWER(*s)) { 6743 *s = Py_UNICODE_TOLOWER(*s); 6744 status = 1; 6745 } 6746 s++; 6747 } 6748 return status; 6749} 6750 6751static int 6752fixtitle(PyUnicodeObject *self) 6753{ 6754 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 6755 register Py_UNICODE *e; 6756 int previous_is_cased; 6757 6758 /* Shortcut for single character strings */ 6759 if (PyUnicode_GET_SIZE(self) == 1) { 6760 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p); 6761 if (*p != ch) { 6762 *p = ch; 6763 return 1; 6764 } 6765 else 6766 return 0; 6767 } 6768 6769 e = p + PyUnicode_GET_SIZE(self); 6770 previous_is_cased = 0; 6771 for (; p < e; p++) { 6772 register const Py_UNICODE ch = *p; 6773 6774 if (previous_is_cased) 6775 *p = Py_UNICODE_TOLOWER(ch); 6776 else 6777 *p = Py_UNICODE_TOTITLE(ch); 6778 6779 if (Py_UNICODE_ISLOWER(ch) || 6780 Py_UNICODE_ISUPPER(ch) || 6781 Py_UNICODE_ISTITLE(ch)) 6782 previous_is_cased = 1; 6783 else 6784 previous_is_cased = 0; 6785 } 6786 return 1; 6787} 6788 6789PyObject * 6790PyUnicode_Join(PyObject *separator, PyObject *seq) 6791{ 6792 const Py_UNICODE blank = ' '; 6793 const Py_UNICODE *sep = ␣ 6794 Py_ssize_t seplen = 1; 6795 PyUnicodeObject *res = NULL; /* the result */ 6796 Py_UNICODE *res_p; /* pointer to free byte in res's string area */ 6797 PyObject *fseq; /* PySequence_Fast(seq) */ 6798 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */ 6799 PyObject **items; 6800 PyObject *item; 6801 Py_ssize_t sz, i; 6802 6803 fseq = PySequence_Fast(seq, ""); 6804 if (fseq == NULL) { 6805 return NULL; 6806 } 6807 6808 /* NOTE: the following code can't call back into Python code, 6809 * so we are sure that fseq won't be mutated. 6810 */ 6811 6812 seqlen = PySequence_Fast_GET_SIZE(fseq); 6813 /* If empty sequence, return u"". */ 6814 if (seqlen == 0) { 6815 res = _PyUnicode_New(0); /* empty sequence; return u"" */ 6816 goto Done; 6817 } 6818 items = PySequence_Fast_ITEMS(fseq); 6819 /* If singleton sequence with an exact Unicode, return that. */ 6820 if (seqlen == 1) { 6821 item = items[0]; 6822 if (PyUnicode_CheckExact(item)) { 6823 Py_INCREF(item); 6824 res = (PyUnicodeObject *)item; 6825 goto Done; 6826 } 6827 } 6828 else { 6829 /* Set up sep and seplen */ 6830 if (separator == NULL) { 6831 sep = ␣ 6832 seplen = 1; 6833 } 6834 else { 6835 if (!PyUnicode_Check(separator)) { 6836 PyErr_Format(PyExc_TypeError, 6837 "separator: expected str instance," 6838 " %.80s found", 6839 Py_TYPE(separator)->tp_name); 6840 goto onError; 6841 } 6842 sep = PyUnicode_AS_UNICODE(separator); 6843 seplen = PyUnicode_GET_SIZE(separator); 6844 } 6845 } 6846 6847 /* There are at least two things to join, or else we have a subclass 6848 * of str in the sequence. 6849 * Do a pre-pass to figure out the total amount of space we'll 6850 * need (sz), and see whether all argument are strings. 6851 */ 6852 sz = 0; 6853 for (i = 0; i < seqlen; i++) { 6854 const Py_ssize_t old_sz = sz; 6855 item = items[i]; 6856 if (!PyUnicode_Check(item)) { 6857 PyErr_Format(PyExc_TypeError, 6858 "sequence item %zd: expected str instance," 6859 " %.80s found", 6860 i, Py_TYPE(item)->tp_name); 6861 goto onError; 6862 } 6863 sz += PyUnicode_GET_SIZE(item); 6864 if (i != 0) 6865 sz += seplen; 6866 if (sz < old_sz || sz > PY_SSIZE_T_MAX) { 6867 PyErr_SetString(PyExc_OverflowError, 6868 "join() result is too long for a Python string"); 6869 goto onError; 6870 } 6871 } 6872 6873 res = _PyUnicode_New(sz); 6874 if (res == NULL) 6875 goto onError; 6876 6877 /* Catenate everything. */ 6878 res_p = PyUnicode_AS_UNICODE(res); 6879 for (i = 0; i < seqlen; ++i) { 6880 Py_ssize_t itemlen; 6881 item = items[i]; 6882 itemlen = PyUnicode_GET_SIZE(item); 6883 /* Copy item, and maybe the separator. */ 6884 if (i) { 6885 Py_UNICODE_COPY(res_p, sep, seplen); 6886 res_p += seplen; 6887 } 6888 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen); 6889 res_p += itemlen; 6890 } 6891 6892 Done: 6893 Py_DECREF(fseq); 6894 return (PyObject *)res; 6895 6896 onError: 6897 Py_DECREF(fseq); 6898 Py_XDECREF(res); 6899 return NULL; 6900} 6901 6902static PyUnicodeObject * 6903pad(PyUnicodeObject *self, 6904 Py_ssize_t left, 6905 Py_ssize_t right, 6906 Py_UNICODE fill) 6907{ 6908 PyUnicodeObject *u; 6909 6910 if (left < 0) 6911 left = 0; 6912 if (right < 0) 6913 right = 0; 6914 6915 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) { 6916 Py_INCREF(self); 6917 return self; 6918 } 6919 6920 if (left > PY_SSIZE_T_MAX - self->length || 6921 right > PY_SSIZE_T_MAX - (left + self->length)) { 6922 PyErr_SetString(PyExc_OverflowError, "padded string is too long"); 6923 return NULL; 6924 } 6925 u = _PyUnicode_New(left + self->length + right); 6926 if (u) { 6927 if (left) 6928 Py_UNICODE_FILL(u->str, fill, left); 6929 Py_UNICODE_COPY(u->str + left, self->str, self->length); 6930 if (right) 6931 Py_UNICODE_FILL(u->str + left + self->length, fill, right); 6932 } 6933 6934 return u; 6935} 6936 6937PyObject * 6938PyUnicode_Splitlines(PyObject *string, int keepends) 6939{ 6940 PyObject *list; 6941 6942 string = PyUnicode_FromObject(string); 6943 if (string == NULL) 6944 return NULL; 6945 6946 list = stringlib_splitlines( 6947 (PyObject*) string, PyUnicode_AS_UNICODE(string), 6948 PyUnicode_GET_SIZE(string), keepends); 6949 6950 Py_DECREF(string); 6951 return list; 6952} 6953 6954static PyObject * 6955split(PyUnicodeObject *self, 6956 PyUnicodeObject *substring, 6957 Py_ssize_t maxcount) 6958{ 6959 if (maxcount < 0) 6960 maxcount = PY_SSIZE_T_MAX; 6961 6962 if (substring == NULL) 6963 return stringlib_split_whitespace( 6964 (PyObject*) self, self->str, self->length, maxcount 6965 ); 6966 6967 return stringlib_split( 6968 (PyObject*) self, self->str, self->length, 6969 substring->str, substring->length, 6970 maxcount 6971 ); 6972} 6973 6974static PyObject * 6975rsplit(PyUnicodeObject *self, 6976 PyUnicodeObject *substring, 6977 Py_ssize_t maxcount) 6978{ 6979 if (maxcount < 0) 6980 maxcount = PY_SSIZE_T_MAX; 6981 6982 if (substring == NULL) 6983 return stringlib_rsplit_whitespace( 6984 (PyObject*) self, self->str, self->length, maxcount 6985 ); 6986 6987 return stringlib_rsplit( 6988 (PyObject*) self, self->str, self->length, 6989 substring->str, substring->length, 6990 maxcount 6991 ); 6992} 6993 6994static PyObject * 6995replace(PyUnicodeObject *self, 6996 PyUnicodeObject *str1, 6997 PyUnicodeObject *str2, 6998 Py_ssize_t maxcount) 6999{ 7000 PyUnicodeObject *u; 7001 7002 if (maxcount < 0) 7003 maxcount = PY_SSIZE_T_MAX; 7004 else if (maxcount == 0 || self->length == 0) 7005 goto nothing; 7006 7007 if (str1->length == str2->length) { 7008 Py_ssize_t i; 7009 /* same length */ 7010 if (str1->length == 0) 7011 goto nothing; 7012 if (str1->length == 1) { 7013 /* replace characters */ 7014 Py_UNICODE u1, u2; 7015 if (!findchar(self->str, self->length, str1->str[0])) 7016 goto nothing; 7017 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length); 7018 if (!u) 7019 return NULL; 7020 Py_UNICODE_COPY(u->str, self->str, self->length); 7021 u1 = str1->str[0]; 7022 u2 = str2->str[0]; 7023 for (i = 0; i < u->length; i++) 7024 if (u->str[i] == u1) { 7025 if (--maxcount < 0) 7026 break; 7027 u->str[i] = u2; 7028 } 7029 } else { 7030 i = stringlib_find( 7031 self->str, self->length, str1->str, str1->length, 0 7032 ); 7033 if (i < 0) 7034 goto nothing; 7035 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length); 7036 if (!u) 7037 return NULL; 7038 Py_UNICODE_COPY(u->str, self->str, self->length); 7039 7040 /* change everything in-place, starting with this one */ 7041 Py_UNICODE_COPY(u->str+i, str2->str, str2->length); 7042 i += str1->length; 7043 7044 while ( --maxcount > 0) { 7045 i = stringlib_find(self->str+i, self->length-i, 7046 str1->str, str1->length, 7047 i); 7048 if (i == -1) 7049 break; 7050 Py_UNICODE_COPY(u->str+i, str2->str, str2->length); 7051 i += str1->length; 7052 } 7053 } 7054 } else { 7055 7056 Py_ssize_t n, i, j; 7057 Py_ssize_t product, new_size, delta; 7058 Py_UNICODE *p; 7059 7060 /* replace strings */ 7061 n = stringlib_count(self->str, self->length, str1->str, str1->length, 7062 maxcount); 7063 if (n == 0) 7064 goto nothing; 7065 /* new_size = self->length + n * (str2->length - str1->length)); */ 7066 delta = (str2->length - str1->length); 7067 if (delta == 0) { 7068 new_size = self->length; 7069 } else { 7070 product = n * (str2->length - str1->length); 7071 if ((product / (str2->length - str1->length)) != n) { 7072 PyErr_SetString(PyExc_OverflowError, 7073 "replace string is too long"); 7074 return NULL; 7075 } 7076 new_size = self->length + product; 7077 if (new_size < 0) { 7078 PyErr_SetString(PyExc_OverflowError, 7079 "replace string is too long"); 7080 return NULL; 7081 } 7082 } 7083 u = _PyUnicode_New(new_size); 7084 if (!u) 7085 return NULL; 7086 i = 0; 7087 p = u->str; 7088 if (str1->length > 0) { 7089 while (n-- > 0) { 7090 /* look for next match */ 7091 j = stringlib_find(self->str+i, self->length-i, 7092 str1->str, str1->length, 7093 i); 7094 if (j == -1) 7095 break; 7096 else if (j > i) { 7097 /* copy unchanged part [i:j] */ 7098 Py_UNICODE_COPY(p, self->str+i, j-i); 7099 p += j - i; 7100 } 7101 /* copy substitution string */ 7102 if (str2->length > 0) { 7103 Py_UNICODE_COPY(p, str2->str, str2->length); 7104 p += str2->length; 7105 } 7106 i = j + str1->length; 7107 } 7108 if (i < self->length) 7109 /* copy tail [i:] */ 7110 Py_UNICODE_COPY(p, self->str+i, self->length-i); 7111 } else { 7112 /* interleave */ 7113 while (n > 0) { 7114 Py_UNICODE_COPY(p, str2->str, str2->length); 7115 p += str2->length; 7116 if (--n <= 0) 7117 break; 7118 *p++ = self->str[i++]; 7119 } 7120 Py_UNICODE_COPY(p, self->str+i, self->length-i); 7121 } 7122 } 7123 return (PyObject *) u; 7124 7125 nothing: 7126 /* nothing to replace; return original string (when possible) */ 7127 if (PyUnicode_CheckExact(self)) { 7128 Py_INCREF(self); 7129 return (PyObject *) self; 7130 } 7131 return PyUnicode_FromUnicode(self->str, self->length); 7132} 7133 7134/* --- Unicode Object Methods --------------------------------------------- */ 7135 7136PyDoc_STRVAR(title__doc__, 7137 "S.title() -> str\n\ 7138\n\ 7139Return a titlecased version of S, i.e. words start with title case\n\ 7140characters, all remaining cased characters have lower case."); 7141 7142static PyObject* 7143unicode_title(PyUnicodeObject *self) 7144{ 7145 return fixup(self, fixtitle); 7146} 7147 7148PyDoc_STRVAR(capitalize__doc__, 7149 "S.capitalize() -> str\n\ 7150\n\ 7151Return a capitalized version of S, i.e. make the first character\n\ 7152have upper case and the rest lower case."); 7153 7154static PyObject* 7155unicode_capitalize(PyUnicodeObject *self) 7156{ 7157 return fixup(self, fixcapitalize); 7158} 7159 7160#if 0 7161PyDoc_STRVAR(capwords__doc__, 7162 "S.capwords() -> str\n\ 7163\n\ 7164Apply .capitalize() to all words in S and return the result with\n\ 7165normalized whitespace (all whitespace strings are replaced by ' ')."); 7166 7167static PyObject* 7168unicode_capwords(PyUnicodeObject *self) 7169{ 7170 PyObject *list; 7171 PyObject *item; 7172 Py_ssize_t i; 7173 7174 /* Split into words */ 7175 list = split(self, NULL, -1); 7176 if (!list) 7177 return NULL; 7178 7179 /* Capitalize each word */ 7180 for (i = 0; i < PyList_GET_SIZE(list); i++) { 7181 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i), 7182 fixcapitalize); 7183 if (item == NULL) 7184 goto onError; 7185 Py_DECREF(PyList_GET_ITEM(list, i)); 7186 PyList_SET_ITEM(list, i, item); 7187 } 7188 7189 /* Join the words to form a new string */ 7190 item = PyUnicode_Join(NULL, list); 7191 7192 onError: 7193 Py_DECREF(list); 7194 return (PyObject *)item; 7195} 7196#endif 7197 7198/* Argument converter. Coerces to a single unicode character */ 7199 7200static int 7201convert_uc(PyObject *obj, void *addr) 7202{ 7203 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr; 7204 PyObject *uniobj; 7205 Py_UNICODE *unistr; 7206 7207 uniobj = PyUnicode_FromObject(obj); 7208 if (uniobj == NULL) { 7209 PyErr_SetString(PyExc_TypeError, 7210 "The fill character cannot be converted to Unicode"); 7211 return 0; 7212 } 7213 if (PyUnicode_GET_SIZE(uniobj) != 1) { 7214 PyErr_SetString(PyExc_TypeError, 7215 "The fill character must be exactly one character long"); 7216 Py_DECREF(uniobj); 7217 return 0; 7218 } 7219 unistr = PyUnicode_AS_UNICODE(uniobj); 7220 *fillcharloc = unistr[0]; 7221 Py_DECREF(uniobj); 7222 return 1; 7223} 7224 7225PyDoc_STRVAR(center__doc__, 7226 "S.center(width[, fillchar]) -> str\n\ 7227\n\ 7228Return S centered in a string of length width. Padding is\n\ 7229done using the specified fill character (default is a space)"); 7230 7231static PyObject * 7232unicode_center(PyUnicodeObject *self, PyObject *args) 7233{ 7234 Py_ssize_t marg, left; 7235 Py_ssize_t width; 7236 Py_UNICODE fillchar = ' '; 7237 7238 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar)) 7239 return NULL; 7240 7241 if (self->length >= width && PyUnicode_CheckExact(self)) { 7242 Py_INCREF(self); 7243 return (PyObject*) self; 7244 } 7245 7246 marg = width - self->length; 7247 left = marg / 2 + (marg & width & 1); 7248 7249 return (PyObject*) pad(self, left, marg - left, fillchar); 7250} 7251 7252#if 0 7253 7254/* This code should go into some future Unicode collation support 7255 module. The basic comparison should compare ordinals on a naive 7256 basis (this is what Java does and thus Jython too). */ 7257 7258/* speedy UTF-16 code point order comparison */ 7259/* gleaned from: */ 7260/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */ 7261 7262static short utf16Fixup[32] = 7263{ 7264 0, 0, 0, 0, 0, 0, 0, 0, 7265 0, 0, 0, 0, 0, 0, 0, 0, 7266 0, 0, 0, 0, 0, 0, 0, 0, 7267 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800 7268}; 7269 7270static int 7271unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2) 7272{ 7273 Py_ssize_t len1, len2; 7274 7275 Py_UNICODE *s1 = str1->str; 7276 Py_UNICODE *s2 = str2->str; 7277 7278 len1 = str1->length; 7279 len2 = str2->length; 7280 7281 while (len1 > 0 && len2 > 0) { 7282 Py_UNICODE c1, c2; 7283 7284 c1 = *s1++; 7285 c2 = *s2++; 7286 7287 if (c1 > (1<<11) * 26) 7288 c1 += utf16Fixup[c1>>11]; 7289 if (c2 > (1<<11) * 26) 7290 c2 += utf16Fixup[c2>>11]; 7291 /* now c1 and c2 are in UTF-32-compatible order */ 7292 7293 if (c1 != c2) 7294 return (c1 < c2) ? -1 : 1; 7295 7296 len1--; len2--; 7297 } 7298 7299 return (len1 < len2) ? -1 : (len1 != len2); 7300} 7301 7302#else 7303 7304static int 7305unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2) 7306{ 7307 register Py_ssize_t len1, len2; 7308 7309 Py_UNICODE *s1 = str1->str; 7310 Py_UNICODE *s2 = str2->str; 7311 7312 len1 = str1->length; 7313 len2 = str2->length; 7314 7315 while (len1 > 0 && len2 > 0) { 7316 Py_UNICODE c1, c2; 7317 7318 c1 = *s1++; 7319 c2 = *s2++; 7320 7321 if (c1 != c2) 7322 return (c1 < c2) ? -1 : 1; 7323 7324 len1--; len2--; 7325 } 7326 7327 return (len1 < len2) ? -1 : (len1 != len2); 7328} 7329 7330#endif 7331 7332int 7333PyUnicode_Compare(PyObject *left, PyObject *right) 7334{ 7335 if (PyUnicode_Check(left) && PyUnicode_Check(right)) 7336 return unicode_compare((PyUnicodeObject *)left, 7337 (PyUnicodeObject *)right); 7338 PyErr_Format(PyExc_TypeError, 7339 "Can't compare %.100s and %.100s", 7340 left->ob_type->tp_name, 7341 right->ob_type->tp_name); 7342 return -1; 7343} 7344 7345int 7346PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str) 7347{ 7348 int i; 7349 Py_UNICODE *id; 7350 assert(PyUnicode_Check(uni)); 7351 id = PyUnicode_AS_UNICODE(uni); 7352 /* Compare Unicode string and source character set string */ 7353 for (i = 0; id[i] && str[i]; i++) 7354 if (id[i] != str[i]) 7355 return ((int)id[i] < (int)str[i]) ? -1 : 1; 7356 /* This check keeps Python strings that end in '\0' from comparing equal 7357 to C strings identical up to that point. */ 7358 if (PyUnicode_GET_SIZE(uni) != i || id[i]) 7359 return 1; /* uni is longer */ 7360 if (str[i]) 7361 return -1; /* str is longer */ 7362 return 0; 7363} 7364 7365 7366#define TEST_COND(cond) \ 7367 ((cond) ? Py_True : Py_False) 7368 7369PyObject * 7370PyUnicode_RichCompare(PyObject *left, PyObject *right, int op) 7371{ 7372 int result; 7373 7374 if (PyUnicode_Check(left) && PyUnicode_Check(right)) { 7375 PyObject *v; 7376 if (PyUnicode_GET_SIZE(left) != PyUnicode_GET_SIZE(right)) { 7377 if (op == Py_EQ) { 7378 Py_INCREF(Py_False); 7379 return Py_False; 7380 } 7381 if (op == Py_NE) { 7382 Py_INCREF(Py_True); 7383 return Py_True; 7384 } 7385 } 7386 if (left == right) 7387 result = 0; 7388 else 7389 result = unicode_compare((PyUnicodeObject *)left, 7390 (PyUnicodeObject *)right); 7391 7392 /* Convert the return value to a Boolean */ 7393 switch (op) { 7394 case Py_EQ: 7395 v = TEST_COND(result == 0); 7396 break; 7397 case Py_NE: 7398 v = TEST_COND(result != 0); 7399 break; 7400 case Py_LE: 7401 v = TEST_COND(result <= 0); 7402 break; 7403 case Py_GE: 7404 v = TEST_COND(result >= 0); 7405 break; 7406 case Py_LT: 7407 v = TEST_COND(result == -1); 7408 break; 7409 case Py_GT: 7410 v = TEST_COND(result == 1); 7411 break; 7412 default: 7413 PyErr_BadArgument(); 7414 return NULL; 7415 } 7416 Py_INCREF(v); 7417 return v; 7418 } 7419 7420 Py_RETURN_NOTIMPLEMENTED; 7421} 7422 7423int 7424PyUnicode_Contains(PyObject *container, PyObject *element) 7425{ 7426 PyObject *str, *sub; 7427 int result; 7428 7429 /* Coerce the two arguments */ 7430 sub = PyUnicode_FromObject(element); 7431 if (!sub) { 7432 PyErr_Format(PyExc_TypeError, 7433 "'in <string>' requires string as left operand, not %s", 7434 element->ob_type->tp_name); 7435 return -1; 7436 } 7437 7438 str = PyUnicode_FromObject(container); 7439 if (!str) { 7440 Py_DECREF(sub); 7441 return -1; 7442 } 7443 7444 result = stringlib_contains_obj(str, sub); 7445 7446 Py_DECREF(str); 7447 Py_DECREF(sub); 7448 7449 return result; 7450} 7451 7452/* Concat to string or Unicode object giving a new Unicode object. */ 7453 7454PyObject * 7455PyUnicode_Concat(PyObject *left, PyObject *right) 7456{ 7457 PyUnicodeObject *u = NULL, *v = NULL, *w; 7458 7459 /* Coerce the two arguments */ 7460 u = (PyUnicodeObject *)PyUnicode_FromObject(left); 7461 if (u == NULL) 7462 goto onError; 7463 v = (PyUnicodeObject *)PyUnicode_FromObject(right); 7464 if (v == NULL) 7465 goto onError; 7466 7467 /* Shortcuts */ 7468 if (v == unicode_empty) { 7469 Py_DECREF(v); 7470 return (PyObject *)u; 7471 } 7472 if (u == unicode_empty) { 7473 Py_DECREF(u); 7474 return (PyObject *)v; 7475 } 7476 7477 /* Concat the two Unicode strings */ 7478 w = _PyUnicode_New(u->length + v->length); 7479 if (w == NULL) 7480 goto onError; 7481 Py_UNICODE_COPY(w->str, u->str, u->length); 7482 Py_UNICODE_COPY(w->str + u->length, v->str, v->length); 7483 7484 Py_DECREF(u); 7485 Py_DECREF(v); 7486 return (PyObject *)w; 7487 7488 onError: 7489 Py_XDECREF(u); 7490 Py_XDECREF(v); 7491 return NULL; 7492} 7493 7494void 7495PyUnicode_Append(PyObject **pleft, PyObject *right) 7496{ 7497 PyObject *new; 7498 if (*pleft == NULL) 7499 return; 7500 if (right == NULL || !PyUnicode_Check(*pleft)) { 7501 Py_DECREF(*pleft); 7502 *pleft = NULL; 7503 return; 7504 } 7505 new = PyUnicode_Concat(*pleft, right); 7506 Py_DECREF(*pleft); 7507 *pleft = new; 7508} 7509 7510void 7511PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right) 7512{ 7513 PyUnicode_Append(pleft, right); 7514 Py_XDECREF(right); 7515} 7516 7517PyDoc_STRVAR(count__doc__, 7518 "S.count(sub[, start[, end]]) -> int\n\ 7519\n\ 7520Return the number of non-overlapping occurrences of substring sub in\n\ 7521string S[start:end]. Optional arguments start and end are\n\ 7522interpreted as in slice notation."); 7523 7524static PyObject * 7525unicode_count(PyUnicodeObject *self, PyObject *args) 7526{ 7527 PyUnicodeObject *substring; 7528 Py_ssize_t start = 0; 7529 Py_ssize_t end = PY_SSIZE_T_MAX; 7530 PyObject *result; 7531 7532 if (!stringlib_parse_args_finds_unicode("count", args, &substring, 7533 &start, &end)) 7534 return NULL; 7535 7536 ADJUST_INDICES(start, end, self->length); 7537 result = PyLong_FromSsize_t( 7538 stringlib_count(self->str + start, end - start, 7539 substring->str, substring->length, 7540 PY_SSIZE_T_MAX) 7541 ); 7542 7543 Py_DECREF(substring); 7544 7545 return result; 7546} 7547 7548PyDoc_STRVAR(encode__doc__, 7549 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\ 7550\n\ 7551Encode S using the codec registered for encoding. Default encoding\n\ 7552is 'utf-8'. errors may be given to set a different error\n\ 7553handling scheme. Default is 'strict' meaning that encoding errors raise\n\ 7554a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\ 7555'xmlcharrefreplace' as well as any other name registered with\n\ 7556codecs.register_error that can handle UnicodeEncodeErrors."); 7557 7558static PyObject * 7559unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs) 7560{ 7561 static char *kwlist[] = {"encoding", "errors", 0}; 7562 char *encoding = NULL; 7563 char *errors = NULL; 7564 7565 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode", 7566 kwlist, &encoding, &errors)) 7567 return NULL; 7568 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors); 7569} 7570 7571PyDoc_STRVAR(expandtabs__doc__, 7572 "S.expandtabs([tabsize]) -> str\n\ 7573\n\ 7574Return a copy of S where all tab characters are expanded using spaces.\n\ 7575If tabsize is not given, a tab size of 8 characters is assumed."); 7576 7577static PyObject* 7578unicode_expandtabs(PyUnicodeObject *self, PyObject *args) 7579{ 7580 Py_UNICODE *e; 7581 Py_UNICODE *p; 7582 Py_UNICODE *q; 7583 Py_UNICODE *qe; 7584 Py_ssize_t i, j, incr; 7585 PyUnicodeObject *u; 7586 int tabsize = 8; 7587 7588 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize)) 7589 return NULL; 7590 7591 /* First pass: determine size of output string */ 7592 i = 0; /* chars up to and including most recent \n or \r */ 7593 j = 0; /* chars since most recent \n or \r (use in tab calculations) */ 7594 e = self->str + self->length; /* end of input */ 7595 for (p = self->str; p < e; p++) 7596 if (*p == '\t') { 7597 if (tabsize > 0) { 7598 incr = tabsize - (j % tabsize); /* cannot overflow */ 7599 if (j > PY_SSIZE_T_MAX - incr) 7600 goto overflow1; 7601 j += incr; 7602 } 7603 } 7604 else { 7605 if (j > PY_SSIZE_T_MAX - 1) 7606 goto overflow1; 7607 j++; 7608 if (*p == '\n' || *p == '\r') { 7609 if (i > PY_SSIZE_T_MAX - j) 7610 goto overflow1; 7611 i += j; 7612 j = 0; 7613 } 7614 } 7615 7616 if (i > PY_SSIZE_T_MAX - j) 7617 goto overflow1; 7618 7619 /* Second pass: create output string and fill it */ 7620 u = _PyUnicode_New(i + j); 7621 if (!u) 7622 return NULL; 7623 7624 j = 0; /* same as in first pass */ 7625 q = u->str; /* next output char */ 7626 qe = u->str + u->length; /* end of output */ 7627 7628 for (p = self->str; p < e; p++) 7629 if (*p == '\t') { 7630 if (tabsize > 0) { 7631 i = tabsize - (j % tabsize); 7632 j += i; 7633 while (i--) { 7634 if (q >= qe) 7635 goto overflow2; 7636 *q++ = ' '; 7637 } 7638 } 7639 } 7640 else { 7641 if (q >= qe) 7642 goto overflow2; 7643 *q++ = *p; 7644 j++; 7645 if (*p == '\n' || *p == '\r') 7646 j = 0; 7647 } 7648 7649 return (PyObject*) u; 7650 7651 overflow2: 7652 Py_DECREF(u); 7653 overflow1: 7654 PyErr_SetString(PyExc_OverflowError, "new string is too long"); 7655 return NULL; 7656} 7657 7658PyDoc_STRVAR(find__doc__, 7659 "S.find(sub[, start[, end]]) -> int\n\ 7660\n\ 7661Return the lowest index in S where substring sub is found,\n\ 7662such that sub is contained within S[start:end]. Optional\n\ 7663arguments start and end are interpreted as in slice notation.\n\ 7664\n\ 7665Return -1 on failure."); 7666 7667static PyObject * 7668unicode_find(PyUnicodeObject *self, PyObject *args) 7669{ 7670 PyUnicodeObject *substring; 7671 Py_ssize_t start; 7672 Py_ssize_t end; 7673 Py_ssize_t result; 7674 7675 if (!stringlib_parse_args_finds_unicode("find", args, &substring, 7676 &start, &end)) 7677 return NULL; 7678 7679 result = stringlib_find_slice( 7680 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self), 7681 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring), 7682 start, end 7683 ); 7684 7685 Py_DECREF(substring); 7686 7687 return PyLong_FromSsize_t(result); 7688} 7689 7690static PyObject * 7691unicode_getitem(PyUnicodeObject *self, Py_ssize_t index) 7692{ 7693 if (index < 0 || index >= self->length) { 7694 PyErr_SetString(PyExc_IndexError, "string index out of range"); 7695 return NULL; 7696 } 7697 7698 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1); 7699} 7700 7701/* Believe it or not, this produces the same value for ASCII strings 7702 as string_hash(). */ 7703static Py_hash_t 7704unicode_hash(PyUnicodeObject *self) 7705{ 7706 Py_ssize_t len; 7707 Py_UNICODE *p; 7708 Py_hash_t x; 7709 7710 if (self->hash != -1) 7711 return self->hash; 7712 len = Py_SIZE(self); 7713 p = self->str; 7714 x = *p << 7; 7715 while (--len >= 0) 7716 x = (1000003*x) ^ *p++; 7717 x ^= Py_SIZE(self); 7718 if (x == -1) 7719 x = -2; 7720 self->hash = x; 7721 return x; 7722} 7723 7724PyDoc_STRVAR(index__doc__, 7725 "S.index(sub[, start[, end]]) -> int\n\ 7726\n\ 7727Like S.find() but raise ValueError when the substring is not found."); 7728 7729static PyObject * 7730unicode_index(PyUnicodeObject *self, PyObject *args) 7731{ 7732 Py_ssize_t result; 7733 PyUnicodeObject *substring; 7734 Py_ssize_t start; 7735 Py_ssize_t end; 7736 7737 if (!stringlib_parse_args_finds_unicode("index", args, &substring, 7738 &start, &end)) 7739 return NULL; 7740 7741 result = stringlib_find_slice( 7742 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self), 7743 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring), 7744 start, end 7745 ); 7746 7747 Py_DECREF(substring); 7748 7749 if (result < 0) { 7750 PyErr_SetString(PyExc_ValueError, "substring not found"); 7751 return NULL; 7752 } 7753 7754 return PyLong_FromSsize_t(result); 7755} 7756 7757PyDoc_STRVAR(islower__doc__, 7758 "S.islower() -> bool\n\ 7759\n\ 7760Return True if all cased characters in S are lowercase and there is\n\ 7761at least one cased character in S, False otherwise."); 7762 7763static PyObject* 7764unicode_islower(PyUnicodeObject *self) 7765{ 7766 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 7767 register const Py_UNICODE *e; 7768 int cased; 7769 7770 /* Shortcut for single character strings */ 7771 if (PyUnicode_GET_SIZE(self) == 1) 7772 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p)); 7773 7774 /* Special case for empty strings */ 7775 if (PyUnicode_GET_SIZE(self) == 0) 7776 return PyBool_FromLong(0); 7777 7778 e = p + PyUnicode_GET_SIZE(self); 7779 cased = 0; 7780 for (; p < e; p++) { 7781 register const Py_UNICODE ch = *p; 7782 7783 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) 7784 return PyBool_FromLong(0); 7785 else if (!cased && Py_UNICODE_ISLOWER(ch)) 7786 cased = 1; 7787 } 7788 return PyBool_FromLong(cased); 7789} 7790 7791PyDoc_STRVAR(isupper__doc__, 7792 "S.isupper() -> bool\n\ 7793\n\ 7794Return True if all cased characters in S are uppercase and there is\n\ 7795at least one cased character in S, False otherwise."); 7796 7797static PyObject* 7798unicode_isupper(PyUnicodeObject *self) 7799{ 7800 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 7801 register const Py_UNICODE *e; 7802 int cased; 7803 7804 /* Shortcut for single character strings */ 7805 if (PyUnicode_GET_SIZE(self) == 1) 7806 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0); 7807 7808 /* Special case for empty strings */ 7809 if (PyUnicode_GET_SIZE(self) == 0) 7810 return PyBool_FromLong(0); 7811 7812 e = p + PyUnicode_GET_SIZE(self); 7813 cased = 0; 7814 for (; p < e; p++) { 7815 register const Py_UNICODE ch = *p; 7816 7817 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch)) 7818 return PyBool_FromLong(0); 7819 else if (!cased && Py_UNICODE_ISUPPER(ch)) 7820 cased = 1; 7821 } 7822 return PyBool_FromLong(cased); 7823} 7824 7825PyDoc_STRVAR(istitle__doc__, 7826 "S.istitle() -> bool\n\ 7827\n\ 7828Return True if S is a titlecased string and there is at least one\n\ 7829character in S, i.e. upper- and titlecase characters may only\n\ 7830follow uncased characters and lowercase characters only cased ones.\n\ 7831Return False otherwise."); 7832 7833static PyObject* 7834unicode_istitle(PyUnicodeObject *self) 7835{ 7836 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 7837 register const Py_UNICODE *e; 7838 int cased, previous_is_cased; 7839 7840 /* Shortcut for single character strings */ 7841 if (PyUnicode_GET_SIZE(self) == 1) 7842 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) || 7843 (Py_UNICODE_ISUPPER(*p) != 0)); 7844 7845 /* Special case for empty strings */ 7846 if (PyUnicode_GET_SIZE(self) == 0) 7847 return PyBool_FromLong(0); 7848 7849 e = p + PyUnicode_GET_SIZE(self); 7850 cased = 0; 7851 previous_is_cased = 0; 7852 for (; p < e; p++) { 7853 register const Py_UNICODE ch = *p; 7854 7855 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) { 7856 if (previous_is_cased) 7857 return PyBool_FromLong(0); 7858 previous_is_cased = 1; 7859 cased = 1; 7860 } 7861 else if (Py_UNICODE_ISLOWER(ch)) { 7862 if (!previous_is_cased) 7863 return PyBool_FromLong(0); 7864 previous_is_cased = 1; 7865 cased = 1; 7866 } 7867 else 7868 previous_is_cased = 0; 7869 } 7870 return PyBool_FromLong(cased); 7871} 7872 7873PyDoc_STRVAR(isspace__doc__, 7874 "S.isspace() -> bool\n\ 7875\n\ 7876Return True if all characters in S are whitespace\n\ 7877and there is at least one character in S, False otherwise."); 7878 7879static PyObject* 7880unicode_isspace(PyUnicodeObject *self) 7881{ 7882 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 7883 register const Py_UNICODE *e; 7884 7885 /* Shortcut for single character strings */ 7886 if (PyUnicode_GET_SIZE(self) == 1 && 7887 Py_UNICODE_ISSPACE(*p)) 7888 return PyBool_FromLong(1); 7889 7890 /* Special case for empty strings */ 7891 if (PyUnicode_GET_SIZE(self) == 0) 7892 return PyBool_FromLong(0); 7893 7894 e = p + PyUnicode_GET_SIZE(self); 7895 for (; p < e; p++) { 7896 if (!Py_UNICODE_ISSPACE(*p)) 7897 return PyBool_FromLong(0); 7898 } 7899 return PyBool_FromLong(1); 7900} 7901 7902PyDoc_STRVAR(isalpha__doc__, 7903 "S.isalpha() -> bool\n\ 7904\n\ 7905Return True if all characters in S are alphabetic\n\ 7906and there is at least one character in S, False otherwise."); 7907 7908static PyObject* 7909unicode_isalpha(PyUnicodeObject *self) 7910{ 7911 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 7912 register const Py_UNICODE *e; 7913 7914 /* Shortcut for single character strings */ 7915 if (PyUnicode_GET_SIZE(self) == 1 && 7916 Py_UNICODE_ISALPHA(*p)) 7917 return PyBool_FromLong(1); 7918 7919 /* Special case for empty strings */ 7920 if (PyUnicode_GET_SIZE(self) == 0) 7921 return PyBool_FromLong(0); 7922 7923 e = p + PyUnicode_GET_SIZE(self); 7924 for (; p < e; p++) { 7925 if (!Py_UNICODE_ISALPHA(*p)) 7926 return PyBool_FromLong(0); 7927 } 7928 return PyBool_FromLong(1); 7929} 7930 7931PyDoc_STRVAR(isalnum__doc__, 7932 "S.isalnum() -> bool\n\ 7933\n\ 7934Return True if all characters in S are alphanumeric\n\ 7935and there is at least one character in S, False otherwise."); 7936 7937static PyObject* 7938unicode_isalnum(PyUnicodeObject *self) 7939{ 7940 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 7941 register const Py_UNICODE *e; 7942 7943 /* Shortcut for single character strings */ 7944 if (PyUnicode_GET_SIZE(self) == 1 && 7945 Py_UNICODE_ISALNUM(*p)) 7946 return PyBool_FromLong(1); 7947 7948 /* Special case for empty strings */ 7949 if (PyUnicode_GET_SIZE(self) == 0) 7950 return PyBool_FromLong(0); 7951 7952 e = p + PyUnicode_GET_SIZE(self); 7953 for (; p < e; p++) { 7954 if (!Py_UNICODE_ISALNUM(*p)) 7955 return PyBool_FromLong(0); 7956 } 7957 return PyBool_FromLong(1); 7958} 7959 7960PyDoc_STRVAR(isdecimal__doc__, 7961 "S.isdecimal() -> bool\n\ 7962\n\ 7963Return True if there are only decimal characters in S,\n\ 7964False otherwise."); 7965 7966static PyObject* 7967unicode_isdecimal(PyUnicodeObject *self) 7968{ 7969 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 7970 register const Py_UNICODE *e; 7971 7972 /* Shortcut for single character strings */ 7973 if (PyUnicode_GET_SIZE(self) == 1 && 7974 Py_UNICODE_ISDECIMAL(*p)) 7975 return PyBool_FromLong(1); 7976 7977 /* Special case for empty strings */ 7978 if (PyUnicode_GET_SIZE(self) == 0) 7979 return PyBool_FromLong(0); 7980 7981 e = p + PyUnicode_GET_SIZE(self); 7982 for (; p < e; p++) { 7983 if (!Py_UNICODE_ISDECIMAL(*p)) 7984 return PyBool_FromLong(0); 7985 } 7986 return PyBool_FromLong(1); 7987} 7988 7989PyDoc_STRVAR(isdigit__doc__, 7990 "S.isdigit() -> bool\n\ 7991\n\ 7992Return True if all characters in S are digits\n\ 7993and there is at least one character in S, False otherwise."); 7994 7995static PyObject* 7996unicode_isdigit(PyUnicodeObject *self) 7997{ 7998 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 7999 register const Py_UNICODE *e; 8000 8001 /* Shortcut for single character strings */ 8002 if (PyUnicode_GET_SIZE(self) == 1 && 8003 Py_UNICODE_ISDIGIT(*p)) 8004 return PyBool_FromLong(1); 8005 8006 /* Special case for empty strings */ 8007 if (PyUnicode_GET_SIZE(self) == 0) 8008 return PyBool_FromLong(0); 8009 8010 e = p + PyUnicode_GET_SIZE(self); 8011 for (; p < e; p++) { 8012 if (!Py_UNICODE_ISDIGIT(*p)) 8013 return PyBool_FromLong(0); 8014 } 8015 return PyBool_FromLong(1); 8016} 8017 8018PyDoc_STRVAR(isnumeric__doc__, 8019 "S.isnumeric() -> bool\n\ 8020\n\ 8021Return True if there are only numeric characters in S,\n\ 8022False otherwise."); 8023 8024static PyObject* 8025unicode_isnumeric(PyUnicodeObject *self) 8026{ 8027 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 8028 register const Py_UNICODE *e; 8029 8030 /* Shortcut for single character strings */ 8031 if (PyUnicode_GET_SIZE(self) == 1 && 8032 Py_UNICODE_ISNUMERIC(*p)) 8033 return PyBool_FromLong(1); 8034 8035 /* Special case for empty strings */ 8036 if (PyUnicode_GET_SIZE(self) == 0) 8037 return PyBool_FromLong(0); 8038 8039 e = p + PyUnicode_GET_SIZE(self); 8040 for (; p < e; p++) { 8041 if (!Py_UNICODE_ISNUMERIC(*p)) 8042 return PyBool_FromLong(0); 8043 } 8044 return PyBool_FromLong(1); 8045} 8046 8047static Py_UCS4 8048decode_ucs4(const Py_UNICODE *s, Py_ssize_t *i, Py_ssize_t size) 8049{ 8050 Py_UCS4 ch; 8051 assert(*i < size); 8052 ch = s[(*i)++]; 8053#ifndef Py_UNICODE_WIDE 8054 if ((ch & 0xfffffc00) == 0xd800 && 8055 *i < size 8056 && (s[*i] & 0xFFFFFC00) == 0xDC00) 8057 ch = ((Py_UCS4)ch << 10UL) + (Py_UCS4)(s[(*i)++]) - 0x35fdc00; 8058#endif 8059 return ch; 8060} 8061 8062int 8063PyUnicode_IsIdentifier(PyObject *self) 8064{ 8065 Py_ssize_t i = 0, size = PyUnicode_GET_SIZE(self); 8066 Py_UCS4 first; 8067 const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self); 8068 8069 /* Special case for empty strings */ 8070 if (!size) 8071 return 0; 8072 8073 /* PEP 3131 says that the first character must be in 8074 XID_Start and subsequent characters in XID_Continue, 8075 and for the ASCII range, the 2.x rules apply (i.e 8076 start with letters and underscore, continue with 8077 letters, digits, underscore). However, given the current 8078 definition of XID_Start and XID_Continue, it is sufficient 8079 to check just for these, except that _ must be allowed 8080 as starting an identifier. */ 8081 first = decode_ucs4(p, &i, size); 8082 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */) 8083 return 0; 8084 8085 while (i < size) 8086 if (!_PyUnicode_IsXidContinue(decode_ucs4(p, &i, size))) 8087 return 0; 8088 return 1; 8089} 8090 8091PyDoc_STRVAR(isidentifier__doc__, 8092 "S.isidentifier() -> bool\n\ 8093\n\ 8094Return True if S is a valid identifier according\n\ 8095to the language definition."); 8096 8097static PyObject* 8098unicode_isidentifier(PyObject *self) 8099{ 8100 return PyBool_FromLong(PyUnicode_IsIdentifier(self)); 8101} 8102 8103PyDoc_STRVAR(isprintable__doc__, 8104 "S.isprintable() -> bool\n\ 8105\n\ 8106Return True if all characters in S are considered\n\ 8107printable in repr() or S is empty, False otherwise."); 8108 8109static PyObject* 8110unicode_isprintable(PyObject *self) 8111{ 8112 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 8113 register const Py_UNICODE *e; 8114 8115 /* Shortcut for single character strings */ 8116 if (PyUnicode_GET_SIZE(self) == 1 && Py_UNICODE_ISPRINTABLE(*p)) { 8117 Py_RETURN_TRUE; 8118 } 8119 8120 e = p + PyUnicode_GET_SIZE(self); 8121 for (; p < e; p++) { 8122 if (!Py_UNICODE_ISPRINTABLE(*p)) { 8123 Py_RETURN_FALSE; 8124 } 8125 } 8126 Py_RETURN_TRUE; 8127} 8128 8129PyDoc_STRVAR(join__doc__, 8130 "S.join(iterable) -> str\n\ 8131\n\ 8132Return a string which is the concatenation of the strings in the\n\ 8133iterable. The separator between elements is S."); 8134 8135static PyObject* 8136unicode_join(PyObject *self, PyObject *data) 8137{ 8138 return PyUnicode_Join(self, data); 8139} 8140 8141static Py_ssize_t 8142unicode_length(PyUnicodeObject *self) 8143{ 8144 return self->length; 8145} 8146 8147PyDoc_STRVAR(ljust__doc__, 8148 "S.ljust(width[, fillchar]) -> str\n\ 8149\n\ 8150Return S left-justified in a Unicode string of length width. Padding is\n\ 8151done using the specified fill character (default is a space)."); 8152 8153static PyObject * 8154unicode_ljust(PyUnicodeObject *self, PyObject *args) 8155{ 8156 Py_ssize_t width; 8157 Py_UNICODE fillchar = ' '; 8158 8159 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar)) 8160 return NULL; 8161 8162 if (self->length >= width && PyUnicode_CheckExact(self)) { 8163 Py_INCREF(self); 8164 return (PyObject*) self; 8165 } 8166 8167 return (PyObject*) pad(self, 0, width - self->length, fillchar); 8168} 8169 8170PyDoc_STRVAR(lower__doc__, 8171 "S.lower() -> str\n\ 8172\n\ 8173Return a copy of the string S converted to lowercase."); 8174 8175static PyObject* 8176unicode_lower(PyUnicodeObject *self) 8177{ 8178 return fixup(self, fixlower); 8179} 8180 8181#define LEFTSTRIP 0 8182#define RIGHTSTRIP 1 8183#define BOTHSTRIP 2 8184 8185/* Arrays indexed by above */ 8186static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"}; 8187 8188#define STRIPNAME(i) (stripformat[i]+3) 8189 8190/* externally visible for str.strip(unicode) */ 8191PyObject * 8192_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj) 8193{ 8194 Py_UNICODE *s = PyUnicode_AS_UNICODE(self); 8195 Py_ssize_t len = PyUnicode_GET_SIZE(self); 8196 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj); 8197 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj); 8198 Py_ssize_t i, j; 8199 8200 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen); 8201 8202 i = 0; 8203 if (striptype != RIGHTSTRIP) { 8204 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) { 8205 i++; 8206 } 8207 } 8208 8209 j = len; 8210 if (striptype != LEFTSTRIP) { 8211 do { 8212 j--; 8213 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen)); 8214 j++; 8215 } 8216 8217 if (i == 0 && j == len && PyUnicode_CheckExact(self)) { 8218 Py_INCREF(self); 8219 return (PyObject*)self; 8220 } 8221 else 8222 return PyUnicode_FromUnicode(s+i, j-i); 8223} 8224 8225 8226static PyObject * 8227do_strip(PyUnicodeObject *self, int striptype) 8228{ 8229 Py_UNICODE *s = PyUnicode_AS_UNICODE(self); 8230 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j; 8231 8232 i = 0; 8233 if (striptype != RIGHTSTRIP) { 8234 while (i < len && Py_UNICODE_ISSPACE(s[i])) { 8235 i++; 8236 } 8237 } 8238 8239 j = len; 8240 if (striptype != LEFTSTRIP) { 8241 do { 8242 j--; 8243 } while (j >= i && Py_UNICODE_ISSPACE(s[j])); 8244 j++; 8245 } 8246 8247 if (i == 0 && j == len && PyUnicode_CheckExact(self)) { 8248 Py_INCREF(self); 8249 return (PyObject*)self; 8250 } 8251 else 8252 return PyUnicode_FromUnicode(s+i, j-i); 8253} 8254 8255 8256static PyObject * 8257do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args) 8258{ 8259 PyObject *sep = NULL; 8260 8261 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep)) 8262 return NULL; 8263 8264 if (sep != NULL && sep != Py_None) { 8265 if (PyUnicode_Check(sep)) 8266 return _PyUnicode_XStrip(self, striptype, sep); 8267 else { 8268 PyErr_Format(PyExc_TypeError, 8269 "%s arg must be None or str", 8270 STRIPNAME(striptype)); 8271 return NULL; 8272 } 8273 } 8274 8275 return do_strip(self, striptype); 8276} 8277 8278 8279PyDoc_STRVAR(strip__doc__, 8280 "S.strip([chars]) -> str\n\ 8281\n\ 8282Return a copy of the string S with leading and trailing\n\ 8283whitespace removed.\n\ 8284If chars is given and not None, remove characters in chars instead."); 8285 8286static PyObject * 8287unicode_strip(PyUnicodeObject *self, PyObject *args) 8288{ 8289 if (PyTuple_GET_SIZE(args) == 0) 8290 return do_strip(self, BOTHSTRIP); /* Common case */ 8291 else 8292 return do_argstrip(self, BOTHSTRIP, args); 8293} 8294 8295 8296PyDoc_STRVAR(lstrip__doc__, 8297 "S.lstrip([chars]) -> str\n\ 8298\n\ 8299Return a copy of the string S with leading whitespace removed.\n\ 8300If chars is given and not None, remove characters in chars instead."); 8301 8302static PyObject * 8303unicode_lstrip(PyUnicodeObject *self, PyObject *args) 8304{ 8305 if (PyTuple_GET_SIZE(args) == 0) 8306 return do_strip(self, LEFTSTRIP); /* Common case */ 8307 else 8308 return do_argstrip(self, LEFTSTRIP, args); 8309} 8310 8311 8312PyDoc_STRVAR(rstrip__doc__, 8313 "S.rstrip([chars]) -> str\n\ 8314\n\ 8315Return a copy of the string S with trailing whitespace removed.\n\ 8316If chars is given and not None, remove characters in chars instead."); 8317 8318static PyObject * 8319unicode_rstrip(PyUnicodeObject *self, PyObject *args) 8320{ 8321 if (PyTuple_GET_SIZE(args) == 0) 8322 return do_strip(self, RIGHTSTRIP); /* Common case */ 8323 else 8324 return do_argstrip(self, RIGHTSTRIP, args); 8325} 8326 8327 8328static PyObject* 8329unicode_repeat(PyUnicodeObject *str, Py_ssize_t len) 8330{ 8331 PyUnicodeObject *u; 8332 Py_UNICODE *p; 8333 Py_ssize_t nchars; 8334 size_t nbytes; 8335 8336 if (len < 1) { 8337 Py_INCREF(unicode_empty); 8338 return (PyObject *)unicode_empty; 8339 } 8340 8341 if (len == 1 && PyUnicode_CheckExact(str)) { 8342 /* no repeat, return original string */ 8343 Py_INCREF(str); 8344 return (PyObject*) str; 8345 } 8346 8347 /* ensure # of chars needed doesn't overflow int and # of bytes 8348 * needed doesn't overflow size_t 8349 */ 8350 nchars = len * str->length; 8351 if (nchars / len != str->length) { 8352 PyErr_SetString(PyExc_OverflowError, 8353 "repeated string is too long"); 8354 return NULL; 8355 } 8356 nbytes = (nchars + 1) * sizeof(Py_UNICODE); 8357 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) { 8358 PyErr_SetString(PyExc_OverflowError, 8359 "repeated string is too long"); 8360 return NULL; 8361 } 8362 u = _PyUnicode_New(nchars); 8363 if (!u) 8364 return NULL; 8365 8366 p = u->str; 8367 8368 if (str->length == 1) { 8369 Py_UNICODE_FILL(p, str->str[0], len); 8370 } else { 8371 Py_ssize_t done = str->length; /* number of characters copied this far */ 8372 Py_UNICODE_COPY(p, str->str, str->length); 8373 while (done < nchars) { 8374 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done; 8375 Py_UNICODE_COPY(p+done, p, n); 8376 done += n; 8377 } 8378 } 8379 8380 return (PyObject*) u; 8381} 8382 8383PyObject * 8384PyUnicode_Replace(PyObject *obj, 8385 PyObject *subobj, 8386 PyObject *replobj, 8387 Py_ssize_t maxcount) 8388{ 8389 PyObject *self; 8390 PyObject *str1; 8391 PyObject *str2; 8392 PyObject *result; 8393 8394 self = PyUnicode_FromObject(obj); 8395 if (self == NULL) 8396 return NULL; 8397 str1 = PyUnicode_FromObject(subobj); 8398 if (str1 == NULL) { 8399 Py_DECREF(self); 8400 return NULL; 8401 } 8402 str2 = PyUnicode_FromObject(replobj); 8403 if (str2 == NULL) { 8404 Py_DECREF(self); 8405 Py_DECREF(str1); 8406 return NULL; 8407 } 8408 result = replace((PyUnicodeObject *)self, 8409 (PyUnicodeObject *)str1, 8410 (PyUnicodeObject *)str2, 8411 maxcount); 8412 Py_DECREF(self); 8413 Py_DECREF(str1); 8414 Py_DECREF(str2); 8415 return result; 8416} 8417 8418PyDoc_STRVAR(replace__doc__, 8419 "S.replace(old, new[, count]) -> str\n\ 8420\n\ 8421Return a copy of S with all occurrences of substring\n\ 8422old replaced by new. If the optional argument count is\n\ 8423given, only the first count occurrences are replaced."); 8424 8425static PyObject* 8426unicode_replace(PyUnicodeObject *self, PyObject *args) 8427{ 8428 PyUnicodeObject *str1; 8429 PyUnicodeObject *str2; 8430 Py_ssize_t maxcount = -1; 8431 PyObject *result; 8432 8433 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount)) 8434 return NULL; 8435 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1); 8436 if (str1 == NULL) 8437 return NULL; 8438 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2); 8439 if (str2 == NULL) { 8440 Py_DECREF(str1); 8441 return NULL; 8442 } 8443 8444 result = replace(self, str1, str2, maxcount); 8445 8446 Py_DECREF(str1); 8447 Py_DECREF(str2); 8448 return result; 8449} 8450 8451static PyObject * 8452unicode_repr(PyObject *unicode) 8453{ 8454 PyObject *repr; 8455 Py_UNICODE *p; 8456 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode); 8457 Py_ssize_t size = PyUnicode_GET_SIZE(unicode); 8458 8459 /* XXX(nnorwitz): rather than over-allocating, it would be 8460 better to choose a different scheme. Perhaps scan the 8461 first N-chars of the string and allocate based on that size. 8462 */ 8463 /* Initial allocation is based on the longest-possible unichr 8464 escape. 8465 8466 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source 8467 unichr, so in this case it's the longest unichr escape. In 8468 narrow (UTF-16) builds this is five chars per source unichr 8469 since there are two unichrs in the surrogate pair, so in narrow 8470 (UTF-16) builds it's not the longest unichr escape. 8471 8472 In wide or narrow builds '\uxxxx' is 6 chars per source unichr, 8473 so in the narrow (UTF-16) build case it's the longest unichr 8474 escape. 8475 */ 8476 8477 repr = PyUnicode_FromUnicode(NULL, 8478 2 /* quotes */ 8479#ifdef Py_UNICODE_WIDE 8480 + 10*size 8481#else 8482 + 6*size 8483#endif 8484 + 1); 8485 if (repr == NULL) 8486 return NULL; 8487 8488 p = PyUnicode_AS_UNICODE(repr); 8489 8490 /* Add quote */ 8491 *p++ = (findchar(s, size, '\'') && 8492 !findchar(s, size, '"')) ? '"' : '\''; 8493 while (size-- > 0) { 8494 Py_UNICODE ch = *s++; 8495 8496 /* Escape quotes and backslashes */ 8497 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) { 8498 *p++ = '\\'; 8499 *p++ = ch; 8500 continue; 8501 } 8502 8503 /* Map special whitespace to '\t', \n', '\r' */ 8504 if (ch == '\t') { 8505 *p++ = '\\'; 8506 *p++ = 't'; 8507 } 8508 else if (ch == '\n') { 8509 *p++ = '\\'; 8510 *p++ = 'n'; 8511 } 8512 else if (ch == '\r') { 8513 *p++ = '\\'; 8514 *p++ = 'r'; 8515 } 8516 8517 /* Map non-printable US ASCII to '\xhh' */ 8518 else if (ch < ' ' || ch == 0x7F) { 8519 *p++ = '\\'; 8520 *p++ = 'x'; 8521 *p++ = hexdigits[(ch >> 4) & 0x000F]; 8522 *p++ = hexdigits[ch & 0x000F]; 8523 } 8524 8525 /* Copy ASCII characters as-is */ 8526 else if (ch < 0x7F) { 8527 *p++ = ch; 8528 } 8529 8530 /* Non-ASCII characters */ 8531 else { 8532 Py_UCS4 ucs = ch; 8533 8534#ifndef Py_UNICODE_WIDE 8535 Py_UNICODE ch2 = 0; 8536 /* Get code point from surrogate pair */ 8537 if (size > 0) { 8538 ch2 = *s; 8539 if (ch >= 0xD800 && ch < 0xDC00 && ch2 >= 0xDC00 8540 && ch2 <= 0xDFFF) { 8541 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) 8542 + 0x00010000; 8543 s++; 8544 size--; 8545 } 8546 } 8547#endif 8548 /* Map Unicode whitespace and control characters 8549 (categories Z* and C* except ASCII space) 8550 */ 8551 if (!Py_UNICODE_ISPRINTABLE(ucs)) { 8552 /* Map 8-bit characters to '\xhh' */ 8553 if (ucs <= 0xff) { 8554 *p++ = '\\'; 8555 *p++ = 'x'; 8556 *p++ = hexdigits[(ch >> 4) & 0x000F]; 8557 *p++ = hexdigits[ch & 0x000F]; 8558 } 8559 /* Map 21-bit characters to '\U00xxxxxx' */ 8560 else if (ucs >= 0x10000) { 8561 *p++ = '\\'; 8562 *p++ = 'U'; 8563 *p++ = hexdigits[(ucs >> 28) & 0x0000000F]; 8564 *p++ = hexdigits[(ucs >> 24) & 0x0000000F]; 8565 *p++ = hexdigits[(ucs >> 20) & 0x0000000F]; 8566 *p++ = hexdigits[(ucs >> 16) & 0x0000000F]; 8567 *p++ = hexdigits[(ucs >> 12) & 0x0000000F]; 8568 *p++ = hexdigits[(ucs >> 8) & 0x0000000F]; 8569 *p++ = hexdigits[(ucs >> 4) & 0x0000000F]; 8570 *p++ = hexdigits[ucs & 0x0000000F]; 8571 } 8572 /* Map 16-bit characters to '\uxxxx' */ 8573 else { 8574 *p++ = '\\'; 8575 *p++ = 'u'; 8576 *p++ = hexdigits[(ucs >> 12) & 0x000F]; 8577 *p++ = hexdigits[(ucs >> 8) & 0x000F]; 8578 *p++ = hexdigits[(ucs >> 4) & 0x000F]; 8579 *p++ = hexdigits[ucs & 0x000F]; 8580 } 8581 } 8582 /* Copy characters as-is */ 8583 else { 8584 *p++ = ch; 8585#ifndef Py_UNICODE_WIDE 8586 if (ucs >= 0x10000) 8587 *p++ = ch2; 8588#endif 8589 } 8590 } 8591 } 8592 /* Add quote */ 8593 *p++ = PyUnicode_AS_UNICODE(repr)[0]; 8594 8595 *p = '\0'; 8596 PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr)); 8597 return repr; 8598} 8599 8600PyDoc_STRVAR(rfind__doc__, 8601 "S.rfind(sub[, start[, end]]) -> int\n\ 8602\n\ 8603Return the highest index in S where substring sub is found,\n\ 8604such that sub is contained within S[start:end]. Optional\n\ 8605arguments start and end are interpreted as in slice notation.\n\ 8606\n\ 8607Return -1 on failure."); 8608 8609static PyObject * 8610unicode_rfind(PyUnicodeObject *self, PyObject *args) 8611{ 8612 PyUnicodeObject *substring; 8613 Py_ssize_t start; 8614 Py_ssize_t end; 8615 Py_ssize_t result; 8616 8617 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring, 8618 &start, &end)) 8619 return NULL; 8620 8621 result = stringlib_rfind_slice( 8622 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self), 8623 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring), 8624 start, end 8625 ); 8626 8627 Py_DECREF(substring); 8628 8629 return PyLong_FromSsize_t(result); 8630} 8631 8632PyDoc_STRVAR(rindex__doc__, 8633 "S.rindex(sub[, start[, end]]) -> int\n\ 8634\n\ 8635Like S.rfind() but raise ValueError when the substring is not found."); 8636 8637static PyObject * 8638unicode_rindex(PyUnicodeObject *self, PyObject *args) 8639{ 8640 PyUnicodeObject *substring; 8641 Py_ssize_t start; 8642 Py_ssize_t end; 8643 Py_ssize_t result; 8644 8645 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring, 8646 &start, &end)) 8647 return NULL; 8648 8649 result = stringlib_rfind_slice( 8650 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self), 8651 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring), 8652 start, end 8653 ); 8654 8655 Py_DECREF(substring); 8656 8657 if (result < 0) { 8658 PyErr_SetString(PyExc_ValueError, "substring not found"); 8659 return NULL; 8660 } 8661 return PyLong_FromSsize_t(result); 8662} 8663 8664PyDoc_STRVAR(rjust__doc__, 8665 "S.rjust(width[, fillchar]) -> str\n\ 8666\n\ 8667Return S right-justified in a string of length width. Padding is\n\ 8668done using the specified fill character (default is a space)."); 8669 8670static PyObject * 8671unicode_rjust(PyUnicodeObject *self, PyObject *args) 8672{ 8673 Py_ssize_t width; 8674 Py_UNICODE fillchar = ' '; 8675 8676 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar)) 8677 return NULL; 8678 8679 if (self->length >= width && PyUnicode_CheckExact(self)) { 8680 Py_INCREF(self); 8681 return (PyObject*) self; 8682 } 8683 8684 return (PyObject*) pad(self, width - self->length, 0, fillchar); 8685} 8686 8687PyObject * 8688PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit) 8689{ 8690 PyObject *result; 8691 8692 s = PyUnicode_FromObject(s); 8693 if (s == NULL) 8694 return NULL; 8695 if (sep != NULL) { 8696 sep = PyUnicode_FromObject(sep); 8697 if (sep == NULL) { 8698 Py_DECREF(s); 8699 return NULL; 8700 } 8701 } 8702 8703 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit); 8704 8705 Py_DECREF(s); 8706 Py_XDECREF(sep); 8707 return result; 8708} 8709 8710PyDoc_STRVAR(split__doc__, 8711 "S.split([sep[, maxsplit]]) -> list of strings\n\ 8712\n\ 8713Return a list of the words in S, using sep as the\n\ 8714delimiter string. If maxsplit is given, at most maxsplit\n\ 8715splits are done. If sep is not specified or is None, any\n\ 8716whitespace string is a separator and empty strings are\n\ 8717removed from the result."); 8718 8719static PyObject* 8720unicode_split(PyUnicodeObject *self, PyObject *args) 8721{ 8722 PyObject *substring = Py_None; 8723 Py_ssize_t maxcount = -1; 8724 8725 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount)) 8726 return NULL; 8727 8728 if (substring == Py_None) 8729 return split(self, NULL, maxcount); 8730 else if (PyUnicode_Check(substring)) 8731 return split(self, (PyUnicodeObject *)substring, maxcount); 8732 else 8733 return PyUnicode_Split((PyObject *)self, substring, maxcount); 8734} 8735 8736PyObject * 8737PyUnicode_Partition(PyObject *str_in, PyObject *sep_in) 8738{ 8739 PyObject* str_obj; 8740 PyObject* sep_obj; 8741 PyObject* out; 8742 8743 str_obj = PyUnicode_FromObject(str_in); 8744 if (!str_obj) 8745 return NULL; 8746 sep_obj = PyUnicode_FromObject(sep_in); 8747 if (!sep_obj) { 8748 Py_DECREF(str_obj); 8749 return NULL; 8750 } 8751 8752 out = stringlib_partition( 8753 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj), 8754 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj) 8755 ); 8756 8757 Py_DECREF(sep_obj); 8758 Py_DECREF(str_obj); 8759 8760 return out; 8761} 8762 8763 8764PyObject * 8765PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in) 8766{ 8767 PyObject* str_obj; 8768 PyObject* sep_obj; 8769 PyObject* out; 8770 8771 str_obj = PyUnicode_FromObject(str_in); 8772 if (!str_obj) 8773 return NULL; 8774 sep_obj = PyUnicode_FromObject(sep_in); 8775 if (!sep_obj) { 8776 Py_DECREF(str_obj); 8777 return NULL; 8778 } 8779 8780 out = stringlib_rpartition( 8781 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj), 8782 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj) 8783 ); 8784 8785 Py_DECREF(sep_obj); 8786 Py_DECREF(str_obj); 8787 8788 return out; 8789} 8790 8791PyDoc_STRVAR(partition__doc__, 8792 "S.partition(sep) -> (head, sep, tail)\n\ 8793\n\ 8794Search for the separator sep in S, and return the part before it,\n\ 8795the separator itself, and the part after it. If the separator is not\n\ 8796found, return S and two empty strings."); 8797 8798static PyObject* 8799unicode_partition(PyUnicodeObject *self, PyObject *separator) 8800{ 8801 return PyUnicode_Partition((PyObject *)self, separator); 8802} 8803 8804PyDoc_STRVAR(rpartition__doc__, 8805 "S.rpartition(sep) -> (head, sep, tail)\n\ 8806\n\ 8807Search for the separator sep in S, starting at the end of S, and return\n\ 8808the part before it, the separator itself, and the part after it. If the\n\ 8809separator is not found, return two empty strings and S."); 8810 8811static PyObject* 8812unicode_rpartition(PyUnicodeObject *self, PyObject *separator) 8813{ 8814 return PyUnicode_RPartition((PyObject *)self, separator); 8815} 8816 8817PyObject * 8818PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit) 8819{ 8820 PyObject *result; 8821 8822 s = PyUnicode_FromObject(s); 8823 if (s == NULL) 8824 return NULL; 8825 if (sep != NULL) { 8826 sep = PyUnicode_FromObject(sep); 8827 if (sep == NULL) { 8828 Py_DECREF(s); 8829 return NULL; 8830 } 8831 } 8832 8833 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit); 8834 8835 Py_DECREF(s); 8836 Py_XDECREF(sep); 8837 return result; 8838} 8839 8840PyDoc_STRVAR(rsplit__doc__, 8841 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\ 8842\n\ 8843Return a list of the words in S, using sep as the\n\ 8844delimiter string, starting at the end of the string and\n\ 8845working to the front. If maxsplit is given, at most maxsplit\n\ 8846splits are done. If sep is not specified, any whitespace string\n\ 8847is a separator."); 8848 8849static PyObject* 8850unicode_rsplit(PyUnicodeObject *self, PyObject *args) 8851{ 8852 PyObject *substring = Py_None; 8853 Py_ssize_t maxcount = -1; 8854 8855 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount)) 8856 return NULL; 8857 8858 if (substring == Py_None) 8859 return rsplit(self, NULL, maxcount); 8860 else if (PyUnicode_Check(substring)) 8861 return rsplit(self, (PyUnicodeObject *)substring, maxcount); 8862 else 8863 return PyUnicode_RSplit((PyObject *)self, substring, maxcount); 8864} 8865 8866PyDoc_STRVAR(splitlines__doc__, 8867 "S.splitlines([keepends]) -> list of strings\n\ 8868\n\ 8869Return a list of the lines in S, breaking at line boundaries.\n\ 8870Line breaks are not included in the resulting list unless keepends\n\ 8871is given and true."); 8872 8873static PyObject* 8874unicode_splitlines(PyUnicodeObject *self, PyObject *args) 8875{ 8876 int keepends = 0; 8877 8878 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends)) 8879 return NULL; 8880 8881 return PyUnicode_Splitlines((PyObject *)self, keepends); 8882} 8883 8884static 8885PyObject *unicode_str(PyObject *self) 8886{ 8887 if (PyUnicode_CheckExact(self)) { 8888 Py_INCREF(self); 8889 return self; 8890 } else 8891 /* Subtype -- return genuine unicode string with the same value. */ 8892 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self), 8893 PyUnicode_GET_SIZE(self)); 8894} 8895 8896PyDoc_STRVAR(swapcase__doc__, 8897 "S.swapcase() -> str\n\ 8898\n\ 8899Return a copy of S with uppercase characters converted to lowercase\n\ 8900and vice versa."); 8901 8902static PyObject* 8903unicode_swapcase(PyUnicodeObject *self) 8904{ 8905 return fixup(self, fixswapcase); 8906} 8907 8908PyDoc_STRVAR(maketrans__doc__, 8909 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\ 8910\n\ 8911Return a translation table usable for str.translate().\n\ 8912If there is only one argument, it must be a dictionary mapping Unicode\n\ 8913ordinals (integers) or characters to Unicode ordinals, strings or None.\n\ 8914Character keys will be then converted to ordinals.\n\ 8915If there are two arguments, they must be strings of equal length, and\n\ 8916in the resulting dictionary, each character in x will be mapped to the\n\ 8917character at the same position in y. If there is a third argument, it\n\ 8918must be a string, whose characters will be mapped to None in the result."); 8919 8920static PyObject* 8921unicode_maketrans(PyUnicodeObject *null, PyObject *args) 8922{ 8923 PyObject *x, *y = NULL, *z = NULL; 8924 PyObject *new = NULL, *key, *value; 8925 Py_ssize_t i = 0; 8926 int res; 8927 8928 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z)) 8929 return NULL; 8930 new = PyDict_New(); 8931 if (!new) 8932 return NULL; 8933 if (y != NULL) { 8934 /* x must be a string too, of equal length */ 8935 Py_ssize_t ylen = PyUnicode_GET_SIZE(y); 8936 if (!PyUnicode_Check(x)) { 8937 PyErr_SetString(PyExc_TypeError, "first maketrans argument must " 8938 "be a string if there is a second argument"); 8939 goto err; 8940 } 8941 if (PyUnicode_GET_SIZE(x) != ylen) { 8942 PyErr_SetString(PyExc_ValueError, "the first two maketrans " 8943 "arguments must have equal length"); 8944 goto err; 8945 } 8946 /* create entries for translating chars in x to those in y */ 8947 for (i = 0; i < PyUnicode_GET_SIZE(x); i++) { 8948 key = PyLong_FromLong(PyUnicode_AS_UNICODE(x)[i]); 8949 value = PyLong_FromLong(PyUnicode_AS_UNICODE(y)[i]); 8950 if (!key || !value) 8951 goto err; 8952 res = PyDict_SetItem(new, key, value); 8953 Py_DECREF(key); 8954 Py_DECREF(value); 8955 if (res < 0) 8956 goto err; 8957 } 8958 /* create entries for deleting chars in z */ 8959 if (z != NULL) { 8960 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) { 8961 key = PyLong_FromLong(PyUnicode_AS_UNICODE(z)[i]); 8962 if (!key) 8963 goto err; 8964 res = PyDict_SetItem(new, key, Py_None); 8965 Py_DECREF(key); 8966 if (res < 0) 8967 goto err; 8968 } 8969 } 8970 } else { 8971 /* x must be a dict */ 8972 if (!PyDict_CheckExact(x)) { 8973 PyErr_SetString(PyExc_TypeError, "if you give only one argument " 8974 "to maketrans it must be a dict"); 8975 goto err; 8976 } 8977 /* copy entries into the new dict, converting string keys to int keys */ 8978 while (PyDict_Next(x, &i, &key, &value)) { 8979 if (PyUnicode_Check(key)) { 8980 /* convert string keys to integer keys */ 8981 PyObject *newkey; 8982 if (PyUnicode_GET_SIZE(key) != 1) { 8983 PyErr_SetString(PyExc_ValueError, "string keys in translate " 8984 "table must be of length 1"); 8985 goto err; 8986 } 8987 newkey = PyLong_FromLong(PyUnicode_AS_UNICODE(key)[0]); 8988 if (!newkey) 8989 goto err; 8990 res = PyDict_SetItem(new, newkey, value); 8991 Py_DECREF(newkey); 8992 if (res < 0) 8993 goto err; 8994 } else if (PyLong_Check(key)) { 8995 /* just keep integer keys */ 8996 if (PyDict_SetItem(new, key, value) < 0) 8997 goto err; 8998 } else { 8999 PyErr_SetString(PyExc_TypeError, "keys in translate table must " 9000 "be strings or integers"); 9001 goto err; 9002 } 9003 } 9004 } 9005 return new; 9006 err: 9007 Py_DECREF(new); 9008 return NULL; 9009} 9010 9011PyDoc_STRVAR(translate__doc__, 9012 "S.translate(table) -> str\n\ 9013\n\ 9014Return a copy of the string S, where all characters have been mapped\n\ 9015through the given translation table, which must be a mapping of\n\ 9016Unicode ordinals to Unicode ordinals, strings, or None.\n\ 9017Unmapped characters are left untouched. Characters mapped to None\n\ 9018are deleted."); 9019 9020static PyObject* 9021unicode_translate(PyUnicodeObject *self, PyObject *table) 9022{ 9023 return PyUnicode_TranslateCharmap(self->str, self->length, table, "ignore"); 9024} 9025 9026PyDoc_STRVAR(upper__doc__, 9027 "S.upper() -> str\n\ 9028\n\ 9029Return a copy of S converted to uppercase."); 9030 9031static PyObject* 9032unicode_upper(PyUnicodeObject *self) 9033{ 9034 return fixup(self, fixupper); 9035} 9036 9037PyDoc_STRVAR(zfill__doc__, 9038 "S.zfill(width) -> str\n\ 9039\n\ 9040Pad a numeric string S with zeros on the left, to fill a field\n\ 9041of the specified width. The string S is never truncated."); 9042 9043static PyObject * 9044unicode_zfill(PyUnicodeObject *self, PyObject *args) 9045{ 9046 Py_ssize_t fill; 9047 PyUnicodeObject *u; 9048 9049 Py_ssize_t width; 9050 if (!PyArg_ParseTuple(args, "n:zfill", &width)) 9051 return NULL; 9052 9053 if (self->length >= width) { 9054 if (PyUnicode_CheckExact(self)) { 9055 Py_INCREF(self); 9056 return (PyObject*) self; 9057 } 9058 else 9059 return PyUnicode_FromUnicode( 9060 PyUnicode_AS_UNICODE(self), 9061 PyUnicode_GET_SIZE(self) 9062 ); 9063 } 9064 9065 fill = width - self->length; 9066 9067 u = pad(self, fill, 0, '0'); 9068 9069 if (u == NULL) 9070 return NULL; 9071 9072 if (u->str[fill] == '+' || u->str[fill] == '-') { 9073 /* move sign to beginning of string */ 9074 u->str[0] = u->str[fill]; 9075 u->str[fill] = '0'; 9076 } 9077 9078 return (PyObject*) u; 9079} 9080 9081#if 0 9082static PyObject* 9083unicode_freelistsize(PyUnicodeObject *self) 9084{ 9085 return PyLong_FromLong(numfree); 9086} 9087 9088static PyObject * 9089unicode__decimal2ascii(PyObject *self) 9090{ 9091 return PyUnicode_TransformDecimalToASCII(PyUnicode_AS_UNICODE(self), 9092 PyUnicode_GET_SIZE(self)); 9093} 9094#endif 9095 9096PyDoc_STRVAR(startswith__doc__, 9097 "S.startswith(prefix[, start[, end]]) -> bool\n\ 9098\n\ 9099Return True if S starts with the specified prefix, False otherwise.\n\ 9100With optional start, test S beginning at that position.\n\ 9101With optional end, stop comparing S at that position.\n\ 9102prefix can also be a tuple of strings to try."); 9103 9104static PyObject * 9105unicode_startswith(PyUnicodeObject *self, 9106 PyObject *args) 9107{ 9108 PyObject *subobj; 9109 PyUnicodeObject *substring; 9110 Py_ssize_t start = 0; 9111 Py_ssize_t end = PY_SSIZE_T_MAX; 9112 int result; 9113 9114 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end)) 9115 return NULL; 9116 if (PyTuple_Check(subobj)) { 9117 Py_ssize_t i; 9118 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) { 9119 substring = (PyUnicodeObject *)PyUnicode_FromObject( 9120 PyTuple_GET_ITEM(subobj, i)); 9121 if (substring == NULL) 9122 return NULL; 9123 result = tailmatch(self, substring, start, end, -1); 9124 Py_DECREF(substring); 9125 if (result) { 9126 Py_RETURN_TRUE; 9127 } 9128 } 9129 /* nothing matched */ 9130 Py_RETURN_FALSE; 9131 } 9132 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj); 9133 if (substring == NULL) { 9134 if (PyErr_ExceptionMatches(PyExc_TypeError)) 9135 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or " 9136 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name); 9137 return NULL; 9138 } 9139 result = tailmatch(self, substring, start, end, -1); 9140 Py_DECREF(substring); 9141 return PyBool_FromLong(result); 9142} 9143 9144 9145PyDoc_STRVAR(endswith__doc__, 9146 "S.endswith(suffix[, start[, end]]) -> bool\n\ 9147\n\ 9148Return True if S ends with the specified suffix, False otherwise.\n\ 9149With optional start, test S beginning at that position.\n\ 9150With optional end, stop comparing S at that position.\n\ 9151suffix can also be a tuple of strings to try."); 9152 9153static PyObject * 9154unicode_endswith(PyUnicodeObject *self, 9155 PyObject *args) 9156{ 9157 PyObject *subobj; 9158 PyUnicodeObject *substring; 9159 Py_ssize_t start = 0; 9160 Py_ssize_t end = PY_SSIZE_T_MAX; 9161 int result; 9162 9163 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end)) 9164 return NULL; 9165 if (PyTuple_Check(subobj)) { 9166 Py_ssize_t i; 9167 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) { 9168 substring = (PyUnicodeObject *)PyUnicode_FromObject( 9169 PyTuple_GET_ITEM(subobj, i)); 9170 if (substring == NULL) 9171 return NULL; 9172 result = tailmatch(self, substring, start, end, +1); 9173 Py_DECREF(substring); 9174 if (result) { 9175 Py_RETURN_TRUE; 9176 } 9177 } 9178 Py_RETURN_FALSE; 9179 } 9180 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj); 9181 if (substring == NULL) { 9182 if (PyErr_ExceptionMatches(PyExc_TypeError)) 9183 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or " 9184 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name); 9185 return NULL; 9186 } 9187 result = tailmatch(self, substring, start, end, +1); 9188 Py_DECREF(substring); 9189 return PyBool_FromLong(result); 9190} 9191 9192#include "stringlib/string_format.h" 9193 9194PyDoc_STRVAR(format__doc__, 9195 "S.format(*args, **kwargs) -> str\n\ 9196\n\ 9197Return a formatted version of S, using substitutions from args and kwargs.\n\ 9198The substitutions are identified by braces ('{' and '}')."); 9199 9200PyDoc_STRVAR(format_map__doc__, 9201 "S.format_map(mapping) -> str\n\ 9202\n\ 9203Return a formatted version of S, using substitutions from mapping.\n\ 9204The substitutions are identified by braces ('{' and '}')."); 9205 9206static PyObject * 9207unicode__format__(PyObject* self, PyObject* args) 9208{ 9209 PyObject *format_spec; 9210 9211 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec)) 9212 return NULL; 9213 9214 return _PyUnicode_FormatAdvanced(self, 9215 PyUnicode_AS_UNICODE(format_spec), 9216 PyUnicode_GET_SIZE(format_spec)); 9217} 9218 9219PyDoc_STRVAR(p_format__doc__, 9220 "S.__format__(format_spec) -> str\n\ 9221\n\ 9222Return a formatted version of S as described by format_spec."); 9223 9224static PyObject * 9225unicode__sizeof__(PyUnicodeObject *v) 9226{ 9227 return PyLong_FromSsize_t(sizeof(PyUnicodeObject) + 9228 sizeof(Py_UNICODE) * (v->length + 1)); 9229} 9230 9231PyDoc_STRVAR(sizeof__doc__, 9232 "S.__sizeof__() -> size of S in memory, in bytes"); 9233 9234static PyObject * 9235unicode_getnewargs(PyUnicodeObject *v) 9236{ 9237 return Py_BuildValue("(u#)", v->str, v->length); 9238} 9239 9240static PyMethodDef unicode_methods[] = { 9241 9242 /* Order is according to common usage: often used methods should 9243 appear first, since lookup is done sequentially. */ 9244 9245 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__}, 9246 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__}, 9247 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__}, 9248 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__}, 9249 {"join", (PyCFunction) unicode_join, METH_O, join__doc__}, 9250 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__}, 9251 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__}, 9252 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__}, 9253 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__}, 9254 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__}, 9255 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__}, 9256 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__}, 9257 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__}, 9258 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__}, 9259 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__}, 9260 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__}, 9261 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__}, 9262 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__}, 9263 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__}, 9264 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__}, 9265 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__}, 9266 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__}, 9267 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__}, 9268 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__}, 9269 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__}, 9270 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__}, 9271 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__}, 9272 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__}, 9273 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__}, 9274 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__}, 9275 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__}, 9276 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__}, 9277 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__}, 9278 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__}, 9279 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__}, 9280 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__}, 9281 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__}, 9282 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__}, 9283 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__}, 9284 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__}, 9285 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__}, 9286 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__}, 9287 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__}, 9288 {"maketrans", (PyCFunction) unicode_maketrans, 9289 METH_VARARGS | METH_STATIC, maketrans__doc__}, 9290 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__}, 9291#if 0 9292 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__}, 9293#endif 9294 9295#if 0 9296 /* These methods are just used for debugging the implementation. */ 9297 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS}, 9298 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS}, 9299#endif 9300 9301 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS}, 9302 {NULL, NULL} 9303}; 9304 9305static PyObject * 9306unicode_mod(PyObject *v, PyObject *w) 9307{ 9308 if (!PyUnicode_Check(v)) 9309 Py_RETURN_NOTIMPLEMENTED; 9310 return PyUnicode_Format(v, w); 9311} 9312 9313static PyNumberMethods unicode_as_number = { 9314 0, /*nb_add*/ 9315 0, /*nb_subtract*/ 9316 0, /*nb_multiply*/ 9317 unicode_mod, /*nb_remainder*/ 9318}; 9319 9320static PySequenceMethods unicode_as_sequence = { 9321 (lenfunc) unicode_length, /* sq_length */ 9322 PyUnicode_Concat, /* sq_concat */ 9323 (ssizeargfunc) unicode_repeat, /* sq_repeat */ 9324 (ssizeargfunc) unicode_getitem, /* sq_item */ 9325 0, /* sq_slice */ 9326 0, /* sq_ass_item */ 9327 0, /* sq_ass_slice */ 9328 PyUnicode_Contains, /* sq_contains */ 9329}; 9330 9331static PyObject* 9332unicode_subscript(PyUnicodeObject* self, PyObject* item) 9333{ 9334 if (PyIndex_Check(item)) { 9335 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError); 9336 if (i == -1 && PyErr_Occurred()) 9337 return NULL; 9338 if (i < 0) 9339 i += PyUnicode_GET_SIZE(self); 9340 return unicode_getitem(self, i); 9341 } else if (PySlice_Check(item)) { 9342 Py_ssize_t start, stop, step, slicelength, cur, i; 9343 Py_UNICODE* source_buf; 9344 Py_UNICODE* result_buf; 9345 PyObject* result; 9346 9347 if (PySlice_GetIndicesEx(item, PyUnicode_GET_SIZE(self), 9348 &start, &stop, &step, &slicelength) < 0) { 9349 return NULL; 9350 } 9351 9352 if (slicelength <= 0) { 9353 return PyUnicode_FromUnicode(NULL, 0); 9354 } else if (start == 0 && step == 1 && slicelength == self->length && 9355 PyUnicode_CheckExact(self)) { 9356 Py_INCREF(self); 9357 return (PyObject *)self; 9358 } else if (step == 1) { 9359 return PyUnicode_FromUnicode(self->str + start, slicelength); 9360 } else { 9361 source_buf = PyUnicode_AS_UNICODE((PyObject*)self); 9362 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength* 9363 sizeof(Py_UNICODE)); 9364 9365 if (result_buf == NULL) 9366 return PyErr_NoMemory(); 9367 9368 for (cur = start, i = 0; i < slicelength; cur += step, i++) { 9369 result_buf[i] = source_buf[cur]; 9370 } 9371 9372 result = PyUnicode_FromUnicode(result_buf, slicelength); 9373 PyObject_FREE(result_buf); 9374 return result; 9375 } 9376 } else { 9377 PyErr_SetString(PyExc_TypeError, "string indices must be integers"); 9378 return NULL; 9379 } 9380} 9381 9382static PyMappingMethods unicode_as_mapping = { 9383 (lenfunc)unicode_length, /* mp_length */ 9384 (binaryfunc)unicode_subscript, /* mp_subscript */ 9385 (objobjargproc)0, /* mp_ass_subscript */ 9386}; 9387 9388 9389/* Helpers for PyUnicode_Format() */ 9390 9391static PyObject * 9392getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx) 9393{ 9394 Py_ssize_t argidx = *p_argidx; 9395 if (argidx < arglen) { 9396 (*p_argidx)++; 9397 if (arglen < 0) 9398 return args; 9399 else 9400 return PyTuple_GetItem(args, argidx); 9401 } 9402 PyErr_SetString(PyExc_TypeError, 9403 "not enough arguments for format string"); 9404 return NULL; 9405} 9406 9407/* Returns a new reference to a PyUnicode object, or NULL on failure. */ 9408 9409static PyObject * 9410formatfloat(PyObject *v, int flags, int prec, int type) 9411{ 9412 char *p; 9413 PyObject *result; 9414 double x; 9415 9416 x = PyFloat_AsDouble(v); 9417 if (x == -1.0 && PyErr_Occurred()) 9418 return NULL; 9419 9420 if (prec < 0) 9421 prec = 6; 9422 9423 p = PyOS_double_to_string(x, type, prec, 9424 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL); 9425 if (p == NULL) 9426 return NULL; 9427 result = PyUnicode_FromStringAndSize(p, strlen(p)); 9428 PyMem_Free(p); 9429 return result; 9430} 9431 9432static PyObject* 9433formatlong(PyObject *val, int flags, int prec, int type) 9434{ 9435 char *buf; 9436 int len; 9437 PyObject *str; /* temporary string object. */ 9438 PyObject *result; 9439 9440 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len); 9441 if (!str) 9442 return NULL; 9443 result = PyUnicode_FromStringAndSize(buf, len); 9444 Py_DECREF(str); 9445 return result; 9446} 9447 9448static int 9449formatchar(Py_UNICODE *buf, 9450 size_t buflen, 9451 PyObject *v) 9452{ 9453 /* presume that the buffer is at least 3 characters long */ 9454 if (PyUnicode_Check(v)) { 9455 if (PyUnicode_GET_SIZE(v) == 1) { 9456 buf[0] = PyUnicode_AS_UNICODE(v)[0]; 9457 buf[1] = '\0'; 9458 return 1; 9459 } 9460#ifndef Py_UNICODE_WIDE 9461 if (PyUnicode_GET_SIZE(v) == 2) { 9462 /* Decode a valid surrogate pair */ 9463 int c0 = PyUnicode_AS_UNICODE(v)[0]; 9464 int c1 = PyUnicode_AS_UNICODE(v)[1]; 9465 if (0xD800 <= c0 && c0 <= 0xDBFF && 9466 0xDC00 <= c1 && c1 <= 0xDFFF) { 9467 buf[0] = c0; 9468 buf[1] = c1; 9469 buf[2] = '\0'; 9470 return 2; 9471 } 9472 } 9473#endif 9474 goto onError; 9475 } 9476 else { 9477 /* Integer input truncated to a character */ 9478 long x; 9479 x = PyLong_AsLong(v); 9480 if (x == -1 && PyErr_Occurred()) 9481 goto onError; 9482 9483 if (x < 0 || x > 0x10ffff) { 9484 PyErr_SetString(PyExc_OverflowError, 9485 "%c arg not in range(0x110000)"); 9486 return -1; 9487 } 9488 9489#ifndef Py_UNICODE_WIDE 9490 if (x > 0xffff) { 9491 x -= 0x10000; 9492 buf[0] = (Py_UNICODE)(0xD800 | (x >> 10)); 9493 buf[1] = (Py_UNICODE)(0xDC00 | (x & 0x3FF)); 9494 return 2; 9495 } 9496#endif 9497 buf[0] = (Py_UNICODE) x; 9498 buf[1] = '\0'; 9499 return 1; 9500 } 9501 9502 onError: 9503 PyErr_SetString(PyExc_TypeError, 9504 "%c requires int or char"); 9505 return -1; 9506} 9507 9508/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...) 9509 FORMATBUFLEN is the length of the buffer in which chars are formatted. 9510*/ 9511#define FORMATBUFLEN (size_t)10 9512 9513PyObject * 9514PyUnicode_Format(PyObject *format, PyObject *args) 9515{ 9516 Py_UNICODE *fmt, *res; 9517 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx; 9518 int args_owned = 0; 9519 PyUnicodeObject *result = NULL; 9520 PyObject *dict = NULL; 9521 PyObject *uformat; 9522 9523 if (format == NULL || args == NULL) { 9524 PyErr_BadInternalCall(); 9525 return NULL; 9526 } 9527 uformat = PyUnicode_FromObject(format); 9528 if (uformat == NULL) 9529 return NULL; 9530 fmt = PyUnicode_AS_UNICODE(uformat); 9531 fmtcnt = PyUnicode_GET_SIZE(uformat); 9532 9533 reslen = rescnt = fmtcnt + 100; 9534 result = _PyUnicode_New(reslen); 9535 if (result == NULL) 9536 goto onError; 9537 res = PyUnicode_AS_UNICODE(result); 9538 9539 if (PyTuple_Check(args)) { 9540 arglen = PyTuple_Size(args); 9541 argidx = 0; 9542 } 9543 else { 9544 arglen = -1; 9545 argidx = -2; 9546 } 9547 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) && 9548 !PyUnicode_Check(args)) 9549 dict = args; 9550 9551 while (--fmtcnt >= 0) { 9552 if (*fmt != '%') { 9553 if (--rescnt < 0) { 9554 rescnt = fmtcnt + 100; 9555 reslen += rescnt; 9556 if (_PyUnicode_Resize(&result, reslen) < 0) 9557 goto onError; 9558 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt; 9559 --rescnt; 9560 } 9561 *res++ = *fmt++; 9562 } 9563 else { 9564 /* Got a format specifier */ 9565 int flags = 0; 9566 Py_ssize_t width = -1; 9567 int prec = -1; 9568 Py_UNICODE c = '\0'; 9569 Py_UNICODE fill; 9570 int isnumok; 9571 PyObject *v = NULL; 9572 PyObject *temp = NULL; 9573 Py_UNICODE *pbuf; 9574 Py_UNICODE sign; 9575 Py_ssize_t len; 9576 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For formatchar() */ 9577 9578 fmt++; 9579 if (*fmt == '(') { 9580 Py_UNICODE *keystart; 9581 Py_ssize_t keylen; 9582 PyObject *key; 9583 int pcount = 1; 9584 9585 if (dict == NULL) { 9586 PyErr_SetString(PyExc_TypeError, 9587 "format requires a mapping"); 9588 goto onError; 9589 } 9590 ++fmt; 9591 --fmtcnt; 9592 keystart = fmt; 9593 /* Skip over balanced parentheses */ 9594 while (pcount > 0 && --fmtcnt >= 0) { 9595 if (*fmt == ')') 9596 --pcount; 9597 else if (*fmt == '(') 9598 ++pcount; 9599 fmt++; 9600 } 9601 keylen = fmt - keystart - 1; 9602 if (fmtcnt < 0 || pcount > 0) { 9603 PyErr_SetString(PyExc_ValueError, 9604 "incomplete format key"); 9605 goto onError; 9606 } 9607#if 0 9608 /* keys are converted to strings using UTF-8 and 9609 then looked up since Python uses strings to hold 9610 variables names etc. in its namespaces and we 9611 wouldn't want to break common idioms. */ 9612 key = PyUnicode_EncodeUTF8(keystart, 9613 keylen, 9614 NULL); 9615#else 9616 key = PyUnicode_FromUnicode(keystart, keylen); 9617#endif 9618 if (key == NULL) 9619 goto onError; 9620 if (args_owned) { 9621 Py_DECREF(args); 9622 args_owned = 0; 9623 } 9624 args = PyObject_GetItem(dict, key); 9625 Py_DECREF(key); 9626 if (args == NULL) { 9627 goto onError; 9628 } 9629 args_owned = 1; 9630 arglen = -1; 9631 argidx = -2; 9632 } 9633 while (--fmtcnt >= 0) { 9634 switch (c = *fmt++) { 9635 case '-': flags |= F_LJUST; continue; 9636 case '+': flags |= F_SIGN; continue; 9637 case ' ': flags |= F_BLANK; continue; 9638 case '#': flags |= F_ALT; continue; 9639 case '0': flags |= F_ZERO; continue; 9640 } 9641 break; 9642 } 9643 if (c == '*') { 9644 v = getnextarg(args, arglen, &argidx); 9645 if (v == NULL) 9646 goto onError; 9647 if (!PyLong_Check(v)) { 9648 PyErr_SetString(PyExc_TypeError, 9649 "* wants int"); 9650 goto onError; 9651 } 9652 width = PyLong_AsLong(v); 9653 if (width == -1 && PyErr_Occurred()) 9654 goto onError; 9655 if (width < 0) { 9656 flags |= F_LJUST; 9657 width = -width; 9658 } 9659 if (--fmtcnt >= 0) 9660 c = *fmt++; 9661 } 9662 else if (c >= '0' && c <= '9') { 9663 width = c - '0'; 9664 while (--fmtcnt >= 0) { 9665 c = *fmt++; 9666 if (c < '0' || c > '9') 9667 break; 9668 if ((width*10) / 10 != width) { 9669 PyErr_SetString(PyExc_ValueError, 9670 "width too big"); 9671 goto onError; 9672 } 9673 width = width*10 + (c - '0'); 9674 } 9675 } 9676 if (c == '.') { 9677 prec = 0; 9678 if (--fmtcnt >= 0) 9679 c = *fmt++; 9680 if (c == '*') { 9681 v = getnextarg(args, arglen, &argidx); 9682 if (v == NULL) 9683 goto onError; 9684 if (!PyLong_Check(v)) { 9685 PyErr_SetString(PyExc_TypeError, 9686 "* wants int"); 9687 goto onError; 9688 } 9689 prec = PyLong_AsLong(v); 9690 if (prec == -1 && PyErr_Occurred()) 9691 goto onError; 9692 if (prec < 0) 9693 prec = 0; 9694 if (--fmtcnt >= 0) 9695 c = *fmt++; 9696 } 9697 else if (c >= '0' && c <= '9') { 9698 prec = c - '0'; 9699 while (--fmtcnt >= 0) { 9700 c = *fmt++; 9701 if (c < '0' || c > '9') 9702 break; 9703 if ((prec*10) / 10 != prec) { 9704 PyErr_SetString(PyExc_ValueError, 9705 "prec too big"); 9706 goto onError; 9707 } 9708 prec = prec*10 + (c - '0'); 9709 } 9710 } 9711 } /* prec */ 9712 if (fmtcnt >= 0) { 9713 if (c == 'h' || c == 'l' || c == 'L') { 9714 if (--fmtcnt >= 0) 9715 c = *fmt++; 9716 } 9717 } 9718 if (fmtcnt < 0) { 9719 PyErr_SetString(PyExc_ValueError, 9720 "incomplete format"); 9721 goto onError; 9722 } 9723 if (c != '%') { 9724 v = getnextarg(args, arglen, &argidx); 9725 if (v == NULL) 9726 goto onError; 9727 } 9728 sign = 0; 9729 fill = ' '; 9730 switch (c) { 9731 9732 case '%': 9733 pbuf = formatbuf; 9734 /* presume that buffer length is at least 1 */ 9735 pbuf[0] = '%'; 9736 len = 1; 9737 break; 9738 9739 case 's': 9740 case 'r': 9741 case 'a': 9742 if (PyUnicode_CheckExact(v) && c == 's') { 9743 temp = v; 9744 Py_INCREF(temp); 9745 } 9746 else { 9747 if (c == 's') 9748 temp = PyObject_Str(v); 9749 else if (c == 'r') 9750 temp = PyObject_Repr(v); 9751 else 9752 temp = PyObject_ASCII(v); 9753 if (temp == NULL) 9754 goto onError; 9755 if (PyUnicode_Check(temp)) 9756 /* nothing to do */; 9757 else { 9758 Py_DECREF(temp); 9759 PyErr_SetString(PyExc_TypeError, 9760 "%s argument has non-string str()"); 9761 goto onError; 9762 } 9763 } 9764 pbuf = PyUnicode_AS_UNICODE(temp); 9765 len = PyUnicode_GET_SIZE(temp); 9766 if (prec >= 0 && len > prec) 9767 len = prec; 9768 break; 9769 9770 case 'i': 9771 case 'd': 9772 case 'u': 9773 case 'o': 9774 case 'x': 9775 case 'X': 9776 isnumok = 0; 9777 if (PyNumber_Check(v)) { 9778 PyObject *iobj=NULL; 9779 9780 if (PyLong_Check(v)) { 9781 iobj = v; 9782 Py_INCREF(iobj); 9783 } 9784 else { 9785 iobj = PyNumber_Long(v); 9786 } 9787 if (iobj!=NULL) { 9788 if (PyLong_Check(iobj)) { 9789 isnumok = 1; 9790 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c)); 9791 Py_DECREF(iobj); 9792 if (!temp) 9793 goto onError; 9794 pbuf = PyUnicode_AS_UNICODE(temp); 9795 len = PyUnicode_GET_SIZE(temp); 9796 sign = 1; 9797 } 9798 else { 9799 Py_DECREF(iobj); 9800 } 9801 } 9802 } 9803 if (!isnumok) { 9804 PyErr_Format(PyExc_TypeError, 9805 "%%%c format: a number is required, " 9806 "not %.200s", (char)c, Py_TYPE(v)->tp_name); 9807 goto onError; 9808 } 9809 if (flags & F_ZERO) 9810 fill = '0'; 9811 break; 9812 9813 case 'e': 9814 case 'E': 9815 case 'f': 9816 case 'F': 9817 case 'g': 9818 case 'G': 9819 temp = formatfloat(v, flags, prec, c); 9820 if (!temp) 9821 goto onError; 9822 pbuf = PyUnicode_AS_UNICODE(temp); 9823 len = PyUnicode_GET_SIZE(temp); 9824 sign = 1; 9825 if (flags & F_ZERO) 9826 fill = '0'; 9827 break; 9828 9829 case 'c': 9830 pbuf = formatbuf; 9831 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v); 9832 if (len < 0) 9833 goto onError; 9834 break; 9835 9836 default: 9837 PyErr_Format(PyExc_ValueError, 9838 "unsupported format character '%c' (0x%x) " 9839 "at index %zd", 9840 (31<=c && c<=126) ? (char)c : '?', 9841 (int)c, 9842 (Py_ssize_t)(fmt - 1 - 9843 PyUnicode_AS_UNICODE(uformat))); 9844 goto onError; 9845 } 9846 if (sign) { 9847 if (*pbuf == '-' || *pbuf == '+') { 9848 sign = *pbuf++; 9849 len--; 9850 } 9851 else if (flags & F_SIGN) 9852 sign = '+'; 9853 else if (flags & F_BLANK) 9854 sign = ' '; 9855 else 9856 sign = 0; 9857 } 9858 if (width < len) 9859 width = len; 9860 if (rescnt - (sign != 0) < width) { 9861 reslen -= rescnt; 9862 rescnt = width + fmtcnt + 100; 9863 reslen += rescnt; 9864 if (reslen < 0) { 9865 Py_XDECREF(temp); 9866 PyErr_NoMemory(); 9867 goto onError; 9868 } 9869 if (_PyUnicode_Resize(&result, reslen) < 0) { 9870 Py_XDECREF(temp); 9871 goto onError; 9872 } 9873 res = PyUnicode_AS_UNICODE(result) 9874 + reslen - rescnt; 9875 } 9876 if (sign) { 9877 if (fill != ' ') 9878 *res++ = sign; 9879 rescnt--; 9880 if (width > len) 9881 width--; 9882 } 9883 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) { 9884 assert(pbuf[0] == '0'); 9885 assert(pbuf[1] == c); 9886 if (fill != ' ') { 9887 *res++ = *pbuf++; 9888 *res++ = *pbuf++; 9889 } 9890 rescnt -= 2; 9891 width -= 2; 9892 if (width < 0) 9893 width = 0; 9894 len -= 2; 9895 } 9896 if (width > len && !(flags & F_LJUST)) { 9897 do { 9898 --rescnt; 9899 *res++ = fill; 9900 } while (--width > len); 9901 } 9902 if (fill == ' ') { 9903 if (sign) 9904 *res++ = sign; 9905 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) { 9906 assert(pbuf[0] == '0'); 9907 assert(pbuf[1] == c); 9908 *res++ = *pbuf++; 9909 *res++ = *pbuf++; 9910 } 9911 } 9912 Py_UNICODE_COPY(res, pbuf, len); 9913 res += len; 9914 rescnt -= len; 9915 while (--width >= len) { 9916 --rescnt; 9917 *res++ = ' '; 9918 } 9919 if (dict && (argidx < arglen) && c != '%') { 9920 PyErr_SetString(PyExc_TypeError, 9921 "not all arguments converted during string formatting"); 9922 Py_XDECREF(temp); 9923 goto onError; 9924 } 9925 Py_XDECREF(temp); 9926 } /* '%' */ 9927 } /* until end */ 9928 if (argidx < arglen && !dict) { 9929 PyErr_SetString(PyExc_TypeError, 9930 "not all arguments converted during string formatting"); 9931 goto onError; 9932 } 9933 9934 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0) 9935 goto onError; 9936 if (args_owned) { 9937 Py_DECREF(args); 9938 } 9939 Py_DECREF(uformat); 9940 return (PyObject *)result; 9941 9942 onError: 9943 Py_XDECREF(result); 9944 Py_DECREF(uformat); 9945 if (args_owned) { 9946 Py_DECREF(args); 9947 } 9948 return NULL; 9949} 9950 9951static PyObject * 9952unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds); 9953 9954static PyObject * 9955unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds) 9956{ 9957 PyObject *x = NULL; 9958 static char *kwlist[] = {"object", "encoding", "errors", 0}; 9959 char *encoding = NULL; 9960 char *errors = NULL; 9961 9962 if (type != &PyUnicode_Type) 9963 return unicode_subtype_new(type, args, kwds); 9964 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str", 9965 kwlist, &x, &encoding, &errors)) 9966 return NULL; 9967 if (x == NULL) 9968 return (PyObject *)_PyUnicode_New(0); 9969 if (encoding == NULL && errors == NULL) 9970 return PyObject_Str(x); 9971 else 9972 return PyUnicode_FromEncodedObject(x, encoding, errors); 9973} 9974 9975static PyObject * 9976unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds) 9977{ 9978 PyUnicodeObject *tmp, *pnew; 9979 Py_ssize_t n; 9980 9981 assert(PyType_IsSubtype(type, &PyUnicode_Type)); 9982 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds); 9983 if (tmp == NULL) 9984 return NULL; 9985 assert(PyUnicode_Check(tmp)); 9986 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length); 9987 if (pnew == NULL) { 9988 Py_DECREF(tmp); 9989 return NULL; 9990 } 9991 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1)); 9992 if (pnew->str == NULL) { 9993 _Py_ForgetReference((PyObject *)pnew); 9994 PyObject_Del(pnew); 9995 Py_DECREF(tmp); 9996 return PyErr_NoMemory(); 9997 } 9998 Py_UNICODE_COPY(pnew->str, tmp->str, n+1); 9999 pnew->length = n; 10000 pnew->hash = tmp->hash; 10001 Py_DECREF(tmp); 10002 return (PyObject *)pnew; 10003} 10004 10005PyDoc_STRVAR(unicode_doc, 10006 "str(string[, encoding[, errors]]) -> str\n\ 10007\n\ 10008Create a new string object from the given encoded string.\n\ 10009encoding defaults to the current default string encoding.\n\ 10010errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'."); 10011 10012static PyObject *unicode_iter(PyObject *seq); 10013 10014PyTypeObject PyUnicode_Type = { 10015 PyVarObject_HEAD_INIT(&PyType_Type, 0) 10016 "str", /* tp_name */ 10017 sizeof(PyUnicodeObject), /* tp_size */ 10018 0, /* tp_itemsize */ 10019 /* Slots */ 10020 (destructor)unicode_dealloc, /* tp_dealloc */ 10021 0, /* tp_print */ 10022 0, /* tp_getattr */ 10023 0, /* tp_setattr */ 10024 0, /* tp_reserved */ 10025 unicode_repr, /* tp_repr */ 10026 &unicode_as_number, /* tp_as_number */ 10027 &unicode_as_sequence, /* tp_as_sequence */ 10028 &unicode_as_mapping, /* tp_as_mapping */ 10029 (hashfunc) unicode_hash, /* tp_hash*/ 10030 0, /* tp_call*/ 10031 (reprfunc) unicode_str, /* tp_str */ 10032 PyObject_GenericGetAttr, /* tp_getattro */ 10033 0, /* tp_setattro */ 10034 0, /* tp_as_buffer */ 10035 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | 10036 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */ 10037 unicode_doc, /* tp_doc */ 10038 0, /* tp_traverse */ 10039 0, /* tp_clear */ 10040 PyUnicode_RichCompare, /* tp_richcompare */ 10041 0, /* tp_weaklistoffset */ 10042 unicode_iter, /* tp_iter */ 10043 0, /* tp_iternext */ 10044 unicode_methods, /* tp_methods */ 10045 0, /* tp_members */ 10046 0, /* tp_getset */ 10047 &PyBaseObject_Type, /* tp_base */ 10048 0, /* tp_dict */ 10049 0, /* tp_descr_get */ 10050 0, /* tp_descr_set */ 10051 0, /* tp_dictoffset */ 10052 0, /* tp_init */ 10053 0, /* tp_alloc */ 10054 unicode_new, /* tp_new */ 10055 PyObject_Del, /* tp_free */ 10056}; 10057 10058/* Initialize the Unicode implementation */ 10059 10060void _PyUnicode_Init(void) 10061{ 10062 int i; 10063 10064 /* XXX - move this array to unicodectype.c ? */ 10065 Py_UNICODE linebreak[] = { 10066 0x000A, /* LINE FEED */ 10067 0x000D, /* CARRIAGE RETURN */ 10068 0x001C, /* FILE SEPARATOR */ 10069 0x001D, /* GROUP SEPARATOR */ 10070 0x001E, /* RECORD SEPARATOR */ 10071 0x0085, /* NEXT LINE */ 10072 0x2028, /* LINE SEPARATOR */ 10073 0x2029, /* PARAGRAPH SEPARATOR */ 10074 }; 10075 10076 /* Init the implementation */ 10077 free_list = NULL; 10078 numfree = 0; 10079 unicode_empty = _PyUnicode_New(0); 10080 if (!unicode_empty) 10081 return; 10082 10083 for (i = 0; i < 256; i++) 10084 unicode_latin1[i] = NULL; 10085 if (PyType_Ready(&PyUnicode_Type) < 0) 10086 Py_FatalError("Can't initialize 'unicode'"); 10087 10088 /* initialize the linebreak bloom filter */ 10089 bloom_linebreak = make_bloom_mask( 10090 linebreak, sizeof(linebreak) / sizeof(linebreak[0]) 10091 ); 10092 10093 PyType_Ready(&EncodingMapType); 10094} 10095 10096/* Finalize the Unicode implementation */ 10097 10098int 10099PyUnicode_ClearFreeList(void) 10100{ 10101 int freelist_size = numfree; 10102 PyUnicodeObject *u; 10103 10104 for (u = free_list; u != NULL;) { 10105 PyUnicodeObject *v = u; 10106 u = *(PyUnicodeObject **)u; 10107 if (v->str) 10108 PyObject_DEL(v->str); 10109 Py_XDECREF(v->defenc); 10110 PyObject_Del(v); 10111 numfree--; 10112 } 10113 free_list = NULL; 10114 assert(numfree == 0); 10115 return freelist_size; 10116} 10117 10118void 10119_PyUnicode_Fini(void) 10120{ 10121 int i; 10122 10123 Py_XDECREF(unicode_empty); 10124 unicode_empty = NULL; 10125 10126 for (i = 0; i < 256; i++) { 10127 if (unicode_latin1[i]) { 10128 Py_DECREF(unicode_latin1[i]); 10129 unicode_latin1[i] = NULL; 10130 } 10131 } 10132 (void)PyUnicode_ClearFreeList(); 10133} 10134 10135void 10136PyUnicode_InternInPlace(PyObject **p) 10137{ 10138 register PyUnicodeObject *s = (PyUnicodeObject *)(*p); 10139 PyObject *t; 10140 if (s == NULL || !PyUnicode_Check(s)) 10141 Py_FatalError( 10142 "PyUnicode_InternInPlace: unicode strings only please!"); 10143 /* If it's a subclass, we don't really know what putting 10144 it in the interned dict might do. */ 10145 if (!PyUnicode_CheckExact(s)) 10146 return; 10147 if (PyUnicode_CHECK_INTERNED(s)) 10148 return; 10149 if (interned == NULL) { 10150 interned = PyDict_New(); 10151 if (interned == NULL) { 10152 PyErr_Clear(); /* Don't leave an exception */ 10153 return; 10154 } 10155 } 10156 /* It might be that the GetItem call fails even 10157 though the key is present in the dictionary, 10158 namely when this happens during a stack overflow. */ 10159 Py_ALLOW_RECURSION 10160 t = PyDict_GetItem(interned, (PyObject *)s); 10161 Py_END_ALLOW_RECURSION 10162 10163 if (t) { 10164 Py_INCREF(t); 10165 Py_DECREF(*p); 10166 *p = t; 10167 return; 10168 } 10169 10170 PyThreadState_GET()->recursion_critical = 1; 10171 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) { 10172 PyErr_Clear(); 10173 PyThreadState_GET()->recursion_critical = 0; 10174 return; 10175 } 10176 PyThreadState_GET()->recursion_critical = 0; 10177 /* The two references in interned are not counted by refcnt. 10178 The deallocator will take care of this */ 10179 Py_REFCNT(s) -= 2; 10180 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL; 10181} 10182 10183void 10184PyUnicode_InternImmortal(PyObject **p) 10185{ 10186 PyUnicode_InternInPlace(p); 10187 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) { 10188 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL; 10189 Py_INCREF(*p); 10190 } 10191} 10192 10193PyObject * 10194PyUnicode_InternFromString(const char *cp) 10195{ 10196 PyObject *s = PyUnicode_FromString(cp); 10197 if (s == NULL) 10198 return NULL; 10199 PyUnicode_InternInPlace(&s); 10200 return s; 10201} 10202 10203void 10204_Py_ReleaseInternedUnicodeStrings(void) 10205{ 10206 PyObject *keys; 10207 PyUnicodeObject *s; 10208 Py_ssize_t i, n; 10209 Py_ssize_t immortal_size = 0, mortal_size = 0; 10210 10211 if (interned == NULL || !PyDict_Check(interned)) 10212 return; 10213 keys = PyDict_Keys(interned); 10214 if (keys == NULL || !PyList_Check(keys)) { 10215 PyErr_Clear(); 10216 return; 10217 } 10218 10219 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak 10220 detector, interned unicode strings are not forcibly deallocated; 10221 rather, we give them their stolen references back, and then clear 10222 and DECREF the interned dict. */ 10223 10224 n = PyList_GET_SIZE(keys); 10225 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n", 10226 n); 10227 for (i = 0; i < n; i++) { 10228 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i); 10229 switch (s->state) { 10230 case SSTATE_NOT_INTERNED: 10231 /* XXX Shouldn't happen */ 10232 break; 10233 case SSTATE_INTERNED_IMMORTAL: 10234 Py_REFCNT(s) += 1; 10235 immortal_size += s->length; 10236 break; 10237 case SSTATE_INTERNED_MORTAL: 10238 Py_REFCNT(s) += 2; 10239 mortal_size += s->length; 10240 break; 10241 default: 10242 Py_FatalError("Inconsistent interned string state."); 10243 } 10244 s->state = SSTATE_NOT_INTERNED; 10245 } 10246 fprintf(stderr, "total size of all interned strings: " 10247 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d " 10248 "mortal/immortal\n", mortal_size, immortal_size); 10249 Py_DECREF(keys); 10250 PyDict_Clear(interned); 10251 Py_DECREF(interned); 10252 interned = NULL; 10253} 10254 10255 10256/********************* Unicode Iterator **************************/ 10257 10258typedef struct { 10259 PyObject_HEAD 10260 Py_ssize_t it_index; 10261 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */ 10262} unicodeiterobject; 10263 10264static void 10265unicodeiter_dealloc(unicodeiterobject *it) 10266{ 10267 _PyObject_GC_UNTRACK(it); 10268 Py_XDECREF(it->it_seq); 10269 PyObject_GC_Del(it); 10270} 10271 10272static int 10273unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg) 10274{ 10275 Py_VISIT(it->it_seq); 10276 return 0; 10277} 10278 10279static PyObject * 10280unicodeiter_next(unicodeiterobject *it) 10281{ 10282 PyUnicodeObject *seq; 10283 PyObject *item; 10284 10285 assert(it != NULL); 10286 seq = it->it_seq; 10287 if (seq == NULL) 10288 return NULL; 10289 assert(PyUnicode_Check(seq)); 10290 10291 if (it->it_index < PyUnicode_GET_SIZE(seq)) { 10292 item = PyUnicode_FromUnicode( 10293 PyUnicode_AS_UNICODE(seq)+it->it_index, 1); 10294 if (item != NULL) 10295 ++it->it_index; 10296 return item; 10297 } 10298 10299 Py_DECREF(seq); 10300 it->it_seq = NULL; 10301 return NULL; 10302} 10303 10304static PyObject * 10305unicodeiter_len(unicodeiterobject *it) 10306{ 10307 Py_ssize_t len = 0; 10308 if (it->it_seq) 10309 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index; 10310 return PyLong_FromSsize_t(len); 10311} 10312 10313PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it))."); 10314 10315static PyMethodDef unicodeiter_methods[] = { 10316 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS, 10317 length_hint_doc}, 10318 {NULL, NULL} /* sentinel */ 10319}; 10320 10321PyTypeObject PyUnicodeIter_Type = { 10322 PyVarObject_HEAD_INIT(&PyType_Type, 0) 10323 "str_iterator", /* tp_name */ 10324 sizeof(unicodeiterobject), /* tp_basicsize */ 10325 0, /* tp_itemsize */ 10326 /* methods */ 10327 (destructor)unicodeiter_dealloc, /* tp_dealloc */ 10328 0, /* tp_print */ 10329 0, /* tp_getattr */ 10330 0, /* tp_setattr */ 10331 0, /* tp_reserved */ 10332 0, /* tp_repr */ 10333 0, /* tp_as_number */ 10334 0, /* tp_as_sequence */ 10335 0, /* tp_as_mapping */ 10336 0, /* tp_hash */ 10337 0, /* tp_call */ 10338 0, /* tp_str */ 10339 PyObject_GenericGetAttr, /* tp_getattro */ 10340 0, /* tp_setattro */ 10341 0, /* tp_as_buffer */ 10342 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */ 10343 0, /* tp_doc */ 10344 (traverseproc)unicodeiter_traverse, /* tp_traverse */ 10345 0, /* tp_clear */ 10346 0, /* tp_richcompare */ 10347 0, /* tp_weaklistoffset */ 10348 PyObject_SelfIter, /* tp_iter */ 10349 (iternextfunc)unicodeiter_next, /* tp_iternext */ 10350 unicodeiter_methods, /* tp_methods */ 10351 0, 10352}; 10353 10354static PyObject * 10355unicode_iter(PyObject *seq) 10356{ 10357 unicodeiterobject *it; 10358 10359 if (!PyUnicode_Check(seq)) { 10360 PyErr_BadInternalCall(); 10361 return NULL; 10362 } 10363 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type); 10364 if (it == NULL) 10365 return NULL; 10366 it->it_index = 0; 10367 Py_INCREF(seq); 10368 it->it_seq = (PyUnicodeObject *)seq; 10369 _PyObject_GC_TRACK(it); 10370 return (PyObject *)it; 10371} 10372 10373size_t 10374Py_UNICODE_strlen(const Py_UNICODE *u) 10375{ 10376 int res = 0; 10377 while(*u++) 10378 res++; 10379 return res; 10380} 10381 10382Py_UNICODE* 10383Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2) 10384{ 10385 Py_UNICODE *u = s1; 10386 while ((*u++ = *s2++)); 10387 return s1; 10388} 10389 10390Py_UNICODE* 10391Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n) 10392{ 10393 Py_UNICODE *u = s1; 10394 while ((*u++ = *s2++)) 10395 if (n-- == 0) 10396 break; 10397 return s1; 10398} 10399 10400Py_UNICODE* 10401Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2) 10402{ 10403 Py_UNICODE *u1 = s1; 10404 u1 += Py_UNICODE_strlen(u1); 10405 Py_UNICODE_strcpy(u1, s2); 10406 return s1; 10407} 10408 10409int 10410Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2) 10411{ 10412 while (*s1 && *s2 && *s1 == *s2) 10413 s1++, s2++; 10414 if (*s1 && *s2) 10415 return (*s1 < *s2) ? -1 : +1; 10416 if (*s1) 10417 return 1; 10418 if (*s2) 10419 return -1; 10420 return 0; 10421} 10422 10423int 10424Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n) 10425{ 10426 register Py_UNICODE u1, u2; 10427 for (; n != 0; n--) { 10428 u1 = *s1; 10429 u2 = *s2; 10430 if (u1 != u2) 10431 return (u1 < u2) ? -1 : +1; 10432 if (u1 == '\0') 10433 return 0; 10434 s1++; 10435 s2++; 10436 } 10437 return 0; 10438} 10439 10440Py_UNICODE* 10441Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c) 10442{ 10443 const Py_UNICODE *p; 10444 for (p = s; *p; p++) 10445 if (*p == c) 10446 return (Py_UNICODE*)p; 10447 return NULL; 10448} 10449 10450Py_UNICODE* 10451Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c) 10452{ 10453 const Py_UNICODE *p; 10454 p = s + Py_UNICODE_strlen(s); 10455 while (p != s) { 10456 p--; 10457 if (*p == c) 10458 return (Py_UNICODE*)p; 10459 } 10460 return NULL; 10461} 10462 10463Py_UNICODE* 10464PyUnicode_AsUnicodeCopy(PyObject *object) 10465{ 10466 PyUnicodeObject *unicode = (PyUnicodeObject *)object; 10467 Py_UNICODE *copy; 10468 Py_ssize_t size; 10469 10470 /* Ensure we won't overflow the size. */ 10471 if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) { 10472 PyErr_NoMemory(); 10473 return NULL; 10474 } 10475 size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */ 10476 size *= sizeof(Py_UNICODE); 10477 copy = PyMem_Malloc(size); 10478 if (copy == NULL) { 10479 PyErr_NoMemory(); 10480 return NULL; 10481 } 10482 memcpy(copy, PyUnicode_AS_UNICODE(unicode), size); 10483 return copy; 10484} 10485 10486/* A _string module, to export formatter_parser and formatter_field_name_split 10487 to the string.Formatter class implemented in Python. */ 10488 10489static PyMethodDef _string_methods[] = { 10490 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split, 10491 METH_O, PyDoc_STR("split the argument as a field name")}, 10492 {"formatter_parser", (PyCFunction) formatter_parser, 10493 METH_O, PyDoc_STR("parse the argument as a format string")}, 10494 {NULL, NULL} 10495}; 10496 10497static struct PyModuleDef _string_module = { 10498 PyModuleDef_HEAD_INIT, 10499 "_string", 10500 PyDoc_STR("string helper module"), 10501 0, 10502 _string_methods, 10503 NULL, 10504 NULL, 10505 NULL, 10506 NULL 10507}; 10508 10509PyMODINIT_FUNC 10510PyInit__string(void) 10511{ 10512 return PyModule_Create(&_string_module); 10513} 10514 10515 10516#ifdef __cplusplus 10517} 10518#endif 10519