unicodeobject.c revision f3fd733f928752c9e35f8f5141a54cd21c0993b5
1/* 2 3Unicode implementation based on original code by Fredrik Lundh, 4modified by Marc-Andre Lemburg <mal@lemburg.com> according to the 5Unicode Integration Proposal (see file Misc/unicode.txt). 6 7Major speed upgrades to the method implementations at the Reykjavik 8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke. 9 10Copyright (c) Corporation for National Research Initiatives. 11 12-------------------------------------------------------------------- 13The original string type implementation is: 14 15 Copyright (c) 1999 by Secret Labs AB 16 Copyright (c) 1999 by Fredrik Lundh 17 18By obtaining, using, and/or copying this software and/or its 19associated documentation, you agree that you have read, understood, 20and will comply with the following terms and conditions: 21 22Permission to use, copy, modify, and distribute this software and its 23associated documentation for any purpose and without fee is hereby 24granted, provided that the above copyright notice appears in all 25copies, and that both that copyright notice and this permission notice 26appear in supporting documentation, and that the name of Secret Labs 27AB or the author not be used in advertising or publicity pertaining to 28distribution of the software without specific, written prior 29permission. 30 31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO 32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND 33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR 34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT 37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 38-------------------------------------------------------------------- 39 40*/ 41 42#define PY_SSIZE_T_CLEAN 43#include "Python.h" 44#include "ucnhash.h" 45 46#ifdef MS_WINDOWS 47#include <windows.h> 48#endif 49 50/* Limit for the Unicode object free list */ 51 52#define PyUnicode_MAXFREELIST 1024 53 54/* Limit for the Unicode object free list stay alive optimization. 55 56 The implementation will keep allocated Unicode memory intact for 57 all objects on the free list having a size less than this 58 limit. This reduces malloc() overhead for small Unicode objects. 59 60 At worst this will result in PyUnicode_MAXFREELIST * 61 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT + 62 malloc()-overhead) bytes of unused garbage. 63 64 Setting the limit to 0 effectively turns the feature off. 65 66 Note: This is an experimental feature ! If you get core dumps when 67 using Unicode objects, turn this feature off. 68 69*/ 70 71#define KEEPALIVE_SIZE_LIMIT 9 72 73/* Endianness switches; defaults to little endian */ 74 75#ifdef WORDS_BIGENDIAN 76# define BYTEORDER_IS_BIG_ENDIAN 77#else 78# define BYTEORDER_IS_LITTLE_ENDIAN 79#endif 80 81/* --- Globals ------------------------------------------------------------ 82 83 The globals are initialized by the _PyUnicode_Init() API and should 84 not be used before calling that API. 85 86*/ 87 88 89#ifdef __cplusplus 90extern "C" { 91#endif 92 93/* This dictionary holds all interned unicode strings. Note that references 94 to strings in this dictionary are *not* counted in the string's ob_refcnt. 95 When the interned string reaches a refcnt of 0 the string deallocation 96 function will delete the reference from this dictionary. 97 98 Another way to look at this is that to say that the actual reference 99 count of a string is: s->ob_refcnt + (s->state ? 2 : 0) 100*/ 101static PyObject *interned; 102 103/* Free list for Unicode objects */ 104static PyUnicodeObject *free_list; 105static int numfree; 106 107/* The empty Unicode object is shared to improve performance. */ 108static PyUnicodeObject *unicode_empty; 109 110/* Single character Unicode strings in the Latin-1 range are being 111 shared as well. */ 112static PyUnicodeObject *unicode_latin1[256]; 113 114/* Fast detection of the most frequent whitespace characters */ 115const unsigned char _Py_ascii_whitespace[] = { 116 0, 0, 0, 0, 0, 0, 0, 0, 117/* case 0x0009: * CHARACTER TABULATION */ 118/* case 0x000A: * LINE FEED */ 119/* case 0x000B: * LINE TABULATION */ 120/* case 0x000C: * FORM FEED */ 121/* case 0x000D: * CARRIAGE RETURN */ 122 0, 1, 1, 1, 1, 1, 0, 0, 123 0, 0, 0, 0, 0, 0, 0, 0, 124/* case 0x001C: * FILE SEPARATOR */ 125/* case 0x001D: * GROUP SEPARATOR */ 126/* case 0x001E: * RECORD SEPARATOR */ 127/* case 0x001F: * UNIT SEPARATOR */ 128 0, 0, 0, 0, 1, 1, 1, 1, 129/* case 0x0020: * SPACE */ 130 1, 0, 0, 0, 0, 0, 0, 0, 131 0, 0, 0, 0, 0, 0, 0, 0, 132 0, 0, 0, 0, 0, 0, 0, 0, 133 0, 0, 0, 0, 0, 0, 0, 0, 134 135 0, 0, 0, 0, 0, 0, 0, 0, 136 0, 0, 0, 0, 0, 0, 0, 0, 137 0, 0, 0, 0, 0, 0, 0, 0, 138 0, 0, 0, 0, 0, 0, 0, 0, 139 0, 0, 0, 0, 0, 0, 0, 0, 140 0, 0, 0, 0, 0, 0, 0, 0, 141 0, 0, 0, 0, 0, 0, 0, 0, 142 0, 0, 0, 0, 0, 0, 0, 0 143}; 144 145static PyObject * 146unicode_encode_call_errorhandler(const char *errors, 147 PyObject **errorHandler,const char *encoding, const char *reason, 148 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject, 149 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos); 150 151static void 152raise_encode_exception(PyObject **exceptionObject, 153 const char *encoding, 154 const Py_UNICODE *unicode, Py_ssize_t size, 155 Py_ssize_t startpos, Py_ssize_t endpos, 156 const char *reason); 157 158/* Same for linebreaks */ 159static unsigned char ascii_linebreak[] = { 160 0, 0, 0, 0, 0, 0, 0, 0, 161/* 0x000A, * LINE FEED */ 162/* 0x000B, * LINE TABULATION */ 163/* 0x000C, * FORM FEED */ 164/* 0x000D, * CARRIAGE RETURN */ 165 0, 0, 1, 1, 1, 1, 0, 0, 166 0, 0, 0, 0, 0, 0, 0, 0, 167/* 0x001C, * FILE SEPARATOR */ 168/* 0x001D, * GROUP SEPARATOR */ 169/* 0x001E, * RECORD SEPARATOR */ 170 0, 0, 0, 0, 1, 1, 1, 0, 171 0, 0, 0, 0, 0, 0, 0, 0, 172 0, 0, 0, 0, 0, 0, 0, 0, 173 0, 0, 0, 0, 0, 0, 0, 0, 174 0, 0, 0, 0, 0, 0, 0, 0, 175 176 0, 0, 0, 0, 0, 0, 0, 0, 177 0, 0, 0, 0, 0, 0, 0, 0, 178 0, 0, 0, 0, 0, 0, 0, 0, 179 0, 0, 0, 0, 0, 0, 0, 0, 180 0, 0, 0, 0, 0, 0, 0, 0, 181 0, 0, 0, 0, 0, 0, 0, 0, 182 0, 0, 0, 0, 0, 0, 0, 0, 183 0, 0, 0, 0, 0, 0, 0, 0 184}; 185 186 187Py_UNICODE 188PyUnicode_GetMax(void) 189{ 190#ifdef Py_UNICODE_WIDE 191 return 0x10FFFF; 192#else 193 /* This is actually an illegal character, so it should 194 not be passed to unichr. */ 195 return 0xFFFF; 196#endif 197} 198 199/* --- Bloom Filters ----------------------------------------------------- */ 200 201/* stuff to implement simple "bloom filters" for Unicode characters. 202 to keep things simple, we use a single bitmask, using the least 5 203 bits from each unicode characters as the bit index. */ 204 205/* the linebreak mask is set up by Unicode_Init below */ 206 207#if LONG_BIT >= 128 208#define BLOOM_WIDTH 128 209#elif LONG_BIT >= 64 210#define BLOOM_WIDTH 64 211#elif LONG_BIT >= 32 212#define BLOOM_WIDTH 32 213#else 214#error "LONG_BIT is smaller than 32" 215#endif 216 217#define BLOOM_MASK unsigned long 218 219static BLOOM_MASK bloom_linebreak; 220 221#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1))))) 222#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1))))) 223 224#define BLOOM_LINEBREAK(ch) \ 225 ((ch) < 128U ? ascii_linebreak[(ch)] : \ 226 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch))) 227 228Py_LOCAL_INLINE(BLOOM_MASK) 229make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len) 230{ 231 /* calculate simple bloom-style bitmask for a given unicode string */ 232 233 BLOOM_MASK mask; 234 Py_ssize_t i; 235 236 mask = 0; 237 for (i = 0; i < len; i++) 238 BLOOM_ADD(mask, ptr[i]); 239 240 return mask; 241} 242 243Py_LOCAL_INLINE(int) 244unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen) 245{ 246 Py_ssize_t i; 247 248 for (i = 0; i < setlen; i++) 249 if (set[i] == chr) 250 return 1; 251 252 return 0; 253} 254 255#define BLOOM_MEMBER(mask, chr, set, setlen) \ 256 BLOOM(mask, chr) && unicode_member(chr, set, setlen) 257 258/* --- Unicode Object ----------------------------------------------------- */ 259 260static int 261unicode_resize(register PyUnicodeObject *unicode, 262 Py_ssize_t length) 263{ 264 void *oldstr; 265 266 /* Shortcut if there's nothing much to do. */ 267 if (unicode->length == length) 268 goto reset; 269 270 /* Resizing shared object (unicode_empty or single character 271 objects) in-place is not allowed. Use PyUnicode_Resize() 272 instead ! */ 273 274 if (unicode == unicode_empty || 275 (unicode->length == 1 && 276 unicode->str[0] < 256U && 277 unicode_latin1[unicode->str[0]] == unicode)) { 278 PyErr_SetString(PyExc_SystemError, 279 "can't resize shared str objects"); 280 return -1; 281 } 282 283 /* We allocate one more byte to make sure the string is Ux0000 terminated. 284 The overallocation is also used by fastsearch, which assumes that it's 285 safe to look at str[length] (without making any assumptions about what 286 it contains). */ 287 288 oldstr = unicode->str; 289 unicode->str = PyObject_REALLOC(unicode->str, 290 sizeof(Py_UNICODE) * (length + 1)); 291 if (!unicode->str) { 292 unicode->str = (Py_UNICODE *)oldstr; 293 PyErr_NoMemory(); 294 return -1; 295 } 296 unicode->str[length] = 0; 297 unicode->length = length; 298 299 reset: 300 /* Reset the object caches */ 301 if (unicode->defenc) { 302 Py_CLEAR(unicode->defenc); 303 } 304 unicode->hash = -1; 305 306 return 0; 307} 308 309/* We allocate one more byte to make sure the string is 310 Ux0000 terminated; some code (e.g. new_identifier) 311 relies on that. 312 313 XXX This allocator could further be enhanced by assuring that the 314 free list never reduces its size below 1. 315 316*/ 317 318static PyUnicodeObject * 319_PyUnicode_New(Py_ssize_t length) 320{ 321 register PyUnicodeObject *unicode; 322 323 /* Optimization for empty strings */ 324 if (length == 0 && unicode_empty != NULL) { 325 Py_INCREF(unicode_empty); 326 return unicode_empty; 327 } 328 329 /* Ensure we won't overflow the size. */ 330 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) { 331 return (PyUnicodeObject *)PyErr_NoMemory(); 332 } 333 334 /* Unicode freelist & memory allocation */ 335 if (free_list) { 336 unicode = free_list; 337 free_list = *(PyUnicodeObject **)unicode; 338 numfree--; 339 if (unicode->str) { 340 /* Keep-Alive optimization: we only upsize the buffer, 341 never downsize it. */ 342 if ((unicode->length < length) && 343 unicode_resize(unicode, length) < 0) { 344 PyObject_DEL(unicode->str); 345 unicode->str = NULL; 346 } 347 } 348 else { 349 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1); 350 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size); 351 } 352 PyObject_INIT(unicode, &PyUnicode_Type); 353 } 354 else { 355 size_t new_size; 356 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type); 357 if (unicode == NULL) 358 return NULL; 359 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1); 360 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size); 361 } 362 363 if (!unicode->str) { 364 PyErr_NoMemory(); 365 goto onError; 366 } 367 /* Initialize the first element to guard against cases where 368 * the caller fails before initializing str -- unicode_resize() 369 * reads str[0], and the Keep-Alive optimization can keep memory 370 * allocated for str alive across a call to unicode_dealloc(unicode). 371 * We don't want unicode_resize to read uninitialized memory in 372 * that case. 373 */ 374 unicode->str[0] = 0; 375 unicode->str[length] = 0; 376 unicode->length = length; 377 unicode->hash = -1; 378 unicode->state = 0; 379 unicode->defenc = NULL; 380 return unicode; 381 382 onError: 383 /* XXX UNREF/NEWREF interface should be more symmetrical */ 384 _Py_DEC_REFTOTAL; 385 _Py_ForgetReference((PyObject *)unicode); 386 PyObject_Del(unicode); 387 return NULL; 388} 389 390static void 391unicode_dealloc(register PyUnicodeObject *unicode) 392{ 393 switch (PyUnicode_CHECK_INTERNED(unicode)) { 394 case SSTATE_NOT_INTERNED: 395 break; 396 397 case SSTATE_INTERNED_MORTAL: 398 /* revive dead object temporarily for DelItem */ 399 Py_REFCNT(unicode) = 3; 400 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0) 401 Py_FatalError( 402 "deletion of interned string failed"); 403 break; 404 405 case SSTATE_INTERNED_IMMORTAL: 406 Py_FatalError("Immortal interned string died."); 407 408 default: 409 Py_FatalError("Inconsistent interned string state."); 410 } 411 412 if (PyUnicode_CheckExact(unicode) && 413 numfree < PyUnicode_MAXFREELIST) { 414 /* Keep-Alive optimization */ 415 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) { 416 PyObject_DEL(unicode->str); 417 unicode->str = NULL; 418 unicode->length = 0; 419 } 420 if (unicode->defenc) { 421 Py_CLEAR(unicode->defenc); 422 } 423 /* Add to free list */ 424 *(PyUnicodeObject **)unicode = free_list; 425 free_list = unicode; 426 numfree++; 427 } 428 else { 429 PyObject_DEL(unicode->str); 430 Py_XDECREF(unicode->defenc); 431 Py_TYPE(unicode)->tp_free((PyObject *)unicode); 432 } 433} 434 435static int 436_PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length) 437{ 438 register PyUnicodeObject *v; 439 440 /* Argument checks */ 441 if (unicode == NULL) { 442 PyErr_BadInternalCall(); 443 return -1; 444 } 445 v = *unicode; 446 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) { 447 PyErr_BadInternalCall(); 448 return -1; 449 } 450 451 /* Resizing unicode_empty and single character objects is not 452 possible since these are being shared. We simply return a fresh 453 copy with the same Unicode content. */ 454 if (v->length != length && 455 (v == unicode_empty || v->length == 1)) { 456 PyUnicodeObject *w = _PyUnicode_New(length); 457 if (w == NULL) 458 return -1; 459 Py_UNICODE_COPY(w->str, v->str, 460 length < v->length ? length : v->length); 461 Py_DECREF(*unicode); 462 *unicode = w; 463 return 0; 464 } 465 466 /* Note that we don't have to modify *unicode for unshared Unicode 467 objects, since we can modify them in-place. */ 468 return unicode_resize(v, length); 469} 470 471int 472PyUnicode_Resize(PyObject **unicode, Py_ssize_t length) 473{ 474 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length); 475} 476 477PyObject * 478PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size) 479{ 480 PyUnicodeObject *unicode; 481 482 /* If the Unicode data is known at construction time, we can apply 483 some optimizations which share commonly used objects. */ 484 if (u != NULL) { 485 486 /* Optimization for empty strings */ 487 if (size == 0 && unicode_empty != NULL) { 488 Py_INCREF(unicode_empty); 489 return (PyObject *)unicode_empty; 490 } 491 492 /* Single character Unicode objects in the Latin-1 range are 493 shared when using this constructor */ 494 if (size == 1 && *u < 256) { 495 unicode = unicode_latin1[*u]; 496 if (!unicode) { 497 unicode = _PyUnicode_New(1); 498 if (!unicode) 499 return NULL; 500 unicode->str[0] = *u; 501 unicode_latin1[*u] = unicode; 502 } 503 Py_INCREF(unicode); 504 return (PyObject *)unicode; 505 } 506 } 507 508 unicode = _PyUnicode_New(size); 509 if (!unicode) 510 return NULL; 511 512 /* Copy the Unicode data into the new object */ 513 if (u != NULL) 514 Py_UNICODE_COPY(unicode->str, u, size); 515 516 return (PyObject *)unicode; 517} 518 519PyObject * 520PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size) 521{ 522 PyUnicodeObject *unicode; 523 524 if (size < 0) { 525 PyErr_SetString(PyExc_SystemError, 526 "Negative size passed to PyUnicode_FromStringAndSize"); 527 return NULL; 528 } 529 530 /* If the Unicode data is known at construction time, we can apply 531 some optimizations which share commonly used objects. 532 Also, this means the input must be UTF-8, so fall back to the 533 UTF-8 decoder at the end. */ 534 if (u != NULL) { 535 536 /* Optimization for empty strings */ 537 if (size == 0 && unicode_empty != NULL) { 538 Py_INCREF(unicode_empty); 539 return (PyObject *)unicode_empty; 540 } 541 542 /* Single characters are shared when using this constructor. 543 Restrict to ASCII, since the input must be UTF-8. */ 544 if (size == 1 && Py_CHARMASK(*u) < 128) { 545 unicode = unicode_latin1[Py_CHARMASK(*u)]; 546 if (!unicode) { 547 unicode = _PyUnicode_New(1); 548 if (!unicode) 549 return NULL; 550 unicode->str[0] = Py_CHARMASK(*u); 551 unicode_latin1[Py_CHARMASK(*u)] = unicode; 552 } 553 Py_INCREF(unicode); 554 return (PyObject *)unicode; 555 } 556 557 return PyUnicode_DecodeUTF8(u, size, NULL); 558 } 559 560 unicode = _PyUnicode_New(size); 561 if (!unicode) 562 return NULL; 563 564 return (PyObject *)unicode; 565} 566 567PyObject * 568PyUnicode_FromString(const char *u) 569{ 570 size_t size = strlen(u); 571 if (size > PY_SSIZE_T_MAX) { 572 PyErr_SetString(PyExc_OverflowError, "input too long"); 573 return NULL; 574 } 575 576 return PyUnicode_FromStringAndSize(u, size); 577} 578 579#ifdef HAVE_WCHAR_H 580 581#if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4) 582# define CONVERT_WCHAR_TO_SURROGATES 583#endif 584 585#ifdef CONVERT_WCHAR_TO_SURROGATES 586 587/* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need 588 to convert from UTF32 to UTF16. */ 589 590PyObject * 591PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size) 592{ 593 PyUnicodeObject *unicode; 594 register Py_ssize_t i; 595 Py_ssize_t alloc; 596 const wchar_t *orig_w; 597 598 if (w == NULL) { 599 if (size == 0) 600 return PyUnicode_FromStringAndSize(NULL, 0); 601 PyErr_BadInternalCall(); 602 return NULL; 603 } 604 605 if (size == -1) { 606 size = wcslen(w); 607 } 608 609 alloc = size; 610 orig_w = w; 611 for (i = size; i > 0; i--) { 612 if (*w > 0xFFFF) 613 alloc++; 614 w++; 615 } 616 w = orig_w; 617 unicode = _PyUnicode_New(alloc); 618 if (!unicode) 619 return NULL; 620 621 /* Copy the wchar_t data into the new object */ 622 { 623 register Py_UNICODE *u; 624 u = PyUnicode_AS_UNICODE(unicode); 625 for (i = size; i > 0; i--) { 626 if (*w > 0xFFFF) { 627 wchar_t ordinal = *w++; 628 ordinal -= 0x10000; 629 *u++ = 0xD800 | (ordinal >> 10); 630 *u++ = 0xDC00 | (ordinal & 0x3FF); 631 } 632 else 633 *u++ = *w++; 634 } 635 } 636 return (PyObject *)unicode; 637} 638 639#else 640 641PyObject * 642PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size) 643{ 644 PyUnicodeObject *unicode; 645 646 if (w == NULL) { 647 if (size == 0) 648 return PyUnicode_FromStringAndSize(NULL, 0); 649 PyErr_BadInternalCall(); 650 return NULL; 651 } 652 653 if (size == -1) { 654 size = wcslen(w); 655 } 656 657 unicode = _PyUnicode_New(size); 658 if (!unicode) 659 return NULL; 660 661 /* Copy the wchar_t data into the new object */ 662#if Py_UNICODE_SIZE == SIZEOF_WCHAR_T 663 memcpy(unicode->str, w, size * sizeof(wchar_t)); 664#else 665 { 666 register Py_UNICODE *u; 667 register Py_ssize_t i; 668 u = PyUnicode_AS_UNICODE(unicode); 669 for (i = size; i > 0; i--) 670 *u++ = *w++; 671 } 672#endif 673 674 return (PyObject *)unicode; 675} 676 677#endif /* CONVERT_WCHAR_TO_SURROGATES */ 678 679#undef CONVERT_WCHAR_TO_SURROGATES 680 681static void 682makefmt(char *fmt, int longflag, int longlongflag, int size_tflag, 683 int zeropad, int width, int precision, char c) 684{ 685 *fmt++ = '%'; 686 if (width) { 687 if (zeropad) 688 *fmt++ = '0'; 689 fmt += sprintf(fmt, "%d", width); 690 } 691 if (precision) 692 fmt += sprintf(fmt, ".%d", precision); 693 if (longflag) 694 *fmt++ = 'l'; 695 else if (longlongflag) { 696 /* longlongflag should only ever be nonzero on machines with 697 HAVE_LONG_LONG defined */ 698#ifdef HAVE_LONG_LONG 699 char *f = PY_FORMAT_LONG_LONG; 700 while (*f) 701 *fmt++ = *f++; 702#else 703 /* we shouldn't ever get here */ 704 assert(0); 705 *fmt++ = 'l'; 706#endif 707 } 708 else if (size_tflag) { 709 char *f = PY_FORMAT_SIZE_T; 710 while (*f) 711 *fmt++ = *f++; 712 } 713 *fmt++ = c; 714 *fmt = '\0'; 715} 716 717/* helper for PyUnicode_FromFormatV() */ 718 719static const char* 720parse_format_flags(const char *f, 721 int *p_width, int *p_precision, 722 int *p_longflag, int *p_longlongflag, int *p_size_tflag) 723{ 724 int width, precision, longflag, longlongflag, size_tflag; 725 726 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */ 727 f++; 728 width = 0; 729 while (Py_ISDIGIT((unsigned)*f)) 730 width = (width*10) + *f++ - '0'; 731 precision = 0; 732 if (*f == '.') { 733 f++; 734 while (Py_ISDIGIT((unsigned)*f)) 735 precision = (precision*10) + *f++ - '0'; 736 if (*f == '%') { 737 /* "%.3%s" => f points to "3" */ 738 f--; 739 } 740 } 741 if (*f == '\0') { 742 /* bogus format "%.1" => go backward, f points to "1" */ 743 f--; 744 } 745 if (p_width != NULL) 746 *p_width = width; 747 if (p_precision != NULL) 748 *p_precision = precision; 749 750 /* Handle %ld, %lu, %lld and %llu. */ 751 longflag = 0; 752 longlongflag = 0; 753 size_tflag = 0; 754 755 if (*f == 'l') { 756 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') { 757 longflag = 1; 758 ++f; 759 } 760#ifdef HAVE_LONG_LONG 761 else if (f[1] == 'l' && 762 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) { 763 longlongflag = 1; 764 f += 2; 765 } 766#endif 767 } 768 /* handle the size_t flag. */ 769 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) { 770 size_tflag = 1; 771 ++f; 772 } 773 if (p_longflag != NULL) 774 *p_longflag = longflag; 775 if (p_longlongflag != NULL) 776 *p_longlongflag = longlongflag; 777 if (p_size_tflag != NULL) 778 *p_size_tflag = size_tflag; 779 return f; 780} 781 782#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;} 783 784/* size of fixed-size buffer for formatting single arguments */ 785#define ITEM_BUFFER_LEN 21 786/* maximum number of characters required for output of %ld. 21 characters 787 allows for 64-bit integers (in decimal) and an optional sign. */ 788#define MAX_LONG_CHARS 21 789/* maximum number of characters required for output of %lld. 790 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits, 791 plus 1 for the sign. 53/22 is an upper bound for log10(256). */ 792#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22) 793 794PyObject * 795PyUnicode_FromFormatV(const char *format, va_list vargs) 796{ 797 va_list count; 798 Py_ssize_t callcount = 0; 799 PyObject **callresults = NULL; 800 PyObject **callresult = NULL; 801 Py_ssize_t n = 0; 802 int width = 0; 803 int precision = 0; 804 int zeropad; 805 const char* f; 806 Py_UNICODE *s; 807 PyObject *string; 808 /* used by sprintf */ 809 char buffer[ITEM_BUFFER_LEN+1]; 810 /* use abuffer instead of buffer, if we need more space 811 * (which can happen if there's a format specifier with width). */ 812 char *abuffer = NULL; 813 char *realbuffer; 814 Py_ssize_t abuffersize = 0; 815 char fmt[61]; /* should be enough for %0width.precisionlld */ 816 const char *copy; 817 818 Py_VA_COPY(count, vargs); 819 /* step 1: count the number of %S/%R/%A/%s format specifications 820 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/ 821 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the 822 * result in an array) */ 823 for (f = format; *f; f++) { 824 if (*f == '%') { 825 /* skip width or width.precision (eg. "1.2" of "%1.2f") */ 826 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL); 827 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V') 828 ++callcount; 829 } 830 else if (128 <= (unsigned char)*f) { 831 PyErr_Format(PyExc_ValueError, 832 "PyUnicode_FromFormatV() expects an ASCII-encoded format " 833 "string, got a non-ASCII byte: 0x%02x", 834 (unsigned char)*f); 835 return NULL; 836 } 837 } 838 /* step 2: allocate memory for the results of 839 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */ 840 if (callcount) { 841 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount); 842 if (!callresults) { 843 PyErr_NoMemory(); 844 return NULL; 845 } 846 callresult = callresults; 847 } 848 /* step 3: figure out how large a buffer we need */ 849 for (f = format; *f; f++) { 850 if (*f == '%') { 851#ifdef HAVE_LONG_LONG 852 int longlongflag; 853#endif 854 const char* p; 855 856 p = f; 857 f = parse_format_flags(f, &width, NULL, 858 NULL, &longlongflag, NULL); 859 860 switch (*f) { 861 case 'c': 862 { 863#ifndef Py_UNICODE_WIDE 864 int ordinal = va_arg(count, int); 865 if (ordinal > 0xffff) 866 n += 2; 867 else 868 n++; 869#else 870 (void)va_arg(count, int); 871 n++; 872#endif 873 break; 874 } 875 case '%': 876 n++; 877 break; 878 case 'd': case 'u': case 'i': case 'x': 879 (void) va_arg(count, int); 880#ifdef HAVE_LONG_LONG 881 if (longlongflag) { 882 if (width < MAX_LONG_LONG_CHARS) 883 width = MAX_LONG_LONG_CHARS; 884 } 885 else 886#endif 887 /* MAX_LONG_CHARS is enough to hold a 64-bit integer, 888 including sign. Decimal takes the most space. This 889 isn't enough for octal. If a width is specified we 890 need more (which we allocate later). */ 891 if (width < MAX_LONG_CHARS) 892 width = MAX_LONG_CHARS; 893 n += width; 894 /* XXX should allow for large precision here too. */ 895 if (abuffersize < width) 896 abuffersize = width; 897 break; 898 case 's': 899 { 900 /* UTF-8 */ 901 const char *s = va_arg(count, const char*); 902 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace"); 903 if (!str) 904 goto fail; 905 n += PyUnicode_GET_SIZE(str); 906 /* Remember the str and switch to the next slot */ 907 *callresult++ = str; 908 break; 909 } 910 case 'U': 911 { 912 PyObject *obj = va_arg(count, PyObject *); 913 assert(obj && PyUnicode_Check(obj)); 914 n += PyUnicode_GET_SIZE(obj); 915 break; 916 } 917 case 'V': 918 { 919 PyObject *obj = va_arg(count, PyObject *); 920 const char *str = va_arg(count, const char *); 921 PyObject *str_obj; 922 assert(obj || str); 923 assert(!obj || PyUnicode_Check(obj)); 924 if (obj) { 925 n += PyUnicode_GET_SIZE(obj); 926 *callresult++ = NULL; 927 } 928 else { 929 str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace"); 930 if (!str_obj) 931 goto fail; 932 n += PyUnicode_GET_SIZE(str_obj); 933 *callresult++ = str_obj; 934 } 935 break; 936 } 937 case 'S': 938 { 939 PyObject *obj = va_arg(count, PyObject *); 940 PyObject *str; 941 assert(obj); 942 str = PyObject_Str(obj); 943 if (!str) 944 goto fail; 945 n += PyUnicode_GET_SIZE(str); 946 /* Remember the str and switch to the next slot */ 947 *callresult++ = str; 948 break; 949 } 950 case 'R': 951 { 952 PyObject *obj = va_arg(count, PyObject *); 953 PyObject *repr; 954 assert(obj); 955 repr = PyObject_Repr(obj); 956 if (!repr) 957 goto fail; 958 n += PyUnicode_GET_SIZE(repr); 959 /* Remember the repr and switch to the next slot */ 960 *callresult++ = repr; 961 break; 962 } 963 case 'A': 964 { 965 PyObject *obj = va_arg(count, PyObject *); 966 PyObject *ascii; 967 assert(obj); 968 ascii = PyObject_ASCII(obj); 969 if (!ascii) 970 goto fail; 971 n += PyUnicode_GET_SIZE(ascii); 972 /* Remember the repr and switch to the next slot */ 973 *callresult++ = ascii; 974 break; 975 } 976 case 'p': 977 (void) va_arg(count, int); 978 /* maximum 64-bit pointer representation: 979 * 0xffffffffffffffff 980 * so 19 characters is enough. 981 * XXX I count 18 -- what's the extra for? 982 */ 983 n += 19; 984 break; 985 default: 986 /* if we stumble upon an unknown 987 formatting code, copy the rest of 988 the format string to the output 989 string. (we cannot just skip the 990 code, since there's no way to know 991 what's in the argument list) */ 992 n += strlen(p); 993 goto expand; 994 } 995 } else 996 n++; 997 } 998 expand: 999 if (abuffersize > ITEM_BUFFER_LEN) { 1000 /* add 1 for sprintf's trailing null byte */ 1001 abuffer = PyObject_Malloc(abuffersize + 1); 1002 if (!abuffer) { 1003 PyErr_NoMemory(); 1004 goto fail; 1005 } 1006 realbuffer = abuffer; 1007 } 1008 else 1009 realbuffer = buffer; 1010 /* step 4: fill the buffer */ 1011 /* Since we've analyzed how much space we need for the worst case, 1012 we don't have to resize the string. 1013 There can be no errors beyond this point. */ 1014 string = PyUnicode_FromUnicode(NULL, n); 1015 if (!string) 1016 goto fail; 1017 1018 s = PyUnicode_AS_UNICODE(string); 1019 callresult = callresults; 1020 1021 for (f = format; *f; f++) { 1022 if (*f == '%') { 1023 const char* p; 1024 int longflag; 1025 int longlongflag; 1026 int size_tflag; 1027 1028 p = f; 1029 zeropad = (f[1] == '0'); 1030 f = parse_format_flags(f, &width, &precision, 1031 &longflag, &longlongflag, &size_tflag); 1032 1033 switch (*f) { 1034 case 'c': 1035 { 1036 int ordinal = va_arg(vargs, int); 1037#ifndef Py_UNICODE_WIDE 1038 if (ordinal > 0xffff) { 1039 ordinal -= 0x10000; 1040 *s++ = 0xD800 | (ordinal >> 10); 1041 *s++ = 0xDC00 | (ordinal & 0x3FF); 1042 } else 1043#endif 1044 *s++ = ordinal; 1045 break; 1046 } 1047 case 'i': 1048 case 'd': 1049 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad, 1050 width, precision, *f); 1051 if (longflag) 1052 sprintf(realbuffer, fmt, va_arg(vargs, long)); 1053#ifdef HAVE_LONG_LONG 1054 else if (longlongflag) 1055 sprintf(realbuffer, fmt, va_arg(vargs, PY_LONG_LONG)); 1056#endif 1057 else if (size_tflag) 1058 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t)); 1059 else 1060 sprintf(realbuffer, fmt, va_arg(vargs, int)); 1061 appendstring(realbuffer); 1062 break; 1063 case 'u': 1064 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad, 1065 width, precision, 'u'); 1066 if (longflag) 1067 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long)); 1068#ifdef HAVE_LONG_LONG 1069 else if (longlongflag) 1070 sprintf(realbuffer, fmt, va_arg(vargs, 1071 unsigned PY_LONG_LONG)); 1072#endif 1073 else if (size_tflag) 1074 sprintf(realbuffer, fmt, va_arg(vargs, size_t)); 1075 else 1076 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int)); 1077 appendstring(realbuffer); 1078 break; 1079 case 'x': 1080 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x'); 1081 sprintf(realbuffer, fmt, va_arg(vargs, int)); 1082 appendstring(realbuffer); 1083 break; 1084 case 's': 1085 { 1086 /* unused, since we already have the result */ 1087 (void) va_arg(vargs, char *); 1088 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult), 1089 PyUnicode_GET_SIZE(*callresult)); 1090 s += PyUnicode_GET_SIZE(*callresult); 1091 /* We're done with the unicode()/repr() => forget it */ 1092 Py_DECREF(*callresult); 1093 /* switch to next unicode()/repr() result */ 1094 ++callresult; 1095 break; 1096 } 1097 case 'U': 1098 { 1099 PyObject *obj = va_arg(vargs, PyObject *); 1100 Py_ssize_t size = PyUnicode_GET_SIZE(obj); 1101 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size); 1102 s += size; 1103 break; 1104 } 1105 case 'V': 1106 { 1107 PyObject *obj = va_arg(vargs, PyObject *); 1108 va_arg(vargs, const char *); 1109 if (obj) { 1110 Py_ssize_t size = PyUnicode_GET_SIZE(obj); 1111 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size); 1112 s += size; 1113 } else { 1114 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult), 1115 PyUnicode_GET_SIZE(*callresult)); 1116 s += PyUnicode_GET_SIZE(*callresult); 1117 Py_DECREF(*callresult); 1118 } 1119 ++callresult; 1120 break; 1121 } 1122 case 'S': 1123 case 'R': 1124 case 'A': 1125 { 1126 Py_UNICODE *ucopy; 1127 Py_ssize_t usize; 1128 Py_ssize_t upos; 1129 /* unused, since we already have the result */ 1130 (void) va_arg(vargs, PyObject *); 1131 ucopy = PyUnicode_AS_UNICODE(*callresult); 1132 usize = PyUnicode_GET_SIZE(*callresult); 1133 for (upos = 0; upos<usize;) 1134 *s++ = ucopy[upos++]; 1135 /* We're done with the unicode()/repr() => forget it */ 1136 Py_DECREF(*callresult); 1137 /* switch to next unicode()/repr() result */ 1138 ++callresult; 1139 break; 1140 } 1141 case 'p': 1142 sprintf(buffer, "%p", va_arg(vargs, void*)); 1143 /* %p is ill-defined: ensure leading 0x. */ 1144 if (buffer[1] == 'X') 1145 buffer[1] = 'x'; 1146 else if (buffer[1] != 'x') { 1147 memmove(buffer+2, buffer, strlen(buffer)+1); 1148 buffer[0] = '0'; 1149 buffer[1] = 'x'; 1150 } 1151 appendstring(buffer); 1152 break; 1153 case '%': 1154 *s++ = '%'; 1155 break; 1156 default: 1157 appendstring(p); 1158 goto end; 1159 } 1160 } 1161 else 1162 *s++ = *f; 1163 } 1164 1165 end: 1166 if (callresults) 1167 PyObject_Free(callresults); 1168 if (abuffer) 1169 PyObject_Free(abuffer); 1170 PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string)); 1171 return string; 1172 fail: 1173 if (callresults) { 1174 PyObject **callresult2 = callresults; 1175 while (callresult2 < callresult) { 1176 Py_XDECREF(*callresult2); 1177 ++callresult2; 1178 } 1179 PyObject_Free(callresults); 1180 } 1181 if (abuffer) 1182 PyObject_Free(abuffer); 1183 return NULL; 1184} 1185 1186#undef appendstring 1187 1188PyObject * 1189PyUnicode_FromFormat(const char *format, ...) 1190{ 1191 PyObject* ret; 1192 va_list vargs; 1193 1194#ifdef HAVE_STDARG_PROTOTYPES 1195 va_start(vargs, format); 1196#else 1197 va_start(vargs); 1198#endif 1199 ret = PyUnicode_FromFormatV(format, vargs); 1200 va_end(vargs); 1201 return ret; 1202} 1203 1204/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString(): 1205 convert a Unicode object to a wide character string. 1206 1207 - If w is NULL: return the number of wide characters (including the nul 1208 character) required to convert the unicode object. Ignore size argument. 1209 1210 - Otherwise: return the number of wide characters (excluding the nul 1211 character) written into w. Write at most size wide characters (including 1212 the nul character). */ 1213static Py_ssize_t 1214unicode_aswidechar(PyUnicodeObject *unicode, 1215 wchar_t *w, 1216 Py_ssize_t size) 1217{ 1218#if Py_UNICODE_SIZE == SIZEOF_WCHAR_T 1219 Py_ssize_t res; 1220 if (w != NULL) { 1221 res = PyUnicode_GET_SIZE(unicode); 1222 if (size > res) 1223 size = res + 1; 1224 else 1225 res = size; 1226 memcpy(w, unicode->str, size * sizeof(wchar_t)); 1227 return res; 1228 } 1229 else 1230 return PyUnicode_GET_SIZE(unicode) + 1; 1231#elif Py_UNICODE_SIZE == 2 && SIZEOF_WCHAR_T == 4 1232 register const Py_UNICODE *u; 1233 const Py_UNICODE *uend; 1234 const wchar_t *worig, *wend; 1235 Py_ssize_t nchar; 1236 1237 u = PyUnicode_AS_UNICODE(unicode); 1238 uend = u + PyUnicode_GET_SIZE(unicode); 1239 if (w != NULL) { 1240 worig = w; 1241 wend = w + size; 1242 while (u != uend && w != wend) { 1243 if (0xD800 <= u[0] && u[0] <= 0xDBFF 1244 && 0xDC00 <= u[1] && u[1] <= 0xDFFF) 1245 { 1246 *w = (((u[0] & 0x3FF) << 10) | (u[1] & 0x3FF)) + 0x10000; 1247 u += 2; 1248 } 1249 else { 1250 *w = *u; 1251 u++; 1252 } 1253 w++; 1254 } 1255 if (w != wend) 1256 *w = L'\0'; 1257 return w - worig; 1258 } 1259 else { 1260 nchar = 1; /* nul character at the end */ 1261 while (u != uend) { 1262 if (0xD800 <= u[0] && u[0] <= 0xDBFF 1263 && 0xDC00 <= u[1] && u[1] <= 0xDFFF) 1264 u += 2; 1265 else 1266 u++; 1267 nchar++; 1268 } 1269 } 1270 return nchar; 1271#elif Py_UNICODE_SIZE == 4 && SIZEOF_WCHAR_T == 2 1272 register Py_UNICODE *u, *uend, ordinal; 1273 register Py_ssize_t i; 1274 wchar_t *worig, *wend; 1275 Py_ssize_t nchar; 1276 1277 u = PyUnicode_AS_UNICODE(unicode); 1278 uend = u + PyUnicode_GET_SIZE(u); 1279 if (w != NULL) { 1280 worig = w; 1281 wend = w + size; 1282 while (u != uend && w != wend) { 1283 ordinal = *u; 1284 if (ordinal > 0xffff) { 1285 ordinal -= 0x10000; 1286 *w++ = 0xD800 | (ordinal >> 10); 1287 *w++ = 0xDC00 | (ordinal & 0x3FF); 1288 } 1289 else 1290 *w++ = ordinal; 1291 u++; 1292 } 1293 if (w != wend) 1294 *w = 0; 1295 return w - worig; 1296 } 1297 else { 1298 nchar = 1; /* nul character */ 1299 while (u != uend) { 1300 if (*u > 0xffff) 1301 nchar += 2; 1302 else 1303 nchar++; 1304 u++; 1305 } 1306 return nchar; 1307 } 1308#else 1309# error "unsupported wchar_t and Py_UNICODE sizes, see issue #8670" 1310#endif 1311} 1312 1313Py_ssize_t 1314PyUnicode_AsWideChar(PyObject *unicode, 1315 wchar_t *w, 1316 Py_ssize_t size) 1317{ 1318 if (unicode == NULL) { 1319 PyErr_BadInternalCall(); 1320 return -1; 1321 } 1322 return unicode_aswidechar((PyUnicodeObject*)unicode, w, size); 1323} 1324 1325wchar_t* 1326PyUnicode_AsWideCharString(PyObject *unicode, 1327 Py_ssize_t *size) 1328{ 1329 wchar_t* buffer; 1330 Py_ssize_t buflen; 1331 1332 if (unicode == NULL) { 1333 PyErr_BadInternalCall(); 1334 return NULL; 1335 } 1336 1337 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, NULL, 0); 1338 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) { 1339 PyErr_NoMemory(); 1340 return NULL; 1341 } 1342 1343 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t)); 1344 if (buffer == NULL) { 1345 PyErr_NoMemory(); 1346 return NULL; 1347 } 1348 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, buffer, buflen); 1349 if (size != NULL) 1350 *size = buflen; 1351 return buffer; 1352} 1353 1354#endif 1355 1356PyObject * 1357PyUnicode_FromOrdinal(int ordinal) 1358{ 1359 Py_UNICODE s[2]; 1360 1361 if (ordinal < 0 || ordinal > 0x10ffff) { 1362 PyErr_SetString(PyExc_ValueError, 1363 "chr() arg not in range(0x110000)"); 1364 return NULL; 1365 } 1366 1367#ifndef Py_UNICODE_WIDE 1368 if (ordinal > 0xffff) { 1369 ordinal -= 0x10000; 1370 s[0] = 0xD800 | (ordinal >> 10); 1371 s[1] = 0xDC00 | (ordinal & 0x3FF); 1372 return PyUnicode_FromUnicode(s, 2); 1373 } 1374#endif 1375 1376 s[0] = (Py_UNICODE)ordinal; 1377 return PyUnicode_FromUnicode(s, 1); 1378} 1379 1380PyObject * 1381PyUnicode_FromObject(register PyObject *obj) 1382{ 1383 /* XXX Perhaps we should make this API an alias of 1384 PyObject_Str() instead ?! */ 1385 if (PyUnicode_CheckExact(obj)) { 1386 Py_INCREF(obj); 1387 return obj; 1388 } 1389 if (PyUnicode_Check(obj)) { 1390 /* For a Unicode subtype that's not a Unicode object, 1391 return a true Unicode object with the same data. */ 1392 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj), 1393 PyUnicode_GET_SIZE(obj)); 1394 } 1395 PyErr_Format(PyExc_TypeError, 1396 "Can't convert '%.100s' object to str implicitly", 1397 Py_TYPE(obj)->tp_name); 1398 return NULL; 1399} 1400 1401PyObject * 1402PyUnicode_FromEncodedObject(register PyObject *obj, 1403 const char *encoding, 1404 const char *errors) 1405{ 1406 Py_buffer buffer; 1407 PyObject *v; 1408 1409 if (obj == NULL) { 1410 PyErr_BadInternalCall(); 1411 return NULL; 1412 } 1413 1414 /* Decoding bytes objects is the most common case and should be fast */ 1415 if (PyBytes_Check(obj)) { 1416 if (PyBytes_GET_SIZE(obj) == 0) { 1417 Py_INCREF(unicode_empty); 1418 v = (PyObject *) unicode_empty; 1419 } 1420 else { 1421 v = PyUnicode_Decode( 1422 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj), 1423 encoding, errors); 1424 } 1425 return v; 1426 } 1427 1428 if (PyUnicode_Check(obj)) { 1429 PyErr_SetString(PyExc_TypeError, 1430 "decoding str is not supported"); 1431 return NULL; 1432 } 1433 1434 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */ 1435 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) { 1436 PyErr_Format(PyExc_TypeError, 1437 "coercing to str: need bytes, bytearray " 1438 "or buffer-like object, %.80s found", 1439 Py_TYPE(obj)->tp_name); 1440 return NULL; 1441 } 1442 1443 if (buffer.len == 0) { 1444 Py_INCREF(unicode_empty); 1445 v = (PyObject *) unicode_empty; 1446 } 1447 else 1448 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors); 1449 1450 PyBuffer_Release(&buffer); 1451 return v; 1452} 1453 1454/* Convert encoding to lower case and replace '_' with '-' in order to 1455 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1), 1456 1 on success. */ 1457static int 1458normalize_encoding(const char *encoding, 1459 char *lower, 1460 size_t lower_len) 1461{ 1462 const char *e; 1463 char *l; 1464 char *l_end; 1465 1466 e = encoding; 1467 l = lower; 1468 l_end = &lower[lower_len - 1]; 1469 while (*e) { 1470 if (l == l_end) 1471 return 0; 1472 if (Py_ISUPPER(*e)) { 1473 *l++ = Py_TOLOWER(*e++); 1474 } 1475 else if (*e == '_') { 1476 *l++ = '-'; 1477 e++; 1478 } 1479 else { 1480 *l++ = *e++; 1481 } 1482 } 1483 *l = '\0'; 1484 return 1; 1485} 1486 1487PyObject * 1488PyUnicode_Decode(const char *s, 1489 Py_ssize_t size, 1490 const char *encoding, 1491 const char *errors) 1492{ 1493 PyObject *buffer = NULL, *unicode; 1494 Py_buffer info; 1495 char lower[11]; /* Enough for any encoding shortcut */ 1496 1497 if (encoding == NULL) 1498 return PyUnicode_DecodeUTF8(s, size, errors); 1499 1500 /* Shortcuts for common default encodings */ 1501 if (normalize_encoding(encoding, lower, sizeof(lower))) { 1502 if ((strcmp(lower, "utf-8") == 0) || 1503 (strcmp(lower, "utf8") == 0)) 1504 return PyUnicode_DecodeUTF8(s, size, errors); 1505 else if ((strcmp(lower, "latin-1") == 0) || 1506 (strcmp(lower, "latin1") == 0) || 1507 (strcmp(lower, "iso-8859-1") == 0)) 1508 return PyUnicode_DecodeLatin1(s, size, errors); 1509#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T) 1510 else if (strcmp(lower, "mbcs") == 0) 1511 return PyUnicode_DecodeMBCS(s, size, errors); 1512#endif 1513 else if (strcmp(lower, "ascii") == 0) 1514 return PyUnicode_DecodeASCII(s, size, errors); 1515 else if (strcmp(lower, "utf-16") == 0) 1516 return PyUnicode_DecodeUTF16(s, size, errors, 0); 1517 else if (strcmp(lower, "utf-32") == 0) 1518 return PyUnicode_DecodeUTF32(s, size, errors, 0); 1519 } 1520 1521 /* Decode via the codec registry */ 1522 buffer = NULL; 1523 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0) 1524 goto onError; 1525 buffer = PyMemoryView_FromBuffer(&info); 1526 if (buffer == NULL) 1527 goto onError; 1528 unicode = PyCodec_Decode(buffer, encoding, errors); 1529 if (unicode == NULL) 1530 goto onError; 1531 if (!PyUnicode_Check(unicode)) { 1532 PyErr_Format(PyExc_TypeError, 1533 "decoder did not return a str object (type=%.400s)", 1534 Py_TYPE(unicode)->tp_name); 1535 Py_DECREF(unicode); 1536 goto onError; 1537 } 1538 Py_DECREF(buffer); 1539 return unicode; 1540 1541 onError: 1542 Py_XDECREF(buffer); 1543 return NULL; 1544} 1545 1546PyObject * 1547PyUnicode_AsDecodedObject(PyObject *unicode, 1548 const char *encoding, 1549 const char *errors) 1550{ 1551 PyObject *v; 1552 1553 if (!PyUnicode_Check(unicode)) { 1554 PyErr_BadArgument(); 1555 goto onError; 1556 } 1557 1558 if (encoding == NULL) 1559 encoding = PyUnicode_GetDefaultEncoding(); 1560 1561 /* Decode via the codec registry */ 1562 v = PyCodec_Decode(unicode, encoding, errors); 1563 if (v == NULL) 1564 goto onError; 1565 return v; 1566 1567 onError: 1568 return NULL; 1569} 1570 1571PyObject * 1572PyUnicode_AsDecodedUnicode(PyObject *unicode, 1573 const char *encoding, 1574 const char *errors) 1575{ 1576 PyObject *v; 1577 1578 if (!PyUnicode_Check(unicode)) { 1579 PyErr_BadArgument(); 1580 goto onError; 1581 } 1582 1583 if (encoding == NULL) 1584 encoding = PyUnicode_GetDefaultEncoding(); 1585 1586 /* Decode via the codec registry */ 1587 v = PyCodec_Decode(unicode, encoding, errors); 1588 if (v == NULL) 1589 goto onError; 1590 if (!PyUnicode_Check(v)) { 1591 PyErr_Format(PyExc_TypeError, 1592 "decoder did not return a str object (type=%.400s)", 1593 Py_TYPE(v)->tp_name); 1594 Py_DECREF(v); 1595 goto onError; 1596 } 1597 return v; 1598 1599 onError: 1600 return NULL; 1601} 1602 1603PyObject * 1604PyUnicode_Encode(const Py_UNICODE *s, 1605 Py_ssize_t size, 1606 const char *encoding, 1607 const char *errors) 1608{ 1609 PyObject *v, *unicode; 1610 1611 unicode = PyUnicode_FromUnicode(s, size); 1612 if (unicode == NULL) 1613 return NULL; 1614 v = PyUnicode_AsEncodedString(unicode, encoding, errors); 1615 Py_DECREF(unicode); 1616 return v; 1617} 1618 1619PyObject * 1620PyUnicode_AsEncodedObject(PyObject *unicode, 1621 const char *encoding, 1622 const char *errors) 1623{ 1624 PyObject *v; 1625 1626 if (!PyUnicode_Check(unicode)) { 1627 PyErr_BadArgument(); 1628 goto onError; 1629 } 1630 1631 if (encoding == NULL) 1632 encoding = PyUnicode_GetDefaultEncoding(); 1633 1634 /* Encode via the codec registry */ 1635 v = PyCodec_Encode(unicode, encoding, errors); 1636 if (v == NULL) 1637 goto onError; 1638 return v; 1639 1640 onError: 1641 return NULL; 1642} 1643 1644PyObject * 1645PyUnicode_EncodeFSDefault(PyObject *unicode) 1646{ 1647#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T) 1648 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode), 1649 PyUnicode_GET_SIZE(unicode), 1650 NULL); 1651#elif defined(__APPLE__) 1652 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode), 1653 PyUnicode_GET_SIZE(unicode), 1654 "surrogateescape"); 1655#else 1656 if (Py_FileSystemDefaultEncoding) { 1657 return PyUnicode_AsEncodedString(unicode, 1658 Py_FileSystemDefaultEncoding, 1659 "surrogateescape"); 1660 } 1661 else { 1662 /* locale encoding with surrogateescape */ 1663 wchar_t *wchar; 1664 char *bytes; 1665 PyObject *bytes_obj; 1666 size_t error_pos; 1667 1668 wchar = PyUnicode_AsWideCharString(unicode, NULL); 1669 if (wchar == NULL) 1670 return NULL; 1671 bytes = _Py_wchar2char(wchar, &error_pos); 1672 if (bytes == NULL) { 1673 if (error_pos != (size_t)-1) { 1674 char *errmsg = strerror(errno); 1675 PyObject *exc = NULL; 1676 if (errmsg == NULL) 1677 errmsg = "Py_wchar2char() failed"; 1678 raise_encode_exception(&exc, 1679 "filesystemencoding", 1680 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode), 1681 error_pos, error_pos+1, 1682 errmsg); 1683 Py_XDECREF(exc); 1684 } 1685 else 1686 PyErr_NoMemory(); 1687 PyMem_Free(wchar); 1688 return NULL; 1689 } 1690 PyMem_Free(wchar); 1691 1692 bytes_obj = PyBytes_FromString(bytes); 1693 PyMem_Free(bytes); 1694 return bytes_obj; 1695 } 1696#endif 1697} 1698 1699PyObject * 1700PyUnicode_AsEncodedString(PyObject *unicode, 1701 const char *encoding, 1702 const char *errors) 1703{ 1704 PyObject *v; 1705 char lower[11]; /* Enough for any encoding shortcut */ 1706 1707 if (!PyUnicode_Check(unicode)) { 1708 PyErr_BadArgument(); 1709 return NULL; 1710 } 1711 1712 if (encoding == NULL) 1713 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode), 1714 PyUnicode_GET_SIZE(unicode), 1715 errors); 1716 1717 /* Shortcuts for common default encodings */ 1718 if (normalize_encoding(encoding, lower, sizeof(lower))) { 1719 if ((strcmp(lower, "utf-8") == 0) || 1720 (strcmp(lower, "utf8") == 0)) 1721 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode), 1722 PyUnicode_GET_SIZE(unicode), 1723 errors); 1724 else if ((strcmp(lower, "latin-1") == 0) || 1725 (strcmp(lower, "latin1") == 0) || 1726 (strcmp(lower, "iso-8859-1") == 0)) 1727 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode), 1728 PyUnicode_GET_SIZE(unicode), 1729 errors); 1730#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T) 1731 else if (strcmp(lower, "mbcs") == 0) 1732 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode), 1733 PyUnicode_GET_SIZE(unicode), 1734 errors); 1735#endif 1736 else if (strcmp(lower, "ascii") == 0) 1737 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode), 1738 PyUnicode_GET_SIZE(unicode), 1739 errors); 1740 } 1741 1742 /* Encode via the codec registry */ 1743 v = PyCodec_Encode(unicode, encoding, errors); 1744 if (v == NULL) 1745 return NULL; 1746 1747 /* The normal path */ 1748 if (PyBytes_Check(v)) 1749 return v; 1750 1751 /* If the codec returns a buffer, raise a warning and convert to bytes */ 1752 if (PyByteArray_Check(v)) { 1753 int error; 1754 PyObject *b; 1755 1756 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1, 1757 "encoder %s returned bytearray instead of bytes", 1758 encoding); 1759 if (error) { 1760 Py_DECREF(v); 1761 return NULL; 1762 } 1763 1764 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v)); 1765 Py_DECREF(v); 1766 return b; 1767 } 1768 1769 PyErr_Format(PyExc_TypeError, 1770 "encoder did not return a bytes object (type=%.400s)", 1771 Py_TYPE(v)->tp_name); 1772 Py_DECREF(v); 1773 return NULL; 1774} 1775 1776PyObject * 1777PyUnicode_AsEncodedUnicode(PyObject *unicode, 1778 const char *encoding, 1779 const char *errors) 1780{ 1781 PyObject *v; 1782 1783 if (!PyUnicode_Check(unicode)) { 1784 PyErr_BadArgument(); 1785 goto onError; 1786 } 1787 1788 if (encoding == NULL) 1789 encoding = PyUnicode_GetDefaultEncoding(); 1790 1791 /* Encode via the codec registry */ 1792 v = PyCodec_Encode(unicode, encoding, errors); 1793 if (v == NULL) 1794 goto onError; 1795 if (!PyUnicode_Check(v)) { 1796 PyErr_Format(PyExc_TypeError, 1797 "encoder did not return an str object (type=%.400s)", 1798 Py_TYPE(v)->tp_name); 1799 Py_DECREF(v); 1800 goto onError; 1801 } 1802 return v; 1803 1804 onError: 1805 return NULL; 1806} 1807 1808PyObject * 1809_PyUnicode_AsDefaultEncodedString(PyObject *unicode) 1810{ 1811 PyObject *v = ((PyUnicodeObject *)unicode)->defenc; 1812 if (v) 1813 return v; 1814 v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode), 1815 PyUnicode_GET_SIZE(unicode), 1816 NULL); 1817 if (!v) 1818 return NULL; 1819 ((PyUnicodeObject *)unicode)->defenc = v; 1820 return v; 1821} 1822 1823PyObject* 1824PyUnicode_DecodeFSDefault(const char *s) { 1825 Py_ssize_t size = (Py_ssize_t)strlen(s); 1826 return PyUnicode_DecodeFSDefaultAndSize(s, size); 1827} 1828 1829PyObject* 1830PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size) 1831{ 1832#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T) 1833 return PyUnicode_DecodeMBCS(s, size, NULL); 1834#elif defined(__APPLE__) 1835 return PyUnicode_DecodeUTF8(s, size, "surrogateescape"); 1836#else 1837 /* During the early bootstrapping process, Py_FileSystemDefaultEncoding 1838 can be undefined. If it is case, decode using UTF-8. The following assumes 1839 that Py_FileSystemDefaultEncoding is set to a built-in encoding during the 1840 bootstrapping process where the codecs aren't ready yet. 1841 */ 1842 if (Py_FileSystemDefaultEncoding) { 1843 return PyUnicode_Decode(s, size, 1844 Py_FileSystemDefaultEncoding, 1845 "surrogateescape"); 1846 } 1847 else { 1848 /* locale encoding with surrogateescape */ 1849 wchar_t *wchar; 1850 PyObject *unicode; 1851 size_t len; 1852 1853 if (s[size] != '\0' || size != strlen(s)) { 1854 PyErr_SetString(PyExc_TypeError, "embedded NUL character"); 1855 return NULL; 1856 } 1857 1858 wchar = _Py_char2wchar(s, &len); 1859 if (wchar == NULL) 1860 return PyErr_NoMemory(); 1861 1862 unicode = PyUnicode_FromWideChar(wchar, len); 1863 PyMem_Free(wchar); 1864 return unicode; 1865 } 1866#endif 1867} 1868 1869 1870int 1871PyUnicode_FSConverter(PyObject* arg, void* addr) 1872{ 1873 PyObject *output = NULL; 1874 Py_ssize_t size; 1875 void *data; 1876 if (arg == NULL) { 1877 Py_DECREF(*(PyObject**)addr); 1878 return 1; 1879 } 1880 if (PyBytes_Check(arg)) { 1881 output = arg; 1882 Py_INCREF(output); 1883 } 1884 else { 1885 arg = PyUnicode_FromObject(arg); 1886 if (!arg) 1887 return 0; 1888 output = PyUnicode_EncodeFSDefault(arg); 1889 Py_DECREF(arg); 1890 if (!output) 1891 return 0; 1892 if (!PyBytes_Check(output)) { 1893 Py_DECREF(output); 1894 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes"); 1895 return 0; 1896 } 1897 } 1898 size = PyBytes_GET_SIZE(output); 1899 data = PyBytes_AS_STRING(output); 1900 if (size != strlen(data)) { 1901 PyErr_SetString(PyExc_TypeError, "embedded NUL character"); 1902 Py_DECREF(output); 1903 return 0; 1904 } 1905 *(PyObject**)addr = output; 1906 return Py_CLEANUP_SUPPORTED; 1907} 1908 1909 1910int 1911PyUnicode_FSDecoder(PyObject* arg, void* addr) 1912{ 1913 PyObject *output = NULL; 1914 Py_ssize_t size; 1915 void *data; 1916 if (arg == NULL) { 1917 Py_DECREF(*(PyObject**)addr); 1918 return 1; 1919 } 1920 if (PyUnicode_Check(arg)) { 1921 output = arg; 1922 Py_INCREF(output); 1923 } 1924 else { 1925 arg = PyBytes_FromObject(arg); 1926 if (!arg) 1927 return 0; 1928 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg), 1929 PyBytes_GET_SIZE(arg)); 1930 Py_DECREF(arg); 1931 if (!output) 1932 return 0; 1933 if (!PyUnicode_Check(output)) { 1934 Py_DECREF(output); 1935 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode"); 1936 return 0; 1937 } 1938 } 1939 size = PyUnicode_GET_SIZE(output); 1940 data = PyUnicode_AS_UNICODE(output); 1941 if (size != Py_UNICODE_strlen(data)) { 1942 PyErr_SetString(PyExc_TypeError, "embedded NUL character"); 1943 Py_DECREF(output); 1944 return 0; 1945 } 1946 *(PyObject**)addr = output; 1947 return Py_CLEANUP_SUPPORTED; 1948} 1949 1950 1951char* 1952_PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize) 1953{ 1954 PyObject *bytes; 1955 if (!PyUnicode_Check(unicode)) { 1956 PyErr_BadArgument(); 1957 return NULL; 1958 } 1959 bytes = _PyUnicode_AsDefaultEncodedString(unicode); 1960 if (bytes == NULL) 1961 return NULL; 1962 if (psize != NULL) 1963 *psize = PyBytes_GET_SIZE(bytes); 1964 return PyBytes_AS_STRING(bytes); 1965} 1966 1967char* 1968_PyUnicode_AsString(PyObject *unicode) 1969{ 1970 return _PyUnicode_AsStringAndSize(unicode, NULL); 1971} 1972 1973Py_UNICODE * 1974PyUnicode_AsUnicode(PyObject *unicode) 1975{ 1976 if (!PyUnicode_Check(unicode)) { 1977 PyErr_BadArgument(); 1978 goto onError; 1979 } 1980 return PyUnicode_AS_UNICODE(unicode); 1981 1982 onError: 1983 return NULL; 1984} 1985 1986Py_ssize_t 1987PyUnicode_GetSize(PyObject *unicode) 1988{ 1989 if (!PyUnicode_Check(unicode)) { 1990 PyErr_BadArgument(); 1991 goto onError; 1992 } 1993 return PyUnicode_GET_SIZE(unicode); 1994 1995 onError: 1996 return -1; 1997} 1998 1999const char * 2000PyUnicode_GetDefaultEncoding(void) 2001{ 2002 return "utf-8"; 2003} 2004 2005/* create or adjust a UnicodeDecodeError */ 2006static void 2007make_decode_exception(PyObject **exceptionObject, 2008 const char *encoding, 2009 const char *input, Py_ssize_t length, 2010 Py_ssize_t startpos, Py_ssize_t endpos, 2011 const char *reason) 2012{ 2013 if (*exceptionObject == NULL) { 2014 *exceptionObject = PyUnicodeDecodeError_Create( 2015 encoding, input, length, startpos, endpos, reason); 2016 } 2017 else { 2018 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos)) 2019 goto onError; 2020 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos)) 2021 goto onError; 2022 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason)) 2023 goto onError; 2024 } 2025 return; 2026 2027onError: 2028 Py_DECREF(*exceptionObject); 2029 *exceptionObject = NULL; 2030} 2031 2032/* error handling callback helper: 2033 build arguments, call the callback and check the arguments, 2034 if no exception occurred, copy the replacement to the output 2035 and adjust various state variables. 2036 return 0 on success, -1 on error 2037*/ 2038 2039static int 2040unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler, 2041 const char *encoding, const char *reason, 2042 const char **input, const char **inend, Py_ssize_t *startinpos, 2043 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr, 2044 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr) 2045{ 2046 static char *argparse = "O!n;decoding error handler must return (str, int) tuple"; 2047 2048 PyObject *restuple = NULL; 2049 PyObject *repunicode = NULL; 2050 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output); 2051 Py_ssize_t insize; 2052 Py_ssize_t requiredsize; 2053 Py_ssize_t newpos; 2054 Py_UNICODE *repptr; 2055 PyObject *inputobj = NULL; 2056 Py_ssize_t repsize; 2057 int res = -1; 2058 2059 if (*errorHandler == NULL) { 2060 *errorHandler = PyCodec_LookupError(errors); 2061 if (*errorHandler == NULL) 2062 goto onError; 2063 } 2064 2065 make_decode_exception(exceptionObject, 2066 encoding, 2067 *input, *inend - *input, 2068 *startinpos, *endinpos, 2069 reason); 2070 if (*exceptionObject == NULL) 2071 goto onError; 2072 2073 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL); 2074 if (restuple == NULL) 2075 goto onError; 2076 if (!PyTuple_Check(restuple)) { 2077 PyErr_SetString(PyExc_TypeError, &argparse[4]); 2078 goto onError; 2079 } 2080 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos)) 2081 goto onError; 2082 2083 /* Copy back the bytes variables, which might have been modified by the 2084 callback */ 2085 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject); 2086 if (!inputobj) 2087 goto onError; 2088 if (!PyBytes_Check(inputobj)) { 2089 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes"); 2090 } 2091 *input = PyBytes_AS_STRING(inputobj); 2092 insize = PyBytes_GET_SIZE(inputobj); 2093 *inend = *input + insize; 2094 /* we can DECREF safely, as the exception has another reference, 2095 so the object won't go away. */ 2096 Py_DECREF(inputobj); 2097 2098 if (newpos<0) 2099 newpos = insize+newpos; 2100 if (newpos<0 || newpos>insize) { 2101 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos); 2102 goto onError; 2103 } 2104 2105 /* need more space? (at least enough for what we 2106 have+the replacement+the rest of the string (starting 2107 at the new input position), so we won't have to check space 2108 when there are no errors in the rest of the string) */ 2109 repptr = PyUnicode_AS_UNICODE(repunicode); 2110 repsize = PyUnicode_GET_SIZE(repunicode); 2111 requiredsize = *outpos + repsize + insize-newpos; 2112 if (requiredsize > outsize) { 2113 if (requiredsize<2*outsize) 2114 requiredsize = 2*outsize; 2115 if (_PyUnicode_Resize(output, requiredsize) < 0) 2116 goto onError; 2117 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos; 2118 } 2119 *endinpos = newpos; 2120 *inptr = *input + newpos; 2121 Py_UNICODE_COPY(*outptr, repptr, repsize); 2122 *outptr += repsize; 2123 *outpos += repsize; 2124 2125 /* we made it! */ 2126 res = 0; 2127 2128 onError: 2129 Py_XDECREF(restuple); 2130 return res; 2131} 2132 2133/* --- UTF-7 Codec -------------------------------------------------------- */ 2134 2135/* See RFC2152 for details. We encode conservatively and decode liberally. */ 2136 2137/* Three simple macros defining base-64. */ 2138 2139/* Is c a base-64 character? */ 2140 2141#define IS_BASE64(c) \ 2142 (((c) >= 'A' && (c) <= 'Z') || \ 2143 ((c) >= 'a' && (c) <= 'z') || \ 2144 ((c) >= '0' && (c) <= '9') || \ 2145 (c) == '+' || (c) == '/') 2146 2147/* given that c is a base-64 character, what is its base-64 value? */ 2148 2149#define FROM_BASE64(c) \ 2150 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \ 2151 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \ 2152 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \ 2153 (c) == '+' ? 62 : 63) 2154 2155/* What is the base-64 character of the bottom 6 bits of n? */ 2156 2157#define TO_BASE64(n) \ 2158 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f]) 2159 2160/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be 2161 * decoded as itself. We are permissive on decoding; the only ASCII 2162 * byte not decoding to itself is the + which begins a base64 2163 * string. */ 2164 2165#define DECODE_DIRECT(c) \ 2166 ((c) <= 127 && (c) != '+') 2167 2168/* The UTF-7 encoder treats ASCII characters differently according to 2169 * whether they are Set D, Set O, Whitespace, or special (i.e. none of 2170 * the above). See RFC2152. This array identifies these different 2171 * sets: 2172 * 0 : "Set D" 2173 * alphanumeric and '(),-./:? 2174 * 1 : "Set O" 2175 * !"#$%&*;<=>@[]^_`{|} 2176 * 2 : "whitespace" 2177 * ht nl cr sp 2178 * 3 : special (must be base64 encoded) 2179 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127) 2180 */ 2181 2182static 2183char utf7_category[128] = { 2184/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */ 2185 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3, 2186/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */ 2187 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2188/* sp ! " # $ % & ' ( ) * + , - . / */ 2189 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0, 2190/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */ 2191 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 2192/* @ A B C D E F G H I J K L M N O */ 2193 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2194/* P Q R S T U V W X Y Z [ \ ] ^ _ */ 2195 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1, 2196/* ` a b c d e f g h i j k l m n o */ 2197 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2198/* p q r s t u v w x y z { | } ~ del */ 2199 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3, 2200}; 2201 2202/* ENCODE_DIRECT: this character should be encoded as itself. The 2203 * answer depends on whether we are encoding set O as itself, and also 2204 * on whether we are encoding whitespace as itself. RFC2152 makes it 2205 * clear that the answers to these questions vary between 2206 * applications, so this code needs to be flexible. */ 2207 2208#define ENCODE_DIRECT(c, directO, directWS) \ 2209 ((c) < 128 && (c) > 0 && \ 2210 ((utf7_category[(c)] == 0) || \ 2211 (directWS && (utf7_category[(c)] == 2)) || \ 2212 (directO && (utf7_category[(c)] == 1)))) 2213 2214PyObject * 2215PyUnicode_DecodeUTF7(const char *s, 2216 Py_ssize_t size, 2217 const char *errors) 2218{ 2219 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL); 2220} 2221 2222/* The decoder. The only state we preserve is our read position, 2223 * i.e. how many characters we have consumed. So if we end in the 2224 * middle of a shift sequence we have to back off the read position 2225 * and the output to the beginning of the sequence, otherwise we lose 2226 * all the shift state (seen bits, number of bits seen, high 2227 * surrogate). */ 2228 2229PyObject * 2230PyUnicode_DecodeUTF7Stateful(const char *s, 2231 Py_ssize_t size, 2232 const char *errors, 2233 Py_ssize_t *consumed) 2234{ 2235 const char *starts = s; 2236 Py_ssize_t startinpos; 2237 Py_ssize_t endinpos; 2238 Py_ssize_t outpos; 2239 const char *e; 2240 PyUnicodeObject *unicode; 2241 Py_UNICODE *p; 2242 const char *errmsg = ""; 2243 int inShift = 0; 2244 Py_UNICODE *shiftOutStart; 2245 unsigned int base64bits = 0; 2246 unsigned long base64buffer = 0; 2247 Py_UNICODE surrogate = 0; 2248 PyObject *errorHandler = NULL; 2249 PyObject *exc = NULL; 2250 2251 unicode = _PyUnicode_New(size); 2252 if (!unicode) 2253 return NULL; 2254 if (size == 0) { 2255 if (consumed) 2256 *consumed = 0; 2257 return (PyObject *)unicode; 2258 } 2259 2260 p = unicode->str; 2261 shiftOutStart = p; 2262 e = s + size; 2263 2264 while (s < e) { 2265 Py_UNICODE ch; 2266 restart: 2267 ch = (unsigned char) *s; 2268 2269 if (inShift) { /* in a base-64 section */ 2270 if (IS_BASE64(ch)) { /* consume a base-64 character */ 2271 base64buffer = (base64buffer << 6) | FROM_BASE64(ch); 2272 base64bits += 6; 2273 s++; 2274 if (base64bits >= 16) { 2275 /* we have enough bits for a UTF-16 value */ 2276 Py_UNICODE outCh = (Py_UNICODE) 2277 (base64buffer >> (base64bits-16)); 2278 base64bits -= 16; 2279 base64buffer &= (1 << base64bits) - 1; /* clear high bits */ 2280 if (surrogate) { 2281 /* expecting a second surrogate */ 2282 if (outCh >= 0xDC00 && outCh <= 0xDFFF) { 2283#ifdef Py_UNICODE_WIDE 2284 *p++ = (((surrogate & 0x3FF)<<10) 2285 | (outCh & 0x3FF)) + 0x10000; 2286#else 2287 *p++ = surrogate; 2288 *p++ = outCh; 2289#endif 2290 surrogate = 0; 2291 } 2292 else { 2293 surrogate = 0; 2294 errmsg = "second surrogate missing"; 2295 goto utf7Error; 2296 } 2297 } 2298 else if (outCh >= 0xD800 && outCh <= 0xDBFF) { 2299 /* first surrogate */ 2300 surrogate = outCh; 2301 } 2302 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) { 2303 errmsg = "unexpected second surrogate"; 2304 goto utf7Error; 2305 } 2306 else { 2307 *p++ = outCh; 2308 } 2309 } 2310 } 2311 else { /* now leaving a base-64 section */ 2312 inShift = 0; 2313 s++; 2314 if (surrogate) { 2315 errmsg = "second surrogate missing at end of shift sequence"; 2316 goto utf7Error; 2317 } 2318 if (base64bits > 0) { /* left-over bits */ 2319 if (base64bits >= 6) { 2320 /* We've seen at least one base-64 character */ 2321 errmsg = "partial character in shift sequence"; 2322 goto utf7Error; 2323 } 2324 else { 2325 /* Some bits remain; they should be zero */ 2326 if (base64buffer != 0) { 2327 errmsg = "non-zero padding bits in shift sequence"; 2328 goto utf7Error; 2329 } 2330 } 2331 } 2332 if (ch != '-') { 2333 /* '-' is absorbed; other terminating 2334 characters are preserved */ 2335 *p++ = ch; 2336 } 2337 } 2338 } 2339 else if ( ch == '+' ) { 2340 startinpos = s-starts; 2341 s++; /* consume '+' */ 2342 if (s < e && *s == '-') { /* '+-' encodes '+' */ 2343 s++; 2344 *p++ = '+'; 2345 } 2346 else { /* begin base64-encoded section */ 2347 inShift = 1; 2348 shiftOutStart = p; 2349 base64bits = 0; 2350 } 2351 } 2352 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */ 2353 *p++ = ch; 2354 s++; 2355 } 2356 else { 2357 startinpos = s-starts; 2358 s++; 2359 errmsg = "unexpected special character"; 2360 goto utf7Error; 2361 } 2362 continue; 2363utf7Error: 2364 outpos = p-PyUnicode_AS_UNICODE(unicode); 2365 endinpos = s-starts; 2366 if (unicode_decode_call_errorhandler( 2367 errors, &errorHandler, 2368 "utf7", errmsg, 2369 &starts, &e, &startinpos, &endinpos, &exc, &s, 2370 &unicode, &outpos, &p)) 2371 goto onError; 2372 } 2373 2374 /* end of string */ 2375 2376 if (inShift && !consumed) { /* in shift sequence, no more to follow */ 2377 /* if we're in an inconsistent state, that's an error */ 2378 if (surrogate || 2379 (base64bits >= 6) || 2380 (base64bits > 0 && base64buffer != 0)) { 2381 outpos = p-PyUnicode_AS_UNICODE(unicode); 2382 endinpos = size; 2383 if (unicode_decode_call_errorhandler( 2384 errors, &errorHandler, 2385 "utf7", "unterminated shift sequence", 2386 &starts, &e, &startinpos, &endinpos, &exc, &s, 2387 &unicode, &outpos, &p)) 2388 goto onError; 2389 if (s < e) 2390 goto restart; 2391 } 2392 } 2393 2394 /* return state */ 2395 if (consumed) { 2396 if (inShift) { 2397 p = shiftOutStart; /* back off output */ 2398 *consumed = startinpos; 2399 } 2400 else { 2401 *consumed = s-starts; 2402 } 2403 } 2404 2405 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0) 2406 goto onError; 2407 2408 Py_XDECREF(errorHandler); 2409 Py_XDECREF(exc); 2410 return (PyObject *)unicode; 2411 2412 onError: 2413 Py_XDECREF(errorHandler); 2414 Py_XDECREF(exc); 2415 Py_DECREF(unicode); 2416 return NULL; 2417} 2418 2419 2420PyObject * 2421PyUnicode_EncodeUTF7(const Py_UNICODE *s, 2422 Py_ssize_t size, 2423 int base64SetO, 2424 int base64WhiteSpace, 2425 const char *errors) 2426{ 2427 PyObject *v; 2428 /* It might be possible to tighten this worst case */ 2429 Py_ssize_t allocated = 8 * size; 2430 int inShift = 0; 2431 Py_ssize_t i = 0; 2432 unsigned int base64bits = 0; 2433 unsigned long base64buffer = 0; 2434 char * out; 2435 char * start; 2436 2437 if (size == 0) 2438 return PyBytes_FromStringAndSize(NULL, 0); 2439 2440 if (allocated / 8 != size) 2441 return PyErr_NoMemory(); 2442 2443 v = PyBytes_FromStringAndSize(NULL, allocated); 2444 if (v == NULL) 2445 return NULL; 2446 2447 start = out = PyBytes_AS_STRING(v); 2448 for (;i < size; ++i) { 2449 Py_UNICODE ch = s[i]; 2450 2451 if (inShift) { 2452 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) { 2453 /* shifting out */ 2454 if (base64bits) { /* output remaining bits */ 2455 *out++ = TO_BASE64(base64buffer << (6-base64bits)); 2456 base64buffer = 0; 2457 base64bits = 0; 2458 } 2459 inShift = 0; 2460 /* Characters not in the BASE64 set implicitly unshift the sequence 2461 so no '-' is required, except if the character is itself a '-' */ 2462 if (IS_BASE64(ch) || ch == '-') { 2463 *out++ = '-'; 2464 } 2465 *out++ = (char) ch; 2466 } 2467 else { 2468 goto encode_char; 2469 } 2470 } 2471 else { /* not in a shift sequence */ 2472 if (ch == '+') { 2473 *out++ = '+'; 2474 *out++ = '-'; 2475 } 2476 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) { 2477 *out++ = (char) ch; 2478 } 2479 else { 2480 *out++ = '+'; 2481 inShift = 1; 2482 goto encode_char; 2483 } 2484 } 2485 continue; 2486encode_char: 2487#ifdef Py_UNICODE_WIDE 2488 if (ch >= 0x10000) { 2489 /* code first surrogate */ 2490 base64bits += 16; 2491 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10); 2492 while (base64bits >= 6) { 2493 *out++ = TO_BASE64(base64buffer >> (base64bits-6)); 2494 base64bits -= 6; 2495 } 2496 /* prepare second surrogate */ 2497 ch = 0xDC00 | ((ch-0x10000) & 0x3FF); 2498 } 2499#endif 2500 base64bits += 16; 2501 base64buffer = (base64buffer << 16) | ch; 2502 while (base64bits >= 6) { 2503 *out++ = TO_BASE64(base64buffer >> (base64bits-6)); 2504 base64bits -= 6; 2505 } 2506 } 2507 if (base64bits) 2508 *out++= TO_BASE64(base64buffer << (6-base64bits) ); 2509 if (inShift) 2510 *out++ = '-'; 2511 if (_PyBytes_Resize(&v, out - start) < 0) 2512 return NULL; 2513 return v; 2514} 2515 2516#undef IS_BASE64 2517#undef FROM_BASE64 2518#undef TO_BASE64 2519#undef DECODE_DIRECT 2520#undef ENCODE_DIRECT 2521 2522/* --- UTF-8 Codec -------------------------------------------------------- */ 2523 2524static 2525char utf8_code_length[256] = { 2526 /* Map UTF-8 encoded prefix byte to sequence length. Zero means 2527 illegal prefix. See RFC 3629 for details */ 2528 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */ 2529 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2530 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2531 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2532 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2533 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2534 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2535 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */ 2536 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */ 2537 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2538 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2539 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */ 2540 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */ 2541 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */ 2542 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */ 2543 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */ 2544}; 2545 2546PyObject * 2547PyUnicode_DecodeUTF8(const char *s, 2548 Py_ssize_t size, 2549 const char *errors) 2550{ 2551 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL); 2552} 2553 2554/* Mask to check or force alignment of a pointer to C 'long' boundaries */ 2555#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1) 2556 2557/* Mask to quickly check whether a C 'long' contains a 2558 non-ASCII, UTF8-encoded char. */ 2559#if (SIZEOF_LONG == 8) 2560# define ASCII_CHAR_MASK 0x8080808080808080L 2561#elif (SIZEOF_LONG == 4) 2562# define ASCII_CHAR_MASK 0x80808080L 2563#else 2564# error C 'long' size should be either 4 or 8! 2565#endif 2566 2567PyObject * 2568PyUnicode_DecodeUTF8Stateful(const char *s, 2569 Py_ssize_t size, 2570 const char *errors, 2571 Py_ssize_t *consumed) 2572{ 2573 const char *starts = s; 2574 int n; 2575 int k; 2576 Py_ssize_t startinpos; 2577 Py_ssize_t endinpos; 2578 Py_ssize_t outpos; 2579 const char *e, *aligned_end; 2580 PyUnicodeObject *unicode; 2581 Py_UNICODE *p; 2582 const char *errmsg = ""; 2583 PyObject *errorHandler = NULL; 2584 PyObject *exc = NULL; 2585 2586 /* Note: size will always be longer than the resulting Unicode 2587 character count */ 2588 unicode = _PyUnicode_New(size); 2589 if (!unicode) 2590 return NULL; 2591 if (size == 0) { 2592 if (consumed) 2593 *consumed = 0; 2594 return (PyObject *)unicode; 2595 } 2596 2597 /* Unpack UTF-8 encoded data */ 2598 p = unicode->str; 2599 e = s + size; 2600 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK); 2601 2602 while (s < e) { 2603 Py_UCS4 ch = (unsigned char)*s; 2604 2605 if (ch < 0x80) { 2606 /* Fast path for runs of ASCII characters. Given that common UTF-8 2607 input will consist of an overwhelming majority of ASCII 2608 characters, we try to optimize for this case by checking 2609 as many characters as a C 'long' can contain. 2610 First, check if we can do an aligned read, as most CPUs have 2611 a penalty for unaligned reads. 2612 */ 2613 if (!((size_t) s & LONG_PTR_MASK)) { 2614 /* Help register allocation */ 2615 register const char *_s = s; 2616 register Py_UNICODE *_p = p; 2617 while (_s < aligned_end) { 2618 /* Read a whole long at a time (either 4 or 8 bytes), 2619 and do a fast unrolled copy if it only contains ASCII 2620 characters. */ 2621 unsigned long data = *(unsigned long *) _s; 2622 if (data & ASCII_CHAR_MASK) 2623 break; 2624 _p[0] = (unsigned char) _s[0]; 2625 _p[1] = (unsigned char) _s[1]; 2626 _p[2] = (unsigned char) _s[2]; 2627 _p[3] = (unsigned char) _s[3]; 2628#if (SIZEOF_LONG == 8) 2629 _p[4] = (unsigned char) _s[4]; 2630 _p[5] = (unsigned char) _s[5]; 2631 _p[6] = (unsigned char) _s[6]; 2632 _p[7] = (unsigned char) _s[7]; 2633#endif 2634 _s += SIZEOF_LONG; 2635 _p += SIZEOF_LONG; 2636 } 2637 s = _s; 2638 p = _p; 2639 if (s == e) 2640 break; 2641 ch = (unsigned char)*s; 2642 } 2643 } 2644 2645 if (ch < 0x80) { 2646 *p++ = (Py_UNICODE)ch; 2647 s++; 2648 continue; 2649 } 2650 2651 n = utf8_code_length[ch]; 2652 2653 if (s + n > e) { 2654 if (consumed) 2655 break; 2656 else { 2657 errmsg = "unexpected end of data"; 2658 startinpos = s-starts; 2659 endinpos = startinpos+1; 2660 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++) 2661 endinpos++; 2662 goto utf8Error; 2663 } 2664 } 2665 2666 switch (n) { 2667 2668 case 0: 2669 errmsg = "invalid start byte"; 2670 startinpos = s-starts; 2671 endinpos = startinpos+1; 2672 goto utf8Error; 2673 2674 case 1: 2675 errmsg = "internal error"; 2676 startinpos = s-starts; 2677 endinpos = startinpos+1; 2678 goto utf8Error; 2679 2680 case 2: 2681 if ((s[1] & 0xc0) != 0x80) { 2682 errmsg = "invalid continuation byte"; 2683 startinpos = s-starts; 2684 endinpos = startinpos + 1; 2685 goto utf8Error; 2686 } 2687 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f); 2688 assert ((ch > 0x007F) && (ch <= 0x07FF)); 2689 *p++ = (Py_UNICODE)ch; 2690 break; 2691 2692 case 3: 2693 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf 2694 will result in surrogates in range d800-dfff. Surrogates are 2695 not valid UTF-8 so they are rejected. 2696 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf 2697 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */ 2698 if ((s[1] & 0xc0) != 0x80 || 2699 (s[2] & 0xc0) != 0x80 || 2700 ((unsigned char)s[0] == 0xE0 && 2701 (unsigned char)s[1] < 0xA0) || 2702 ((unsigned char)s[0] == 0xED && 2703 (unsigned char)s[1] > 0x9F)) { 2704 errmsg = "invalid continuation byte"; 2705 startinpos = s-starts; 2706 endinpos = startinpos + 1; 2707 2708 /* if s[1] first two bits are 1 and 0, then the invalid 2709 continuation byte is s[2], so increment endinpos by 1, 2710 if not, s[1] is invalid and endinpos doesn't need to 2711 be incremented. */ 2712 if ((s[1] & 0xC0) == 0x80) 2713 endinpos++; 2714 goto utf8Error; 2715 } 2716 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f); 2717 assert ((ch > 0x07FF) && (ch <= 0xFFFF)); 2718 *p++ = (Py_UNICODE)ch; 2719 break; 2720 2721 case 4: 2722 if ((s[1] & 0xc0) != 0x80 || 2723 (s[2] & 0xc0) != 0x80 || 2724 (s[3] & 0xc0) != 0x80 || 2725 ((unsigned char)s[0] == 0xF0 && 2726 (unsigned char)s[1] < 0x90) || 2727 ((unsigned char)s[0] == 0xF4 && 2728 (unsigned char)s[1] > 0x8F)) { 2729 errmsg = "invalid continuation byte"; 2730 startinpos = s-starts; 2731 endinpos = startinpos + 1; 2732 if ((s[1] & 0xC0) == 0x80) { 2733 endinpos++; 2734 if ((s[2] & 0xC0) == 0x80) 2735 endinpos++; 2736 } 2737 goto utf8Error; 2738 } 2739 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) + 2740 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f); 2741 assert ((ch > 0xFFFF) && (ch <= 0x10ffff)); 2742 2743#ifdef Py_UNICODE_WIDE 2744 *p++ = (Py_UNICODE)ch; 2745#else 2746 /* compute and append the two surrogates: */ 2747 2748 /* translate from 10000..10FFFF to 0..FFFF */ 2749 ch -= 0x10000; 2750 2751 /* high surrogate = top 10 bits added to D800 */ 2752 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10)); 2753 2754 /* low surrogate = bottom 10 bits added to DC00 */ 2755 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF)); 2756#endif 2757 break; 2758 } 2759 s += n; 2760 continue; 2761 2762 utf8Error: 2763 outpos = p-PyUnicode_AS_UNICODE(unicode); 2764 if (unicode_decode_call_errorhandler( 2765 errors, &errorHandler, 2766 "utf8", errmsg, 2767 &starts, &e, &startinpos, &endinpos, &exc, &s, 2768 &unicode, &outpos, &p)) 2769 goto onError; 2770 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK); 2771 } 2772 if (consumed) 2773 *consumed = s-starts; 2774 2775 /* Adjust length */ 2776 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0) 2777 goto onError; 2778 2779 Py_XDECREF(errorHandler); 2780 Py_XDECREF(exc); 2781 return (PyObject *)unicode; 2782 2783 onError: 2784 Py_XDECREF(errorHandler); 2785 Py_XDECREF(exc); 2786 Py_DECREF(unicode); 2787 return NULL; 2788} 2789 2790#undef ASCII_CHAR_MASK 2791 2792#ifdef __APPLE__ 2793 2794/* Simplified UTF-8 decoder using surrogateescape error handler, 2795 used to decode the command line arguments on Mac OS X. */ 2796 2797wchar_t* 2798_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size) 2799{ 2800 int n; 2801 const char *e; 2802 wchar_t *unicode, *p; 2803 2804 /* Note: size will always be longer than the resulting Unicode 2805 character count */ 2806 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) { 2807 PyErr_NoMemory(); 2808 return NULL; 2809 } 2810 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t)); 2811 if (!unicode) 2812 return NULL; 2813 2814 /* Unpack UTF-8 encoded data */ 2815 p = unicode; 2816 e = s + size; 2817 while (s < e) { 2818 Py_UCS4 ch = (unsigned char)*s; 2819 2820 if (ch < 0x80) { 2821 *p++ = (wchar_t)ch; 2822 s++; 2823 continue; 2824 } 2825 2826 n = utf8_code_length[ch]; 2827 if (s + n > e) { 2828 goto surrogateescape; 2829 } 2830 2831 switch (n) { 2832 case 0: 2833 case 1: 2834 goto surrogateescape; 2835 2836 case 2: 2837 if ((s[1] & 0xc0) != 0x80) 2838 goto surrogateescape; 2839 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f); 2840 assert ((ch > 0x007F) && (ch <= 0x07FF)); 2841 *p++ = (wchar_t)ch; 2842 break; 2843 2844 case 3: 2845 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf 2846 will result in surrogates in range d800-dfff. Surrogates are 2847 not valid UTF-8 so they are rejected. 2848 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf 2849 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */ 2850 if ((s[1] & 0xc0) != 0x80 || 2851 (s[2] & 0xc0) != 0x80 || 2852 ((unsigned char)s[0] == 0xE0 && 2853 (unsigned char)s[1] < 0xA0) || 2854 ((unsigned char)s[0] == 0xED && 2855 (unsigned char)s[1] > 0x9F)) { 2856 2857 goto surrogateescape; 2858 } 2859 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f); 2860 assert ((ch > 0x07FF) && (ch <= 0xFFFF)); 2861 *p++ = (Py_UNICODE)ch; 2862 break; 2863 2864 case 4: 2865 if ((s[1] & 0xc0) != 0x80 || 2866 (s[2] & 0xc0) != 0x80 || 2867 (s[3] & 0xc0) != 0x80 || 2868 ((unsigned char)s[0] == 0xF0 && 2869 (unsigned char)s[1] < 0x90) || 2870 ((unsigned char)s[0] == 0xF4 && 2871 (unsigned char)s[1] > 0x8F)) { 2872 goto surrogateescape; 2873 } 2874 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) + 2875 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f); 2876 assert ((ch > 0xFFFF) && (ch <= 0x10ffff)); 2877 2878#if SIZEOF_WCHAR_T == 4 2879 *p++ = (wchar_t)ch; 2880#else 2881 /* compute and append the two surrogates: */ 2882 2883 /* translate from 10000..10FFFF to 0..FFFF */ 2884 ch -= 0x10000; 2885 2886 /* high surrogate = top 10 bits added to D800 */ 2887 *p++ = (wchar_t)(0xD800 + (ch >> 10)); 2888 2889 /* low surrogate = bottom 10 bits added to DC00 */ 2890 *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF)); 2891#endif 2892 break; 2893 } 2894 s += n; 2895 continue; 2896 2897 surrogateescape: 2898 *p++ = 0xDC00 + ch; 2899 s++; 2900 } 2901 *p = L'\0'; 2902 return unicode; 2903} 2904 2905#endif /* __APPLE__ */ 2906 2907/* Allocation strategy: if the string is short, convert into a stack buffer 2908 and allocate exactly as much space needed at the end. Else allocate the 2909 maximum possible needed (4 result bytes per Unicode character), and return 2910 the excess memory at the end. 2911*/ 2912PyObject * 2913PyUnicode_EncodeUTF8(const Py_UNICODE *s, 2914 Py_ssize_t size, 2915 const char *errors) 2916{ 2917#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */ 2918 2919 Py_ssize_t i; /* index into s of next input byte */ 2920 PyObject *result; /* result string object */ 2921 char *p; /* next free byte in output buffer */ 2922 Py_ssize_t nallocated; /* number of result bytes allocated */ 2923 Py_ssize_t nneeded; /* number of result bytes needed */ 2924 char stackbuf[MAX_SHORT_UNICHARS * 4]; 2925 PyObject *errorHandler = NULL; 2926 PyObject *exc = NULL; 2927 2928 assert(s != NULL); 2929 assert(size >= 0); 2930 2931 if (size <= MAX_SHORT_UNICHARS) { 2932 /* Write into the stack buffer; nallocated can't overflow. 2933 * At the end, we'll allocate exactly as much heap space as it 2934 * turns out we need. 2935 */ 2936 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int); 2937 result = NULL; /* will allocate after we're done */ 2938 p = stackbuf; 2939 } 2940 else { 2941 /* Overallocate on the heap, and give the excess back at the end. */ 2942 nallocated = size * 4; 2943 if (nallocated / 4 != size) /* overflow! */ 2944 return PyErr_NoMemory(); 2945 result = PyBytes_FromStringAndSize(NULL, nallocated); 2946 if (result == NULL) 2947 return NULL; 2948 p = PyBytes_AS_STRING(result); 2949 } 2950 2951 for (i = 0; i < size;) { 2952 Py_UCS4 ch = s[i++]; 2953 2954 if (ch < 0x80) 2955 /* Encode ASCII */ 2956 *p++ = (char) ch; 2957 2958 else if (ch < 0x0800) { 2959 /* Encode Latin-1 */ 2960 *p++ = (char)(0xc0 | (ch >> 6)); 2961 *p++ = (char)(0x80 | (ch & 0x3f)); 2962 } else if (0xD800 <= ch && ch <= 0xDFFF) { 2963#ifndef Py_UNICODE_WIDE 2964 /* Special case: check for high and low surrogate */ 2965 if (ch <= 0xDBFF && i != size && 0xDC00 <= s[i] && s[i] <= 0xDFFF) { 2966 Py_UCS4 ch2 = s[i]; 2967 /* Combine the two surrogates to form a UCS4 value */ 2968 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000; 2969 i++; 2970 2971 /* Encode UCS4 Unicode ordinals */ 2972 *p++ = (char)(0xf0 | (ch >> 18)); 2973 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f)); 2974 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); 2975 *p++ = (char)(0x80 | (ch & 0x3f)); 2976 } else { 2977#endif 2978 Py_ssize_t newpos; 2979 PyObject *rep; 2980 Py_ssize_t repsize, k; 2981 rep = unicode_encode_call_errorhandler 2982 (errors, &errorHandler, "utf-8", "surrogates not allowed", 2983 s, size, &exc, i-1, i, &newpos); 2984 if (!rep) 2985 goto error; 2986 2987 if (PyBytes_Check(rep)) 2988 repsize = PyBytes_GET_SIZE(rep); 2989 else 2990 repsize = PyUnicode_GET_SIZE(rep); 2991 2992 if (repsize > 4) { 2993 Py_ssize_t offset; 2994 2995 if (result == NULL) 2996 offset = p - stackbuf; 2997 else 2998 offset = p - PyBytes_AS_STRING(result); 2999 3000 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) { 3001 /* integer overflow */ 3002 PyErr_NoMemory(); 3003 goto error; 3004 } 3005 nallocated += repsize - 4; 3006 if (result != NULL) { 3007 if (_PyBytes_Resize(&result, nallocated) < 0) 3008 goto error; 3009 } else { 3010 result = PyBytes_FromStringAndSize(NULL, nallocated); 3011 if (result == NULL) 3012 goto error; 3013 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset); 3014 } 3015 p = PyBytes_AS_STRING(result) + offset; 3016 } 3017 3018 if (PyBytes_Check(rep)) { 3019 char *prep = PyBytes_AS_STRING(rep); 3020 for(k = repsize; k > 0; k--) 3021 *p++ = *prep++; 3022 } else /* rep is unicode */ { 3023 Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep); 3024 Py_UNICODE c; 3025 3026 for(k=0; k<repsize; k++) { 3027 c = prep[k]; 3028 if (0x80 <= c) { 3029 raise_encode_exception(&exc, "utf-8", s, size, 3030 i-1, i, "surrogates not allowed"); 3031 goto error; 3032 } 3033 *p++ = (char)prep[k]; 3034 } 3035 } 3036 Py_DECREF(rep); 3037#ifndef Py_UNICODE_WIDE 3038 } 3039#endif 3040 } else if (ch < 0x10000) { 3041 *p++ = (char)(0xe0 | (ch >> 12)); 3042 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); 3043 *p++ = (char)(0x80 | (ch & 0x3f)); 3044 } else /* ch >= 0x10000 */ { 3045 /* Encode UCS4 Unicode ordinals */ 3046 *p++ = (char)(0xf0 | (ch >> 18)); 3047 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f)); 3048 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); 3049 *p++ = (char)(0x80 | (ch & 0x3f)); 3050 } 3051 } 3052 3053 if (result == NULL) { 3054 /* This was stack allocated. */ 3055 nneeded = p - stackbuf; 3056 assert(nneeded <= nallocated); 3057 result = PyBytes_FromStringAndSize(stackbuf, nneeded); 3058 } 3059 else { 3060 /* Cut back to size actually needed. */ 3061 nneeded = p - PyBytes_AS_STRING(result); 3062 assert(nneeded <= nallocated); 3063 _PyBytes_Resize(&result, nneeded); 3064 } 3065 Py_XDECREF(errorHandler); 3066 Py_XDECREF(exc); 3067 return result; 3068 error: 3069 Py_XDECREF(errorHandler); 3070 Py_XDECREF(exc); 3071 Py_XDECREF(result); 3072 return NULL; 3073 3074#undef MAX_SHORT_UNICHARS 3075} 3076 3077PyObject * 3078PyUnicode_AsUTF8String(PyObject *unicode) 3079{ 3080 if (!PyUnicode_Check(unicode)) { 3081 PyErr_BadArgument(); 3082 return NULL; 3083 } 3084 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode), 3085 PyUnicode_GET_SIZE(unicode), 3086 NULL); 3087} 3088 3089/* --- UTF-32 Codec ------------------------------------------------------- */ 3090 3091PyObject * 3092PyUnicode_DecodeUTF32(const char *s, 3093 Py_ssize_t size, 3094 const char *errors, 3095 int *byteorder) 3096{ 3097 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL); 3098} 3099 3100PyObject * 3101PyUnicode_DecodeUTF32Stateful(const char *s, 3102 Py_ssize_t size, 3103 const char *errors, 3104 int *byteorder, 3105 Py_ssize_t *consumed) 3106{ 3107 const char *starts = s; 3108 Py_ssize_t startinpos; 3109 Py_ssize_t endinpos; 3110 Py_ssize_t outpos; 3111 PyUnicodeObject *unicode; 3112 Py_UNICODE *p; 3113#ifndef Py_UNICODE_WIDE 3114 int pairs = 0; 3115 const unsigned char *qq; 3116#else 3117 const int pairs = 0; 3118#endif 3119 const unsigned char *q, *e; 3120 int bo = 0; /* assume native ordering by default */ 3121 const char *errmsg = ""; 3122 /* Offsets from q for retrieving bytes in the right order. */ 3123#ifdef BYTEORDER_IS_LITTLE_ENDIAN 3124 int iorder[] = {0, 1, 2, 3}; 3125#else 3126 int iorder[] = {3, 2, 1, 0}; 3127#endif 3128 PyObject *errorHandler = NULL; 3129 PyObject *exc = NULL; 3130 3131 q = (unsigned char *)s; 3132 e = q + size; 3133 3134 if (byteorder) 3135 bo = *byteorder; 3136 3137 /* Check for BOM marks (U+FEFF) in the input and adjust current 3138 byte order setting accordingly. In native mode, the leading BOM 3139 mark is skipped, in all other modes, it is copied to the output 3140 stream as-is (giving a ZWNBSP character). */ 3141 if (bo == 0) { 3142 if (size >= 4) { 3143 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) | 3144 (q[iorder[1]] << 8) | q[iorder[0]]; 3145#ifdef BYTEORDER_IS_LITTLE_ENDIAN 3146 if (bom == 0x0000FEFF) { 3147 q += 4; 3148 bo = -1; 3149 } 3150 else if (bom == 0xFFFE0000) { 3151 q += 4; 3152 bo = 1; 3153 } 3154#else 3155 if (bom == 0x0000FEFF) { 3156 q += 4; 3157 bo = 1; 3158 } 3159 else if (bom == 0xFFFE0000) { 3160 q += 4; 3161 bo = -1; 3162 } 3163#endif 3164 } 3165 } 3166 3167 if (bo == -1) { 3168 /* force LE */ 3169 iorder[0] = 0; 3170 iorder[1] = 1; 3171 iorder[2] = 2; 3172 iorder[3] = 3; 3173 } 3174 else if (bo == 1) { 3175 /* force BE */ 3176 iorder[0] = 3; 3177 iorder[1] = 2; 3178 iorder[2] = 1; 3179 iorder[3] = 0; 3180 } 3181 3182 /* On narrow builds we split characters outside the BMP into two 3183 codepoints => count how much extra space we need. */ 3184#ifndef Py_UNICODE_WIDE 3185 for (qq = q; qq < e; qq += 4) 3186 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0) 3187 pairs++; 3188#endif 3189 3190 /* This might be one to much, because of a BOM */ 3191 unicode = _PyUnicode_New((size+3)/4+pairs); 3192 if (!unicode) 3193 return NULL; 3194 if (size == 0) 3195 return (PyObject *)unicode; 3196 3197 /* Unpack UTF-32 encoded data */ 3198 p = unicode->str; 3199 3200 while (q < e) { 3201 Py_UCS4 ch; 3202 /* remaining bytes at the end? (size should be divisible by 4) */ 3203 if (e-q<4) { 3204 if (consumed) 3205 break; 3206 errmsg = "truncated data"; 3207 startinpos = ((const char *)q)-starts; 3208 endinpos = ((const char *)e)-starts; 3209 goto utf32Error; 3210 /* The remaining input chars are ignored if the callback 3211 chooses to skip the input */ 3212 } 3213 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) | 3214 (q[iorder[1]] << 8) | q[iorder[0]]; 3215 3216 if (ch >= 0x110000) 3217 { 3218 errmsg = "codepoint not in range(0x110000)"; 3219 startinpos = ((const char *)q)-starts; 3220 endinpos = startinpos+4; 3221 goto utf32Error; 3222 } 3223#ifndef Py_UNICODE_WIDE 3224 if (ch >= 0x10000) 3225 { 3226 *p++ = 0xD800 | ((ch-0x10000) >> 10); 3227 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF); 3228 } 3229 else 3230#endif 3231 *p++ = ch; 3232 q += 4; 3233 continue; 3234 utf32Error: 3235 outpos = p-PyUnicode_AS_UNICODE(unicode); 3236 if (unicode_decode_call_errorhandler( 3237 errors, &errorHandler, 3238 "utf32", errmsg, 3239 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q, 3240 &unicode, &outpos, &p)) 3241 goto onError; 3242 } 3243 3244 if (byteorder) 3245 *byteorder = bo; 3246 3247 if (consumed) 3248 *consumed = (const char *)q-starts; 3249 3250 /* Adjust length */ 3251 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0) 3252 goto onError; 3253 3254 Py_XDECREF(errorHandler); 3255 Py_XDECREF(exc); 3256 return (PyObject *)unicode; 3257 3258 onError: 3259 Py_DECREF(unicode); 3260 Py_XDECREF(errorHandler); 3261 Py_XDECREF(exc); 3262 return NULL; 3263} 3264 3265PyObject * 3266PyUnicode_EncodeUTF32(const Py_UNICODE *s, 3267 Py_ssize_t size, 3268 const char *errors, 3269 int byteorder) 3270{ 3271 PyObject *v; 3272 unsigned char *p; 3273 Py_ssize_t nsize, bytesize; 3274#ifndef Py_UNICODE_WIDE 3275 Py_ssize_t i, pairs; 3276#else 3277 const int pairs = 0; 3278#endif 3279 /* Offsets from p for storing byte pairs in the right order. */ 3280#ifdef BYTEORDER_IS_LITTLE_ENDIAN 3281 int iorder[] = {0, 1, 2, 3}; 3282#else 3283 int iorder[] = {3, 2, 1, 0}; 3284#endif 3285 3286#define STORECHAR(CH) \ 3287 do { \ 3288 p[iorder[3]] = ((CH) >> 24) & 0xff; \ 3289 p[iorder[2]] = ((CH) >> 16) & 0xff; \ 3290 p[iorder[1]] = ((CH) >> 8) & 0xff; \ 3291 p[iorder[0]] = (CH) & 0xff; \ 3292 p += 4; \ 3293 } while(0) 3294 3295 /* In narrow builds we can output surrogate pairs as one codepoint, 3296 so we need less space. */ 3297#ifndef Py_UNICODE_WIDE 3298 for (i = pairs = 0; i < size-1; i++) 3299 if (0xD800 <= s[i] && s[i] <= 0xDBFF && 3300 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF) 3301 pairs++; 3302#endif 3303 nsize = (size - pairs + (byteorder == 0)); 3304 bytesize = nsize * 4; 3305 if (bytesize / 4 != nsize) 3306 return PyErr_NoMemory(); 3307 v = PyBytes_FromStringAndSize(NULL, bytesize); 3308 if (v == NULL) 3309 return NULL; 3310 3311 p = (unsigned char *)PyBytes_AS_STRING(v); 3312 if (byteorder == 0) 3313 STORECHAR(0xFEFF); 3314 if (size == 0) 3315 goto done; 3316 3317 if (byteorder == -1) { 3318 /* force LE */ 3319 iorder[0] = 0; 3320 iorder[1] = 1; 3321 iorder[2] = 2; 3322 iorder[3] = 3; 3323 } 3324 else if (byteorder == 1) { 3325 /* force BE */ 3326 iorder[0] = 3; 3327 iorder[1] = 2; 3328 iorder[2] = 1; 3329 iorder[3] = 0; 3330 } 3331 3332 while (size-- > 0) { 3333 Py_UCS4 ch = *s++; 3334#ifndef Py_UNICODE_WIDE 3335 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) { 3336 Py_UCS4 ch2 = *s; 3337 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) { 3338 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000; 3339 s++; 3340 size--; 3341 } 3342 } 3343#endif 3344 STORECHAR(ch); 3345 } 3346 3347 done: 3348 return v; 3349#undef STORECHAR 3350} 3351 3352PyObject * 3353PyUnicode_AsUTF32String(PyObject *unicode) 3354{ 3355 if (!PyUnicode_Check(unicode)) { 3356 PyErr_BadArgument(); 3357 return NULL; 3358 } 3359 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode), 3360 PyUnicode_GET_SIZE(unicode), 3361 NULL, 3362 0); 3363} 3364 3365/* --- UTF-16 Codec ------------------------------------------------------- */ 3366 3367PyObject * 3368PyUnicode_DecodeUTF16(const char *s, 3369 Py_ssize_t size, 3370 const char *errors, 3371 int *byteorder) 3372{ 3373 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL); 3374} 3375 3376/* Two masks for fast checking of whether a C 'long' may contain 3377 UTF16-encoded surrogate characters. This is an efficient heuristic, 3378 assuming that non-surrogate characters with a code point >= 0x8000 are 3379 rare in most input. 3380 FAST_CHAR_MASK is used when the input is in native byte ordering, 3381 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering. 3382*/ 3383#if (SIZEOF_LONG == 8) 3384# define FAST_CHAR_MASK 0x8000800080008000L 3385# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L 3386#elif (SIZEOF_LONG == 4) 3387# define FAST_CHAR_MASK 0x80008000L 3388# define SWAPPED_FAST_CHAR_MASK 0x00800080L 3389#else 3390# error C 'long' size should be either 4 or 8! 3391#endif 3392 3393PyObject * 3394PyUnicode_DecodeUTF16Stateful(const char *s, 3395 Py_ssize_t size, 3396 const char *errors, 3397 int *byteorder, 3398 Py_ssize_t *consumed) 3399{ 3400 const char *starts = s; 3401 Py_ssize_t startinpos; 3402 Py_ssize_t endinpos; 3403 Py_ssize_t outpos; 3404 PyUnicodeObject *unicode; 3405 Py_UNICODE *p; 3406 const unsigned char *q, *e, *aligned_end; 3407 int bo = 0; /* assume native ordering by default */ 3408 int native_ordering = 0; 3409 const char *errmsg = ""; 3410 /* Offsets from q for retrieving byte pairs in the right order. */ 3411#ifdef BYTEORDER_IS_LITTLE_ENDIAN 3412 int ihi = 1, ilo = 0; 3413#else 3414 int ihi = 0, ilo = 1; 3415#endif 3416 PyObject *errorHandler = NULL; 3417 PyObject *exc = NULL; 3418 3419 /* Note: size will always be longer than the resulting Unicode 3420 character count */ 3421 unicode = _PyUnicode_New(size); 3422 if (!unicode) 3423 return NULL; 3424 if (size == 0) 3425 return (PyObject *)unicode; 3426 3427 /* Unpack UTF-16 encoded data */ 3428 p = unicode->str; 3429 q = (unsigned char *)s; 3430 e = q + size - 1; 3431 3432 if (byteorder) 3433 bo = *byteorder; 3434 3435 /* Check for BOM marks (U+FEFF) in the input and adjust current 3436 byte order setting accordingly. In native mode, the leading BOM 3437 mark is skipped, in all other modes, it is copied to the output 3438 stream as-is (giving a ZWNBSP character). */ 3439 if (bo == 0) { 3440 if (size >= 2) { 3441 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo]; 3442#ifdef BYTEORDER_IS_LITTLE_ENDIAN 3443 if (bom == 0xFEFF) { 3444 q += 2; 3445 bo = -1; 3446 } 3447 else if (bom == 0xFFFE) { 3448 q += 2; 3449 bo = 1; 3450 } 3451#else 3452 if (bom == 0xFEFF) { 3453 q += 2; 3454 bo = 1; 3455 } 3456 else if (bom == 0xFFFE) { 3457 q += 2; 3458 bo = -1; 3459 } 3460#endif 3461 } 3462 } 3463 3464 if (bo == -1) { 3465 /* force LE */ 3466 ihi = 1; 3467 ilo = 0; 3468 } 3469 else if (bo == 1) { 3470 /* force BE */ 3471 ihi = 0; 3472 ilo = 1; 3473 } 3474#ifdef BYTEORDER_IS_LITTLE_ENDIAN 3475 native_ordering = ilo < ihi; 3476#else 3477 native_ordering = ilo > ihi; 3478#endif 3479 3480 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK); 3481 while (q < e) { 3482 Py_UNICODE ch; 3483 /* First check for possible aligned read of a C 'long'. Unaligned 3484 reads are more expensive, better to defer to another iteration. */ 3485 if (!((size_t) q & LONG_PTR_MASK)) { 3486 /* Fast path for runs of non-surrogate chars. */ 3487 register const unsigned char *_q = q; 3488 Py_UNICODE *_p = p; 3489 if (native_ordering) { 3490 /* Native ordering is simple: as long as the input cannot 3491 possibly contain a surrogate char, do an unrolled copy 3492 of several 16-bit code points to the target object. 3493 The non-surrogate check is done on several input bytes 3494 at a time (as many as a C 'long' can contain). */ 3495 while (_q < aligned_end) { 3496 unsigned long data = * (unsigned long *) _q; 3497 if (data & FAST_CHAR_MASK) 3498 break; 3499 _p[0] = ((unsigned short *) _q)[0]; 3500 _p[1] = ((unsigned short *) _q)[1]; 3501#if (SIZEOF_LONG == 8) 3502 _p[2] = ((unsigned short *) _q)[2]; 3503 _p[3] = ((unsigned short *) _q)[3]; 3504#endif 3505 _q += SIZEOF_LONG; 3506 _p += SIZEOF_LONG / 2; 3507 } 3508 } 3509 else { 3510 /* Byteswapped ordering is similar, but we must decompose 3511 the copy bytewise, and take care of zero'ing out the 3512 upper bytes if the target object is in 32-bit units 3513 (that is, in UCS-4 builds). */ 3514 while (_q < aligned_end) { 3515 unsigned long data = * (unsigned long *) _q; 3516 if (data & SWAPPED_FAST_CHAR_MASK) 3517 break; 3518 /* Zero upper bytes in UCS-4 builds */ 3519#if (Py_UNICODE_SIZE > 2) 3520 _p[0] = 0; 3521 _p[1] = 0; 3522#if (SIZEOF_LONG == 8) 3523 _p[2] = 0; 3524 _p[3] = 0; 3525#endif 3526#endif 3527 /* Issue #4916; UCS-4 builds on big endian machines must 3528 fill the two last bytes of each 4-byte unit. */ 3529#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2) 3530# define OFF 2 3531#else 3532# define OFF 0 3533#endif 3534 ((unsigned char *) _p)[OFF + 1] = _q[0]; 3535 ((unsigned char *) _p)[OFF + 0] = _q[1]; 3536 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2]; 3537 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3]; 3538#if (SIZEOF_LONG == 8) 3539 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4]; 3540 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5]; 3541 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6]; 3542 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7]; 3543#endif 3544#undef OFF 3545 _q += SIZEOF_LONG; 3546 _p += SIZEOF_LONG / 2; 3547 } 3548 } 3549 p = _p; 3550 q = _q; 3551 if (q >= e) 3552 break; 3553 } 3554 ch = (q[ihi] << 8) | q[ilo]; 3555 3556 q += 2; 3557 3558 if (ch < 0xD800 || ch > 0xDFFF) { 3559 *p++ = ch; 3560 continue; 3561 } 3562 3563 /* UTF-16 code pair: */ 3564 if (q > e) { 3565 errmsg = "unexpected end of data"; 3566 startinpos = (((const char *)q) - 2) - starts; 3567 endinpos = ((const char *)e) + 1 - starts; 3568 goto utf16Error; 3569 } 3570 if (0xD800 <= ch && ch <= 0xDBFF) { 3571 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo]; 3572 q += 2; 3573 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) { 3574#ifndef Py_UNICODE_WIDE 3575 *p++ = ch; 3576 *p++ = ch2; 3577#else 3578 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000; 3579#endif 3580 continue; 3581 } 3582 else { 3583 errmsg = "illegal UTF-16 surrogate"; 3584 startinpos = (((const char *)q)-4)-starts; 3585 endinpos = startinpos+2; 3586 goto utf16Error; 3587 } 3588 3589 } 3590 errmsg = "illegal encoding"; 3591 startinpos = (((const char *)q)-2)-starts; 3592 endinpos = startinpos+2; 3593 /* Fall through to report the error */ 3594 3595 utf16Error: 3596 outpos = p - PyUnicode_AS_UNICODE(unicode); 3597 if (unicode_decode_call_errorhandler( 3598 errors, 3599 &errorHandler, 3600 "utf16", errmsg, 3601 &starts, 3602 (const char **)&e, 3603 &startinpos, 3604 &endinpos, 3605 &exc, 3606 (const char **)&q, 3607 &unicode, 3608 &outpos, 3609 &p)) 3610 goto onError; 3611 } 3612 /* remaining byte at the end? (size should be even) */ 3613 if (e == q) { 3614 if (!consumed) { 3615 errmsg = "truncated data"; 3616 startinpos = ((const char *)q) - starts; 3617 endinpos = ((const char *)e) + 1 - starts; 3618 outpos = p - PyUnicode_AS_UNICODE(unicode); 3619 if (unicode_decode_call_errorhandler( 3620 errors, 3621 &errorHandler, 3622 "utf16", errmsg, 3623 &starts, 3624 (const char **)&e, 3625 &startinpos, 3626 &endinpos, 3627 &exc, 3628 (const char **)&q, 3629 &unicode, 3630 &outpos, 3631 &p)) 3632 goto onError; 3633 /* The remaining input chars are ignored if the callback 3634 chooses to skip the input */ 3635 } 3636 } 3637 3638 if (byteorder) 3639 *byteorder = bo; 3640 3641 if (consumed) 3642 *consumed = (const char *)q-starts; 3643 3644 /* Adjust length */ 3645 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0) 3646 goto onError; 3647 3648 Py_XDECREF(errorHandler); 3649 Py_XDECREF(exc); 3650 return (PyObject *)unicode; 3651 3652 onError: 3653 Py_DECREF(unicode); 3654 Py_XDECREF(errorHandler); 3655 Py_XDECREF(exc); 3656 return NULL; 3657} 3658 3659#undef FAST_CHAR_MASK 3660#undef SWAPPED_FAST_CHAR_MASK 3661 3662PyObject * 3663PyUnicode_EncodeUTF16(const Py_UNICODE *s, 3664 Py_ssize_t size, 3665 const char *errors, 3666 int byteorder) 3667{ 3668 PyObject *v; 3669 unsigned char *p; 3670 Py_ssize_t nsize, bytesize; 3671#ifdef Py_UNICODE_WIDE 3672 Py_ssize_t i, pairs; 3673#else 3674 const int pairs = 0; 3675#endif 3676 /* Offsets from p for storing byte pairs in the right order. */ 3677#ifdef BYTEORDER_IS_LITTLE_ENDIAN 3678 int ihi = 1, ilo = 0; 3679#else 3680 int ihi = 0, ilo = 1; 3681#endif 3682 3683#define STORECHAR(CH) \ 3684 do { \ 3685 p[ihi] = ((CH) >> 8) & 0xff; \ 3686 p[ilo] = (CH) & 0xff; \ 3687 p += 2; \ 3688 } while(0) 3689 3690#ifdef Py_UNICODE_WIDE 3691 for (i = pairs = 0; i < size; i++) 3692 if (s[i] >= 0x10000) 3693 pairs++; 3694#endif 3695 /* 2 * (size + pairs + (byteorder == 0)) */ 3696 if (size > PY_SSIZE_T_MAX || 3697 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0)) 3698 return PyErr_NoMemory(); 3699 nsize = size + pairs + (byteorder == 0); 3700 bytesize = nsize * 2; 3701 if (bytesize / 2 != nsize) 3702 return PyErr_NoMemory(); 3703 v = PyBytes_FromStringAndSize(NULL, bytesize); 3704 if (v == NULL) 3705 return NULL; 3706 3707 p = (unsigned char *)PyBytes_AS_STRING(v); 3708 if (byteorder == 0) 3709 STORECHAR(0xFEFF); 3710 if (size == 0) 3711 goto done; 3712 3713 if (byteorder == -1) { 3714 /* force LE */ 3715 ihi = 1; 3716 ilo = 0; 3717 } 3718 else if (byteorder == 1) { 3719 /* force BE */ 3720 ihi = 0; 3721 ilo = 1; 3722 } 3723 3724 while (size-- > 0) { 3725 Py_UNICODE ch = *s++; 3726 Py_UNICODE ch2 = 0; 3727#ifdef Py_UNICODE_WIDE 3728 if (ch >= 0x10000) { 3729 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF); 3730 ch = 0xD800 | ((ch-0x10000) >> 10); 3731 } 3732#endif 3733 STORECHAR(ch); 3734 if (ch2) 3735 STORECHAR(ch2); 3736 } 3737 3738 done: 3739 return v; 3740#undef STORECHAR 3741} 3742 3743PyObject * 3744PyUnicode_AsUTF16String(PyObject *unicode) 3745{ 3746 if (!PyUnicode_Check(unicode)) { 3747 PyErr_BadArgument(); 3748 return NULL; 3749 } 3750 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode), 3751 PyUnicode_GET_SIZE(unicode), 3752 NULL, 3753 0); 3754} 3755 3756/* --- Unicode Escape Codec ----------------------------------------------- */ 3757 3758static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL; 3759 3760PyObject * 3761PyUnicode_DecodeUnicodeEscape(const char *s, 3762 Py_ssize_t size, 3763 const char *errors) 3764{ 3765 const char *starts = s; 3766 Py_ssize_t startinpos; 3767 Py_ssize_t endinpos; 3768 Py_ssize_t outpos; 3769 int i; 3770 PyUnicodeObject *v; 3771 Py_UNICODE *p; 3772 const char *end; 3773 char* message; 3774 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */ 3775 PyObject *errorHandler = NULL; 3776 PyObject *exc = NULL; 3777 3778 /* Escaped strings will always be longer than the resulting 3779 Unicode string, so we start with size here and then reduce the 3780 length after conversion to the true value. 3781 (but if the error callback returns a long replacement string 3782 we'll have to allocate more space) */ 3783 v = _PyUnicode_New(size); 3784 if (v == NULL) 3785 goto onError; 3786 if (size == 0) 3787 return (PyObject *)v; 3788 3789 p = PyUnicode_AS_UNICODE(v); 3790 end = s + size; 3791 3792 while (s < end) { 3793 unsigned char c; 3794 Py_UNICODE x; 3795 int digits; 3796 3797 /* Non-escape characters are interpreted as Unicode ordinals */ 3798 if (*s != '\\') { 3799 *p++ = (unsigned char) *s++; 3800 continue; 3801 } 3802 3803 startinpos = s-starts; 3804 /* \ - Escapes */ 3805 s++; 3806 c = *s++; 3807 if (s > end) 3808 c = '\0'; /* Invalid after \ */ 3809 switch (c) { 3810 3811 /* \x escapes */ 3812 case '\n': break; 3813 case '\\': *p++ = '\\'; break; 3814 case '\'': *p++ = '\''; break; 3815 case '\"': *p++ = '\"'; break; 3816 case 'b': *p++ = '\b'; break; 3817 case 'f': *p++ = '\014'; break; /* FF */ 3818 case 't': *p++ = '\t'; break; 3819 case 'n': *p++ = '\n'; break; 3820 case 'r': *p++ = '\r'; break; 3821 case 'v': *p++ = '\013'; break; /* VT */ 3822 case 'a': *p++ = '\007'; break; /* BEL, not classic C */ 3823 3824 /* \OOO (octal) escapes */ 3825 case '0': case '1': case '2': case '3': 3826 case '4': case '5': case '6': case '7': 3827 x = s[-1] - '0'; 3828 if (s < end && '0' <= *s && *s <= '7') { 3829 x = (x<<3) + *s++ - '0'; 3830 if (s < end && '0' <= *s && *s <= '7') 3831 x = (x<<3) + *s++ - '0'; 3832 } 3833 *p++ = x; 3834 break; 3835 3836 /* hex escapes */ 3837 /* \xXX */ 3838 case 'x': 3839 digits = 2; 3840 message = "truncated \\xXX escape"; 3841 goto hexescape; 3842 3843 /* \uXXXX */ 3844 case 'u': 3845 digits = 4; 3846 message = "truncated \\uXXXX escape"; 3847 goto hexescape; 3848 3849 /* \UXXXXXXXX */ 3850 case 'U': 3851 digits = 8; 3852 message = "truncated \\UXXXXXXXX escape"; 3853 hexescape: 3854 chr = 0; 3855 outpos = p-PyUnicode_AS_UNICODE(v); 3856 if (s+digits>end) { 3857 endinpos = size; 3858 if (unicode_decode_call_errorhandler( 3859 errors, &errorHandler, 3860 "unicodeescape", "end of string in escape sequence", 3861 &starts, &end, &startinpos, &endinpos, &exc, &s, 3862 &v, &outpos, &p)) 3863 goto onError; 3864 goto nextByte; 3865 } 3866 for (i = 0; i < digits; ++i) { 3867 c = (unsigned char) s[i]; 3868 if (!Py_ISXDIGIT(c)) { 3869 endinpos = (s+i+1)-starts; 3870 if (unicode_decode_call_errorhandler( 3871 errors, &errorHandler, 3872 "unicodeescape", message, 3873 &starts, &end, &startinpos, &endinpos, &exc, &s, 3874 &v, &outpos, &p)) 3875 goto onError; 3876 goto nextByte; 3877 } 3878 chr = (chr<<4) & ~0xF; 3879 if (c >= '0' && c <= '9') 3880 chr += c - '0'; 3881 else if (c >= 'a' && c <= 'f') 3882 chr += 10 + c - 'a'; 3883 else 3884 chr += 10 + c - 'A'; 3885 } 3886 s += i; 3887 if (chr == 0xffffffff && PyErr_Occurred()) 3888 /* _decoding_error will have already written into the 3889 target buffer. */ 3890 break; 3891 store: 3892 /* when we get here, chr is a 32-bit unicode character */ 3893 if (chr <= 0xffff) 3894 /* UCS-2 character */ 3895 *p++ = (Py_UNICODE) chr; 3896 else if (chr <= 0x10ffff) { 3897 /* UCS-4 character. Either store directly, or as 3898 surrogate pair. */ 3899#ifdef Py_UNICODE_WIDE 3900 *p++ = chr; 3901#else 3902 chr -= 0x10000L; 3903 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10); 3904 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF); 3905#endif 3906 } else { 3907 endinpos = s-starts; 3908 outpos = p-PyUnicode_AS_UNICODE(v); 3909 if (unicode_decode_call_errorhandler( 3910 errors, &errorHandler, 3911 "unicodeescape", "illegal Unicode character", 3912 &starts, &end, &startinpos, &endinpos, &exc, &s, 3913 &v, &outpos, &p)) 3914 goto onError; 3915 } 3916 break; 3917 3918 /* \N{name} */ 3919 case 'N': 3920 message = "malformed \\N character escape"; 3921 if (ucnhash_CAPI == NULL) { 3922 /* load the unicode data module */ 3923 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(PyUnicodeData_CAPSULE_NAME, 1); 3924 if (ucnhash_CAPI == NULL) 3925 goto ucnhashError; 3926 } 3927 if (*s == '{') { 3928 const char *start = s+1; 3929 /* look for the closing brace */ 3930 while (*s != '}' && s < end) 3931 s++; 3932 if (s > start && s < end && *s == '}') { 3933 /* found a name. look it up in the unicode database */ 3934 message = "unknown Unicode character name"; 3935 s++; 3936 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr)) 3937 goto store; 3938 } 3939 } 3940 endinpos = s-starts; 3941 outpos = p-PyUnicode_AS_UNICODE(v); 3942 if (unicode_decode_call_errorhandler( 3943 errors, &errorHandler, 3944 "unicodeescape", message, 3945 &starts, &end, &startinpos, &endinpos, &exc, &s, 3946 &v, &outpos, &p)) 3947 goto onError; 3948 break; 3949 3950 default: 3951 if (s > end) { 3952 message = "\\ at end of string"; 3953 s--; 3954 endinpos = s-starts; 3955 outpos = p-PyUnicode_AS_UNICODE(v); 3956 if (unicode_decode_call_errorhandler( 3957 errors, &errorHandler, 3958 "unicodeescape", message, 3959 &starts, &end, &startinpos, &endinpos, &exc, &s, 3960 &v, &outpos, &p)) 3961 goto onError; 3962 } 3963 else { 3964 *p++ = '\\'; 3965 *p++ = (unsigned char)s[-1]; 3966 } 3967 break; 3968 } 3969 nextByte: 3970 ; 3971 } 3972 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0) 3973 goto onError; 3974 Py_XDECREF(errorHandler); 3975 Py_XDECREF(exc); 3976 return (PyObject *)v; 3977 3978 ucnhashError: 3979 PyErr_SetString( 3980 PyExc_UnicodeError, 3981 "\\N escapes not supported (can't load unicodedata module)" 3982 ); 3983 Py_XDECREF(v); 3984 Py_XDECREF(errorHandler); 3985 Py_XDECREF(exc); 3986 return NULL; 3987 3988 onError: 3989 Py_XDECREF(v); 3990 Py_XDECREF(errorHandler); 3991 Py_XDECREF(exc); 3992 return NULL; 3993} 3994 3995/* Return a Unicode-Escape string version of the Unicode object. 3996 3997 If quotes is true, the string is enclosed in u"" or u'' quotes as 3998 appropriate. 3999 4000*/ 4001 4002Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s, 4003 Py_ssize_t size, 4004 Py_UNICODE ch) 4005{ 4006 /* like wcschr, but doesn't stop at NULL characters */ 4007 4008 while (size-- > 0) { 4009 if (*s == ch) 4010 return s; 4011 s++; 4012 } 4013 4014 return NULL; 4015} 4016 4017static const char *hexdigits = "0123456789abcdef"; 4018 4019PyObject * 4020PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s, 4021 Py_ssize_t size) 4022{ 4023 PyObject *repr; 4024 char *p; 4025 4026#ifdef Py_UNICODE_WIDE 4027 const Py_ssize_t expandsize = 10; 4028#else 4029 const Py_ssize_t expandsize = 6; 4030#endif 4031 4032 /* XXX(nnorwitz): rather than over-allocating, it would be 4033 better to choose a different scheme. Perhaps scan the 4034 first N-chars of the string and allocate based on that size. 4035 */ 4036 /* Initial allocation is based on the longest-possible unichr 4037 escape. 4038 4039 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source 4040 unichr, so in this case it's the longest unichr escape. In 4041 narrow (UTF-16) builds this is five chars per source unichr 4042 since there are two unichrs in the surrogate pair, so in narrow 4043 (UTF-16) builds it's not the longest unichr escape. 4044 4045 In wide or narrow builds '\uxxxx' is 6 chars per source unichr, 4046 so in the narrow (UTF-16) build case it's the longest unichr 4047 escape. 4048 */ 4049 4050 if (size == 0) 4051 return PyBytes_FromStringAndSize(NULL, 0); 4052 4053 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize) 4054 return PyErr_NoMemory(); 4055 4056 repr = PyBytes_FromStringAndSize(NULL, 4057 2 4058 + expandsize*size 4059 + 1); 4060 if (repr == NULL) 4061 return NULL; 4062 4063 p = PyBytes_AS_STRING(repr); 4064 4065 while (size-- > 0) { 4066 Py_UNICODE ch = *s++; 4067 4068 /* Escape backslashes */ 4069 if (ch == '\\') { 4070 *p++ = '\\'; 4071 *p++ = (char) ch; 4072 continue; 4073 } 4074 4075#ifdef Py_UNICODE_WIDE 4076 /* Map 21-bit characters to '\U00xxxxxx' */ 4077 else if (ch >= 0x10000) { 4078 *p++ = '\\'; 4079 *p++ = 'U'; 4080 *p++ = hexdigits[(ch >> 28) & 0x0000000F]; 4081 *p++ = hexdigits[(ch >> 24) & 0x0000000F]; 4082 *p++ = hexdigits[(ch >> 20) & 0x0000000F]; 4083 *p++ = hexdigits[(ch >> 16) & 0x0000000F]; 4084 *p++ = hexdigits[(ch >> 12) & 0x0000000F]; 4085 *p++ = hexdigits[(ch >> 8) & 0x0000000F]; 4086 *p++ = hexdigits[(ch >> 4) & 0x0000000F]; 4087 *p++ = hexdigits[ch & 0x0000000F]; 4088 continue; 4089 } 4090#else 4091 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */ 4092 else if (ch >= 0xD800 && ch < 0xDC00) { 4093 Py_UNICODE ch2; 4094 Py_UCS4 ucs; 4095 4096 ch2 = *s++; 4097 size--; 4098 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) { 4099 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000; 4100 *p++ = '\\'; 4101 *p++ = 'U'; 4102 *p++ = hexdigits[(ucs >> 28) & 0x0000000F]; 4103 *p++ = hexdigits[(ucs >> 24) & 0x0000000F]; 4104 *p++ = hexdigits[(ucs >> 20) & 0x0000000F]; 4105 *p++ = hexdigits[(ucs >> 16) & 0x0000000F]; 4106 *p++ = hexdigits[(ucs >> 12) & 0x0000000F]; 4107 *p++ = hexdigits[(ucs >> 8) & 0x0000000F]; 4108 *p++ = hexdigits[(ucs >> 4) & 0x0000000F]; 4109 *p++ = hexdigits[ucs & 0x0000000F]; 4110 continue; 4111 } 4112 /* Fall through: isolated surrogates are copied as-is */ 4113 s--; 4114 size++; 4115 } 4116#endif 4117 4118 /* Map 16-bit characters to '\uxxxx' */ 4119 if (ch >= 256) { 4120 *p++ = '\\'; 4121 *p++ = 'u'; 4122 *p++ = hexdigits[(ch >> 12) & 0x000F]; 4123 *p++ = hexdigits[(ch >> 8) & 0x000F]; 4124 *p++ = hexdigits[(ch >> 4) & 0x000F]; 4125 *p++ = hexdigits[ch & 0x000F]; 4126 } 4127 4128 /* Map special whitespace to '\t', \n', '\r' */ 4129 else if (ch == '\t') { 4130 *p++ = '\\'; 4131 *p++ = 't'; 4132 } 4133 else if (ch == '\n') { 4134 *p++ = '\\'; 4135 *p++ = 'n'; 4136 } 4137 else if (ch == '\r') { 4138 *p++ = '\\'; 4139 *p++ = 'r'; 4140 } 4141 4142 /* Map non-printable US ASCII to '\xhh' */ 4143 else if (ch < ' ' || ch >= 0x7F) { 4144 *p++ = '\\'; 4145 *p++ = 'x'; 4146 *p++ = hexdigits[(ch >> 4) & 0x000F]; 4147 *p++ = hexdigits[ch & 0x000F]; 4148 } 4149 4150 /* Copy everything else as-is */ 4151 else 4152 *p++ = (char) ch; 4153 } 4154 4155 assert(p - PyBytes_AS_STRING(repr) > 0); 4156 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) 4157 return NULL; 4158 return repr; 4159} 4160 4161PyObject * 4162PyUnicode_AsUnicodeEscapeString(PyObject *unicode) 4163{ 4164 PyObject *s; 4165 if (!PyUnicode_Check(unicode)) { 4166 PyErr_BadArgument(); 4167 return NULL; 4168 } 4169 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode), 4170 PyUnicode_GET_SIZE(unicode)); 4171 return s; 4172} 4173 4174/* --- Raw Unicode Escape Codec ------------------------------------------- */ 4175 4176PyObject * 4177PyUnicode_DecodeRawUnicodeEscape(const char *s, 4178 Py_ssize_t size, 4179 const char *errors) 4180{ 4181 const char *starts = s; 4182 Py_ssize_t startinpos; 4183 Py_ssize_t endinpos; 4184 Py_ssize_t outpos; 4185 PyUnicodeObject *v; 4186 Py_UNICODE *p; 4187 const char *end; 4188 const char *bs; 4189 PyObject *errorHandler = NULL; 4190 PyObject *exc = NULL; 4191 4192 /* Escaped strings will always be longer than the resulting 4193 Unicode string, so we start with size here and then reduce the 4194 length after conversion to the true value. (But decoding error 4195 handler might have to resize the string) */ 4196 v = _PyUnicode_New(size); 4197 if (v == NULL) 4198 goto onError; 4199 if (size == 0) 4200 return (PyObject *)v; 4201 p = PyUnicode_AS_UNICODE(v); 4202 end = s + size; 4203 while (s < end) { 4204 unsigned char c; 4205 Py_UCS4 x; 4206 int i; 4207 int count; 4208 4209 /* Non-escape characters are interpreted as Unicode ordinals */ 4210 if (*s != '\\') { 4211 *p++ = (unsigned char)*s++; 4212 continue; 4213 } 4214 startinpos = s-starts; 4215 4216 /* \u-escapes are only interpreted iff the number of leading 4217 backslashes if odd */ 4218 bs = s; 4219 for (;s < end;) { 4220 if (*s != '\\') 4221 break; 4222 *p++ = (unsigned char)*s++; 4223 } 4224 if (((s - bs) & 1) == 0 || 4225 s >= end || 4226 (*s != 'u' && *s != 'U')) { 4227 continue; 4228 } 4229 p--; 4230 count = *s=='u' ? 4 : 8; 4231 s++; 4232 4233 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */ 4234 outpos = p-PyUnicode_AS_UNICODE(v); 4235 for (x = 0, i = 0; i < count; ++i, ++s) { 4236 c = (unsigned char)*s; 4237 if (!Py_ISXDIGIT(c)) { 4238 endinpos = s-starts; 4239 if (unicode_decode_call_errorhandler( 4240 errors, &errorHandler, 4241 "rawunicodeescape", "truncated \\uXXXX", 4242 &starts, &end, &startinpos, &endinpos, &exc, &s, 4243 &v, &outpos, &p)) 4244 goto onError; 4245 goto nextByte; 4246 } 4247 x = (x<<4) & ~0xF; 4248 if (c >= '0' && c <= '9') 4249 x += c - '0'; 4250 else if (c >= 'a' && c <= 'f') 4251 x += 10 + c - 'a'; 4252 else 4253 x += 10 + c - 'A'; 4254 } 4255 if (x <= 0xffff) 4256 /* UCS-2 character */ 4257 *p++ = (Py_UNICODE) x; 4258 else if (x <= 0x10ffff) { 4259 /* UCS-4 character. Either store directly, or as 4260 surrogate pair. */ 4261#ifdef Py_UNICODE_WIDE 4262 *p++ = (Py_UNICODE) x; 4263#else 4264 x -= 0x10000L; 4265 *p++ = 0xD800 + (Py_UNICODE) (x >> 10); 4266 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF); 4267#endif 4268 } else { 4269 endinpos = s-starts; 4270 outpos = p-PyUnicode_AS_UNICODE(v); 4271 if (unicode_decode_call_errorhandler( 4272 errors, &errorHandler, 4273 "rawunicodeescape", "\\Uxxxxxxxx out of range", 4274 &starts, &end, &startinpos, &endinpos, &exc, &s, 4275 &v, &outpos, &p)) 4276 goto onError; 4277 } 4278 nextByte: 4279 ; 4280 } 4281 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0) 4282 goto onError; 4283 Py_XDECREF(errorHandler); 4284 Py_XDECREF(exc); 4285 return (PyObject *)v; 4286 4287 onError: 4288 Py_XDECREF(v); 4289 Py_XDECREF(errorHandler); 4290 Py_XDECREF(exc); 4291 return NULL; 4292} 4293 4294PyObject * 4295PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s, 4296 Py_ssize_t size) 4297{ 4298 PyObject *repr; 4299 char *p; 4300 char *q; 4301 4302#ifdef Py_UNICODE_WIDE 4303 const Py_ssize_t expandsize = 10; 4304#else 4305 const Py_ssize_t expandsize = 6; 4306#endif 4307 4308 if (size > PY_SSIZE_T_MAX / expandsize) 4309 return PyErr_NoMemory(); 4310 4311 repr = PyBytes_FromStringAndSize(NULL, expandsize * size); 4312 if (repr == NULL) 4313 return NULL; 4314 if (size == 0) 4315 return repr; 4316 4317 p = q = PyBytes_AS_STRING(repr); 4318 while (size-- > 0) { 4319 Py_UNICODE ch = *s++; 4320#ifdef Py_UNICODE_WIDE 4321 /* Map 32-bit characters to '\Uxxxxxxxx' */ 4322 if (ch >= 0x10000) { 4323 *p++ = '\\'; 4324 *p++ = 'U'; 4325 *p++ = hexdigits[(ch >> 28) & 0xf]; 4326 *p++ = hexdigits[(ch >> 24) & 0xf]; 4327 *p++ = hexdigits[(ch >> 20) & 0xf]; 4328 *p++ = hexdigits[(ch >> 16) & 0xf]; 4329 *p++ = hexdigits[(ch >> 12) & 0xf]; 4330 *p++ = hexdigits[(ch >> 8) & 0xf]; 4331 *p++ = hexdigits[(ch >> 4) & 0xf]; 4332 *p++ = hexdigits[ch & 15]; 4333 } 4334 else 4335#else 4336 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */ 4337 if (ch >= 0xD800 && ch < 0xDC00) { 4338 Py_UNICODE ch2; 4339 Py_UCS4 ucs; 4340 4341 ch2 = *s++; 4342 size--; 4343 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) { 4344 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000; 4345 *p++ = '\\'; 4346 *p++ = 'U'; 4347 *p++ = hexdigits[(ucs >> 28) & 0xf]; 4348 *p++ = hexdigits[(ucs >> 24) & 0xf]; 4349 *p++ = hexdigits[(ucs >> 20) & 0xf]; 4350 *p++ = hexdigits[(ucs >> 16) & 0xf]; 4351 *p++ = hexdigits[(ucs >> 12) & 0xf]; 4352 *p++ = hexdigits[(ucs >> 8) & 0xf]; 4353 *p++ = hexdigits[(ucs >> 4) & 0xf]; 4354 *p++ = hexdigits[ucs & 0xf]; 4355 continue; 4356 } 4357 /* Fall through: isolated surrogates are copied as-is */ 4358 s--; 4359 size++; 4360 } 4361#endif 4362 /* Map 16-bit characters to '\uxxxx' */ 4363 if (ch >= 256) { 4364 *p++ = '\\'; 4365 *p++ = 'u'; 4366 *p++ = hexdigits[(ch >> 12) & 0xf]; 4367 *p++ = hexdigits[(ch >> 8) & 0xf]; 4368 *p++ = hexdigits[(ch >> 4) & 0xf]; 4369 *p++ = hexdigits[ch & 15]; 4370 } 4371 /* Copy everything else as-is */ 4372 else 4373 *p++ = (char) ch; 4374 } 4375 size = p - q; 4376 4377 assert(size > 0); 4378 if (_PyBytes_Resize(&repr, size) < 0) 4379 return NULL; 4380 return repr; 4381} 4382 4383PyObject * 4384PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode) 4385{ 4386 PyObject *s; 4387 if (!PyUnicode_Check(unicode)) { 4388 PyErr_BadArgument(); 4389 return NULL; 4390 } 4391 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode), 4392 PyUnicode_GET_SIZE(unicode)); 4393 4394 return s; 4395} 4396 4397/* --- Unicode Internal Codec ------------------------------------------- */ 4398 4399PyObject * 4400_PyUnicode_DecodeUnicodeInternal(const char *s, 4401 Py_ssize_t size, 4402 const char *errors) 4403{ 4404 const char *starts = s; 4405 Py_ssize_t startinpos; 4406 Py_ssize_t endinpos; 4407 Py_ssize_t outpos; 4408 PyUnicodeObject *v; 4409 Py_UNICODE *p; 4410 const char *end; 4411 const char *reason; 4412 PyObject *errorHandler = NULL; 4413 PyObject *exc = NULL; 4414 4415#ifdef Py_UNICODE_WIDE 4416 Py_UNICODE unimax = PyUnicode_GetMax(); 4417#endif 4418 4419 /* XXX overflow detection missing */ 4420 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE); 4421 if (v == NULL) 4422 goto onError; 4423 if (PyUnicode_GetSize((PyObject *)v) == 0) 4424 return (PyObject *)v; 4425 p = PyUnicode_AS_UNICODE(v); 4426 end = s + size; 4427 4428 while (s < end) { 4429 memcpy(p, s, sizeof(Py_UNICODE)); 4430 /* We have to sanity check the raw data, otherwise doom looms for 4431 some malformed UCS-4 data. */ 4432 if ( 4433#ifdef Py_UNICODE_WIDE 4434 *p > unimax || *p < 0 || 4435#endif 4436 end-s < Py_UNICODE_SIZE 4437 ) 4438 { 4439 startinpos = s - starts; 4440 if (end-s < Py_UNICODE_SIZE) { 4441 endinpos = end-starts; 4442 reason = "truncated input"; 4443 } 4444 else { 4445 endinpos = s - starts + Py_UNICODE_SIZE; 4446 reason = "illegal code point (> 0x10FFFF)"; 4447 } 4448 outpos = p - PyUnicode_AS_UNICODE(v); 4449 if (unicode_decode_call_errorhandler( 4450 errors, &errorHandler, 4451 "unicode_internal", reason, 4452 &starts, &end, &startinpos, &endinpos, &exc, &s, 4453 &v, &outpos, &p)) { 4454 goto onError; 4455 } 4456 } 4457 else { 4458 p++; 4459 s += Py_UNICODE_SIZE; 4460 } 4461 } 4462 4463 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0) 4464 goto onError; 4465 Py_XDECREF(errorHandler); 4466 Py_XDECREF(exc); 4467 return (PyObject *)v; 4468 4469 onError: 4470 Py_XDECREF(v); 4471 Py_XDECREF(errorHandler); 4472 Py_XDECREF(exc); 4473 return NULL; 4474} 4475 4476/* --- Latin-1 Codec ------------------------------------------------------ */ 4477 4478PyObject * 4479PyUnicode_DecodeLatin1(const char *s, 4480 Py_ssize_t size, 4481 const char *errors) 4482{ 4483 PyUnicodeObject *v; 4484 Py_UNICODE *p; 4485 const char *e, *unrolled_end; 4486 4487 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */ 4488 if (size == 1) { 4489 Py_UNICODE r = *(unsigned char*)s; 4490 return PyUnicode_FromUnicode(&r, 1); 4491 } 4492 4493 v = _PyUnicode_New(size); 4494 if (v == NULL) 4495 goto onError; 4496 if (size == 0) 4497 return (PyObject *)v; 4498 p = PyUnicode_AS_UNICODE(v); 4499 e = s + size; 4500 /* Unrolling the copy makes it much faster by reducing the looping 4501 overhead. This is similar to what many memcpy() implementations do. */ 4502 unrolled_end = e - 4; 4503 while (s < unrolled_end) { 4504 p[0] = (unsigned char) s[0]; 4505 p[1] = (unsigned char) s[1]; 4506 p[2] = (unsigned char) s[2]; 4507 p[3] = (unsigned char) s[3]; 4508 s += 4; 4509 p += 4; 4510 } 4511 while (s < e) 4512 *p++ = (unsigned char) *s++; 4513 return (PyObject *)v; 4514 4515 onError: 4516 Py_XDECREF(v); 4517 return NULL; 4518} 4519 4520/* create or adjust a UnicodeEncodeError */ 4521static void 4522make_encode_exception(PyObject **exceptionObject, 4523 const char *encoding, 4524 const Py_UNICODE *unicode, Py_ssize_t size, 4525 Py_ssize_t startpos, Py_ssize_t endpos, 4526 const char *reason) 4527{ 4528 if (*exceptionObject == NULL) { 4529 *exceptionObject = PyUnicodeEncodeError_Create( 4530 encoding, unicode, size, startpos, endpos, reason); 4531 } 4532 else { 4533 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos)) 4534 goto onError; 4535 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos)) 4536 goto onError; 4537 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason)) 4538 goto onError; 4539 return; 4540 onError: 4541 Py_DECREF(*exceptionObject); 4542 *exceptionObject = NULL; 4543 } 4544} 4545 4546/* raises a UnicodeEncodeError */ 4547static void 4548raise_encode_exception(PyObject **exceptionObject, 4549 const char *encoding, 4550 const Py_UNICODE *unicode, Py_ssize_t size, 4551 Py_ssize_t startpos, Py_ssize_t endpos, 4552 const char *reason) 4553{ 4554 make_encode_exception(exceptionObject, 4555 encoding, unicode, size, startpos, endpos, reason); 4556 if (*exceptionObject != NULL) 4557 PyCodec_StrictErrors(*exceptionObject); 4558} 4559 4560/* error handling callback helper: 4561 build arguments, call the callback and check the arguments, 4562 put the result into newpos and return the replacement string, which 4563 has to be freed by the caller */ 4564static PyObject * 4565unicode_encode_call_errorhandler(const char *errors, 4566 PyObject **errorHandler, 4567 const char *encoding, const char *reason, 4568 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject, 4569 Py_ssize_t startpos, Py_ssize_t endpos, 4570 Py_ssize_t *newpos) 4571{ 4572 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple"; 4573 4574 PyObject *restuple; 4575 PyObject *resunicode; 4576 4577 if (*errorHandler == NULL) { 4578 *errorHandler = PyCodec_LookupError(errors); 4579 if (*errorHandler == NULL) 4580 return NULL; 4581 } 4582 4583 make_encode_exception(exceptionObject, 4584 encoding, unicode, size, startpos, endpos, reason); 4585 if (*exceptionObject == NULL) 4586 return NULL; 4587 4588 restuple = PyObject_CallFunctionObjArgs( 4589 *errorHandler, *exceptionObject, NULL); 4590 if (restuple == NULL) 4591 return NULL; 4592 if (!PyTuple_Check(restuple)) { 4593 PyErr_SetString(PyExc_TypeError, &argparse[3]); 4594 Py_DECREF(restuple); 4595 return NULL; 4596 } 4597 if (!PyArg_ParseTuple(restuple, argparse, 4598 &resunicode, newpos)) { 4599 Py_DECREF(restuple); 4600 return NULL; 4601 } 4602 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) { 4603 PyErr_SetString(PyExc_TypeError, &argparse[3]); 4604 Py_DECREF(restuple); 4605 return NULL; 4606 } 4607 if (*newpos<0) 4608 *newpos = size+*newpos; 4609 if (*newpos<0 || *newpos>size) { 4610 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos); 4611 Py_DECREF(restuple); 4612 return NULL; 4613 } 4614 Py_INCREF(resunicode); 4615 Py_DECREF(restuple); 4616 return resunicode; 4617} 4618 4619static PyObject * 4620unicode_encode_ucs1(const Py_UNICODE *p, 4621 Py_ssize_t size, 4622 const char *errors, 4623 int limit) 4624{ 4625 /* output object */ 4626 PyObject *res; 4627 /* pointers to the beginning and end+1 of input */ 4628 const Py_UNICODE *startp = p; 4629 const Py_UNICODE *endp = p + size; 4630 /* pointer to the beginning of the unencodable characters */ 4631 /* const Py_UNICODE *badp = NULL; */ 4632 /* pointer into the output */ 4633 char *str; 4634 /* current output position */ 4635 Py_ssize_t ressize; 4636 const char *encoding = (limit == 256) ? "latin-1" : "ascii"; 4637 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)"; 4638 PyObject *errorHandler = NULL; 4639 PyObject *exc = NULL; 4640 /* the following variable is used for caching string comparisons 4641 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */ 4642 int known_errorHandler = -1; 4643 4644 /* allocate enough for a simple encoding without 4645 replacements, if we need more, we'll resize */ 4646 if (size == 0) 4647 return PyBytes_FromStringAndSize(NULL, 0); 4648 res = PyBytes_FromStringAndSize(NULL, size); 4649 if (res == NULL) 4650 return NULL; 4651 str = PyBytes_AS_STRING(res); 4652 ressize = size; 4653 4654 while (p<endp) { 4655 Py_UNICODE c = *p; 4656 4657 /* can we encode this? */ 4658 if (c<limit) { 4659 /* no overflow check, because we know that the space is enough */ 4660 *str++ = (char)c; 4661 ++p; 4662 } 4663 else { 4664 Py_ssize_t unicodepos = p-startp; 4665 Py_ssize_t requiredsize; 4666 PyObject *repunicode; 4667 Py_ssize_t repsize; 4668 Py_ssize_t newpos; 4669 Py_ssize_t respos; 4670 Py_UNICODE *uni2; 4671 /* startpos for collecting unencodable chars */ 4672 const Py_UNICODE *collstart = p; 4673 const Py_UNICODE *collend = p; 4674 /* find all unecodable characters */ 4675 while ((collend < endp) && ((*collend)>=limit)) 4676 ++collend; 4677 /* cache callback name lookup (if not done yet, i.e. it's the first error) */ 4678 if (known_errorHandler==-1) { 4679 if ((errors==NULL) || (!strcmp(errors, "strict"))) 4680 known_errorHandler = 1; 4681 else if (!strcmp(errors, "replace")) 4682 known_errorHandler = 2; 4683 else if (!strcmp(errors, "ignore")) 4684 known_errorHandler = 3; 4685 else if (!strcmp(errors, "xmlcharrefreplace")) 4686 known_errorHandler = 4; 4687 else 4688 known_errorHandler = 0; 4689 } 4690 switch (known_errorHandler) { 4691 case 1: /* strict */ 4692 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason); 4693 goto onError; 4694 case 2: /* replace */ 4695 while (collstart++<collend) 4696 *str++ = '?'; /* fall through */ 4697 case 3: /* ignore */ 4698 p = collend; 4699 break; 4700 case 4: /* xmlcharrefreplace */ 4701 respos = str - PyBytes_AS_STRING(res); 4702 /* determine replacement size (temporarily (mis)uses p) */ 4703 for (p = collstart, repsize = 0; p < collend; ++p) { 4704 if (*p<10) 4705 repsize += 2+1+1; 4706 else if (*p<100) 4707 repsize += 2+2+1; 4708 else if (*p<1000) 4709 repsize += 2+3+1; 4710 else if (*p<10000) 4711 repsize += 2+4+1; 4712#ifndef Py_UNICODE_WIDE 4713 else 4714 repsize += 2+5+1; 4715#else 4716 else if (*p<100000) 4717 repsize += 2+5+1; 4718 else if (*p<1000000) 4719 repsize += 2+6+1; 4720 else 4721 repsize += 2+7+1; 4722#endif 4723 } 4724 requiredsize = respos+repsize+(endp-collend); 4725 if (requiredsize > ressize) { 4726 if (requiredsize<2*ressize) 4727 requiredsize = 2*ressize; 4728 if (_PyBytes_Resize(&res, requiredsize)) 4729 goto onError; 4730 str = PyBytes_AS_STRING(res) + respos; 4731 ressize = requiredsize; 4732 } 4733 /* generate replacement (temporarily (mis)uses p) */ 4734 for (p = collstart; p < collend; ++p) { 4735 str += sprintf(str, "&#%d;", (int)*p); 4736 } 4737 p = collend; 4738 break; 4739 default: 4740 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler, 4741 encoding, reason, startp, size, &exc, 4742 collstart-startp, collend-startp, &newpos); 4743 if (repunicode == NULL) 4744 goto onError; 4745 if (PyBytes_Check(repunicode)) { 4746 /* Directly copy bytes result to output. */ 4747 repsize = PyBytes_Size(repunicode); 4748 if (repsize > 1) { 4749 /* Make room for all additional bytes. */ 4750 respos = str - PyBytes_AS_STRING(res); 4751 if (_PyBytes_Resize(&res, ressize+repsize-1)) { 4752 Py_DECREF(repunicode); 4753 goto onError; 4754 } 4755 str = PyBytes_AS_STRING(res) + respos; 4756 ressize += repsize-1; 4757 } 4758 memcpy(str, PyBytes_AsString(repunicode), repsize); 4759 str += repsize; 4760 p = startp + newpos; 4761 Py_DECREF(repunicode); 4762 break; 4763 } 4764 /* need more space? (at least enough for what we 4765 have+the replacement+the rest of the string, so 4766 we won't have to check space for encodable characters) */ 4767 respos = str - PyBytes_AS_STRING(res); 4768 repsize = PyUnicode_GET_SIZE(repunicode); 4769 requiredsize = respos+repsize+(endp-collend); 4770 if (requiredsize > ressize) { 4771 if (requiredsize<2*ressize) 4772 requiredsize = 2*ressize; 4773 if (_PyBytes_Resize(&res, requiredsize)) { 4774 Py_DECREF(repunicode); 4775 goto onError; 4776 } 4777 str = PyBytes_AS_STRING(res) + respos; 4778 ressize = requiredsize; 4779 } 4780 /* check if there is anything unencodable in the replacement 4781 and copy it to the output */ 4782 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) { 4783 c = *uni2; 4784 if (c >= limit) { 4785 raise_encode_exception(&exc, encoding, startp, size, 4786 unicodepos, unicodepos+1, reason); 4787 Py_DECREF(repunicode); 4788 goto onError; 4789 } 4790 *str = (char)c; 4791 } 4792 p = startp + newpos; 4793 Py_DECREF(repunicode); 4794 } 4795 } 4796 } 4797 /* Resize if we allocated to much */ 4798 size = str - PyBytes_AS_STRING(res); 4799 if (size < ressize) { /* If this falls res will be NULL */ 4800 assert(size >= 0); 4801 if (_PyBytes_Resize(&res, size) < 0) 4802 goto onError; 4803 } 4804 4805 Py_XDECREF(errorHandler); 4806 Py_XDECREF(exc); 4807 return res; 4808 4809 onError: 4810 Py_XDECREF(res); 4811 Py_XDECREF(errorHandler); 4812 Py_XDECREF(exc); 4813 return NULL; 4814} 4815 4816PyObject * 4817PyUnicode_EncodeLatin1(const Py_UNICODE *p, 4818 Py_ssize_t size, 4819 const char *errors) 4820{ 4821 return unicode_encode_ucs1(p, size, errors, 256); 4822} 4823 4824PyObject * 4825PyUnicode_AsLatin1String(PyObject *unicode) 4826{ 4827 if (!PyUnicode_Check(unicode)) { 4828 PyErr_BadArgument(); 4829 return NULL; 4830 } 4831 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode), 4832 PyUnicode_GET_SIZE(unicode), 4833 NULL); 4834} 4835 4836/* --- 7-bit ASCII Codec -------------------------------------------------- */ 4837 4838PyObject * 4839PyUnicode_DecodeASCII(const char *s, 4840 Py_ssize_t size, 4841 const char *errors) 4842{ 4843 const char *starts = s; 4844 PyUnicodeObject *v; 4845 Py_UNICODE *p; 4846 Py_ssize_t startinpos; 4847 Py_ssize_t endinpos; 4848 Py_ssize_t outpos; 4849 const char *e; 4850 PyObject *errorHandler = NULL; 4851 PyObject *exc = NULL; 4852 4853 /* ASCII is equivalent to the first 128 ordinals in Unicode. */ 4854 if (size == 1 && *(unsigned char*)s < 128) { 4855 Py_UNICODE r = *(unsigned char*)s; 4856 return PyUnicode_FromUnicode(&r, 1); 4857 } 4858 4859 v = _PyUnicode_New(size); 4860 if (v == NULL) 4861 goto onError; 4862 if (size == 0) 4863 return (PyObject *)v; 4864 p = PyUnicode_AS_UNICODE(v); 4865 e = s + size; 4866 while (s < e) { 4867 register unsigned char c = (unsigned char)*s; 4868 if (c < 128) { 4869 *p++ = c; 4870 ++s; 4871 } 4872 else { 4873 startinpos = s-starts; 4874 endinpos = startinpos + 1; 4875 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v); 4876 if (unicode_decode_call_errorhandler( 4877 errors, &errorHandler, 4878 "ascii", "ordinal not in range(128)", 4879 &starts, &e, &startinpos, &endinpos, &exc, &s, 4880 &v, &outpos, &p)) 4881 goto onError; 4882 } 4883 } 4884 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v)) 4885 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0) 4886 goto onError; 4887 Py_XDECREF(errorHandler); 4888 Py_XDECREF(exc); 4889 return (PyObject *)v; 4890 4891 onError: 4892 Py_XDECREF(v); 4893 Py_XDECREF(errorHandler); 4894 Py_XDECREF(exc); 4895 return NULL; 4896} 4897 4898PyObject * 4899PyUnicode_EncodeASCII(const Py_UNICODE *p, 4900 Py_ssize_t size, 4901 const char *errors) 4902{ 4903 return unicode_encode_ucs1(p, size, errors, 128); 4904} 4905 4906PyObject * 4907PyUnicode_AsASCIIString(PyObject *unicode) 4908{ 4909 if (!PyUnicode_Check(unicode)) { 4910 PyErr_BadArgument(); 4911 return NULL; 4912 } 4913 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode), 4914 PyUnicode_GET_SIZE(unicode), 4915 NULL); 4916} 4917 4918#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T) 4919 4920/* --- MBCS codecs for Windows -------------------------------------------- */ 4921 4922#if SIZEOF_INT < SIZEOF_SIZE_T 4923#define NEED_RETRY 4924#endif 4925 4926/* XXX This code is limited to "true" double-byte encodings, as 4927 a) it assumes an incomplete character consists of a single byte, and 4928 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte 4929 encodings, see IsDBCSLeadByteEx documentation. */ 4930 4931static int 4932is_dbcs_lead_byte(const char *s, int offset) 4933{ 4934 const char *curr = s + offset; 4935 4936 if (IsDBCSLeadByte(*curr)) { 4937 const char *prev = CharPrev(s, curr); 4938 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2); 4939 } 4940 return 0; 4941} 4942 4943/* 4944 * Decode MBCS string into unicode object. If 'final' is set, converts 4945 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise. 4946 */ 4947static int 4948decode_mbcs(PyUnicodeObject **v, 4949 const char *s, /* MBCS string */ 4950 int size, /* sizeof MBCS string */ 4951 int final, 4952 const char *errors) 4953{ 4954 Py_UNICODE *p; 4955 Py_ssize_t n; 4956 DWORD usize; 4957 DWORD flags; 4958 4959 assert(size >= 0); 4960 4961 /* check and handle 'errors' arg */ 4962 if (errors==NULL || strcmp(errors, "strict")==0) 4963 flags = MB_ERR_INVALID_CHARS; 4964 else if (strcmp(errors, "ignore")==0) 4965 flags = 0; 4966 else { 4967 PyErr_Format(PyExc_ValueError, 4968 "mbcs encoding does not support errors='%s'", 4969 errors); 4970 return -1; 4971 } 4972 4973 /* Skip trailing lead-byte unless 'final' is set */ 4974 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1)) 4975 --size; 4976 4977 /* First get the size of the result */ 4978 if (size > 0) { 4979 usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0); 4980 if (usize==0) 4981 goto mbcs_decode_error; 4982 } else 4983 usize = 0; 4984 4985 if (*v == NULL) { 4986 /* Create unicode object */ 4987 *v = _PyUnicode_New(usize); 4988 if (*v == NULL) 4989 return -1; 4990 n = 0; 4991 } 4992 else { 4993 /* Extend unicode object */ 4994 n = PyUnicode_GET_SIZE(*v); 4995 if (_PyUnicode_Resize(v, n + usize) < 0) 4996 return -1; 4997 } 4998 4999 /* Do the conversion */ 5000 if (usize > 0) { 5001 p = PyUnicode_AS_UNICODE(*v) + n; 5002 if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) { 5003 goto mbcs_decode_error; 5004 } 5005 } 5006 return size; 5007 5008mbcs_decode_error: 5009 /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then 5010 we raise a UnicodeDecodeError - else it is a 'generic' 5011 windows error 5012 */ 5013 if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) { 5014 /* Ideally, we should get reason from FormatMessage - this 5015 is the Windows 2000 English version of the message 5016 */ 5017 PyObject *exc = NULL; 5018 const char *reason = "No mapping for the Unicode character exists " 5019 "in the target multi-byte code page."; 5020 make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason); 5021 if (exc != NULL) { 5022 PyCodec_StrictErrors(exc); 5023 Py_DECREF(exc); 5024 } 5025 } else { 5026 PyErr_SetFromWindowsErrWithFilename(0, NULL); 5027 } 5028 return -1; 5029} 5030 5031PyObject * 5032PyUnicode_DecodeMBCSStateful(const char *s, 5033 Py_ssize_t size, 5034 const char *errors, 5035 Py_ssize_t *consumed) 5036{ 5037 PyUnicodeObject *v = NULL; 5038 int done; 5039 5040 if (consumed) 5041 *consumed = 0; 5042 5043#ifdef NEED_RETRY 5044 retry: 5045 if (size > INT_MAX) 5046 done = decode_mbcs(&v, s, INT_MAX, 0, errors); 5047 else 5048#endif 5049 done = decode_mbcs(&v, s, (int)size, !consumed, errors); 5050 5051 if (done < 0) { 5052 Py_XDECREF(v); 5053 return NULL; 5054 } 5055 5056 if (consumed) 5057 *consumed += done; 5058 5059#ifdef NEED_RETRY 5060 if (size > INT_MAX) { 5061 s += done; 5062 size -= done; 5063 goto retry; 5064 } 5065#endif 5066 5067 return (PyObject *)v; 5068} 5069 5070PyObject * 5071PyUnicode_DecodeMBCS(const char *s, 5072 Py_ssize_t size, 5073 const char *errors) 5074{ 5075 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL); 5076} 5077 5078/* 5079 * Convert unicode into string object (MBCS). 5080 * Returns 0 if succeed, -1 otherwise. 5081 */ 5082static int 5083encode_mbcs(PyObject **repr, 5084 const Py_UNICODE *p, /* unicode */ 5085 int size, /* size of unicode */ 5086 const char* errors) 5087{ 5088 BOOL usedDefaultChar = FALSE; 5089 BOOL *pusedDefaultChar; 5090 int mbcssize; 5091 Py_ssize_t n; 5092 PyObject *exc = NULL; 5093 DWORD flags; 5094 5095 assert(size >= 0); 5096 5097 /* check and handle 'errors' arg */ 5098 if (errors==NULL || strcmp(errors, "strict")==0) { 5099 flags = WC_NO_BEST_FIT_CHARS; 5100 pusedDefaultChar = &usedDefaultChar; 5101 } else if (strcmp(errors, "replace")==0) { 5102 flags = 0; 5103 pusedDefaultChar = NULL; 5104 } else { 5105 PyErr_Format(PyExc_ValueError, 5106 "mbcs encoding does not support errors='%s'", 5107 errors); 5108 return -1; 5109 } 5110 5111 /* First get the size of the result */ 5112 if (size > 0) { 5113 mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0, 5114 NULL, pusedDefaultChar); 5115 if (mbcssize == 0) { 5116 PyErr_SetFromWindowsErrWithFilename(0, NULL); 5117 return -1; 5118 } 5119 /* If we used a default char, then we failed! */ 5120 if (pusedDefaultChar && *pusedDefaultChar) 5121 goto mbcs_encode_error; 5122 } else { 5123 mbcssize = 0; 5124 } 5125 5126 if (*repr == NULL) { 5127 /* Create string object */ 5128 *repr = PyBytes_FromStringAndSize(NULL, mbcssize); 5129 if (*repr == NULL) 5130 return -1; 5131 n = 0; 5132 } 5133 else { 5134 /* Extend string object */ 5135 n = PyBytes_Size(*repr); 5136 if (_PyBytes_Resize(repr, n + mbcssize) < 0) 5137 return -1; 5138 } 5139 5140 /* Do the conversion */ 5141 if (size > 0) { 5142 char *s = PyBytes_AS_STRING(*repr) + n; 5143 if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize, 5144 NULL, pusedDefaultChar)) { 5145 PyErr_SetFromWindowsErrWithFilename(0, NULL); 5146 return -1; 5147 } 5148 if (pusedDefaultChar && *pusedDefaultChar) 5149 goto mbcs_encode_error; 5150 } 5151 return 0; 5152 5153mbcs_encode_error: 5154 raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character"); 5155 Py_XDECREF(exc); 5156 return -1; 5157} 5158 5159PyObject * 5160PyUnicode_EncodeMBCS(const Py_UNICODE *p, 5161 Py_ssize_t size, 5162 const char *errors) 5163{ 5164 PyObject *repr = NULL; 5165 int ret; 5166 5167#ifdef NEED_RETRY 5168 retry: 5169 if (size > INT_MAX) 5170 ret = encode_mbcs(&repr, p, INT_MAX, errors); 5171 else 5172#endif 5173 ret = encode_mbcs(&repr, p, (int)size, errors); 5174 5175 if (ret < 0) { 5176 Py_XDECREF(repr); 5177 return NULL; 5178 } 5179 5180#ifdef NEED_RETRY 5181 if (size > INT_MAX) { 5182 p += INT_MAX; 5183 size -= INT_MAX; 5184 goto retry; 5185 } 5186#endif 5187 5188 return repr; 5189} 5190 5191PyObject * 5192PyUnicode_AsMBCSString(PyObject *unicode) 5193{ 5194 if (!PyUnicode_Check(unicode)) { 5195 PyErr_BadArgument(); 5196 return NULL; 5197 } 5198 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode), 5199 PyUnicode_GET_SIZE(unicode), 5200 NULL); 5201} 5202 5203#undef NEED_RETRY 5204 5205#endif /* MS_WINDOWS */ 5206 5207/* --- Character Mapping Codec -------------------------------------------- */ 5208 5209PyObject * 5210PyUnicode_DecodeCharmap(const char *s, 5211 Py_ssize_t size, 5212 PyObject *mapping, 5213 const char *errors) 5214{ 5215 const char *starts = s; 5216 Py_ssize_t startinpos; 5217 Py_ssize_t endinpos; 5218 Py_ssize_t outpos; 5219 const char *e; 5220 PyUnicodeObject *v; 5221 Py_UNICODE *p; 5222 Py_ssize_t extrachars = 0; 5223 PyObject *errorHandler = NULL; 5224 PyObject *exc = NULL; 5225 Py_UNICODE *mapstring = NULL; 5226 Py_ssize_t maplen = 0; 5227 5228 /* Default to Latin-1 */ 5229 if (mapping == NULL) 5230 return PyUnicode_DecodeLatin1(s, size, errors); 5231 5232 v = _PyUnicode_New(size); 5233 if (v == NULL) 5234 goto onError; 5235 if (size == 0) 5236 return (PyObject *)v; 5237 p = PyUnicode_AS_UNICODE(v); 5238 e = s + size; 5239 if (PyUnicode_CheckExact(mapping)) { 5240 mapstring = PyUnicode_AS_UNICODE(mapping); 5241 maplen = PyUnicode_GET_SIZE(mapping); 5242 while (s < e) { 5243 unsigned char ch = *s; 5244 Py_UNICODE x = 0xfffe; /* illegal value */ 5245 5246 if (ch < maplen) 5247 x = mapstring[ch]; 5248 5249 if (x == 0xfffe) { 5250 /* undefined mapping */ 5251 outpos = p-PyUnicode_AS_UNICODE(v); 5252 startinpos = s-starts; 5253 endinpos = startinpos+1; 5254 if (unicode_decode_call_errorhandler( 5255 errors, &errorHandler, 5256 "charmap", "character maps to <undefined>", 5257 &starts, &e, &startinpos, &endinpos, &exc, &s, 5258 &v, &outpos, &p)) { 5259 goto onError; 5260 } 5261 continue; 5262 } 5263 *p++ = x; 5264 ++s; 5265 } 5266 } 5267 else { 5268 while (s < e) { 5269 unsigned char ch = *s; 5270 PyObject *w, *x; 5271 5272 /* Get mapping (char ordinal -> integer, Unicode char or None) */ 5273 w = PyLong_FromLong((long)ch); 5274 if (w == NULL) 5275 goto onError; 5276 x = PyObject_GetItem(mapping, w); 5277 Py_DECREF(w); 5278 if (x == NULL) { 5279 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 5280 /* No mapping found means: mapping is undefined. */ 5281 PyErr_Clear(); 5282 x = Py_None; 5283 Py_INCREF(x); 5284 } else 5285 goto onError; 5286 } 5287 5288 /* Apply mapping */ 5289 if (PyLong_Check(x)) { 5290 long value = PyLong_AS_LONG(x); 5291 if (value < 0 || value > 65535) { 5292 PyErr_SetString(PyExc_TypeError, 5293 "character mapping must be in range(65536)"); 5294 Py_DECREF(x); 5295 goto onError; 5296 } 5297 *p++ = (Py_UNICODE)value; 5298 } 5299 else if (x == Py_None) { 5300 /* undefined mapping */ 5301 outpos = p-PyUnicode_AS_UNICODE(v); 5302 startinpos = s-starts; 5303 endinpos = startinpos+1; 5304 if (unicode_decode_call_errorhandler( 5305 errors, &errorHandler, 5306 "charmap", "character maps to <undefined>", 5307 &starts, &e, &startinpos, &endinpos, &exc, &s, 5308 &v, &outpos, &p)) { 5309 Py_DECREF(x); 5310 goto onError; 5311 } 5312 Py_DECREF(x); 5313 continue; 5314 } 5315 else if (PyUnicode_Check(x)) { 5316 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x); 5317 5318 if (targetsize == 1) 5319 /* 1-1 mapping */ 5320 *p++ = *PyUnicode_AS_UNICODE(x); 5321 5322 else if (targetsize > 1) { 5323 /* 1-n mapping */ 5324 if (targetsize > extrachars) { 5325 /* resize first */ 5326 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v); 5327 Py_ssize_t needed = (targetsize - extrachars) + \ 5328 (targetsize << 2); 5329 extrachars += needed; 5330 /* XXX overflow detection missing */ 5331 if (_PyUnicode_Resize(&v, 5332 PyUnicode_GET_SIZE(v) + needed) < 0) { 5333 Py_DECREF(x); 5334 goto onError; 5335 } 5336 p = PyUnicode_AS_UNICODE(v) + oldpos; 5337 } 5338 Py_UNICODE_COPY(p, 5339 PyUnicode_AS_UNICODE(x), 5340 targetsize); 5341 p += targetsize; 5342 extrachars -= targetsize; 5343 } 5344 /* 1-0 mapping: skip the character */ 5345 } 5346 else { 5347 /* wrong return value */ 5348 PyErr_SetString(PyExc_TypeError, 5349 "character mapping must return integer, None or str"); 5350 Py_DECREF(x); 5351 goto onError; 5352 } 5353 Py_DECREF(x); 5354 ++s; 5355 } 5356 } 5357 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v)) 5358 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0) 5359 goto onError; 5360 Py_XDECREF(errorHandler); 5361 Py_XDECREF(exc); 5362 return (PyObject *)v; 5363 5364 onError: 5365 Py_XDECREF(errorHandler); 5366 Py_XDECREF(exc); 5367 Py_XDECREF(v); 5368 return NULL; 5369} 5370 5371/* Charmap encoding: the lookup table */ 5372 5373struct encoding_map { 5374 PyObject_HEAD 5375 unsigned char level1[32]; 5376 int count2, count3; 5377 unsigned char level23[1]; 5378}; 5379 5380static PyObject* 5381encoding_map_size(PyObject *obj, PyObject* args) 5382{ 5383 struct encoding_map *map = (struct encoding_map*)obj; 5384 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 + 5385 128*map->count3); 5386} 5387 5388static PyMethodDef encoding_map_methods[] = { 5389 {"size", encoding_map_size, METH_NOARGS, 5390 PyDoc_STR("Return the size (in bytes) of this object") }, 5391 { 0 } 5392}; 5393 5394static void 5395encoding_map_dealloc(PyObject* o) 5396{ 5397 PyObject_FREE(o); 5398} 5399 5400static PyTypeObject EncodingMapType = { 5401 PyVarObject_HEAD_INIT(NULL, 0) 5402 "EncodingMap", /*tp_name*/ 5403 sizeof(struct encoding_map), /*tp_basicsize*/ 5404 0, /*tp_itemsize*/ 5405 /* methods */ 5406 encoding_map_dealloc, /*tp_dealloc*/ 5407 0, /*tp_print*/ 5408 0, /*tp_getattr*/ 5409 0, /*tp_setattr*/ 5410 0, /*tp_reserved*/ 5411 0, /*tp_repr*/ 5412 0, /*tp_as_number*/ 5413 0, /*tp_as_sequence*/ 5414 0, /*tp_as_mapping*/ 5415 0, /*tp_hash*/ 5416 0, /*tp_call*/ 5417 0, /*tp_str*/ 5418 0, /*tp_getattro*/ 5419 0, /*tp_setattro*/ 5420 0, /*tp_as_buffer*/ 5421 Py_TPFLAGS_DEFAULT, /*tp_flags*/ 5422 0, /*tp_doc*/ 5423 0, /*tp_traverse*/ 5424 0, /*tp_clear*/ 5425 0, /*tp_richcompare*/ 5426 0, /*tp_weaklistoffset*/ 5427 0, /*tp_iter*/ 5428 0, /*tp_iternext*/ 5429 encoding_map_methods, /*tp_methods*/ 5430 0, /*tp_members*/ 5431 0, /*tp_getset*/ 5432 0, /*tp_base*/ 5433 0, /*tp_dict*/ 5434 0, /*tp_descr_get*/ 5435 0, /*tp_descr_set*/ 5436 0, /*tp_dictoffset*/ 5437 0, /*tp_init*/ 5438 0, /*tp_alloc*/ 5439 0, /*tp_new*/ 5440 0, /*tp_free*/ 5441 0, /*tp_is_gc*/ 5442}; 5443 5444PyObject* 5445PyUnicode_BuildEncodingMap(PyObject* string) 5446{ 5447 Py_UNICODE *decode; 5448 PyObject *result; 5449 struct encoding_map *mresult; 5450 int i; 5451 int need_dict = 0; 5452 unsigned char level1[32]; 5453 unsigned char level2[512]; 5454 unsigned char *mlevel1, *mlevel2, *mlevel3; 5455 int count2 = 0, count3 = 0; 5456 5457 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) { 5458 PyErr_BadArgument(); 5459 return NULL; 5460 } 5461 decode = PyUnicode_AS_UNICODE(string); 5462 memset(level1, 0xFF, sizeof level1); 5463 memset(level2, 0xFF, sizeof level2); 5464 5465 /* If there isn't a one-to-one mapping of NULL to \0, 5466 or if there are non-BMP characters, we need to use 5467 a mapping dictionary. */ 5468 if (decode[0] != 0) 5469 need_dict = 1; 5470 for (i = 1; i < 256; i++) { 5471 int l1, l2; 5472 if (decode[i] == 0 5473#ifdef Py_UNICODE_WIDE 5474 || decode[i] > 0xFFFF 5475#endif 5476 ) { 5477 need_dict = 1; 5478 break; 5479 } 5480 if (decode[i] == 0xFFFE) 5481 /* unmapped character */ 5482 continue; 5483 l1 = decode[i] >> 11; 5484 l2 = decode[i] >> 7; 5485 if (level1[l1] == 0xFF) 5486 level1[l1] = count2++; 5487 if (level2[l2] == 0xFF) 5488 level2[l2] = count3++; 5489 } 5490 5491 if (count2 >= 0xFF || count3 >= 0xFF) 5492 need_dict = 1; 5493 5494 if (need_dict) { 5495 PyObject *result = PyDict_New(); 5496 PyObject *key, *value; 5497 if (!result) 5498 return NULL; 5499 for (i = 0; i < 256; i++) { 5500 key = PyLong_FromLong(decode[i]); 5501 value = PyLong_FromLong(i); 5502 if (!key || !value) 5503 goto failed1; 5504 if (PyDict_SetItem(result, key, value) == -1) 5505 goto failed1; 5506 Py_DECREF(key); 5507 Py_DECREF(value); 5508 } 5509 return result; 5510 failed1: 5511 Py_XDECREF(key); 5512 Py_XDECREF(value); 5513 Py_DECREF(result); 5514 return NULL; 5515 } 5516 5517 /* Create a three-level trie */ 5518 result = PyObject_MALLOC(sizeof(struct encoding_map) + 5519 16*count2 + 128*count3 - 1); 5520 if (!result) 5521 return PyErr_NoMemory(); 5522 PyObject_Init(result, &EncodingMapType); 5523 mresult = (struct encoding_map*)result; 5524 mresult->count2 = count2; 5525 mresult->count3 = count3; 5526 mlevel1 = mresult->level1; 5527 mlevel2 = mresult->level23; 5528 mlevel3 = mresult->level23 + 16*count2; 5529 memcpy(mlevel1, level1, 32); 5530 memset(mlevel2, 0xFF, 16*count2); 5531 memset(mlevel3, 0, 128*count3); 5532 count3 = 0; 5533 for (i = 1; i < 256; i++) { 5534 int o1, o2, o3, i2, i3; 5535 if (decode[i] == 0xFFFE) 5536 /* unmapped character */ 5537 continue; 5538 o1 = decode[i]>>11; 5539 o2 = (decode[i]>>7) & 0xF; 5540 i2 = 16*mlevel1[o1] + o2; 5541 if (mlevel2[i2] == 0xFF) 5542 mlevel2[i2] = count3++; 5543 o3 = decode[i] & 0x7F; 5544 i3 = 128*mlevel2[i2] + o3; 5545 mlevel3[i3] = i; 5546 } 5547 return result; 5548} 5549 5550static int 5551encoding_map_lookup(Py_UNICODE c, PyObject *mapping) 5552{ 5553 struct encoding_map *map = (struct encoding_map*)mapping; 5554 int l1 = c>>11; 5555 int l2 = (c>>7) & 0xF; 5556 int l3 = c & 0x7F; 5557 int i; 5558 5559#ifdef Py_UNICODE_WIDE 5560 if (c > 0xFFFF) { 5561 return -1; 5562 } 5563#endif 5564 if (c == 0) 5565 return 0; 5566 /* level 1*/ 5567 i = map->level1[l1]; 5568 if (i == 0xFF) { 5569 return -1; 5570 } 5571 /* level 2*/ 5572 i = map->level23[16*i+l2]; 5573 if (i == 0xFF) { 5574 return -1; 5575 } 5576 /* level 3 */ 5577 i = map->level23[16*map->count2 + 128*i + l3]; 5578 if (i == 0) { 5579 return -1; 5580 } 5581 return i; 5582} 5583 5584/* Lookup the character ch in the mapping. If the character 5585 can't be found, Py_None is returned (or NULL, if another 5586 error occurred). */ 5587static PyObject * 5588charmapencode_lookup(Py_UNICODE c, PyObject *mapping) 5589{ 5590 PyObject *w = PyLong_FromLong((long)c); 5591 PyObject *x; 5592 5593 if (w == NULL) 5594 return NULL; 5595 x = PyObject_GetItem(mapping, w); 5596 Py_DECREF(w); 5597 if (x == NULL) { 5598 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 5599 /* No mapping found means: mapping is undefined. */ 5600 PyErr_Clear(); 5601 x = Py_None; 5602 Py_INCREF(x); 5603 return x; 5604 } else 5605 return NULL; 5606 } 5607 else if (x == Py_None) 5608 return x; 5609 else if (PyLong_Check(x)) { 5610 long value = PyLong_AS_LONG(x); 5611 if (value < 0 || value > 255) { 5612 PyErr_SetString(PyExc_TypeError, 5613 "character mapping must be in range(256)"); 5614 Py_DECREF(x); 5615 return NULL; 5616 } 5617 return x; 5618 } 5619 else if (PyBytes_Check(x)) 5620 return x; 5621 else { 5622 /* wrong return value */ 5623 PyErr_Format(PyExc_TypeError, 5624 "character mapping must return integer, bytes or None, not %.400s", 5625 x->ob_type->tp_name); 5626 Py_DECREF(x); 5627 return NULL; 5628 } 5629} 5630 5631static int 5632charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize) 5633{ 5634 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj); 5635 /* exponentially overallocate to minimize reallocations */ 5636 if (requiredsize < 2*outsize) 5637 requiredsize = 2*outsize; 5638 if (_PyBytes_Resize(outobj, requiredsize)) 5639 return -1; 5640 return 0; 5641} 5642 5643typedef enum charmapencode_result { 5644 enc_SUCCESS, enc_FAILED, enc_EXCEPTION 5645} charmapencode_result; 5646/* lookup the character, put the result in the output string and adjust 5647 various state variables. Resize the output bytes object if not enough 5648 space is available. Return a new reference to the object that 5649 was put in the output buffer, or Py_None, if the mapping was undefined 5650 (in which case no character was written) or NULL, if a 5651 reallocation error occurred. The caller must decref the result */ 5652static charmapencode_result 5653charmapencode_output(Py_UNICODE c, PyObject *mapping, 5654 PyObject **outobj, Py_ssize_t *outpos) 5655{ 5656 PyObject *rep; 5657 char *outstart; 5658 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj); 5659 5660 if (Py_TYPE(mapping) == &EncodingMapType) { 5661 int res = encoding_map_lookup(c, mapping); 5662 Py_ssize_t requiredsize = *outpos+1; 5663 if (res == -1) 5664 return enc_FAILED; 5665 if (outsize<requiredsize) 5666 if (charmapencode_resize(outobj, outpos, requiredsize)) 5667 return enc_EXCEPTION; 5668 outstart = PyBytes_AS_STRING(*outobj); 5669 outstart[(*outpos)++] = (char)res; 5670 return enc_SUCCESS; 5671 } 5672 5673 rep = charmapencode_lookup(c, mapping); 5674 if (rep==NULL) 5675 return enc_EXCEPTION; 5676 else if (rep==Py_None) { 5677 Py_DECREF(rep); 5678 return enc_FAILED; 5679 } else { 5680 if (PyLong_Check(rep)) { 5681 Py_ssize_t requiredsize = *outpos+1; 5682 if (outsize<requiredsize) 5683 if (charmapencode_resize(outobj, outpos, requiredsize)) { 5684 Py_DECREF(rep); 5685 return enc_EXCEPTION; 5686 } 5687 outstart = PyBytes_AS_STRING(*outobj); 5688 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep); 5689 } 5690 else { 5691 const char *repchars = PyBytes_AS_STRING(rep); 5692 Py_ssize_t repsize = PyBytes_GET_SIZE(rep); 5693 Py_ssize_t requiredsize = *outpos+repsize; 5694 if (outsize<requiredsize) 5695 if (charmapencode_resize(outobj, outpos, requiredsize)) { 5696 Py_DECREF(rep); 5697 return enc_EXCEPTION; 5698 } 5699 outstart = PyBytes_AS_STRING(*outobj); 5700 memcpy(outstart + *outpos, repchars, repsize); 5701 *outpos += repsize; 5702 } 5703 } 5704 Py_DECREF(rep); 5705 return enc_SUCCESS; 5706} 5707 5708/* handle an error in PyUnicode_EncodeCharmap 5709 Return 0 on success, -1 on error */ 5710static int 5711charmap_encoding_error( 5712 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping, 5713 PyObject **exceptionObject, 5714 int *known_errorHandler, PyObject **errorHandler, const char *errors, 5715 PyObject **res, Py_ssize_t *respos) 5716{ 5717 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */ 5718 Py_ssize_t repsize; 5719 Py_ssize_t newpos; 5720 Py_UNICODE *uni2; 5721 /* startpos for collecting unencodable chars */ 5722 Py_ssize_t collstartpos = *inpos; 5723 Py_ssize_t collendpos = *inpos+1; 5724 Py_ssize_t collpos; 5725 char *encoding = "charmap"; 5726 char *reason = "character maps to <undefined>"; 5727 charmapencode_result x; 5728 5729 /* find all unencodable characters */ 5730 while (collendpos < size) { 5731 PyObject *rep; 5732 if (Py_TYPE(mapping) == &EncodingMapType) { 5733 int res = encoding_map_lookup(p[collendpos], mapping); 5734 if (res != -1) 5735 break; 5736 ++collendpos; 5737 continue; 5738 } 5739 5740 rep = charmapencode_lookup(p[collendpos], mapping); 5741 if (rep==NULL) 5742 return -1; 5743 else if (rep!=Py_None) { 5744 Py_DECREF(rep); 5745 break; 5746 } 5747 Py_DECREF(rep); 5748 ++collendpos; 5749 } 5750 /* cache callback name lookup 5751 * (if not done yet, i.e. it's the first error) */ 5752 if (*known_errorHandler==-1) { 5753 if ((errors==NULL) || (!strcmp(errors, "strict"))) 5754 *known_errorHandler = 1; 5755 else if (!strcmp(errors, "replace")) 5756 *known_errorHandler = 2; 5757 else if (!strcmp(errors, "ignore")) 5758 *known_errorHandler = 3; 5759 else if (!strcmp(errors, "xmlcharrefreplace")) 5760 *known_errorHandler = 4; 5761 else 5762 *known_errorHandler = 0; 5763 } 5764 switch (*known_errorHandler) { 5765 case 1: /* strict */ 5766 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason); 5767 return -1; 5768 case 2: /* replace */ 5769 for (collpos = collstartpos; collpos<collendpos; ++collpos) { 5770 x = charmapencode_output('?', mapping, res, respos); 5771 if (x==enc_EXCEPTION) { 5772 return -1; 5773 } 5774 else if (x==enc_FAILED) { 5775 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason); 5776 return -1; 5777 } 5778 } 5779 /* fall through */ 5780 case 3: /* ignore */ 5781 *inpos = collendpos; 5782 break; 5783 case 4: /* xmlcharrefreplace */ 5784 /* generate replacement (temporarily (mis)uses p) */ 5785 for (collpos = collstartpos; collpos < collendpos; ++collpos) { 5786 char buffer[2+29+1+1]; 5787 char *cp; 5788 sprintf(buffer, "&#%d;", (int)p[collpos]); 5789 for (cp = buffer; *cp; ++cp) { 5790 x = charmapencode_output(*cp, mapping, res, respos); 5791 if (x==enc_EXCEPTION) 5792 return -1; 5793 else if (x==enc_FAILED) { 5794 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason); 5795 return -1; 5796 } 5797 } 5798 } 5799 *inpos = collendpos; 5800 break; 5801 default: 5802 repunicode = unicode_encode_call_errorhandler(errors, errorHandler, 5803 encoding, reason, p, size, exceptionObject, 5804 collstartpos, collendpos, &newpos); 5805 if (repunicode == NULL) 5806 return -1; 5807 if (PyBytes_Check(repunicode)) { 5808 /* Directly copy bytes result to output. */ 5809 Py_ssize_t outsize = PyBytes_Size(*res); 5810 Py_ssize_t requiredsize; 5811 repsize = PyBytes_Size(repunicode); 5812 requiredsize = *respos + repsize; 5813 if (requiredsize > outsize) 5814 /* Make room for all additional bytes. */ 5815 if (charmapencode_resize(res, respos, requiredsize)) { 5816 Py_DECREF(repunicode); 5817 return -1; 5818 } 5819 memcpy(PyBytes_AsString(*res) + *respos, 5820 PyBytes_AsString(repunicode), repsize); 5821 *respos += repsize; 5822 *inpos = newpos; 5823 Py_DECREF(repunicode); 5824 break; 5825 } 5826 /* generate replacement */ 5827 repsize = PyUnicode_GET_SIZE(repunicode); 5828 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) { 5829 x = charmapencode_output(*uni2, mapping, res, respos); 5830 if (x==enc_EXCEPTION) { 5831 return -1; 5832 } 5833 else if (x==enc_FAILED) { 5834 Py_DECREF(repunicode); 5835 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason); 5836 return -1; 5837 } 5838 } 5839 *inpos = newpos; 5840 Py_DECREF(repunicode); 5841 } 5842 return 0; 5843} 5844 5845PyObject * 5846PyUnicode_EncodeCharmap(const Py_UNICODE *p, 5847 Py_ssize_t size, 5848 PyObject *mapping, 5849 const char *errors) 5850{ 5851 /* output object */ 5852 PyObject *res = NULL; 5853 /* current input position */ 5854 Py_ssize_t inpos = 0; 5855 /* current output position */ 5856 Py_ssize_t respos = 0; 5857 PyObject *errorHandler = NULL; 5858 PyObject *exc = NULL; 5859 /* the following variable is used for caching string comparisons 5860 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 5861 * 3=ignore, 4=xmlcharrefreplace */ 5862 int known_errorHandler = -1; 5863 5864 /* Default to Latin-1 */ 5865 if (mapping == NULL) 5866 return PyUnicode_EncodeLatin1(p, size, errors); 5867 5868 /* allocate enough for a simple encoding without 5869 replacements, if we need more, we'll resize */ 5870 res = PyBytes_FromStringAndSize(NULL, size); 5871 if (res == NULL) 5872 goto onError; 5873 if (size == 0) 5874 return res; 5875 5876 while (inpos<size) { 5877 /* try to encode it */ 5878 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos); 5879 if (x==enc_EXCEPTION) /* error */ 5880 goto onError; 5881 if (x==enc_FAILED) { /* unencodable character */ 5882 if (charmap_encoding_error(p, size, &inpos, mapping, 5883 &exc, 5884 &known_errorHandler, &errorHandler, errors, 5885 &res, &respos)) { 5886 goto onError; 5887 } 5888 } 5889 else 5890 /* done with this character => adjust input position */ 5891 ++inpos; 5892 } 5893 5894 /* Resize if we allocated to much */ 5895 if (respos<PyBytes_GET_SIZE(res)) 5896 if (_PyBytes_Resize(&res, respos) < 0) 5897 goto onError; 5898 5899 Py_XDECREF(exc); 5900 Py_XDECREF(errorHandler); 5901 return res; 5902 5903 onError: 5904 Py_XDECREF(res); 5905 Py_XDECREF(exc); 5906 Py_XDECREF(errorHandler); 5907 return NULL; 5908} 5909 5910PyObject * 5911PyUnicode_AsCharmapString(PyObject *unicode, 5912 PyObject *mapping) 5913{ 5914 if (!PyUnicode_Check(unicode) || mapping == NULL) { 5915 PyErr_BadArgument(); 5916 return NULL; 5917 } 5918 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode), 5919 PyUnicode_GET_SIZE(unicode), 5920 mapping, 5921 NULL); 5922} 5923 5924/* create or adjust a UnicodeTranslateError */ 5925static void 5926make_translate_exception(PyObject **exceptionObject, 5927 const Py_UNICODE *unicode, Py_ssize_t size, 5928 Py_ssize_t startpos, Py_ssize_t endpos, 5929 const char *reason) 5930{ 5931 if (*exceptionObject == NULL) { 5932 *exceptionObject = PyUnicodeTranslateError_Create( 5933 unicode, size, startpos, endpos, reason); 5934 } 5935 else { 5936 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos)) 5937 goto onError; 5938 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos)) 5939 goto onError; 5940 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason)) 5941 goto onError; 5942 return; 5943 onError: 5944 Py_DECREF(*exceptionObject); 5945 *exceptionObject = NULL; 5946 } 5947} 5948 5949/* raises a UnicodeTranslateError */ 5950static void 5951raise_translate_exception(PyObject **exceptionObject, 5952 const Py_UNICODE *unicode, Py_ssize_t size, 5953 Py_ssize_t startpos, Py_ssize_t endpos, 5954 const char *reason) 5955{ 5956 make_translate_exception(exceptionObject, 5957 unicode, size, startpos, endpos, reason); 5958 if (*exceptionObject != NULL) 5959 PyCodec_StrictErrors(*exceptionObject); 5960} 5961 5962/* error handling callback helper: 5963 build arguments, call the callback and check the arguments, 5964 put the result into newpos and return the replacement string, which 5965 has to be freed by the caller */ 5966static PyObject * 5967unicode_translate_call_errorhandler(const char *errors, 5968 PyObject **errorHandler, 5969 const char *reason, 5970 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject, 5971 Py_ssize_t startpos, Py_ssize_t endpos, 5972 Py_ssize_t *newpos) 5973{ 5974 static char *argparse = "O!n;translating error handler must return (str, int) tuple"; 5975 5976 Py_ssize_t i_newpos; 5977 PyObject *restuple; 5978 PyObject *resunicode; 5979 5980 if (*errorHandler == NULL) { 5981 *errorHandler = PyCodec_LookupError(errors); 5982 if (*errorHandler == NULL) 5983 return NULL; 5984 } 5985 5986 make_translate_exception(exceptionObject, 5987 unicode, size, startpos, endpos, reason); 5988 if (*exceptionObject == NULL) 5989 return NULL; 5990 5991 restuple = PyObject_CallFunctionObjArgs( 5992 *errorHandler, *exceptionObject, NULL); 5993 if (restuple == NULL) 5994 return NULL; 5995 if (!PyTuple_Check(restuple)) { 5996 PyErr_SetString(PyExc_TypeError, &argparse[4]); 5997 Py_DECREF(restuple); 5998 return NULL; 5999 } 6000 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, 6001 &resunicode, &i_newpos)) { 6002 Py_DECREF(restuple); 6003 return NULL; 6004 } 6005 if (i_newpos<0) 6006 *newpos = size+i_newpos; 6007 else 6008 *newpos = i_newpos; 6009 if (*newpos<0 || *newpos>size) { 6010 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos); 6011 Py_DECREF(restuple); 6012 return NULL; 6013 } 6014 Py_INCREF(resunicode); 6015 Py_DECREF(restuple); 6016 return resunicode; 6017} 6018 6019/* Lookup the character ch in the mapping and put the result in result, 6020 which must be decrefed by the caller. 6021 Return 0 on success, -1 on error */ 6022static int 6023charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result) 6024{ 6025 PyObject *w = PyLong_FromLong((long)c); 6026 PyObject *x; 6027 6028 if (w == NULL) 6029 return -1; 6030 x = PyObject_GetItem(mapping, w); 6031 Py_DECREF(w); 6032 if (x == NULL) { 6033 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 6034 /* No mapping found means: use 1:1 mapping. */ 6035 PyErr_Clear(); 6036 *result = NULL; 6037 return 0; 6038 } else 6039 return -1; 6040 } 6041 else if (x == Py_None) { 6042 *result = x; 6043 return 0; 6044 } 6045 else if (PyLong_Check(x)) { 6046 long value = PyLong_AS_LONG(x); 6047 long max = PyUnicode_GetMax(); 6048 if (value < 0 || value > max) { 6049 PyErr_Format(PyExc_TypeError, 6050 "character mapping must be in range(0x%x)", max+1); 6051 Py_DECREF(x); 6052 return -1; 6053 } 6054 *result = x; 6055 return 0; 6056 } 6057 else if (PyUnicode_Check(x)) { 6058 *result = x; 6059 return 0; 6060 } 6061 else { 6062 /* wrong return value */ 6063 PyErr_SetString(PyExc_TypeError, 6064 "character mapping must return integer, None or str"); 6065 Py_DECREF(x); 6066 return -1; 6067 } 6068} 6069/* ensure that *outobj is at least requiredsize characters long, 6070 if not reallocate and adjust various state variables. 6071 Return 0 on success, -1 on error */ 6072static int 6073charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp, 6074 Py_ssize_t requiredsize) 6075{ 6076 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj); 6077 if (requiredsize > oldsize) { 6078 /* remember old output position */ 6079 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj); 6080 /* exponentially overallocate to minimize reallocations */ 6081 if (requiredsize < 2 * oldsize) 6082 requiredsize = 2 * oldsize; 6083 if (PyUnicode_Resize(outobj, requiredsize) < 0) 6084 return -1; 6085 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos; 6086 } 6087 return 0; 6088} 6089/* lookup the character, put the result in the output string and adjust 6090 various state variables. Return a new reference to the object that 6091 was put in the output buffer in *result, or Py_None, if the mapping was 6092 undefined (in which case no character was written). 6093 The called must decref result. 6094 Return 0 on success, -1 on error. */ 6095static int 6096charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp, 6097 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp, 6098 PyObject **res) 6099{ 6100 if (charmaptranslate_lookup(*curinp, mapping, res)) 6101 return -1; 6102 if (*res==NULL) { 6103 /* not found => default to 1:1 mapping */ 6104 *(*outp)++ = *curinp; 6105 } 6106 else if (*res==Py_None) 6107 ; 6108 else if (PyLong_Check(*res)) { 6109 /* no overflow check, because we know that the space is enough */ 6110 *(*outp)++ = (Py_UNICODE)PyLong_AS_LONG(*res); 6111 } 6112 else if (PyUnicode_Check(*res)) { 6113 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res); 6114 if (repsize==1) { 6115 /* no overflow check, because we know that the space is enough */ 6116 *(*outp)++ = *PyUnicode_AS_UNICODE(*res); 6117 } 6118 else if (repsize!=0) { 6119 /* more than one character */ 6120 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) + 6121 (insize - (curinp-startinp)) + 6122 repsize - 1; 6123 if (charmaptranslate_makespace(outobj, outp, requiredsize)) 6124 return -1; 6125 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize); 6126 *outp += repsize; 6127 } 6128 } 6129 else 6130 return -1; 6131 return 0; 6132} 6133 6134PyObject * 6135PyUnicode_TranslateCharmap(const Py_UNICODE *p, 6136 Py_ssize_t size, 6137 PyObject *mapping, 6138 const char *errors) 6139{ 6140 /* output object */ 6141 PyObject *res = NULL; 6142 /* pointers to the beginning and end+1 of input */ 6143 const Py_UNICODE *startp = p; 6144 const Py_UNICODE *endp = p + size; 6145 /* pointer into the output */ 6146 Py_UNICODE *str; 6147 /* current output position */ 6148 Py_ssize_t respos = 0; 6149 char *reason = "character maps to <undefined>"; 6150 PyObject *errorHandler = NULL; 6151 PyObject *exc = NULL; 6152 /* the following variable is used for caching string comparisons 6153 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 6154 * 3=ignore, 4=xmlcharrefreplace */ 6155 int known_errorHandler = -1; 6156 6157 if (mapping == NULL) { 6158 PyErr_BadArgument(); 6159 return NULL; 6160 } 6161 6162 /* allocate enough for a simple 1:1 translation without 6163 replacements, if we need more, we'll resize */ 6164 res = PyUnicode_FromUnicode(NULL, size); 6165 if (res == NULL) 6166 goto onError; 6167 if (size == 0) 6168 return res; 6169 str = PyUnicode_AS_UNICODE(res); 6170 6171 while (p<endp) { 6172 /* try to encode it */ 6173 PyObject *x = NULL; 6174 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) { 6175 Py_XDECREF(x); 6176 goto onError; 6177 } 6178 Py_XDECREF(x); 6179 if (x!=Py_None) /* it worked => adjust input pointer */ 6180 ++p; 6181 else { /* untranslatable character */ 6182 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */ 6183 Py_ssize_t repsize; 6184 Py_ssize_t newpos; 6185 Py_UNICODE *uni2; 6186 /* startpos for collecting untranslatable chars */ 6187 const Py_UNICODE *collstart = p; 6188 const Py_UNICODE *collend = p+1; 6189 const Py_UNICODE *coll; 6190 6191 /* find all untranslatable characters */ 6192 while (collend < endp) { 6193 if (charmaptranslate_lookup(*collend, mapping, &x)) 6194 goto onError; 6195 Py_XDECREF(x); 6196 if (x!=Py_None) 6197 break; 6198 ++collend; 6199 } 6200 /* cache callback name lookup 6201 * (if not done yet, i.e. it's the first error) */ 6202 if (known_errorHandler==-1) { 6203 if ((errors==NULL) || (!strcmp(errors, "strict"))) 6204 known_errorHandler = 1; 6205 else if (!strcmp(errors, "replace")) 6206 known_errorHandler = 2; 6207 else if (!strcmp(errors, "ignore")) 6208 known_errorHandler = 3; 6209 else if (!strcmp(errors, "xmlcharrefreplace")) 6210 known_errorHandler = 4; 6211 else 6212 known_errorHandler = 0; 6213 } 6214 switch (known_errorHandler) { 6215 case 1: /* strict */ 6216 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason); 6217 goto onError; 6218 case 2: /* replace */ 6219 /* No need to check for space, this is a 1:1 replacement */ 6220 for (coll = collstart; coll<collend; ++coll) 6221 *str++ = '?'; 6222 /* fall through */ 6223 case 3: /* ignore */ 6224 p = collend; 6225 break; 6226 case 4: /* xmlcharrefreplace */ 6227 /* generate replacement (temporarily (mis)uses p) */ 6228 for (p = collstart; p < collend; ++p) { 6229 char buffer[2+29+1+1]; 6230 char *cp; 6231 sprintf(buffer, "&#%d;", (int)*p); 6232 if (charmaptranslate_makespace(&res, &str, 6233 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend))) 6234 goto onError; 6235 for (cp = buffer; *cp; ++cp) 6236 *str++ = *cp; 6237 } 6238 p = collend; 6239 break; 6240 default: 6241 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler, 6242 reason, startp, size, &exc, 6243 collstart-startp, collend-startp, &newpos); 6244 if (repunicode == NULL) 6245 goto onError; 6246 /* generate replacement */ 6247 repsize = PyUnicode_GET_SIZE(repunicode); 6248 if (charmaptranslate_makespace(&res, &str, 6249 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) { 6250 Py_DECREF(repunicode); 6251 goto onError; 6252 } 6253 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) 6254 *str++ = *uni2; 6255 p = startp + newpos; 6256 Py_DECREF(repunicode); 6257 } 6258 } 6259 } 6260 /* Resize if we allocated to much */ 6261 respos = str-PyUnicode_AS_UNICODE(res); 6262 if (respos<PyUnicode_GET_SIZE(res)) { 6263 if (PyUnicode_Resize(&res, respos) < 0) 6264 goto onError; 6265 } 6266 Py_XDECREF(exc); 6267 Py_XDECREF(errorHandler); 6268 return res; 6269 6270 onError: 6271 Py_XDECREF(res); 6272 Py_XDECREF(exc); 6273 Py_XDECREF(errorHandler); 6274 return NULL; 6275} 6276 6277PyObject * 6278PyUnicode_Translate(PyObject *str, 6279 PyObject *mapping, 6280 const char *errors) 6281{ 6282 PyObject *result; 6283 6284 str = PyUnicode_FromObject(str); 6285 if (str == NULL) 6286 goto onError; 6287 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str), 6288 PyUnicode_GET_SIZE(str), 6289 mapping, 6290 errors); 6291 Py_DECREF(str); 6292 return result; 6293 6294 onError: 6295 Py_XDECREF(str); 6296 return NULL; 6297} 6298 6299PyObject * 6300PyUnicode_TransformDecimalToASCII(Py_UNICODE *s, 6301 Py_ssize_t length) 6302{ 6303 PyObject *result; 6304 Py_UNICODE *p; /* write pointer into result */ 6305 Py_ssize_t i; 6306 /* Copy to a new string */ 6307 result = (PyObject *)_PyUnicode_New(length); 6308 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length); 6309 if (result == NULL) 6310 return result; 6311 p = PyUnicode_AS_UNICODE(result); 6312 /* Iterate over code points */ 6313 for (i = 0; i < length; i++) { 6314 Py_UNICODE ch =s[i]; 6315 if (ch > 127) { 6316 int decimal = Py_UNICODE_TODECIMAL(ch); 6317 if (decimal >= 0) 6318 p[i] = '0' + decimal; 6319 } 6320 } 6321 return result; 6322} 6323/* --- Decimal Encoder ---------------------------------------------------- */ 6324 6325int 6326PyUnicode_EncodeDecimal(Py_UNICODE *s, 6327 Py_ssize_t length, 6328 char *output, 6329 const char *errors) 6330{ 6331 Py_UNICODE *p, *end; 6332 PyObject *errorHandler = NULL; 6333 PyObject *exc = NULL; 6334 const char *encoding = "decimal"; 6335 const char *reason = "invalid decimal Unicode string"; 6336 /* the following variable is used for caching string comparisons 6337 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */ 6338 int known_errorHandler = -1; 6339 6340 if (output == NULL) { 6341 PyErr_BadArgument(); 6342 return -1; 6343 } 6344 6345 p = s; 6346 end = s + length; 6347 while (p < end) { 6348 register Py_UNICODE ch = *p; 6349 int decimal; 6350 PyObject *repunicode; 6351 Py_ssize_t repsize; 6352 Py_ssize_t newpos; 6353 Py_UNICODE *uni2; 6354 Py_UNICODE *collstart; 6355 Py_UNICODE *collend; 6356 6357 if (Py_UNICODE_ISSPACE(ch)) { 6358 *output++ = ' '; 6359 ++p; 6360 continue; 6361 } 6362 decimal = Py_UNICODE_TODECIMAL(ch); 6363 if (decimal >= 0) { 6364 *output++ = '0' + decimal; 6365 ++p; 6366 continue; 6367 } 6368 if (0 < ch && ch < 256) { 6369 *output++ = (char)ch; 6370 ++p; 6371 continue; 6372 } 6373 /* All other characters are considered unencodable */ 6374 collstart = p; 6375 collend = p+1; 6376 while (collend < end) { 6377 if ((0 < *collend && *collend < 256) || 6378 !Py_UNICODE_ISSPACE(*collend) || 6379 Py_UNICODE_TODECIMAL(*collend)) 6380 break; 6381 } 6382 /* cache callback name lookup 6383 * (if not done yet, i.e. it's the first error) */ 6384 if (known_errorHandler==-1) { 6385 if ((errors==NULL) || (!strcmp(errors, "strict"))) 6386 known_errorHandler = 1; 6387 else if (!strcmp(errors, "replace")) 6388 known_errorHandler = 2; 6389 else if (!strcmp(errors, "ignore")) 6390 known_errorHandler = 3; 6391 else if (!strcmp(errors, "xmlcharrefreplace")) 6392 known_errorHandler = 4; 6393 else 6394 known_errorHandler = 0; 6395 } 6396 switch (known_errorHandler) { 6397 case 1: /* strict */ 6398 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason); 6399 goto onError; 6400 case 2: /* replace */ 6401 for (p = collstart; p < collend; ++p) 6402 *output++ = '?'; 6403 /* fall through */ 6404 case 3: /* ignore */ 6405 p = collend; 6406 break; 6407 case 4: /* xmlcharrefreplace */ 6408 /* generate replacement (temporarily (mis)uses p) */ 6409 for (p = collstart; p < collend; ++p) 6410 output += sprintf(output, "&#%d;", (int)*p); 6411 p = collend; 6412 break; 6413 default: 6414 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler, 6415 encoding, reason, s, length, &exc, 6416 collstart-s, collend-s, &newpos); 6417 if (repunicode == NULL) 6418 goto onError; 6419 if (!PyUnicode_Check(repunicode)) { 6420 /* Byte results not supported, since they have no decimal property. */ 6421 PyErr_SetString(PyExc_TypeError, "error handler should return unicode"); 6422 Py_DECREF(repunicode); 6423 goto onError; 6424 } 6425 /* generate replacement */ 6426 repsize = PyUnicode_GET_SIZE(repunicode); 6427 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) { 6428 Py_UNICODE ch = *uni2; 6429 if (Py_UNICODE_ISSPACE(ch)) 6430 *output++ = ' '; 6431 else { 6432 decimal = Py_UNICODE_TODECIMAL(ch); 6433 if (decimal >= 0) 6434 *output++ = '0' + decimal; 6435 else if (0 < ch && ch < 256) 6436 *output++ = (char)ch; 6437 else { 6438 Py_DECREF(repunicode); 6439 raise_encode_exception(&exc, encoding, 6440 s, length, collstart-s, collend-s, reason); 6441 goto onError; 6442 } 6443 } 6444 } 6445 p = s + newpos; 6446 Py_DECREF(repunicode); 6447 } 6448 } 6449 /* 0-terminate the output string */ 6450 *output++ = '\0'; 6451 Py_XDECREF(exc); 6452 Py_XDECREF(errorHandler); 6453 return 0; 6454 6455 onError: 6456 Py_XDECREF(exc); 6457 Py_XDECREF(errorHandler); 6458 return -1; 6459} 6460 6461/* --- Helpers ------------------------------------------------------------ */ 6462 6463#include "stringlib/unicodedefs.h" 6464#include "stringlib/fastsearch.h" 6465 6466#include "stringlib/count.h" 6467#include "stringlib/find.h" 6468#include "stringlib/partition.h" 6469#include "stringlib/split.h" 6470 6471#define _Py_InsertThousandsGrouping _PyUnicode_InsertThousandsGrouping 6472#define _Py_InsertThousandsGroupingLocale _PyUnicode_InsertThousandsGroupingLocale 6473#include "stringlib/localeutil.h" 6474 6475/* helper macro to fixup start/end slice values */ 6476#define ADJUST_INDICES(start, end, len) \ 6477 if (end > len) \ 6478 end = len; \ 6479 else if (end < 0) { \ 6480 end += len; \ 6481 if (end < 0) \ 6482 end = 0; \ 6483 } \ 6484 if (start < 0) { \ 6485 start += len; \ 6486 if (start < 0) \ 6487 start = 0; \ 6488 } 6489 6490Py_ssize_t 6491PyUnicode_Count(PyObject *str, 6492 PyObject *substr, 6493 Py_ssize_t start, 6494 Py_ssize_t end) 6495{ 6496 Py_ssize_t result; 6497 PyUnicodeObject* str_obj; 6498 PyUnicodeObject* sub_obj; 6499 6500 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str); 6501 if (!str_obj) 6502 return -1; 6503 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr); 6504 if (!sub_obj) { 6505 Py_DECREF(str_obj); 6506 return -1; 6507 } 6508 6509 ADJUST_INDICES(start, end, str_obj->length); 6510 result = stringlib_count( 6511 str_obj->str + start, end - start, sub_obj->str, sub_obj->length, 6512 PY_SSIZE_T_MAX 6513 ); 6514 6515 Py_DECREF(sub_obj); 6516 Py_DECREF(str_obj); 6517 6518 return result; 6519} 6520 6521Py_ssize_t 6522PyUnicode_Find(PyObject *str, 6523 PyObject *sub, 6524 Py_ssize_t start, 6525 Py_ssize_t end, 6526 int direction) 6527{ 6528 Py_ssize_t result; 6529 6530 str = PyUnicode_FromObject(str); 6531 if (!str) 6532 return -2; 6533 sub = PyUnicode_FromObject(sub); 6534 if (!sub) { 6535 Py_DECREF(str); 6536 return -2; 6537 } 6538 6539 if (direction > 0) 6540 result = stringlib_find_slice( 6541 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str), 6542 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub), 6543 start, end 6544 ); 6545 else 6546 result = stringlib_rfind_slice( 6547 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str), 6548 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub), 6549 start, end 6550 ); 6551 6552 Py_DECREF(str); 6553 Py_DECREF(sub); 6554 6555 return result; 6556} 6557 6558static int 6559tailmatch(PyUnicodeObject *self, 6560 PyUnicodeObject *substring, 6561 Py_ssize_t start, 6562 Py_ssize_t end, 6563 int direction) 6564{ 6565 if (substring->length == 0) 6566 return 1; 6567 6568 ADJUST_INDICES(start, end, self->length); 6569 end -= substring->length; 6570 if (end < start) 6571 return 0; 6572 6573 if (direction > 0) { 6574 if (Py_UNICODE_MATCH(self, end, substring)) 6575 return 1; 6576 } else { 6577 if (Py_UNICODE_MATCH(self, start, substring)) 6578 return 1; 6579 } 6580 6581 return 0; 6582} 6583 6584Py_ssize_t 6585PyUnicode_Tailmatch(PyObject *str, 6586 PyObject *substr, 6587 Py_ssize_t start, 6588 Py_ssize_t end, 6589 int direction) 6590{ 6591 Py_ssize_t result; 6592 6593 str = PyUnicode_FromObject(str); 6594 if (str == NULL) 6595 return -1; 6596 substr = PyUnicode_FromObject(substr); 6597 if (substr == NULL) { 6598 Py_DECREF(str); 6599 return -1; 6600 } 6601 6602 result = tailmatch((PyUnicodeObject *)str, 6603 (PyUnicodeObject *)substr, 6604 start, end, direction); 6605 Py_DECREF(str); 6606 Py_DECREF(substr); 6607 return result; 6608} 6609 6610/* Apply fixfct filter to the Unicode object self and return a 6611 reference to the modified object */ 6612 6613static PyObject * 6614fixup(PyUnicodeObject *self, 6615 int (*fixfct)(PyUnicodeObject *s)) 6616{ 6617 6618 PyUnicodeObject *u; 6619 6620 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length); 6621 if (u == NULL) 6622 return NULL; 6623 6624 Py_UNICODE_COPY(u->str, self->str, self->length); 6625 6626 if (!fixfct(u) && PyUnicode_CheckExact(self)) { 6627 /* fixfct should return TRUE if it modified the buffer. If 6628 FALSE, return a reference to the original buffer instead 6629 (to save space, not time) */ 6630 Py_INCREF(self); 6631 Py_DECREF(u); 6632 return (PyObject*) self; 6633 } 6634 return (PyObject*) u; 6635} 6636 6637static int 6638fixupper(PyUnicodeObject *self) 6639{ 6640 Py_ssize_t len = self->length; 6641 Py_UNICODE *s = self->str; 6642 int status = 0; 6643 6644 while (len-- > 0) { 6645 register Py_UNICODE ch; 6646 6647 ch = Py_UNICODE_TOUPPER(*s); 6648 if (ch != *s) { 6649 status = 1; 6650 *s = ch; 6651 } 6652 s++; 6653 } 6654 6655 return status; 6656} 6657 6658static int 6659fixlower(PyUnicodeObject *self) 6660{ 6661 Py_ssize_t len = self->length; 6662 Py_UNICODE *s = self->str; 6663 int status = 0; 6664 6665 while (len-- > 0) { 6666 register Py_UNICODE ch; 6667 6668 ch = Py_UNICODE_TOLOWER(*s); 6669 if (ch != *s) { 6670 status = 1; 6671 *s = ch; 6672 } 6673 s++; 6674 } 6675 6676 return status; 6677} 6678 6679static int 6680fixswapcase(PyUnicodeObject *self) 6681{ 6682 Py_ssize_t len = self->length; 6683 Py_UNICODE *s = self->str; 6684 int status = 0; 6685 6686 while (len-- > 0) { 6687 if (Py_UNICODE_ISUPPER(*s)) { 6688 *s = Py_UNICODE_TOLOWER(*s); 6689 status = 1; 6690 } else if (Py_UNICODE_ISLOWER(*s)) { 6691 *s = Py_UNICODE_TOUPPER(*s); 6692 status = 1; 6693 } 6694 s++; 6695 } 6696 6697 return status; 6698} 6699 6700static int 6701fixcapitalize(PyUnicodeObject *self) 6702{ 6703 Py_ssize_t len = self->length; 6704 Py_UNICODE *s = self->str; 6705 int status = 0; 6706 6707 if (len == 0) 6708 return 0; 6709 if (Py_UNICODE_ISLOWER(*s)) { 6710 *s = Py_UNICODE_TOUPPER(*s); 6711 status = 1; 6712 } 6713 s++; 6714 while (--len > 0) { 6715 if (Py_UNICODE_ISUPPER(*s)) { 6716 *s = Py_UNICODE_TOLOWER(*s); 6717 status = 1; 6718 } 6719 s++; 6720 } 6721 return status; 6722} 6723 6724static int 6725fixtitle(PyUnicodeObject *self) 6726{ 6727 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 6728 register Py_UNICODE *e; 6729 int previous_is_cased; 6730 6731 /* Shortcut for single character strings */ 6732 if (PyUnicode_GET_SIZE(self) == 1) { 6733 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p); 6734 if (*p != ch) { 6735 *p = ch; 6736 return 1; 6737 } 6738 else 6739 return 0; 6740 } 6741 6742 e = p + PyUnicode_GET_SIZE(self); 6743 previous_is_cased = 0; 6744 for (; p < e; p++) { 6745 register const Py_UNICODE ch = *p; 6746 6747 if (previous_is_cased) 6748 *p = Py_UNICODE_TOLOWER(ch); 6749 else 6750 *p = Py_UNICODE_TOTITLE(ch); 6751 6752 if (Py_UNICODE_ISLOWER(ch) || 6753 Py_UNICODE_ISUPPER(ch) || 6754 Py_UNICODE_ISTITLE(ch)) 6755 previous_is_cased = 1; 6756 else 6757 previous_is_cased = 0; 6758 } 6759 return 1; 6760} 6761 6762PyObject * 6763PyUnicode_Join(PyObject *separator, PyObject *seq) 6764{ 6765 const Py_UNICODE blank = ' '; 6766 const Py_UNICODE *sep = ␣ 6767 Py_ssize_t seplen = 1; 6768 PyUnicodeObject *res = NULL; /* the result */ 6769 Py_UNICODE *res_p; /* pointer to free byte in res's string area */ 6770 PyObject *fseq; /* PySequence_Fast(seq) */ 6771 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */ 6772 PyObject **items; 6773 PyObject *item; 6774 Py_ssize_t sz, i; 6775 6776 fseq = PySequence_Fast(seq, ""); 6777 if (fseq == NULL) { 6778 return NULL; 6779 } 6780 6781 /* NOTE: the following code can't call back into Python code, 6782 * so we are sure that fseq won't be mutated. 6783 */ 6784 6785 seqlen = PySequence_Fast_GET_SIZE(fseq); 6786 /* If empty sequence, return u"". */ 6787 if (seqlen == 0) { 6788 res = _PyUnicode_New(0); /* empty sequence; return u"" */ 6789 goto Done; 6790 } 6791 items = PySequence_Fast_ITEMS(fseq); 6792 /* If singleton sequence with an exact Unicode, return that. */ 6793 if (seqlen == 1) { 6794 item = items[0]; 6795 if (PyUnicode_CheckExact(item)) { 6796 Py_INCREF(item); 6797 res = (PyUnicodeObject *)item; 6798 goto Done; 6799 } 6800 } 6801 else { 6802 /* Set up sep and seplen */ 6803 if (separator == NULL) { 6804 sep = ␣ 6805 seplen = 1; 6806 } 6807 else { 6808 if (!PyUnicode_Check(separator)) { 6809 PyErr_Format(PyExc_TypeError, 6810 "separator: expected str instance," 6811 " %.80s found", 6812 Py_TYPE(separator)->tp_name); 6813 goto onError; 6814 } 6815 sep = PyUnicode_AS_UNICODE(separator); 6816 seplen = PyUnicode_GET_SIZE(separator); 6817 } 6818 } 6819 6820 /* There are at least two things to join, or else we have a subclass 6821 * of str in the sequence. 6822 * Do a pre-pass to figure out the total amount of space we'll 6823 * need (sz), and see whether all argument are strings. 6824 */ 6825 sz = 0; 6826 for (i = 0; i < seqlen; i++) { 6827 const Py_ssize_t old_sz = sz; 6828 item = items[i]; 6829 if (!PyUnicode_Check(item)) { 6830 PyErr_Format(PyExc_TypeError, 6831 "sequence item %zd: expected str instance," 6832 " %.80s found", 6833 i, Py_TYPE(item)->tp_name); 6834 goto onError; 6835 } 6836 sz += PyUnicode_GET_SIZE(item); 6837 if (i != 0) 6838 sz += seplen; 6839 if (sz < old_sz || sz > PY_SSIZE_T_MAX) { 6840 PyErr_SetString(PyExc_OverflowError, 6841 "join() result is too long for a Python string"); 6842 goto onError; 6843 } 6844 } 6845 6846 res = _PyUnicode_New(sz); 6847 if (res == NULL) 6848 goto onError; 6849 6850 /* Catenate everything. */ 6851 res_p = PyUnicode_AS_UNICODE(res); 6852 for (i = 0; i < seqlen; ++i) { 6853 Py_ssize_t itemlen; 6854 item = items[i]; 6855 itemlen = PyUnicode_GET_SIZE(item); 6856 /* Copy item, and maybe the separator. */ 6857 if (i) { 6858 Py_UNICODE_COPY(res_p, sep, seplen); 6859 res_p += seplen; 6860 } 6861 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen); 6862 res_p += itemlen; 6863 } 6864 6865 Done: 6866 Py_DECREF(fseq); 6867 return (PyObject *)res; 6868 6869 onError: 6870 Py_DECREF(fseq); 6871 Py_XDECREF(res); 6872 return NULL; 6873} 6874 6875static PyUnicodeObject * 6876pad(PyUnicodeObject *self, 6877 Py_ssize_t left, 6878 Py_ssize_t right, 6879 Py_UNICODE fill) 6880{ 6881 PyUnicodeObject *u; 6882 6883 if (left < 0) 6884 left = 0; 6885 if (right < 0) 6886 right = 0; 6887 6888 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) { 6889 Py_INCREF(self); 6890 return self; 6891 } 6892 6893 if (left > PY_SSIZE_T_MAX - self->length || 6894 right > PY_SSIZE_T_MAX - (left + self->length)) { 6895 PyErr_SetString(PyExc_OverflowError, "padded string is too long"); 6896 return NULL; 6897 } 6898 u = _PyUnicode_New(left + self->length + right); 6899 if (u) { 6900 if (left) 6901 Py_UNICODE_FILL(u->str, fill, left); 6902 Py_UNICODE_COPY(u->str + left, self->str, self->length); 6903 if (right) 6904 Py_UNICODE_FILL(u->str + left + self->length, fill, right); 6905 } 6906 6907 return u; 6908} 6909 6910PyObject * 6911PyUnicode_Splitlines(PyObject *string, int keepends) 6912{ 6913 PyObject *list; 6914 6915 string = PyUnicode_FromObject(string); 6916 if (string == NULL) 6917 return NULL; 6918 6919 list = stringlib_splitlines( 6920 (PyObject*) string, PyUnicode_AS_UNICODE(string), 6921 PyUnicode_GET_SIZE(string), keepends); 6922 6923 Py_DECREF(string); 6924 return list; 6925} 6926 6927static PyObject * 6928split(PyUnicodeObject *self, 6929 PyUnicodeObject *substring, 6930 Py_ssize_t maxcount) 6931{ 6932 if (maxcount < 0) 6933 maxcount = PY_SSIZE_T_MAX; 6934 6935 if (substring == NULL) 6936 return stringlib_split_whitespace( 6937 (PyObject*) self, self->str, self->length, maxcount 6938 ); 6939 6940 return stringlib_split( 6941 (PyObject*) self, self->str, self->length, 6942 substring->str, substring->length, 6943 maxcount 6944 ); 6945} 6946 6947static PyObject * 6948rsplit(PyUnicodeObject *self, 6949 PyUnicodeObject *substring, 6950 Py_ssize_t maxcount) 6951{ 6952 if (maxcount < 0) 6953 maxcount = PY_SSIZE_T_MAX; 6954 6955 if (substring == NULL) 6956 return stringlib_rsplit_whitespace( 6957 (PyObject*) self, self->str, self->length, maxcount 6958 ); 6959 6960 return stringlib_rsplit( 6961 (PyObject*) self, self->str, self->length, 6962 substring->str, substring->length, 6963 maxcount 6964 ); 6965} 6966 6967static PyObject * 6968replace(PyUnicodeObject *self, 6969 PyUnicodeObject *str1, 6970 PyUnicodeObject *str2, 6971 Py_ssize_t maxcount) 6972{ 6973 PyUnicodeObject *u; 6974 6975 if (maxcount < 0) 6976 maxcount = PY_SSIZE_T_MAX; 6977 else if (maxcount == 0 || self->length == 0) 6978 goto nothing; 6979 6980 if (str1->length == str2->length) { 6981 Py_ssize_t i; 6982 /* same length */ 6983 if (str1->length == 0) 6984 goto nothing; 6985 if (str1->length == 1) { 6986 /* replace characters */ 6987 Py_UNICODE u1, u2; 6988 if (!findchar(self->str, self->length, str1->str[0])) 6989 goto nothing; 6990 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length); 6991 if (!u) 6992 return NULL; 6993 Py_UNICODE_COPY(u->str, self->str, self->length); 6994 u1 = str1->str[0]; 6995 u2 = str2->str[0]; 6996 for (i = 0; i < u->length; i++) 6997 if (u->str[i] == u1) { 6998 if (--maxcount < 0) 6999 break; 7000 u->str[i] = u2; 7001 } 7002 } else { 7003 i = stringlib_find( 7004 self->str, self->length, str1->str, str1->length, 0 7005 ); 7006 if (i < 0) 7007 goto nothing; 7008 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length); 7009 if (!u) 7010 return NULL; 7011 Py_UNICODE_COPY(u->str, self->str, self->length); 7012 7013 /* change everything in-place, starting with this one */ 7014 Py_UNICODE_COPY(u->str+i, str2->str, str2->length); 7015 i += str1->length; 7016 7017 while ( --maxcount > 0) { 7018 i = stringlib_find(self->str+i, self->length-i, 7019 str1->str, str1->length, 7020 i); 7021 if (i == -1) 7022 break; 7023 Py_UNICODE_COPY(u->str+i, str2->str, str2->length); 7024 i += str1->length; 7025 } 7026 } 7027 } else { 7028 7029 Py_ssize_t n, i, j; 7030 Py_ssize_t product, new_size, delta; 7031 Py_UNICODE *p; 7032 7033 /* replace strings */ 7034 n = stringlib_count(self->str, self->length, str1->str, str1->length, 7035 maxcount); 7036 if (n == 0) 7037 goto nothing; 7038 /* new_size = self->length + n * (str2->length - str1->length)); */ 7039 delta = (str2->length - str1->length); 7040 if (delta == 0) { 7041 new_size = self->length; 7042 } else { 7043 product = n * (str2->length - str1->length); 7044 if ((product / (str2->length - str1->length)) != n) { 7045 PyErr_SetString(PyExc_OverflowError, 7046 "replace string is too long"); 7047 return NULL; 7048 } 7049 new_size = self->length + product; 7050 if (new_size < 0) { 7051 PyErr_SetString(PyExc_OverflowError, 7052 "replace string is too long"); 7053 return NULL; 7054 } 7055 } 7056 u = _PyUnicode_New(new_size); 7057 if (!u) 7058 return NULL; 7059 i = 0; 7060 p = u->str; 7061 if (str1->length > 0) { 7062 while (n-- > 0) { 7063 /* look for next match */ 7064 j = stringlib_find(self->str+i, self->length-i, 7065 str1->str, str1->length, 7066 i); 7067 if (j == -1) 7068 break; 7069 else if (j > i) { 7070 /* copy unchanged part [i:j] */ 7071 Py_UNICODE_COPY(p, self->str+i, j-i); 7072 p += j - i; 7073 } 7074 /* copy substitution string */ 7075 if (str2->length > 0) { 7076 Py_UNICODE_COPY(p, str2->str, str2->length); 7077 p += str2->length; 7078 } 7079 i = j + str1->length; 7080 } 7081 if (i < self->length) 7082 /* copy tail [i:] */ 7083 Py_UNICODE_COPY(p, self->str+i, self->length-i); 7084 } else { 7085 /* interleave */ 7086 while (n > 0) { 7087 Py_UNICODE_COPY(p, str2->str, str2->length); 7088 p += str2->length; 7089 if (--n <= 0) 7090 break; 7091 *p++ = self->str[i++]; 7092 } 7093 Py_UNICODE_COPY(p, self->str+i, self->length-i); 7094 } 7095 } 7096 return (PyObject *) u; 7097 7098 nothing: 7099 /* nothing to replace; return original string (when possible) */ 7100 if (PyUnicode_CheckExact(self)) { 7101 Py_INCREF(self); 7102 return (PyObject *) self; 7103 } 7104 return PyUnicode_FromUnicode(self->str, self->length); 7105} 7106 7107/* --- Unicode Object Methods --------------------------------------------- */ 7108 7109PyDoc_STRVAR(title__doc__, 7110 "S.title() -> str\n\ 7111\n\ 7112Return a titlecased version of S, i.e. words start with title case\n\ 7113characters, all remaining cased characters have lower case."); 7114 7115static PyObject* 7116unicode_title(PyUnicodeObject *self) 7117{ 7118 return fixup(self, fixtitle); 7119} 7120 7121PyDoc_STRVAR(capitalize__doc__, 7122 "S.capitalize() -> str\n\ 7123\n\ 7124Return a capitalized version of S, i.e. make the first character\n\ 7125have upper case and the rest lower case."); 7126 7127static PyObject* 7128unicode_capitalize(PyUnicodeObject *self) 7129{ 7130 return fixup(self, fixcapitalize); 7131} 7132 7133#if 0 7134PyDoc_STRVAR(capwords__doc__, 7135 "S.capwords() -> str\n\ 7136\n\ 7137Apply .capitalize() to all words in S and return the result with\n\ 7138normalized whitespace (all whitespace strings are replaced by ' ')."); 7139 7140static PyObject* 7141unicode_capwords(PyUnicodeObject *self) 7142{ 7143 PyObject *list; 7144 PyObject *item; 7145 Py_ssize_t i; 7146 7147 /* Split into words */ 7148 list = split(self, NULL, -1); 7149 if (!list) 7150 return NULL; 7151 7152 /* Capitalize each word */ 7153 for (i = 0; i < PyList_GET_SIZE(list); i++) { 7154 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i), 7155 fixcapitalize); 7156 if (item == NULL) 7157 goto onError; 7158 Py_DECREF(PyList_GET_ITEM(list, i)); 7159 PyList_SET_ITEM(list, i, item); 7160 } 7161 7162 /* Join the words to form a new string */ 7163 item = PyUnicode_Join(NULL, list); 7164 7165 onError: 7166 Py_DECREF(list); 7167 return (PyObject *)item; 7168} 7169#endif 7170 7171/* Argument converter. Coerces to a single unicode character */ 7172 7173static int 7174convert_uc(PyObject *obj, void *addr) 7175{ 7176 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr; 7177 PyObject *uniobj; 7178 Py_UNICODE *unistr; 7179 7180 uniobj = PyUnicode_FromObject(obj); 7181 if (uniobj == NULL) { 7182 PyErr_SetString(PyExc_TypeError, 7183 "The fill character cannot be converted to Unicode"); 7184 return 0; 7185 } 7186 if (PyUnicode_GET_SIZE(uniobj) != 1) { 7187 PyErr_SetString(PyExc_TypeError, 7188 "The fill character must be exactly one character long"); 7189 Py_DECREF(uniobj); 7190 return 0; 7191 } 7192 unistr = PyUnicode_AS_UNICODE(uniobj); 7193 *fillcharloc = unistr[0]; 7194 Py_DECREF(uniobj); 7195 return 1; 7196} 7197 7198PyDoc_STRVAR(center__doc__, 7199 "S.center(width[, fillchar]) -> str\n\ 7200\n\ 7201Return S centered in a string of length width. Padding is\n\ 7202done using the specified fill character (default is a space)"); 7203 7204static PyObject * 7205unicode_center(PyUnicodeObject *self, PyObject *args) 7206{ 7207 Py_ssize_t marg, left; 7208 Py_ssize_t width; 7209 Py_UNICODE fillchar = ' '; 7210 7211 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar)) 7212 return NULL; 7213 7214 if (self->length >= width && PyUnicode_CheckExact(self)) { 7215 Py_INCREF(self); 7216 return (PyObject*) self; 7217 } 7218 7219 marg = width - self->length; 7220 left = marg / 2 + (marg & width & 1); 7221 7222 return (PyObject*) pad(self, left, marg - left, fillchar); 7223} 7224 7225#if 0 7226 7227/* This code should go into some future Unicode collation support 7228 module. The basic comparison should compare ordinals on a naive 7229 basis (this is what Java does and thus Jython too). */ 7230 7231/* speedy UTF-16 code point order comparison */ 7232/* gleaned from: */ 7233/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */ 7234 7235static short utf16Fixup[32] = 7236{ 7237 0, 0, 0, 0, 0, 0, 0, 0, 7238 0, 0, 0, 0, 0, 0, 0, 0, 7239 0, 0, 0, 0, 0, 0, 0, 0, 7240 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800 7241}; 7242 7243static int 7244unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2) 7245{ 7246 Py_ssize_t len1, len2; 7247 7248 Py_UNICODE *s1 = str1->str; 7249 Py_UNICODE *s2 = str2->str; 7250 7251 len1 = str1->length; 7252 len2 = str2->length; 7253 7254 while (len1 > 0 && len2 > 0) { 7255 Py_UNICODE c1, c2; 7256 7257 c1 = *s1++; 7258 c2 = *s2++; 7259 7260 if (c1 > (1<<11) * 26) 7261 c1 += utf16Fixup[c1>>11]; 7262 if (c2 > (1<<11) * 26) 7263 c2 += utf16Fixup[c2>>11]; 7264 /* now c1 and c2 are in UTF-32-compatible order */ 7265 7266 if (c1 != c2) 7267 return (c1 < c2) ? -1 : 1; 7268 7269 len1--; len2--; 7270 } 7271 7272 return (len1 < len2) ? -1 : (len1 != len2); 7273} 7274 7275#else 7276 7277static int 7278unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2) 7279{ 7280 register Py_ssize_t len1, len2; 7281 7282 Py_UNICODE *s1 = str1->str; 7283 Py_UNICODE *s2 = str2->str; 7284 7285 len1 = str1->length; 7286 len2 = str2->length; 7287 7288 while (len1 > 0 && len2 > 0) { 7289 Py_UNICODE c1, c2; 7290 7291 c1 = *s1++; 7292 c2 = *s2++; 7293 7294 if (c1 != c2) 7295 return (c1 < c2) ? -1 : 1; 7296 7297 len1--; len2--; 7298 } 7299 7300 return (len1 < len2) ? -1 : (len1 != len2); 7301} 7302 7303#endif 7304 7305int 7306PyUnicode_Compare(PyObject *left, PyObject *right) 7307{ 7308 if (PyUnicode_Check(left) && PyUnicode_Check(right)) 7309 return unicode_compare((PyUnicodeObject *)left, 7310 (PyUnicodeObject *)right); 7311 PyErr_Format(PyExc_TypeError, 7312 "Can't compare %.100s and %.100s", 7313 left->ob_type->tp_name, 7314 right->ob_type->tp_name); 7315 return -1; 7316} 7317 7318int 7319PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str) 7320{ 7321 int i; 7322 Py_UNICODE *id; 7323 assert(PyUnicode_Check(uni)); 7324 id = PyUnicode_AS_UNICODE(uni); 7325 /* Compare Unicode string and source character set string */ 7326 for (i = 0; id[i] && str[i]; i++) 7327 if (id[i] != str[i]) 7328 return ((int)id[i] < (int)str[i]) ? -1 : 1; 7329 /* This check keeps Python strings that end in '\0' from comparing equal 7330 to C strings identical up to that point. */ 7331 if (PyUnicode_GET_SIZE(uni) != i || id[i]) 7332 return 1; /* uni is longer */ 7333 if (str[i]) 7334 return -1; /* str is longer */ 7335 return 0; 7336} 7337 7338 7339#define TEST_COND(cond) \ 7340 ((cond) ? Py_True : Py_False) 7341 7342PyObject * 7343PyUnicode_RichCompare(PyObject *left, PyObject *right, int op) 7344{ 7345 int result; 7346 7347 if (PyUnicode_Check(left) && PyUnicode_Check(right)) { 7348 PyObject *v; 7349 if (((PyUnicodeObject *) left)->length != 7350 ((PyUnicodeObject *) right)->length) { 7351 if (op == Py_EQ) { 7352 Py_INCREF(Py_False); 7353 return Py_False; 7354 } 7355 if (op == Py_NE) { 7356 Py_INCREF(Py_True); 7357 return Py_True; 7358 } 7359 } 7360 if (left == right) 7361 result = 0; 7362 else 7363 result = unicode_compare((PyUnicodeObject *)left, 7364 (PyUnicodeObject *)right); 7365 7366 /* Convert the return value to a Boolean */ 7367 switch (op) { 7368 case Py_EQ: 7369 v = TEST_COND(result == 0); 7370 break; 7371 case Py_NE: 7372 v = TEST_COND(result != 0); 7373 break; 7374 case Py_LE: 7375 v = TEST_COND(result <= 0); 7376 break; 7377 case Py_GE: 7378 v = TEST_COND(result >= 0); 7379 break; 7380 case Py_LT: 7381 v = TEST_COND(result == -1); 7382 break; 7383 case Py_GT: 7384 v = TEST_COND(result == 1); 7385 break; 7386 default: 7387 PyErr_BadArgument(); 7388 return NULL; 7389 } 7390 Py_INCREF(v); 7391 return v; 7392 } 7393 7394 Py_INCREF(Py_NotImplemented); 7395 return Py_NotImplemented; 7396} 7397 7398int 7399PyUnicode_Contains(PyObject *container, PyObject *element) 7400{ 7401 PyObject *str, *sub; 7402 int result; 7403 7404 /* Coerce the two arguments */ 7405 sub = PyUnicode_FromObject(element); 7406 if (!sub) { 7407 PyErr_Format(PyExc_TypeError, 7408 "'in <string>' requires string as left operand, not %s", 7409 element->ob_type->tp_name); 7410 return -1; 7411 } 7412 7413 str = PyUnicode_FromObject(container); 7414 if (!str) { 7415 Py_DECREF(sub); 7416 return -1; 7417 } 7418 7419 result = stringlib_contains_obj(str, sub); 7420 7421 Py_DECREF(str); 7422 Py_DECREF(sub); 7423 7424 return result; 7425} 7426 7427/* Concat to string or Unicode object giving a new Unicode object. */ 7428 7429PyObject * 7430PyUnicode_Concat(PyObject *left, PyObject *right) 7431{ 7432 PyUnicodeObject *u = NULL, *v = NULL, *w; 7433 7434 /* Coerce the two arguments */ 7435 u = (PyUnicodeObject *)PyUnicode_FromObject(left); 7436 if (u == NULL) 7437 goto onError; 7438 v = (PyUnicodeObject *)PyUnicode_FromObject(right); 7439 if (v == NULL) 7440 goto onError; 7441 7442 /* Shortcuts */ 7443 if (v == unicode_empty) { 7444 Py_DECREF(v); 7445 return (PyObject *)u; 7446 } 7447 if (u == unicode_empty) { 7448 Py_DECREF(u); 7449 return (PyObject *)v; 7450 } 7451 7452 /* Concat the two Unicode strings */ 7453 w = _PyUnicode_New(u->length + v->length); 7454 if (w == NULL) 7455 goto onError; 7456 Py_UNICODE_COPY(w->str, u->str, u->length); 7457 Py_UNICODE_COPY(w->str + u->length, v->str, v->length); 7458 7459 Py_DECREF(u); 7460 Py_DECREF(v); 7461 return (PyObject *)w; 7462 7463 onError: 7464 Py_XDECREF(u); 7465 Py_XDECREF(v); 7466 return NULL; 7467} 7468 7469void 7470PyUnicode_Append(PyObject **pleft, PyObject *right) 7471{ 7472 PyObject *new; 7473 if (*pleft == NULL) 7474 return; 7475 if (right == NULL || !PyUnicode_Check(*pleft)) { 7476 Py_DECREF(*pleft); 7477 *pleft = NULL; 7478 return; 7479 } 7480 new = PyUnicode_Concat(*pleft, right); 7481 Py_DECREF(*pleft); 7482 *pleft = new; 7483} 7484 7485void 7486PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right) 7487{ 7488 PyUnicode_Append(pleft, right); 7489 Py_XDECREF(right); 7490} 7491 7492PyDoc_STRVAR(count__doc__, 7493 "S.count(sub[, start[, end]]) -> int\n\ 7494\n\ 7495Return the number of non-overlapping occurrences of substring sub in\n\ 7496string S[start:end]. Optional arguments start and end are\n\ 7497interpreted as in slice notation."); 7498 7499static PyObject * 7500unicode_count(PyUnicodeObject *self, PyObject *args) 7501{ 7502 PyUnicodeObject *substring; 7503 Py_ssize_t start = 0; 7504 Py_ssize_t end = PY_SSIZE_T_MAX; 7505 PyObject *result; 7506 7507 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring, 7508 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 7509 return NULL; 7510 7511 substring = (PyUnicodeObject *)PyUnicode_FromObject( 7512 (PyObject *)substring); 7513 if (substring == NULL) 7514 return NULL; 7515 7516 ADJUST_INDICES(start, end, self->length); 7517 result = PyLong_FromSsize_t( 7518 stringlib_count(self->str + start, end - start, 7519 substring->str, substring->length, 7520 PY_SSIZE_T_MAX) 7521 ); 7522 7523 Py_DECREF(substring); 7524 7525 return result; 7526} 7527 7528PyDoc_STRVAR(encode__doc__, 7529 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\ 7530\n\ 7531Encode S using the codec registered for encoding. Default encoding\n\ 7532is 'utf-8'. errors may be given to set a different error\n\ 7533handling scheme. Default is 'strict' meaning that encoding errors raise\n\ 7534a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\ 7535'xmlcharrefreplace' as well as any other name registered with\n\ 7536codecs.register_error that can handle UnicodeEncodeErrors."); 7537 7538static PyObject * 7539unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs) 7540{ 7541 static char *kwlist[] = {"encoding", "errors", 0}; 7542 char *encoding = NULL; 7543 char *errors = NULL; 7544 7545 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode", 7546 kwlist, &encoding, &errors)) 7547 return NULL; 7548 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors); 7549} 7550 7551PyDoc_STRVAR(expandtabs__doc__, 7552 "S.expandtabs([tabsize]) -> str\n\ 7553\n\ 7554Return a copy of S where all tab characters are expanded using spaces.\n\ 7555If tabsize is not given, a tab size of 8 characters is assumed."); 7556 7557static PyObject* 7558unicode_expandtabs(PyUnicodeObject *self, PyObject *args) 7559{ 7560 Py_UNICODE *e; 7561 Py_UNICODE *p; 7562 Py_UNICODE *q; 7563 Py_UNICODE *qe; 7564 Py_ssize_t i, j, incr; 7565 PyUnicodeObject *u; 7566 int tabsize = 8; 7567 7568 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize)) 7569 return NULL; 7570 7571 /* First pass: determine size of output string */ 7572 i = 0; /* chars up to and including most recent \n or \r */ 7573 j = 0; /* chars since most recent \n or \r (use in tab calculations) */ 7574 e = self->str + self->length; /* end of input */ 7575 for (p = self->str; p < e; p++) 7576 if (*p == '\t') { 7577 if (tabsize > 0) { 7578 incr = tabsize - (j % tabsize); /* cannot overflow */ 7579 if (j > PY_SSIZE_T_MAX - incr) 7580 goto overflow1; 7581 j += incr; 7582 } 7583 } 7584 else { 7585 if (j > PY_SSIZE_T_MAX - 1) 7586 goto overflow1; 7587 j++; 7588 if (*p == '\n' || *p == '\r') { 7589 if (i > PY_SSIZE_T_MAX - j) 7590 goto overflow1; 7591 i += j; 7592 j = 0; 7593 } 7594 } 7595 7596 if (i > PY_SSIZE_T_MAX - j) 7597 goto overflow1; 7598 7599 /* Second pass: create output string and fill it */ 7600 u = _PyUnicode_New(i + j); 7601 if (!u) 7602 return NULL; 7603 7604 j = 0; /* same as in first pass */ 7605 q = u->str; /* next output char */ 7606 qe = u->str + u->length; /* end of output */ 7607 7608 for (p = self->str; p < e; p++) 7609 if (*p == '\t') { 7610 if (tabsize > 0) { 7611 i = tabsize - (j % tabsize); 7612 j += i; 7613 while (i--) { 7614 if (q >= qe) 7615 goto overflow2; 7616 *q++ = ' '; 7617 } 7618 } 7619 } 7620 else { 7621 if (q >= qe) 7622 goto overflow2; 7623 *q++ = *p; 7624 j++; 7625 if (*p == '\n' || *p == '\r') 7626 j = 0; 7627 } 7628 7629 return (PyObject*) u; 7630 7631 overflow2: 7632 Py_DECREF(u); 7633 overflow1: 7634 PyErr_SetString(PyExc_OverflowError, "new string is too long"); 7635 return NULL; 7636} 7637 7638PyDoc_STRVAR(find__doc__, 7639 "S.find(sub[, start[, end]]) -> int\n\ 7640\n\ 7641Return the lowest index in S where substring sub is found,\n\ 7642such that sub is contained within s[start:end]. Optional\n\ 7643arguments start and end are interpreted as in slice notation.\n\ 7644\n\ 7645Return -1 on failure."); 7646 7647static PyObject * 7648unicode_find(PyUnicodeObject *self, PyObject *args) 7649{ 7650 PyObject *substring; 7651 Py_ssize_t start; 7652 Py_ssize_t end; 7653 Py_ssize_t result; 7654 7655 if (!_ParseTupleFinds(args, &substring, &start, &end)) 7656 return NULL; 7657 7658 result = stringlib_find_slice( 7659 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self), 7660 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring), 7661 start, end 7662 ); 7663 7664 Py_DECREF(substring); 7665 7666 return PyLong_FromSsize_t(result); 7667} 7668 7669static PyObject * 7670unicode_getitem(PyUnicodeObject *self, Py_ssize_t index) 7671{ 7672 if (index < 0 || index >= self->length) { 7673 PyErr_SetString(PyExc_IndexError, "string index out of range"); 7674 return NULL; 7675 } 7676 7677 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1); 7678} 7679 7680/* Believe it or not, this produces the same value for ASCII strings 7681 as string_hash(). */ 7682static Py_hash_t 7683unicode_hash(PyUnicodeObject *self) 7684{ 7685 Py_ssize_t len; 7686 Py_UNICODE *p; 7687 Py_hash_t x; 7688 7689 if (self->hash != -1) 7690 return self->hash; 7691 len = Py_SIZE(self); 7692 p = self->str; 7693 x = *p << 7; 7694 while (--len >= 0) 7695 x = (1000003*x) ^ *p++; 7696 x ^= Py_SIZE(self); 7697 if (x == -1) 7698 x = -2; 7699 self->hash = x; 7700 return x; 7701} 7702 7703PyDoc_STRVAR(index__doc__, 7704 "S.index(sub[, start[, end]]) -> int\n\ 7705\n\ 7706Like S.find() but raise ValueError when the substring is not found."); 7707 7708static PyObject * 7709unicode_index(PyUnicodeObject *self, PyObject *args) 7710{ 7711 Py_ssize_t result; 7712 PyObject *substring; 7713 Py_ssize_t start; 7714 Py_ssize_t end; 7715 7716 if (!_ParseTupleFinds(args, &substring, &start, &end)) 7717 return NULL; 7718 7719 result = stringlib_find_slice( 7720 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self), 7721 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring), 7722 start, end 7723 ); 7724 7725 Py_DECREF(substring); 7726 7727 if (result < 0) { 7728 PyErr_SetString(PyExc_ValueError, "substring not found"); 7729 return NULL; 7730 } 7731 7732 return PyLong_FromSsize_t(result); 7733} 7734 7735PyDoc_STRVAR(islower__doc__, 7736 "S.islower() -> bool\n\ 7737\n\ 7738Return True if all cased characters in S are lowercase and there is\n\ 7739at least one cased character in S, False otherwise."); 7740 7741static PyObject* 7742unicode_islower(PyUnicodeObject *self) 7743{ 7744 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 7745 register const Py_UNICODE *e; 7746 int cased; 7747 7748 /* Shortcut for single character strings */ 7749 if (PyUnicode_GET_SIZE(self) == 1) 7750 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p)); 7751 7752 /* Special case for empty strings */ 7753 if (PyUnicode_GET_SIZE(self) == 0) 7754 return PyBool_FromLong(0); 7755 7756 e = p + PyUnicode_GET_SIZE(self); 7757 cased = 0; 7758 for (; p < e; p++) { 7759 register const Py_UNICODE ch = *p; 7760 7761 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) 7762 return PyBool_FromLong(0); 7763 else if (!cased && Py_UNICODE_ISLOWER(ch)) 7764 cased = 1; 7765 } 7766 return PyBool_FromLong(cased); 7767} 7768 7769PyDoc_STRVAR(isupper__doc__, 7770 "S.isupper() -> bool\n\ 7771\n\ 7772Return True if all cased characters in S are uppercase and there is\n\ 7773at least one cased character in S, False otherwise."); 7774 7775static PyObject* 7776unicode_isupper(PyUnicodeObject *self) 7777{ 7778 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 7779 register const Py_UNICODE *e; 7780 int cased; 7781 7782 /* Shortcut for single character strings */ 7783 if (PyUnicode_GET_SIZE(self) == 1) 7784 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0); 7785 7786 /* Special case for empty strings */ 7787 if (PyUnicode_GET_SIZE(self) == 0) 7788 return PyBool_FromLong(0); 7789 7790 e = p + PyUnicode_GET_SIZE(self); 7791 cased = 0; 7792 for (; p < e; p++) { 7793 register const Py_UNICODE ch = *p; 7794 7795 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch)) 7796 return PyBool_FromLong(0); 7797 else if (!cased && Py_UNICODE_ISUPPER(ch)) 7798 cased = 1; 7799 } 7800 return PyBool_FromLong(cased); 7801} 7802 7803PyDoc_STRVAR(istitle__doc__, 7804 "S.istitle() -> bool\n\ 7805\n\ 7806Return True if S is a titlecased string and there is at least one\n\ 7807character in S, i.e. upper- and titlecase characters may only\n\ 7808follow uncased characters and lowercase characters only cased ones.\n\ 7809Return False otherwise."); 7810 7811static PyObject* 7812unicode_istitle(PyUnicodeObject *self) 7813{ 7814 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 7815 register const Py_UNICODE *e; 7816 int cased, previous_is_cased; 7817 7818 /* Shortcut for single character strings */ 7819 if (PyUnicode_GET_SIZE(self) == 1) 7820 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) || 7821 (Py_UNICODE_ISUPPER(*p) != 0)); 7822 7823 /* Special case for empty strings */ 7824 if (PyUnicode_GET_SIZE(self) == 0) 7825 return PyBool_FromLong(0); 7826 7827 e = p + PyUnicode_GET_SIZE(self); 7828 cased = 0; 7829 previous_is_cased = 0; 7830 for (; p < e; p++) { 7831 register const Py_UNICODE ch = *p; 7832 7833 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) { 7834 if (previous_is_cased) 7835 return PyBool_FromLong(0); 7836 previous_is_cased = 1; 7837 cased = 1; 7838 } 7839 else if (Py_UNICODE_ISLOWER(ch)) { 7840 if (!previous_is_cased) 7841 return PyBool_FromLong(0); 7842 previous_is_cased = 1; 7843 cased = 1; 7844 } 7845 else 7846 previous_is_cased = 0; 7847 } 7848 return PyBool_FromLong(cased); 7849} 7850 7851PyDoc_STRVAR(isspace__doc__, 7852 "S.isspace() -> bool\n\ 7853\n\ 7854Return True if all characters in S are whitespace\n\ 7855and there is at least one character in S, False otherwise."); 7856 7857static PyObject* 7858unicode_isspace(PyUnicodeObject *self) 7859{ 7860 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 7861 register const Py_UNICODE *e; 7862 7863 /* Shortcut for single character strings */ 7864 if (PyUnicode_GET_SIZE(self) == 1 && 7865 Py_UNICODE_ISSPACE(*p)) 7866 return PyBool_FromLong(1); 7867 7868 /* Special case for empty strings */ 7869 if (PyUnicode_GET_SIZE(self) == 0) 7870 return PyBool_FromLong(0); 7871 7872 e = p + PyUnicode_GET_SIZE(self); 7873 for (; p < e; p++) { 7874 if (!Py_UNICODE_ISSPACE(*p)) 7875 return PyBool_FromLong(0); 7876 } 7877 return PyBool_FromLong(1); 7878} 7879 7880PyDoc_STRVAR(isalpha__doc__, 7881 "S.isalpha() -> bool\n\ 7882\n\ 7883Return True if all characters in S are alphabetic\n\ 7884and there is at least one character in S, False otherwise."); 7885 7886static PyObject* 7887unicode_isalpha(PyUnicodeObject *self) 7888{ 7889 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 7890 register const Py_UNICODE *e; 7891 7892 /* Shortcut for single character strings */ 7893 if (PyUnicode_GET_SIZE(self) == 1 && 7894 Py_UNICODE_ISALPHA(*p)) 7895 return PyBool_FromLong(1); 7896 7897 /* Special case for empty strings */ 7898 if (PyUnicode_GET_SIZE(self) == 0) 7899 return PyBool_FromLong(0); 7900 7901 e = p + PyUnicode_GET_SIZE(self); 7902 for (; p < e; p++) { 7903 if (!Py_UNICODE_ISALPHA(*p)) 7904 return PyBool_FromLong(0); 7905 } 7906 return PyBool_FromLong(1); 7907} 7908 7909PyDoc_STRVAR(isalnum__doc__, 7910 "S.isalnum() -> bool\n\ 7911\n\ 7912Return True if all characters in S are alphanumeric\n\ 7913and there is at least one character in S, False otherwise."); 7914 7915static PyObject* 7916unicode_isalnum(PyUnicodeObject *self) 7917{ 7918 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 7919 register const Py_UNICODE *e; 7920 7921 /* Shortcut for single character strings */ 7922 if (PyUnicode_GET_SIZE(self) == 1 && 7923 Py_UNICODE_ISALNUM(*p)) 7924 return PyBool_FromLong(1); 7925 7926 /* Special case for empty strings */ 7927 if (PyUnicode_GET_SIZE(self) == 0) 7928 return PyBool_FromLong(0); 7929 7930 e = p + PyUnicode_GET_SIZE(self); 7931 for (; p < e; p++) { 7932 if (!Py_UNICODE_ISALNUM(*p)) 7933 return PyBool_FromLong(0); 7934 } 7935 return PyBool_FromLong(1); 7936} 7937 7938PyDoc_STRVAR(isdecimal__doc__, 7939 "S.isdecimal() -> bool\n\ 7940\n\ 7941Return True if there are only decimal characters in S,\n\ 7942False otherwise."); 7943 7944static PyObject* 7945unicode_isdecimal(PyUnicodeObject *self) 7946{ 7947 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 7948 register const Py_UNICODE *e; 7949 7950 /* Shortcut for single character strings */ 7951 if (PyUnicode_GET_SIZE(self) == 1 && 7952 Py_UNICODE_ISDECIMAL(*p)) 7953 return PyBool_FromLong(1); 7954 7955 /* Special case for empty strings */ 7956 if (PyUnicode_GET_SIZE(self) == 0) 7957 return PyBool_FromLong(0); 7958 7959 e = p + PyUnicode_GET_SIZE(self); 7960 for (; p < e; p++) { 7961 if (!Py_UNICODE_ISDECIMAL(*p)) 7962 return PyBool_FromLong(0); 7963 } 7964 return PyBool_FromLong(1); 7965} 7966 7967PyDoc_STRVAR(isdigit__doc__, 7968 "S.isdigit() -> bool\n\ 7969\n\ 7970Return True if all characters in S are digits\n\ 7971and there is at least one character in S, False otherwise."); 7972 7973static PyObject* 7974unicode_isdigit(PyUnicodeObject *self) 7975{ 7976 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 7977 register const Py_UNICODE *e; 7978 7979 /* Shortcut for single character strings */ 7980 if (PyUnicode_GET_SIZE(self) == 1 && 7981 Py_UNICODE_ISDIGIT(*p)) 7982 return PyBool_FromLong(1); 7983 7984 /* Special case for empty strings */ 7985 if (PyUnicode_GET_SIZE(self) == 0) 7986 return PyBool_FromLong(0); 7987 7988 e = p + PyUnicode_GET_SIZE(self); 7989 for (; p < e; p++) { 7990 if (!Py_UNICODE_ISDIGIT(*p)) 7991 return PyBool_FromLong(0); 7992 } 7993 return PyBool_FromLong(1); 7994} 7995 7996PyDoc_STRVAR(isnumeric__doc__, 7997 "S.isnumeric() -> bool\n\ 7998\n\ 7999Return True if there are only numeric characters in S,\n\ 8000False otherwise."); 8001 8002static PyObject* 8003unicode_isnumeric(PyUnicodeObject *self) 8004{ 8005 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 8006 register const Py_UNICODE *e; 8007 8008 /* Shortcut for single character strings */ 8009 if (PyUnicode_GET_SIZE(self) == 1 && 8010 Py_UNICODE_ISNUMERIC(*p)) 8011 return PyBool_FromLong(1); 8012 8013 /* Special case for empty strings */ 8014 if (PyUnicode_GET_SIZE(self) == 0) 8015 return PyBool_FromLong(0); 8016 8017 e = p + PyUnicode_GET_SIZE(self); 8018 for (; p < e; p++) { 8019 if (!Py_UNICODE_ISNUMERIC(*p)) 8020 return PyBool_FromLong(0); 8021 } 8022 return PyBool_FromLong(1); 8023} 8024 8025int 8026PyUnicode_IsIdentifier(PyObject *self) 8027{ 8028 register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self); 8029 register const Py_UNICODE *e; 8030 8031 /* Special case for empty strings */ 8032 if (PyUnicode_GET_SIZE(self) == 0) 8033 return 0; 8034 8035 /* PEP 3131 says that the first character must be in 8036 XID_Start and subsequent characters in XID_Continue, 8037 and for the ASCII range, the 2.x rules apply (i.e 8038 start with letters and underscore, continue with 8039 letters, digits, underscore). However, given the current 8040 definition of XID_Start and XID_Continue, it is sufficient 8041 to check just for these, except that _ must be allowed 8042 as starting an identifier. */ 8043 if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */) 8044 return 0; 8045 8046 e = p + PyUnicode_GET_SIZE(self); 8047 for (p++; p < e; p++) { 8048 if (!_PyUnicode_IsXidContinue(*p)) 8049 return 0; 8050 } 8051 return 1; 8052} 8053 8054PyDoc_STRVAR(isidentifier__doc__, 8055 "S.isidentifier() -> bool\n\ 8056\n\ 8057Return True if S is a valid identifier according\n\ 8058to the language definition."); 8059 8060static PyObject* 8061unicode_isidentifier(PyObject *self) 8062{ 8063 return PyBool_FromLong(PyUnicode_IsIdentifier(self)); 8064} 8065 8066PyDoc_STRVAR(isprintable__doc__, 8067 "S.isprintable() -> bool\n\ 8068\n\ 8069Return True if all characters in S are considered\n\ 8070printable in repr() or S is empty, False otherwise."); 8071 8072static PyObject* 8073unicode_isprintable(PyObject *self) 8074{ 8075 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 8076 register const Py_UNICODE *e; 8077 8078 /* Shortcut for single character strings */ 8079 if (PyUnicode_GET_SIZE(self) == 1 && Py_UNICODE_ISPRINTABLE(*p)) { 8080 Py_RETURN_TRUE; 8081 } 8082 8083 e = p + PyUnicode_GET_SIZE(self); 8084 for (; p < e; p++) { 8085 if (!Py_UNICODE_ISPRINTABLE(*p)) { 8086 Py_RETURN_FALSE; 8087 } 8088 } 8089 Py_RETURN_TRUE; 8090} 8091 8092PyDoc_STRVAR(join__doc__, 8093 "S.join(iterable) -> str\n\ 8094\n\ 8095Return a string which is the concatenation of the strings in the\n\ 8096iterable. The separator between elements is S."); 8097 8098static PyObject* 8099unicode_join(PyObject *self, PyObject *data) 8100{ 8101 return PyUnicode_Join(self, data); 8102} 8103 8104static Py_ssize_t 8105unicode_length(PyUnicodeObject *self) 8106{ 8107 return self->length; 8108} 8109 8110PyDoc_STRVAR(ljust__doc__, 8111 "S.ljust(width[, fillchar]) -> str\n\ 8112\n\ 8113Return S left-justified in a Unicode string of length width. Padding is\n\ 8114done using the specified fill character (default is a space)."); 8115 8116static PyObject * 8117unicode_ljust(PyUnicodeObject *self, PyObject *args) 8118{ 8119 Py_ssize_t width; 8120 Py_UNICODE fillchar = ' '; 8121 8122 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar)) 8123 return NULL; 8124 8125 if (self->length >= width && PyUnicode_CheckExact(self)) { 8126 Py_INCREF(self); 8127 return (PyObject*) self; 8128 } 8129 8130 return (PyObject*) pad(self, 0, width - self->length, fillchar); 8131} 8132 8133PyDoc_STRVAR(lower__doc__, 8134 "S.lower() -> str\n\ 8135\n\ 8136Return a copy of the string S converted to lowercase."); 8137 8138static PyObject* 8139unicode_lower(PyUnicodeObject *self) 8140{ 8141 return fixup(self, fixlower); 8142} 8143 8144#define LEFTSTRIP 0 8145#define RIGHTSTRIP 1 8146#define BOTHSTRIP 2 8147 8148/* Arrays indexed by above */ 8149static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"}; 8150 8151#define STRIPNAME(i) (stripformat[i]+3) 8152 8153/* externally visible for str.strip(unicode) */ 8154PyObject * 8155_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj) 8156{ 8157 Py_UNICODE *s = PyUnicode_AS_UNICODE(self); 8158 Py_ssize_t len = PyUnicode_GET_SIZE(self); 8159 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj); 8160 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj); 8161 Py_ssize_t i, j; 8162 8163 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen); 8164 8165 i = 0; 8166 if (striptype != RIGHTSTRIP) { 8167 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) { 8168 i++; 8169 } 8170 } 8171 8172 j = len; 8173 if (striptype != LEFTSTRIP) { 8174 do { 8175 j--; 8176 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen)); 8177 j++; 8178 } 8179 8180 if (i == 0 && j == len && PyUnicode_CheckExact(self)) { 8181 Py_INCREF(self); 8182 return (PyObject*)self; 8183 } 8184 else 8185 return PyUnicode_FromUnicode(s+i, j-i); 8186} 8187 8188 8189static PyObject * 8190do_strip(PyUnicodeObject *self, int striptype) 8191{ 8192 Py_UNICODE *s = PyUnicode_AS_UNICODE(self); 8193 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j; 8194 8195 i = 0; 8196 if (striptype != RIGHTSTRIP) { 8197 while (i < len && Py_UNICODE_ISSPACE(s[i])) { 8198 i++; 8199 } 8200 } 8201 8202 j = len; 8203 if (striptype != LEFTSTRIP) { 8204 do { 8205 j--; 8206 } while (j >= i && Py_UNICODE_ISSPACE(s[j])); 8207 j++; 8208 } 8209 8210 if (i == 0 && j == len && PyUnicode_CheckExact(self)) { 8211 Py_INCREF(self); 8212 return (PyObject*)self; 8213 } 8214 else 8215 return PyUnicode_FromUnicode(s+i, j-i); 8216} 8217 8218 8219static PyObject * 8220do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args) 8221{ 8222 PyObject *sep = NULL; 8223 8224 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep)) 8225 return NULL; 8226 8227 if (sep != NULL && sep != Py_None) { 8228 if (PyUnicode_Check(sep)) 8229 return _PyUnicode_XStrip(self, striptype, sep); 8230 else { 8231 PyErr_Format(PyExc_TypeError, 8232 "%s arg must be None or str", 8233 STRIPNAME(striptype)); 8234 return NULL; 8235 } 8236 } 8237 8238 return do_strip(self, striptype); 8239} 8240 8241 8242PyDoc_STRVAR(strip__doc__, 8243 "S.strip([chars]) -> str\n\ 8244\n\ 8245Return a copy of the string S with leading and trailing\n\ 8246whitespace removed.\n\ 8247If chars is given and not None, remove characters in chars instead."); 8248 8249static PyObject * 8250unicode_strip(PyUnicodeObject *self, PyObject *args) 8251{ 8252 if (PyTuple_GET_SIZE(args) == 0) 8253 return do_strip(self, BOTHSTRIP); /* Common case */ 8254 else 8255 return do_argstrip(self, BOTHSTRIP, args); 8256} 8257 8258 8259PyDoc_STRVAR(lstrip__doc__, 8260 "S.lstrip([chars]) -> str\n\ 8261\n\ 8262Return a copy of the string S with leading whitespace removed.\n\ 8263If chars is given and not None, remove characters in chars instead."); 8264 8265static PyObject * 8266unicode_lstrip(PyUnicodeObject *self, PyObject *args) 8267{ 8268 if (PyTuple_GET_SIZE(args) == 0) 8269 return do_strip(self, LEFTSTRIP); /* Common case */ 8270 else 8271 return do_argstrip(self, LEFTSTRIP, args); 8272} 8273 8274 8275PyDoc_STRVAR(rstrip__doc__, 8276 "S.rstrip([chars]) -> str\n\ 8277\n\ 8278Return a copy of the string S with trailing whitespace removed.\n\ 8279If chars is given and not None, remove characters in chars instead."); 8280 8281static PyObject * 8282unicode_rstrip(PyUnicodeObject *self, PyObject *args) 8283{ 8284 if (PyTuple_GET_SIZE(args) == 0) 8285 return do_strip(self, RIGHTSTRIP); /* Common case */ 8286 else 8287 return do_argstrip(self, RIGHTSTRIP, args); 8288} 8289 8290 8291static PyObject* 8292unicode_repeat(PyUnicodeObject *str, Py_ssize_t len) 8293{ 8294 PyUnicodeObject *u; 8295 Py_UNICODE *p; 8296 Py_ssize_t nchars; 8297 size_t nbytes; 8298 8299 if (len < 1) { 8300 Py_INCREF(unicode_empty); 8301 return (PyObject *)unicode_empty; 8302 } 8303 8304 if (len == 1 && PyUnicode_CheckExact(str)) { 8305 /* no repeat, return original string */ 8306 Py_INCREF(str); 8307 return (PyObject*) str; 8308 } 8309 8310 /* ensure # of chars needed doesn't overflow int and # of bytes 8311 * needed doesn't overflow size_t 8312 */ 8313 nchars = len * str->length; 8314 if (nchars / len != str->length) { 8315 PyErr_SetString(PyExc_OverflowError, 8316 "repeated string is too long"); 8317 return NULL; 8318 } 8319 nbytes = (nchars + 1) * sizeof(Py_UNICODE); 8320 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) { 8321 PyErr_SetString(PyExc_OverflowError, 8322 "repeated string is too long"); 8323 return NULL; 8324 } 8325 u = _PyUnicode_New(nchars); 8326 if (!u) 8327 return NULL; 8328 8329 p = u->str; 8330 8331 if (str->length == 1) { 8332 Py_UNICODE_FILL(p, str->str[0], len); 8333 } else { 8334 Py_ssize_t done = str->length; /* number of characters copied this far */ 8335 Py_UNICODE_COPY(p, str->str, str->length); 8336 while (done < nchars) { 8337 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done; 8338 Py_UNICODE_COPY(p+done, p, n); 8339 done += n; 8340 } 8341 } 8342 8343 return (PyObject*) u; 8344} 8345 8346PyObject * 8347PyUnicode_Replace(PyObject *obj, 8348 PyObject *subobj, 8349 PyObject *replobj, 8350 Py_ssize_t maxcount) 8351{ 8352 PyObject *self; 8353 PyObject *str1; 8354 PyObject *str2; 8355 PyObject *result; 8356 8357 self = PyUnicode_FromObject(obj); 8358 if (self == NULL) 8359 return NULL; 8360 str1 = PyUnicode_FromObject(subobj); 8361 if (str1 == NULL) { 8362 Py_DECREF(self); 8363 return NULL; 8364 } 8365 str2 = PyUnicode_FromObject(replobj); 8366 if (str2 == NULL) { 8367 Py_DECREF(self); 8368 Py_DECREF(str1); 8369 return NULL; 8370 } 8371 result = replace((PyUnicodeObject *)self, 8372 (PyUnicodeObject *)str1, 8373 (PyUnicodeObject *)str2, 8374 maxcount); 8375 Py_DECREF(self); 8376 Py_DECREF(str1); 8377 Py_DECREF(str2); 8378 return result; 8379} 8380 8381PyDoc_STRVAR(replace__doc__, 8382 "S.replace(old, new[, count]) -> str\n\ 8383\n\ 8384Return a copy of S with all occurrences of substring\n\ 8385old replaced by new. If the optional argument count is\n\ 8386given, only the first count occurrences are replaced."); 8387 8388static PyObject* 8389unicode_replace(PyUnicodeObject *self, PyObject *args) 8390{ 8391 PyUnicodeObject *str1; 8392 PyUnicodeObject *str2; 8393 Py_ssize_t maxcount = -1; 8394 PyObject *result; 8395 8396 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount)) 8397 return NULL; 8398 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1); 8399 if (str1 == NULL) 8400 return NULL; 8401 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2); 8402 if (str2 == NULL) { 8403 Py_DECREF(str1); 8404 return NULL; 8405 } 8406 8407 result = replace(self, str1, str2, maxcount); 8408 8409 Py_DECREF(str1); 8410 Py_DECREF(str2); 8411 return result; 8412} 8413 8414static PyObject * 8415unicode_repr(PyObject *unicode) 8416{ 8417 PyObject *repr; 8418 Py_UNICODE *p; 8419 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode); 8420 Py_ssize_t size = PyUnicode_GET_SIZE(unicode); 8421 8422 /* XXX(nnorwitz): rather than over-allocating, it would be 8423 better to choose a different scheme. Perhaps scan the 8424 first N-chars of the string and allocate based on that size. 8425 */ 8426 /* Initial allocation is based on the longest-possible unichr 8427 escape. 8428 8429 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source 8430 unichr, so in this case it's the longest unichr escape. In 8431 narrow (UTF-16) builds this is five chars per source unichr 8432 since there are two unichrs in the surrogate pair, so in narrow 8433 (UTF-16) builds it's not the longest unichr escape. 8434 8435 In wide or narrow builds '\uxxxx' is 6 chars per source unichr, 8436 so in the narrow (UTF-16) build case it's the longest unichr 8437 escape. 8438 */ 8439 8440 repr = PyUnicode_FromUnicode(NULL, 8441 2 /* quotes */ 8442#ifdef Py_UNICODE_WIDE 8443 + 10*size 8444#else 8445 + 6*size 8446#endif 8447 + 1); 8448 if (repr == NULL) 8449 return NULL; 8450 8451 p = PyUnicode_AS_UNICODE(repr); 8452 8453 /* Add quote */ 8454 *p++ = (findchar(s, size, '\'') && 8455 !findchar(s, size, '"')) ? '"' : '\''; 8456 while (size-- > 0) { 8457 Py_UNICODE ch = *s++; 8458 8459 /* Escape quotes and backslashes */ 8460 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) { 8461 *p++ = '\\'; 8462 *p++ = ch; 8463 continue; 8464 } 8465 8466 /* Map special whitespace to '\t', \n', '\r' */ 8467 if (ch == '\t') { 8468 *p++ = '\\'; 8469 *p++ = 't'; 8470 } 8471 else if (ch == '\n') { 8472 *p++ = '\\'; 8473 *p++ = 'n'; 8474 } 8475 else if (ch == '\r') { 8476 *p++ = '\\'; 8477 *p++ = 'r'; 8478 } 8479 8480 /* Map non-printable US ASCII to '\xhh' */ 8481 else if (ch < ' ' || ch == 0x7F) { 8482 *p++ = '\\'; 8483 *p++ = 'x'; 8484 *p++ = hexdigits[(ch >> 4) & 0x000F]; 8485 *p++ = hexdigits[ch & 0x000F]; 8486 } 8487 8488 /* Copy ASCII characters as-is */ 8489 else if (ch < 0x7F) { 8490 *p++ = ch; 8491 } 8492 8493 /* Non-ASCII characters */ 8494 else { 8495 Py_UCS4 ucs = ch; 8496 8497#ifndef Py_UNICODE_WIDE 8498 Py_UNICODE ch2 = 0; 8499 /* Get code point from surrogate pair */ 8500 if (size > 0) { 8501 ch2 = *s; 8502 if (ch >= 0xD800 && ch < 0xDC00 && ch2 >= 0xDC00 8503 && ch2 <= 0xDFFF) { 8504 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) 8505 + 0x00010000; 8506 s++; 8507 size--; 8508 } 8509 } 8510#endif 8511 /* Map Unicode whitespace and control characters 8512 (categories Z* and C* except ASCII space) 8513 */ 8514 if (!Py_UNICODE_ISPRINTABLE(ucs)) { 8515 /* Map 8-bit characters to '\xhh' */ 8516 if (ucs <= 0xff) { 8517 *p++ = '\\'; 8518 *p++ = 'x'; 8519 *p++ = hexdigits[(ch >> 4) & 0x000F]; 8520 *p++ = hexdigits[ch & 0x000F]; 8521 } 8522 /* Map 21-bit characters to '\U00xxxxxx' */ 8523 else if (ucs >= 0x10000) { 8524 *p++ = '\\'; 8525 *p++ = 'U'; 8526 *p++ = hexdigits[(ucs >> 28) & 0x0000000F]; 8527 *p++ = hexdigits[(ucs >> 24) & 0x0000000F]; 8528 *p++ = hexdigits[(ucs >> 20) & 0x0000000F]; 8529 *p++ = hexdigits[(ucs >> 16) & 0x0000000F]; 8530 *p++ = hexdigits[(ucs >> 12) & 0x0000000F]; 8531 *p++ = hexdigits[(ucs >> 8) & 0x0000000F]; 8532 *p++ = hexdigits[(ucs >> 4) & 0x0000000F]; 8533 *p++ = hexdigits[ucs & 0x0000000F]; 8534 } 8535 /* Map 16-bit characters to '\uxxxx' */ 8536 else { 8537 *p++ = '\\'; 8538 *p++ = 'u'; 8539 *p++ = hexdigits[(ucs >> 12) & 0x000F]; 8540 *p++ = hexdigits[(ucs >> 8) & 0x000F]; 8541 *p++ = hexdigits[(ucs >> 4) & 0x000F]; 8542 *p++ = hexdigits[ucs & 0x000F]; 8543 } 8544 } 8545 /* Copy characters as-is */ 8546 else { 8547 *p++ = ch; 8548#ifndef Py_UNICODE_WIDE 8549 if (ucs >= 0x10000) 8550 *p++ = ch2; 8551#endif 8552 } 8553 } 8554 } 8555 /* Add quote */ 8556 *p++ = PyUnicode_AS_UNICODE(repr)[0]; 8557 8558 *p = '\0'; 8559 PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr)); 8560 return repr; 8561} 8562 8563PyDoc_STRVAR(rfind__doc__, 8564 "S.rfind(sub[, start[, end]]) -> int\n\ 8565\n\ 8566Return the highest index in S where substring sub is found,\n\ 8567such that sub is contained within s[start:end]. Optional\n\ 8568arguments start and end are interpreted as in slice notation.\n\ 8569\n\ 8570Return -1 on failure."); 8571 8572static PyObject * 8573unicode_rfind(PyUnicodeObject *self, PyObject *args) 8574{ 8575 PyObject *substring; 8576 Py_ssize_t start; 8577 Py_ssize_t end; 8578 Py_ssize_t result; 8579 8580 if (!_ParseTupleFinds(args, &substring, &start, &end)) 8581 return NULL; 8582 8583 result = stringlib_rfind_slice( 8584 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self), 8585 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring), 8586 start, end 8587 ); 8588 8589 Py_DECREF(substring); 8590 8591 return PyLong_FromSsize_t(result); 8592} 8593 8594PyDoc_STRVAR(rindex__doc__, 8595 "S.rindex(sub[, start[, end]]) -> int\n\ 8596\n\ 8597Like S.rfind() but raise ValueError when the substring is not found."); 8598 8599static PyObject * 8600unicode_rindex(PyUnicodeObject *self, PyObject *args) 8601{ 8602 PyObject *substring; 8603 Py_ssize_t start; 8604 Py_ssize_t end; 8605 Py_ssize_t result; 8606 8607 if (!_ParseTupleFinds(args, &substring, &start, &end)) 8608 return NULL; 8609 8610 result = stringlib_rfind_slice( 8611 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self), 8612 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring), 8613 start, end 8614 ); 8615 8616 Py_DECREF(substring); 8617 8618 if (result < 0) { 8619 PyErr_SetString(PyExc_ValueError, "substring not found"); 8620 return NULL; 8621 } 8622 return PyLong_FromSsize_t(result); 8623} 8624 8625PyDoc_STRVAR(rjust__doc__, 8626 "S.rjust(width[, fillchar]) -> str\n\ 8627\n\ 8628Return S right-justified in a string of length width. Padding is\n\ 8629done using the specified fill character (default is a space)."); 8630 8631static PyObject * 8632unicode_rjust(PyUnicodeObject *self, PyObject *args) 8633{ 8634 Py_ssize_t width; 8635 Py_UNICODE fillchar = ' '; 8636 8637 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar)) 8638 return NULL; 8639 8640 if (self->length >= width && PyUnicode_CheckExact(self)) { 8641 Py_INCREF(self); 8642 return (PyObject*) self; 8643 } 8644 8645 return (PyObject*) pad(self, width - self->length, 0, fillchar); 8646} 8647 8648PyObject * 8649PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit) 8650{ 8651 PyObject *result; 8652 8653 s = PyUnicode_FromObject(s); 8654 if (s == NULL) 8655 return NULL; 8656 if (sep != NULL) { 8657 sep = PyUnicode_FromObject(sep); 8658 if (sep == NULL) { 8659 Py_DECREF(s); 8660 return NULL; 8661 } 8662 } 8663 8664 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit); 8665 8666 Py_DECREF(s); 8667 Py_XDECREF(sep); 8668 return result; 8669} 8670 8671PyDoc_STRVAR(split__doc__, 8672 "S.split([sep[, maxsplit]]) -> list of strings\n\ 8673\n\ 8674Return a list of the words in S, using sep as the\n\ 8675delimiter string. If maxsplit is given, at most maxsplit\n\ 8676splits are done. If sep is not specified or is None, any\n\ 8677whitespace string is a separator and empty strings are\n\ 8678removed from the result."); 8679 8680static PyObject* 8681unicode_split(PyUnicodeObject *self, PyObject *args) 8682{ 8683 PyObject *substring = Py_None; 8684 Py_ssize_t maxcount = -1; 8685 8686 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount)) 8687 return NULL; 8688 8689 if (substring == Py_None) 8690 return split(self, NULL, maxcount); 8691 else if (PyUnicode_Check(substring)) 8692 return split(self, (PyUnicodeObject *)substring, maxcount); 8693 else 8694 return PyUnicode_Split((PyObject *)self, substring, maxcount); 8695} 8696 8697PyObject * 8698PyUnicode_Partition(PyObject *str_in, PyObject *sep_in) 8699{ 8700 PyObject* str_obj; 8701 PyObject* sep_obj; 8702 PyObject* out; 8703 8704 str_obj = PyUnicode_FromObject(str_in); 8705 if (!str_obj) 8706 return NULL; 8707 sep_obj = PyUnicode_FromObject(sep_in); 8708 if (!sep_obj) { 8709 Py_DECREF(str_obj); 8710 return NULL; 8711 } 8712 8713 out = stringlib_partition( 8714 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj), 8715 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj) 8716 ); 8717 8718 Py_DECREF(sep_obj); 8719 Py_DECREF(str_obj); 8720 8721 return out; 8722} 8723 8724 8725PyObject * 8726PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in) 8727{ 8728 PyObject* str_obj; 8729 PyObject* sep_obj; 8730 PyObject* out; 8731 8732 str_obj = PyUnicode_FromObject(str_in); 8733 if (!str_obj) 8734 return NULL; 8735 sep_obj = PyUnicode_FromObject(sep_in); 8736 if (!sep_obj) { 8737 Py_DECREF(str_obj); 8738 return NULL; 8739 } 8740 8741 out = stringlib_rpartition( 8742 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj), 8743 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj) 8744 ); 8745 8746 Py_DECREF(sep_obj); 8747 Py_DECREF(str_obj); 8748 8749 return out; 8750} 8751 8752PyDoc_STRVAR(partition__doc__, 8753 "S.partition(sep) -> (head, sep, tail)\n\ 8754\n\ 8755Search for the separator sep in S, and return the part before it,\n\ 8756the separator itself, and the part after it. If the separator is not\n\ 8757found, return S and two empty strings."); 8758 8759static PyObject* 8760unicode_partition(PyUnicodeObject *self, PyObject *separator) 8761{ 8762 return PyUnicode_Partition((PyObject *)self, separator); 8763} 8764 8765PyDoc_STRVAR(rpartition__doc__, 8766 "S.rpartition(sep) -> (head, sep, tail)\n\ 8767\n\ 8768Search for the separator sep in S, starting at the end of S, and return\n\ 8769the part before it, the separator itself, and the part after it. If the\n\ 8770separator is not found, return two empty strings and S."); 8771 8772static PyObject* 8773unicode_rpartition(PyUnicodeObject *self, PyObject *separator) 8774{ 8775 return PyUnicode_RPartition((PyObject *)self, separator); 8776} 8777 8778PyObject * 8779PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit) 8780{ 8781 PyObject *result; 8782 8783 s = PyUnicode_FromObject(s); 8784 if (s == NULL) 8785 return NULL; 8786 if (sep != NULL) { 8787 sep = PyUnicode_FromObject(sep); 8788 if (sep == NULL) { 8789 Py_DECREF(s); 8790 return NULL; 8791 } 8792 } 8793 8794 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit); 8795 8796 Py_DECREF(s); 8797 Py_XDECREF(sep); 8798 return result; 8799} 8800 8801PyDoc_STRVAR(rsplit__doc__, 8802 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\ 8803\n\ 8804Return a list of the words in S, using sep as the\n\ 8805delimiter string, starting at the end of the string and\n\ 8806working to the front. If maxsplit is given, at most maxsplit\n\ 8807splits are done. If sep is not specified, any whitespace string\n\ 8808is a separator."); 8809 8810static PyObject* 8811unicode_rsplit(PyUnicodeObject *self, PyObject *args) 8812{ 8813 PyObject *substring = Py_None; 8814 Py_ssize_t maxcount = -1; 8815 8816 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount)) 8817 return NULL; 8818 8819 if (substring == Py_None) 8820 return rsplit(self, NULL, maxcount); 8821 else if (PyUnicode_Check(substring)) 8822 return rsplit(self, (PyUnicodeObject *)substring, maxcount); 8823 else 8824 return PyUnicode_RSplit((PyObject *)self, substring, maxcount); 8825} 8826 8827PyDoc_STRVAR(splitlines__doc__, 8828 "S.splitlines([keepends]) -> list of strings\n\ 8829\n\ 8830Return a list of the lines in S, breaking at line boundaries.\n\ 8831Line breaks are not included in the resulting list unless keepends\n\ 8832is given and true."); 8833 8834static PyObject* 8835unicode_splitlines(PyUnicodeObject *self, PyObject *args) 8836{ 8837 int keepends = 0; 8838 8839 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends)) 8840 return NULL; 8841 8842 return PyUnicode_Splitlines((PyObject *)self, keepends); 8843} 8844 8845static 8846PyObject *unicode_str(PyObject *self) 8847{ 8848 if (PyUnicode_CheckExact(self)) { 8849 Py_INCREF(self); 8850 return self; 8851 } else 8852 /* Subtype -- return genuine unicode string with the same value. */ 8853 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self), 8854 PyUnicode_GET_SIZE(self)); 8855} 8856 8857PyDoc_STRVAR(swapcase__doc__, 8858 "S.swapcase() -> str\n\ 8859\n\ 8860Return a copy of S with uppercase characters converted to lowercase\n\ 8861and vice versa."); 8862 8863static PyObject* 8864unicode_swapcase(PyUnicodeObject *self) 8865{ 8866 return fixup(self, fixswapcase); 8867} 8868 8869PyDoc_STRVAR(maketrans__doc__, 8870 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\ 8871\n\ 8872Return a translation table usable for str.translate().\n\ 8873If there is only one argument, it must be a dictionary mapping Unicode\n\ 8874ordinals (integers) or characters to Unicode ordinals, strings or None.\n\ 8875Character keys will be then converted to ordinals.\n\ 8876If there are two arguments, they must be strings of equal length, and\n\ 8877in the resulting dictionary, each character in x will be mapped to the\n\ 8878character at the same position in y. If there is a third argument, it\n\ 8879must be a string, whose characters will be mapped to None in the result."); 8880 8881static PyObject* 8882unicode_maketrans(PyUnicodeObject *null, PyObject *args) 8883{ 8884 PyObject *x, *y = NULL, *z = NULL; 8885 PyObject *new = NULL, *key, *value; 8886 Py_ssize_t i = 0; 8887 int res; 8888 8889 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z)) 8890 return NULL; 8891 new = PyDict_New(); 8892 if (!new) 8893 return NULL; 8894 if (y != NULL) { 8895 /* x must be a string too, of equal length */ 8896 Py_ssize_t ylen = PyUnicode_GET_SIZE(y); 8897 if (!PyUnicode_Check(x)) { 8898 PyErr_SetString(PyExc_TypeError, "first maketrans argument must " 8899 "be a string if there is a second argument"); 8900 goto err; 8901 } 8902 if (PyUnicode_GET_SIZE(x) != ylen) { 8903 PyErr_SetString(PyExc_ValueError, "the first two maketrans " 8904 "arguments must have equal length"); 8905 goto err; 8906 } 8907 /* create entries for translating chars in x to those in y */ 8908 for (i = 0; i < PyUnicode_GET_SIZE(x); i++) { 8909 key = PyLong_FromLong(PyUnicode_AS_UNICODE(x)[i]); 8910 value = PyLong_FromLong(PyUnicode_AS_UNICODE(y)[i]); 8911 if (!key || !value) 8912 goto err; 8913 res = PyDict_SetItem(new, key, value); 8914 Py_DECREF(key); 8915 Py_DECREF(value); 8916 if (res < 0) 8917 goto err; 8918 } 8919 /* create entries for deleting chars in z */ 8920 if (z != NULL) { 8921 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) { 8922 key = PyLong_FromLong(PyUnicode_AS_UNICODE(z)[i]); 8923 if (!key) 8924 goto err; 8925 res = PyDict_SetItem(new, key, Py_None); 8926 Py_DECREF(key); 8927 if (res < 0) 8928 goto err; 8929 } 8930 } 8931 } else { 8932 /* x must be a dict */ 8933 if (!PyDict_CheckExact(x)) { 8934 PyErr_SetString(PyExc_TypeError, "if you give only one argument " 8935 "to maketrans it must be a dict"); 8936 goto err; 8937 } 8938 /* copy entries into the new dict, converting string keys to int keys */ 8939 while (PyDict_Next(x, &i, &key, &value)) { 8940 if (PyUnicode_Check(key)) { 8941 /* convert string keys to integer keys */ 8942 PyObject *newkey; 8943 if (PyUnicode_GET_SIZE(key) != 1) { 8944 PyErr_SetString(PyExc_ValueError, "string keys in translate " 8945 "table must be of length 1"); 8946 goto err; 8947 } 8948 newkey = PyLong_FromLong(PyUnicode_AS_UNICODE(key)[0]); 8949 if (!newkey) 8950 goto err; 8951 res = PyDict_SetItem(new, newkey, value); 8952 Py_DECREF(newkey); 8953 if (res < 0) 8954 goto err; 8955 } else if (PyLong_Check(key)) { 8956 /* just keep integer keys */ 8957 if (PyDict_SetItem(new, key, value) < 0) 8958 goto err; 8959 } else { 8960 PyErr_SetString(PyExc_TypeError, "keys in translate table must " 8961 "be strings or integers"); 8962 goto err; 8963 } 8964 } 8965 } 8966 return new; 8967 err: 8968 Py_DECREF(new); 8969 return NULL; 8970} 8971 8972PyDoc_STRVAR(translate__doc__, 8973 "S.translate(table) -> str\n\ 8974\n\ 8975Return a copy of the string S, where all characters have been mapped\n\ 8976through the given translation table, which must be a mapping of\n\ 8977Unicode ordinals to Unicode ordinals, strings, or None.\n\ 8978Unmapped characters are left untouched. Characters mapped to None\n\ 8979are deleted."); 8980 8981static PyObject* 8982unicode_translate(PyUnicodeObject *self, PyObject *table) 8983{ 8984 return PyUnicode_TranslateCharmap(self->str, self->length, table, "ignore"); 8985} 8986 8987PyDoc_STRVAR(upper__doc__, 8988 "S.upper() -> str\n\ 8989\n\ 8990Return a copy of S converted to uppercase."); 8991 8992static PyObject* 8993unicode_upper(PyUnicodeObject *self) 8994{ 8995 return fixup(self, fixupper); 8996} 8997 8998PyDoc_STRVAR(zfill__doc__, 8999 "S.zfill(width) -> str\n\ 9000\n\ 9001Pad a numeric string S with zeros on the left, to fill a field\n\ 9002of the specified width. The string S is never truncated."); 9003 9004static PyObject * 9005unicode_zfill(PyUnicodeObject *self, PyObject *args) 9006{ 9007 Py_ssize_t fill; 9008 PyUnicodeObject *u; 9009 9010 Py_ssize_t width; 9011 if (!PyArg_ParseTuple(args, "n:zfill", &width)) 9012 return NULL; 9013 9014 if (self->length >= width) { 9015 if (PyUnicode_CheckExact(self)) { 9016 Py_INCREF(self); 9017 return (PyObject*) self; 9018 } 9019 else 9020 return PyUnicode_FromUnicode( 9021 PyUnicode_AS_UNICODE(self), 9022 PyUnicode_GET_SIZE(self) 9023 ); 9024 } 9025 9026 fill = width - self->length; 9027 9028 u = pad(self, fill, 0, '0'); 9029 9030 if (u == NULL) 9031 return NULL; 9032 9033 if (u->str[fill] == '+' || u->str[fill] == '-') { 9034 /* move sign to beginning of string */ 9035 u->str[0] = u->str[fill]; 9036 u->str[fill] = '0'; 9037 } 9038 9039 return (PyObject*) u; 9040} 9041 9042#if 0 9043static PyObject* 9044unicode_freelistsize(PyUnicodeObject *self) 9045{ 9046 return PyLong_FromLong(numfree); 9047} 9048 9049static PyObject * 9050unicode__decimal2ascii(PyObject *self) 9051{ 9052 return PyUnicode_TransformDecimalToASCII(PyUnicode_AS_UNICODE(self), 9053 PyUnicode_GET_SIZE(self)); 9054} 9055#endif 9056 9057PyDoc_STRVAR(startswith__doc__, 9058 "S.startswith(prefix[, start[, end]]) -> bool\n\ 9059\n\ 9060Return True if S starts with the specified prefix, False otherwise.\n\ 9061With optional start, test S beginning at that position.\n\ 9062With optional end, stop comparing S at that position.\n\ 9063prefix can also be a tuple of strings to try."); 9064 9065static PyObject * 9066unicode_startswith(PyUnicodeObject *self, 9067 PyObject *args) 9068{ 9069 PyObject *subobj; 9070 PyUnicodeObject *substring; 9071 Py_ssize_t start = 0; 9072 Py_ssize_t end = PY_SSIZE_T_MAX; 9073 int result; 9074 9075 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj, 9076 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 9077 return NULL; 9078 if (PyTuple_Check(subobj)) { 9079 Py_ssize_t i; 9080 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) { 9081 substring = (PyUnicodeObject *)PyUnicode_FromObject( 9082 PyTuple_GET_ITEM(subobj, i)); 9083 if (substring == NULL) 9084 return NULL; 9085 result = tailmatch(self, substring, start, end, -1); 9086 Py_DECREF(substring); 9087 if (result) { 9088 Py_RETURN_TRUE; 9089 } 9090 } 9091 /* nothing matched */ 9092 Py_RETURN_FALSE; 9093 } 9094 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj); 9095 if (substring == NULL) 9096 return NULL; 9097 result = tailmatch(self, substring, start, end, -1); 9098 Py_DECREF(substring); 9099 return PyBool_FromLong(result); 9100} 9101 9102 9103PyDoc_STRVAR(endswith__doc__, 9104 "S.endswith(suffix[, start[, end]]) -> bool\n\ 9105\n\ 9106Return True if S ends with the specified suffix, False otherwise.\n\ 9107With optional start, test S beginning at that position.\n\ 9108With optional end, stop comparing S at that position.\n\ 9109suffix can also be a tuple of strings to try."); 9110 9111static PyObject * 9112unicode_endswith(PyUnicodeObject *self, 9113 PyObject *args) 9114{ 9115 PyObject *subobj; 9116 PyUnicodeObject *substring; 9117 Py_ssize_t start = 0; 9118 Py_ssize_t end = PY_SSIZE_T_MAX; 9119 int result; 9120 9121 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj, 9122 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 9123 return NULL; 9124 if (PyTuple_Check(subobj)) { 9125 Py_ssize_t i; 9126 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) { 9127 substring = (PyUnicodeObject *)PyUnicode_FromObject( 9128 PyTuple_GET_ITEM(subobj, i)); 9129 if (substring == NULL) 9130 return NULL; 9131 result = tailmatch(self, substring, start, end, +1); 9132 Py_DECREF(substring); 9133 if (result) { 9134 Py_RETURN_TRUE; 9135 } 9136 } 9137 Py_RETURN_FALSE; 9138 } 9139 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj); 9140 if (substring == NULL) 9141 return NULL; 9142 9143 result = tailmatch(self, substring, start, end, +1); 9144 Py_DECREF(substring); 9145 return PyBool_FromLong(result); 9146} 9147 9148#include "stringlib/string_format.h" 9149 9150PyDoc_STRVAR(format__doc__, 9151 "S.format(*args, **kwargs) -> str\n\ 9152\n\ 9153Return a formatted version of S, using substitutions from args and kwargs.\n\ 9154The substitutions are identified by braces ('{' and '}')."); 9155 9156PyDoc_STRVAR(format_map__doc__, 9157 "S.format_map(mapping) -> str\n\ 9158\n\ 9159Return a formatted version of S, using substitutions from mapping.\n\ 9160The substitutions are identified by braces ('{' and '}')."); 9161 9162static PyObject * 9163unicode__format__(PyObject* self, PyObject* args) 9164{ 9165 PyObject *format_spec; 9166 9167 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec)) 9168 return NULL; 9169 9170 return _PyUnicode_FormatAdvanced(self, 9171 PyUnicode_AS_UNICODE(format_spec), 9172 PyUnicode_GET_SIZE(format_spec)); 9173} 9174 9175PyDoc_STRVAR(p_format__doc__, 9176 "S.__format__(format_spec) -> str\n\ 9177\n\ 9178Return a formatted version of S as described by format_spec."); 9179 9180static PyObject * 9181unicode__sizeof__(PyUnicodeObject *v) 9182{ 9183 return PyLong_FromSsize_t(sizeof(PyUnicodeObject) + 9184 sizeof(Py_UNICODE) * (v->length + 1)); 9185} 9186 9187PyDoc_STRVAR(sizeof__doc__, 9188 "S.__sizeof__() -> size of S in memory, in bytes"); 9189 9190static PyObject * 9191unicode_getnewargs(PyUnicodeObject *v) 9192{ 9193 return Py_BuildValue("(u#)", v->str, v->length); 9194} 9195 9196static PyMethodDef unicode_methods[] = { 9197 9198 /* Order is according to common usage: often used methods should 9199 appear first, since lookup is done sequentially. */ 9200 9201 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__}, 9202 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__}, 9203 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__}, 9204 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__}, 9205 {"join", (PyCFunction) unicode_join, METH_O, join__doc__}, 9206 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__}, 9207 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__}, 9208 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__}, 9209 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__}, 9210 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__}, 9211 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__}, 9212 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__}, 9213 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__}, 9214 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__}, 9215 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__}, 9216 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__}, 9217 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__}, 9218 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__}, 9219 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__}, 9220 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__}, 9221 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__}, 9222 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__}, 9223 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__}, 9224 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__}, 9225 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__}, 9226 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__}, 9227 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__}, 9228 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__}, 9229 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__}, 9230 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__}, 9231 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__}, 9232 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__}, 9233 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__}, 9234 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__}, 9235 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__}, 9236 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__}, 9237 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__}, 9238 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__}, 9239 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__}, 9240 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__}, 9241 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__}, 9242 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__}, 9243 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__}, 9244 {"maketrans", (PyCFunction) unicode_maketrans, 9245 METH_VARARGS | METH_STATIC, maketrans__doc__}, 9246 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__}, 9247#if 0 9248 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__}, 9249#endif 9250 9251#if 0 9252 /* These methods are just used for debugging the implementation. */ 9253 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS}, 9254 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS}, 9255#endif 9256 9257 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS}, 9258 {NULL, NULL} 9259}; 9260 9261static PyObject * 9262unicode_mod(PyObject *v, PyObject *w) 9263{ 9264 if (!PyUnicode_Check(v)) { 9265 Py_INCREF(Py_NotImplemented); 9266 return Py_NotImplemented; 9267 } 9268 return PyUnicode_Format(v, w); 9269} 9270 9271static PyNumberMethods unicode_as_number = { 9272 0, /*nb_add*/ 9273 0, /*nb_subtract*/ 9274 0, /*nb_multiply*/ 9275 unicode_mod, /*nb_remainder*/ 9276}; 9277 9278static PySequenceMethods unicode_as_sequence = { 9279 (lenfunc) unicode_length, /* sq_length */ 9280 PyUnicode_Concat, /* sq_concat */ 9281 (ssizeargfunc) unicode_repeat, /* sq_repeat */ 9282 (ssizeargfunc) unicode_getitem, /* sq_item */ 9283 0, /* sq_slice */ 9284 0, /* sq_ass_item */ 9285 0, /* sq_ass_slice */ 9286 PyUnicode_Contains, /* sq_contains */ 9287}; 9288 9289static PyObject* 9290unicode_subscript(PyUnicodeObject* self, PyObject* item) 9291{ 9292 if (PyIndex_Check(item)) { 9293 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError); 9294 if (i == -1 && PyErr_Occurred()) 9295 return NULL; 9296 if (i < 0) 9297 i += PyUnicode_GET_SIZE(self); 9298 return unicode_getitem(self, i); 9299 } else if (PySlice_Check(item)) { 9300 Py_ssize_t start, stop, step, slicelength, cur, i; 9301 Py_UNICODE* source_buf; 9302 Py_UNICODE* result_buf; 9303 PyObject* result; 9304 9305 if (PySlice_GetIndicesEx(item, PyUnicode_GET_SIZE(self), 9306 &start, &stop, &step, &slicelength) < 0) { 9307 return NULL; 9308 } 9309 9310 if (slicelength <= 0) { 9311 return PyUnicode_FromUnicode(NULL, 0); 9312 } else if (start == 0 && step == 1 && slicelength == self->length && 9313 PyUnicode_CheckExact(self)) { 9314 Py_INCREF(self); 9315 return (PyObject *)self; 9316 } else if (step == 1) { 9317 return PyUnicode_FromUnicode(self->str + start, slicelength); 9318 } else { 9319 source_buf = PyUnicode_AS_UNICODE((PyObject*)self); 9320 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength* 9321 sizeof(Py_UNICODE)); 9322 9323 if (result_buf == NULL) 9324 return PyErr_NoMemory(); 9325 9326 for (cur = start, i = 0; i < slicelength; cur += step, i++) { 9327 result_buf[i] = source_buf[cur]; 9328 } 9329 9330 result = PyUnicode_FromUnicode(result_buf, slicelength); 9331 PyObject_FREE(result_buf); 9332 return result; 9333 } 9334 } else { 9335 PyErr_SetString(PyExc_TypeError, "string indices must be integers"); 9336 return NULL; 9337 } 9338} 9339 9340static PyMappingMethods unicode_as_mapping = { 9341 (lenfunc)unicode_length, /* mp_length */ 9342 (binaryfunc)unicode_subscript, /* mp_subscript */ 9343 (objobjargproc)0, /* mp_ass_subscript */ 9344}; 9345 9346 9347/* Helpers for PyUnicode_Format() */ 9348 9349static PyObject * 9350getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx) 9351{ 9352 Py_ssize_t argidx = *p_argidx; 9353 if (argidx < arglen) { 9354 (*p_argidx)++; 9355 if (arglen < 0) 9356 return args; 9357 else 9358 return PyTuple_GetItem(args, argidx); 9359 } 9360 PyErr_SetString(PyExc_TypeError, 9361 "not enough arguments for format string"); 9362 return NULL; 9363} 9364 9365/* Returns a new reference to a PyUnicode object, or NULL on failure. */ 9366 9367static PyObject * 9368formatfloat(PyObject *v, int flags, int prec, int type) 9369{ 9370 char *p; 9371 PyObject *result; 9372 double x; 9373 9374 x = PyFloat_AsDouble(v); 9375 if (x == -1.0 && PyErr_Occurred()) 9376 return NULL; 9377 9378 if (prec < 0) 9379 prec = 6; 9380 9381 p = PyOS_double_to_string(x, type, prec, 9382 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL); 9383 if (p == NULL) 9384 return NULL; 9385 result = PyUnicode_FromStringAndSize(p, strlen(p)); 9386 PyMem_Free(p); 9387 return result; 9388} 9389 9390static PyObject* 9391formatlong(PyObject *val, int flags, int prec, int type) 9392{ 9393 char *buf; 9394 int len; 9395 PyObject *str; /* temporary string object. */ 9396 PyObject *result; 9397 9398 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len); 9399 if (!str) 9400 return NULL; 9401 result = PyUnicode_FromStringAndSize(buf, len); 9402 Py_DECREF(str); 9403 return result; 9404} 9405 9406static int 9407formatchar(Py_UNICODE *buf, 9408 size_t buflen, 9409 PyObject *v) 9410{ 9411 /* presume that the buffer is at least 3 characters long */ 9412 if (PyUnicode_Check(v)) { 9413 if (PyUnicode_GET_SIZE(v) == 1) { 9414 buf[0] = PyUnicode_AS_UNICODE(v)[0]; 9415 buf[1] = '\0'; 9416 return 1; 9417 } 9418#ifndef Py_UNICODE_WIDE 9419 if (PyUnicode_GET_SIZE(v) == 2) { 9420 /* Decode a valid surrogate pair */ 9421 int c0 = PyUnicode_AS_UNICODE(v)[0]; 9422 int c1 = PyUnicode_AS_UNICODE(v)[1]; 9423 if (0xD800 <= c0 && c0 <= 0xDBFF && 9424 0xDC00 <= c1 && c1 <= 0xDFFF) { 9425 buf[0] = c0; 9426 buf[1] = c1; 9427 buf[2] = '\0'; 9428 return 2; 9429 } 9430 } 9431#endif 9432 goto onError; 9433 } 9434 else { 9435 /* Integer input truncated to a character */ 9436 long x; 9437 x = PyLong_AsLong(v); 9438 if (x == -1 && PyErr_Occurred()) 9439 goto onError; 9440 9441 if (x < 0 || x > 0x10ffff) { 9442 PyErr_SetString(PyExc_OverflowError, 9443 "%c arg not in range(0x110000)"); 9444 return -1; 9445 } 9446 9447#ifndef Py_UNICODE_WIDE 9448 if (x > 0xffff) { 9449 x -= 0x10000; 9450 buf[0] = (Py_UNICODE)(0xD800 | (x >> 10)); 9451 buf[1] = (Py_UNICODE)(0xDC00 | (x & 0x3FF)); 9452 return 2; 9453 } 9454#endif 9455 buf[0] = (Py_UNICODE) x; 9456 buf[1] = '\0'; 9457 return 1; 9458 } 9459 9460 onError: 9461 PyErr_SetString(PyExc_TypeError, 9462 "%c requires int or char"); 9463 return -1; 9464} 9465 9466/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...) 9467 FORMATBUFLEN is the length of the buffer in which chars are formatted. 9468*/ 9469#define FORMATBUFLEN (size_t)10 9470 9471PyObject * 9472PyUnicode_Format(PyObject *format, PyObject *args) 9473{ 9474 Py_UNICODE *fmt, *res; 9475 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx; 9476 int args_owned = 0; 9477 PyUnicodeObject *result = NULL; 9478 PyObject *dict = NULL; 9479 PyObject *uformat; 9480 9481 if (format == NULL || args == NULL) { 9482 PyErr_BadInternalCall(); 9483 return NULL; 9484 } 9485 uformat = PyUnicode_FromObject(format); 9486 if (uformat == NULL) 9487 return NULL; 9488 fmt = PyUnicode_AS_UNICODE(uformat); 9489 fmtcnt = PyUnicode_GET_SIZE(uformat); 9490 9491 reslen = rescnt = fmtcnt + 100; 9492 result = _PyUnicode_New(reslen); 9493 if (result == NULL) 9494 goto onError; 9495 res = PyUnicode_AS_UNICODE(result); 9496 9497 if (PyTuple_Check(args)) { 9498 arglen = PyTuple_Size(args); 9499 argidx = 0; 9500 } 9501 else { 9502 arglen = -1; 9503 argidx = -2; 9504 } 9505 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) && 9506 !PyUnicode_Check(args)) 9507 dict = args; 9508 9509 while (--fmtcnt >= 0) { 9510 if (*fmt != '%') { 9511 if (--rescnt < 0) { 9512 rescnt = fmtcnt + 100; 9513 reslen += rescnt; 9514 if (_PyUnicode_Resize(&result, reslen) < 0) 9515 goto onError; 9516 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt; 9517 --rescnt; 9518 } 9519 *res++ = *fmt++; 9520 } 9521 else { 9522 /* Got a format specifier */ 9523 int flags = 0; 9524 Py_ssize_t width = -1; 9525 int prec = -1; 9526 Py_UNICODE c = '\0'; 9527 Py_UNICODE fill; 9528 int isnumok; 9529 PyObject *v = NULL; 9530 PyObject *temp = NULL; 9531 Py_UNICODE *pbuf; 9532 Py_UNICODE sign; 9533 Py_ssize_t len; 9534 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For formatchar() */ 9535 9536 fmt++; 9537 if (*fmt == '(') { 9538 Py_UNICODE *keystart; 9539 Py_ssize_t keylen; 9540 PyObject *key; 9541 int pcount = 1; 9542 9543 if (dict == NULL) { 9544 PyErr_SetString(PyExc_TypeError, 9545 "format requires a mapping"); 9546 goto onError; 9547 } 9548 ++fmt; 9549 --fmtcnt; 9550 keystart = fmt; 9551 /* Skip over balanced parentheses */ 9552 while (pcount > 0 && --fmtcnt >= 0) { 9553 if (*fmt == ')') 9554 --pcount; 9555 else if (*fmt == '(') 9556 ++pcount; 9557 fmt++; 9558 } 9559 keylen = fmt - keystart - 1; 9560 if (fmtcnt < 0 || pcount > 0) { 9561 PyErr_SetString(PyExc_ValueError, 9562 "incomplete format key"); 9563 goto onError; 9564 } 9565#if 0 9566 /* keys are converted to strings using UTF-8 and 9567 then looked up since Python uses strings to hold 9568 variables names etc. in its namespaces and we 9569 wouldn't want to break common idioms. */ 9570 key = PyUnicode_EncodeUTF8(keystart, 9571 keylen, 9572 NULL); 9573#else 9574 key = PyUnicode_FromUnicode(keystart, keylen); 9575#endif 9576 if (key == NULL) 9577 goto onError; 9578 if (args_owned) { 9579 Py_DECREF(args); 9580 args_owned = 0; 9581 } 9582 args = PyObject_GetItem(dict, key); 9583 Py_DECREF(key); 9584 if (args == NULL) { 9585 goto onError; 9586 } 9587 args_owned = 1; 9588 arglen = -1; 9589 argidx = -2; 9590 } 9591 while (--fmtcnt >= 0) { 9592 switch (c = *fmt++) { 9593 case '-': flags |= F_LJUST; continue; 9594 case '+': flags |= F_SIGN; continue; 9595 case ' ': flags |= F_BLANK; continue; 9596 case '#': flags |= F_ALT; continue; 9597 case '0': flags |= F_ZERO; continue; 9598 } 9599 break; 9600 } 9601 if (c == '*') { 9602 v = getnextarg(args, arglen, &argidx); 9603 if (v == NULL) 9604 goto onError; 9605 if (!PyLong_Check(v)) { 9606 PyErr_SetString(PyExc_TypeError, 9607 "* wants int"); 9608 goto onError; 9609 } 9610 width = PyLong_AsLong(v); 9611 if (width == -1 && PyErr_Occurred()) 9612 goto onError; 9613 if (width < 0) { 9614 flags |= F_LJUST; 9615 width = -width; 9616 } 9617 if (--fmtcnt >= 0) 9618 c = *fmt++; 9619 } 9620 else if (c >= '0' && c <= '9') { 9621 width = c - '0'; 9622 while (--fmtcnt >= 0) { 9623 c = *fmt++; 9624 if (c < '0' || c > '9') 9625 break; 9626 if ((width*10) / 10 != width) { 9627 PyErr_SetString(PyExc_ValueError, 9628 "width too big"); 9629 goto onError; 9630 } 9631 width = width*10 + (c - '0'); 9632 } 9633 } 9634 if (c == '.') { 9635 prec = 0; 9636 if (--fmtcnt >= 0) 9637 c = *fmt++; 9638 if (c == '*') { 9639 v = getnextarg(args, arglen, &argidx); 9640 if (v == NULL) 9641 goto onError; 9642 if (!PyLong_Check(v)) { 9643 PyErr_SetString(PyExc_TypeError, 9644 "* wants int"); 9645 goto onError; 9646 } 9647 prec = PyLong_AsLong(v); 9648 if (prec == -1 && PyErr_Occurred()) 9649 goto onError; 9650 if (prec < 0) 9651 prec = 0; 9652 if (--fmtcnt >= 0) 9653 c = *fmt++; 9654 } 9655 else if (c >= '0' && c <= '9') { 9656 prec = c - '0'; 9657 while (--fmtcnt >= 0) { 9658 c = *fmt++; 9659 if (c < '0' || c > '9') 9660 break; 9661 if ((prec*10) / 10 != prec) { 9662 PyErr_SetString(PyExc_ValueError, 9663 "prec too big"); 9664 goto onError; 9665 } 9666 prec = prec*10 + (c - '0'); 9667 } 9668 } 9669 } /* prec */ 9670 if (fmtcnt >= 0) { 9671 if (c == 'h' || c == 'l' || c == 'L') { 9672 if (--fmtcnt >= 0) 9673 c = *fmt++; 9674 } 9675 } 9676 if (fmtcnt < 0) { 9677 PyErr_SetString(PyExc_ValueError, 9678 "incomplete format"); 9679 goto onError; 9680 } 9681 if (c != '%') { 9682 v = getnextarg(args, arglen, &argidx); 9683 if (v == NULL) 9684 goto onError; 9685 } 9686 sign = 0; 9687 fill = ' '; 9688 switch (c) { 9689 9690 case '%': 9691 pbuf = formatbuf; 9692 /* presume that buffer length is at least 1 */ 9693 pbuf[0] = '%'; 9694 len = 1; 9695 break; 9696 9697 case 's': 9698 case 'r': 9699 case 'a': 9700 if (PyUnicode_CheckExact(v) && c == 's') { 9701 temp = v; 9702 Py_INCREF(temp); 9703 } 9704 else { 9705 if (c == 's') 9706 temp = PyObject_Str(v); 9707 else if (c == 'r') 9708 temp = PyObject_Repr(v); 9709 else 9710 temp = PyObject_ASCII(v); 9711 if (temp == NULL) 9712 goto onError; 9713 if (PyUnicode_Check(temp)) 9714 /* nothing to do */; 9715 else { 9716 Py_DECREF(temp); 9717 PyErr_SetString(PyExc_TypeError, 9718 "%s argument has non-string str()"); 9719 goto onError; 9720 } 9721 } 9722 pbuf = PyUnicode_AS_UNICODE(temp); 9723 len = PyUnicode_GET_SIZE(temp); 9724 if (prec >= 0 && len > prec) 9725 len = prec; 9726 break; 9727 9728 case 'i': 9729 case 'd': 9730 case 'u': 9731 case 'o': 9732 case 'x': 9733 case 'X': 9734 if (c == 'i') 9735 c = 'd'; 9736 isnumok = 0; 9737 if (PyNumber_Check(v)) { 9738 PyObject *iobj=NULL; 9739 9740 if (PyLong_Check(v)) { 9741 iobj = v; 9742 Py_INCREF(iobj); 9743 } 9744 else { 9745 iobj = PyNumber_Long(v); 9746 } 9747 if (iobj!=NULL) { 9748 if (PyLong_Check(iobj)) { 9749 isnumok = 1; 9750 temp = formatlong(iobj, flags, prec, c); 9751 Py_DECREF(iobj); 9752 if (!temp) 9753 goto onError; 9754 pbuf = PyUnicode_AS_UNICODE(temp); 9755 len = PyUnicode_GET_SIZE(temp); 9756 sign = 1; 9757 } 9758 else { 9759 Py_DECREF(iobj); 9760 } 9761 } 9762 } 9763 if (!isnumok) { 9764 PyErr_Format(PyExc_TypeError, 9765 "%%%c format: a number is required, " 9766 "not %.200s", (char)c, Py_TYPE(v)->tp_name); 9767 goto onError; 9768 } 9769 if (flags & F_ZERO) 9770 fill = '0'; 9771 break; 9772 9773 case 'e': 9774 case 'E': 9775 case 'f': 9776 case 'F': 9777 case 'g': 9778 case 'G': 9779 temp = formatfloat(v, flags, prec, c); 9780 if (!temp) 9781 goto onError; 9782 pbuf = PyUnicode_AS_UNICODE(temp); 9783 len = PyUnicode_GET_SIZE(temp); 9784 sign = 1; 9785 if (flags & F_ZERO) 9786 fill = '0'; 9787 break; 9788 9789 case 'c': 9790 pbuf = formatbuf; 9791 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v); 9792 if (len < 0) 9793 goto onError; 9794 break; 9795 9796 default: 9797 PyErr_Format(PyExc_ValueError, 9798 "unsupported format character '%c' (0x%x) " 9799 "at index %zd", 9800 (31<=c && c<=126) ? (char)c : '?', 9801 (int)c, 9802 (Py_ssize_t)(fmt - 1 - 9803 PyUnicode_AS_UNICODE(uformat))); 9804 goto onError; 9805 } 9806 if (sign) { 9807 if (*pbuf == '-' || *pbuf == '+') { 9808 sign = *pbuf++; 9809 len--; 9810 } 9811 else if (flags & F_SIGN) 9812 sign = '+'; 9813 else if (flags & F_BLANK) 9814 sign = ' '; 9815 else 9816 sign = 0; 9817 } 9818 if (width < len) 9819 width = len; 9820 if (rescnt - (sign != 0) < width) { 9821 reslen -= rescnt; 9822 rescnt = width + fmtcnt + 100; 9823 reslen += rescnt; 9824 if (reslen < 0) { 9825 Py_XDECREF(temp); 9826 PyErr_NoMemory(); 9827 goto onError; 9828 } 9829 if (_PyUnicode_Resize(&result, reslen) < 0) { 9830 Py_XDECREF(temp); 9831 goto onError; 9832 } 9833 res = PyUnicode_AS_UNICODE(result) 9834 + reslen - rescnt; 9835 } 9836 if (sign) { 9837 if (fill != ' ') 9838 *res++ = sign; 9839 rescnt--; 9840 if (width > len) 9841 width--; 9842 } 9843 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) { 9844 assert(pbuf[0] == '0'); 9845 assert(pbuf[1] == c); 9846 if (fill != ' ') { 9847 *res++ = *pbuf++; 9848 *res++ = *pbuf++; 9849 } 9850 rescnt -= 2; 9851 width -= 2; 9852 if (width < 0) 9853 width = 0; 9854 len -= 2; 9855 } 9856 if (width > len && !(flags & F_LJUST)) { 9857 do { 9858 --rescnt; 9859 *res++ = fill; 9860 } while (--width > len); 9861 } 9862 if (fill == ' ') { 9863 if (sign) 9864 *res++ = sign; 9865 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) { 9866 assert(pbuf[0] == '0'); 9867 assert(pbuf[1] == c); 9868 *res++ = *pbuf++; 9869 *res++ = *pbuf++; 9870 } 9871 } 9872 Py_UNICODE_COPY(res, pbuf, len); 9873 res += len; 9874 rescnt -= len; 9875 while (--width >= len) { 9876 --rescnt; 9877 *res++ = ' '; 9878 } 9879 if (dict && (argidx < arglen) && c != '%') { 9880 PyErr_SetString(PyExc_TypeError, 9881 "not all arguments converted during string formatting"); 9882 Py_XDECREF(temp); 9883 goto onError; 9884 } 9885 Py_XDECREF(temp); 9886 } /* '%' */ 9887 } /* until end */ 9888 if (argidx < arglen && !dict) { 9889 PyErr_SetString(PyExc_TypeError, 9890 "not all arguments converted during string formatting"); 9891 goto onError; 9892 } 9893 9894 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0) 9895 goto onError; 9896 if (args_owned) { 9897 Py_DECREF(args); 9898 } 9899 Py_DECREF(uformat); 9900 return (PyObject *)result; 9901 9902 onError: 9903 Py_XDECREF(result); 9904 Py_DECREF(uformat); 9905 if (args_owned) { 9906 Py_DECREF(args); 9907 } 9908 return NULL; 9909} 9910 9911static PyObject * 9912unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds); 9913 9914static PyObject * 9915unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds) 9916{ 9917 PyObject *x = NULL; 9918 static char *kwlist[] = {"object", "encoding", "errors", 0}; 9919 char *encoding = NULL; 9920 char *errors = NULL; 9921 9922 if (type != &PyUnicode_Type) 9923 return unicode_subtype_new(type, args, kwds); 9924 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str", 9925 kwlist, &x, &encoding, &errors)) 9926 return NULL; 9927 if (x == NULL) 9928 return (PyObject *)_PyUnicode_New(0); 9929 if (encoding == NULL && errors == NULL) 9930 return PyObject_Str(x); 9931 else 9932 return PyUnicode_FromEncodedObject(x, encoding, errors); 9933} 9934 9935static PyObject * 9936unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds) 9937{ 9938 PyUnicodeObject *tmp, *pnew; 9939 Py_ssize_t n; 9940 9941 assert(PyType_IsSubtype(type, &PyUnicode_Type)); 9942 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds); 9943 if (tmp == NULL) 9944 return NULL; 9945 assert(PyUnicode_Check(tmp)); 9946 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length); 9947 if (pnew == NULL) { 9948 Py_DECREF(tmp); 9949 return NULL; 9950 } 9951 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1)); 9952 if (pnew->str == NULL) { 9953 _Py_ForgetReference((PyObject *)pnew); 9954 PyObject_Del(pnew); 9955 Py_DECREF(tmp); 9956 return PyErr_NoMemory(); 9957 } 9958 Py_UNICODE_COPY(pnew->str, tmp->str, n+1); 9959 pnew->length = n; 9960 pnew->hash = tmp->hash; 9961 Py_DECREF(tmp); 9962 return (PyObject *)pnew; 9963} 9964 9965PyDoc_STRVAR(unicode_doc, 9966 "str(string[, encoding[, errors]]) -> str\n\ 9967\n\ 9968Create a new string object from the given encoded string.\n\ 9969encoding defaults to the current default string encoding.\n\ 9970errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'."); 9971 9972static PyObject *unicode_iter(PyObject *seq); 9973 9974PyTypeObject PyUnicode_Type = { 9975 PyVarObject_HEAD_INIT(&PyType_Type, 0) 9976 "str", /* tp_name */ 9977 sizeof(PyUnicodeObject), /* tp_size */ 9978 0, /* tp_itemsize */ 9979 /* Slots */ 9980 (destructor)unicode_dealloc, /* tp_dealloc */ 9981 0, /* tp_print */ 9982 0, /* tp_getattr */ 9983 0, /* tp_setattr */ 9984 0, /* tp_reserved */ 9985 unicode_repr, /* tp_repr */ 9986 &unicode_as_number, /* tp_as_number */ 9987 &unicode_as_sequence, /* tp_as_sequence */ 9988 &unicode_as_mapping, /* tp_as_mapping */ 9989 (hashfunc) unicode_hash, /* tp_hash*/ 9990 0, /* tp_call*/ 9991 (reprfunc) unicode_str, /* tp_str */ 9992 PyObject_GenericGetAttr, /* tp_getattro */ 9993 0, /* tp_setattro */ 9994 0, /* tp_as_buffer */ 9995 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | 9996 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */ 9997 unicode_doc, /* tp_doc */ 9998 0, /* tp_traverse */ 9999 0, /* tp_clear */ 10000 PyUnicode_RichCompare, /* tp_richcompare */ 10001 0, /* tp_weaklistoffset */ 10002 unicode_iter, /* tp_iter */ 10003 0, /* tp_iternext */ 10004 unicode_methods, /* tp_methods */ 10005 0, /* tp_members */ 10006 0, /* tp_getset */ 10007 &PyBaseObject_Type, /* tp_base */ 10008 0, /* tp_dict */ 10009 0, /* tp_descr_get */ 10010 0, /* tp_descr_set */ 10011 0, /* tp_dictoffset */ 10012 0, /* tp_init */ 10013 0, /* tp_alloc */ 10014 unicode_new, /* tp_new */ 10015 PyObject_Del, /* tp_free */ 10016}; 10017 10018/* Initialize the Unicode implementation */ 10019 10020void _PyUnicode_Init(void) 10021{ 10022 int i; 10023 10024 /* XXX - move this array to unicodectype.c ? */ 10025 Py_UNICODE linebreak[] = { 10026 0x000A, /* LINE FEED */ 10027 0x000D, /* CARRIAGE RETURN */ 10028 0x001C, /* FILE SEPARATOR */ 10029 0x001D, /* GROUP SEPARATOR */ 10030 0x001E, /* RECORD SEPARATOR */ 10031 0x0085, /* NEXT LINE */ 10032 0x2028, /* LINE SEPARATOR */ 10033 0x2029, /* PARAGRAPH SEPARATOR */ 10034 }; 10035 10036 /* Init the implementation */ 10037 free_list = NULL; 10038 numfree = 0; 10039 unicode_empty = _PyUnicode_New(0); 10040 if (!unicode_empty) 10041 return; 10042 10043 for (i = 0; i < 256; i++) 10044 unicode_latin1[i] = NULL; 10045 if (PyType_Ready(&PyUnicode_Type) < 0) 10046 Py_FatalError("Can't initialize 'unicode'"); 10047 10048 /* initialize the linebreak bloom filter */ 10049 bloom_linebreak = make_bloom_mask( 10050 linebreak, sizeof(linebreak) / sizeof(linebreak[0]) 10051 ); 10052 10053 PyType_Ready(&EncodingMapType); 10054} 10055 10056/* Finalize the Unicode implementation */ 10057 10058int 10059PyUnicode_ClearFreeList(void) 10060{ 10061 int freelist_size = numfree; 10062 PyUnicodeObject *u; 10063 10064 for (u = free_list; u != NULL;) { 10065 PyUnicodeObject *v = u; 10066 u = *(PyUnicodeObject **)u; 10067 if (v->str) 10068 PyObject_DEL(v->str); 10069 Py_XDECREF(v->defenc); 10070 PyObject_Del(v); 10071 numfree--; 10072 } 10073 free_list = NULL; 10074 assert(numfree == 0); 10075 return freelist_size; 10076} 10077 10078void 10079_PyUnicode_Fini(void) 10080{ 10081 int i; 10082 10083 Py_XDECREF(unicode_empty); 10084 unicode_empty = NULL; 10085 10086 for (i = 0; i < 256; i++) { 10087 if (unicode_latin1[i]) { 10088 Py_DECREF(unicode_latin1[i]); 10089 unicode_latin1[i] = NULL; 10090 } 10091 } 10092 (void)PyUnicode_ClearFreeList(); 10093} 10094 10095void 10096PyUnicode_InternInPlace(PyObject **p) 10097{ 10098 register PyUnicodeObject *s = (PyUnicodeObject *)(*p); 10099 PyObject *t; 10100 if (s == NULL || !PyUnicode_Check(s)) 10101 Py_FatalError( 10102 "PyUnicode_InternInPlace: unicode strings only please!"); 10103 /* If it's a subclass, we don't really know what putting 10104 it in the interned dict might do. */ 10105 if (!PyUnicode_CheckExact(s)) 10106 return; 10107 if (PyUnicode_CHECK_INTERNED(s)) 10108 return; 10109 if (interned == NULL) { 10110 interned = PyDict_New(); 10111 if (interned == NULL) { 10112 PyErr_Clear(); /* Don't leave an exception */ 10113 return; 10114 } 10115 } 10116 /* It might be that the GetItem call fails even 10117 though the key is present in the dictionary, 10118 namely when this happens during a stack overflow. */ 10119 Py_ALLOW_RECURSION 10120 t = PyDict_GetItem(interned, (PyObject *)s); 10121 Py_END_ALLOW_RECURSION 10122 10123 if (t) { 10124 Py_INCREF(t); 10125 Py_DECREF(*p); 10126 *p = t; 10127 return; 10128 } 10129 10130 PyThreadState_GET()->recursion_critical = 1; 10131 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) { 10132 PyErr_Clear(); 10133 PyThreadState_GET()->recursion_critical = 0; 10134 return; 10135 } 10136 PyThreadState_GET()->recursion_critical = 0; 10137 /* The two references in interned are not counted by refcnt. 10138 The deallocator will take care of this */ 10139 Py_REFCNT(s) -= 2; 10140 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL; 10141} 10142 10143void 10144PyUnicode_InternImmortal(PyObject **p) 10145{ 10146 PyUnicode_InternInPlace(p); 10147 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) { 10148 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL; 10149 Py_INCREF(*p); 10150 } 10151} 10152 10153PyObject * 10154PyUnicode_InternFromString(const char *cp) 10155{ 10156 PyObject *s = PyUnicode_FromString(cp); 10157 if (s == NULL) 10158 return NULL; 10159 PyUnicode_InternInPlace(&s); 10160 return s; 10161} 10162 10163void 10164_Py_ReleaseInternedUnicodeStrings(void) 10165{ 10166 PyObject *keys; 10167 PyUnicodeObject *s; 10168 Py_ssize_t i, n; 10169 Py_ssize_t immortal_size = 0, mortal_size = 0; 10170 10171 if (interned == NULL || !PyDict_Check(interned)) 10172 return; 10173 keys = PyDict_Keys(interned); 10174 if (keys == NULL || !PyList_Check(keys)) { 10175 PyErr_Clear(); 10176 return; 10177 } 10178 10179 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak 10180 detector, interned unicode strings are not forcibly deallocated; 10181 rather, we give them their stolen references back, and then clear 10182 and DECREF the interned dict. */ 10183 10184 n = PyList_GET_SIZE(keys); 10185 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n", 10186 n); 10187 for (i = 0; i < n; i++) { 10188 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i); 10189 switch (s->state) { 10190 case SSTATE_NOT_INTERNED: 10191 /* XXX Shouldn't happen */ 10192 break; 10193 case SSTATE_INTERNED_IMMORTAL: 10194 Py_REFCNT(s) += 1; 10195 immortal_size += s->length; 10196 break; 10197 case SSTATE_INTERNED_MORTAL: 10198 Py_REFCNT(s) += 2; 10199 mortal_size += s->length; 10200 break; 10201 default: 10202 Py_FatalError("Inconsistent interned string state."); 10203 } 10204 s->state = SSTATE_NOT_INTERNED; 10205 } 10206 fprintf(stderr, "total size of all interned strings: " 10207 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d " 10208 "mortal/immortal\n", mortal_size, immortal_size); 10209 Py_DECREF(keys); 10210 PyDict_Clear(interned); 10211 Py_DECREF(interned); 10212 interned = NULL; 10213} 10214 10215 10216/********************* Unicode Iterator **************************/ 10217 10218typedef struct { 10219 PyObject_HEAD 10220 Py_ssize_t it_index; 10221 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */ 10222} unicodeiterobject; 10223 10224static void 10225unicodeiter_dealloc(unicodeiterobject *it) 10226{ 10227 _PyObject_GC_UNTRACK(it); 10228 Py_XDECREF(it->it_seq); 10229 PyObject_GC_Del(it); 10230} 10231 10232static int 10233unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg) 10234{ 10235 Py_VISIT(it->it_seq); 10236 return 0; 10237} 10238 10239static PyObject * 10240unicodeiter_next(unicodeiterobject *it) 10241{ 10242 PyUnicodeObject *seq; 10243 PyObject *item; 10244 10245 assert(it != NULL); 10246 seq = it->it_seq; 10247 if (seq == NULL) 10248 return NULL; 10249 assert(PyUnicode_Check(seq)); 10250 10251 if (it->it_index < PyUnicode_GET_SIZE(seq)) { 10252 item = PyUnicode_FromUnicode( 10253 PyUnicode_AS_UNICODE(seq)+it->it_index, 1); 10254 if (item != NULL) 10255 ++it->it_index; 10256 return item; 10257 } 10258 10259 Py_DECREF(seq); 10260 it->it_seq = NULL; 10261 return NULL; 10262} 10263 10264static PyObject * 10265unicodeiter_len(unicodeiterobject *it) 10266{ 10267 Py_ssize_t len = 0; 10268 if (it->it_seq) 10269 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index; 10270 return PyLong_FromSsize_t(len); 10271} 10272 10273PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it))."); 10274 10275static PyMethodDef unicodeiter_methods[] = { 10276 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS, 10277 length_hint_doc}, 10278 {NULL, NULL} /* sentinel */ 10279}; 10280 10281PyTypeObject PyUnicodeIter_Type = { 10282 PyVarObject_HEAD_INIT(&PyType_Type, 0) 10283 "str_iterator", /* tp_name */ 10284 sizeof(unicodeiterobject), /* tp_basicsize */ 10285 0, /* tp_itemsize */ 10286 /* methods */ 10287 (destructor)unicodeiter_dealloc, /* tp_dealloc */ 10288 0, /* tp_print */ 10289 0, /* tp_getattr */ 10290 0, /* tp_setattr */ 10291 0, /* tp_reserved */ 10292 0, /* tp_repr */ 10293 0, /* tp_as_number */ 10294 0, /* tp_as_sequence */ 10295 0, /* tp_as_mapping */ 10296 0, /* tp_hash */ 10297 0, /* tp_call */ 10298 0, /* tp_str */ 10299 PyObject_GenericGetAttr, /* tp_getattro */ 10300 0, /* tp_setattro */ 10301 0, /* tp_as_buffer */ 10302 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */ 10303 0, /* tp_doc */ 10304 (traverseproc)unicodeiter_traverse, /* tp_traverse */ 10305 0, /* tp_clear */ 10306 0, /* tp_richcompare */ 10307 0, /* tp_weaklistoffset */ 10308 PyObject_SelfIter, /* tp_iter */ 10309 (iternextfunc)unicodeiter_next, /* tp_iternext */ 10310 unicodeiter_methods, /* tp_methods */ 10311 0, 10312}; 10313 10314static PyObject * 10315unicode_iter(PyObject *seq) 10316{ 10317 unicodeiterobject *it; 10318 10319 if (!PyUnicode_Check(seq)) { 10320 PyErr_BadInternalCall(); 10321 return NULL; 10322 } 10323 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type); 10324 if (it == NULL) 10325 return NULL; 10326 it->it_index = 0; 10327 Py_INCREF(seq); 10328 it->it_seq = (PyUnicodeObject *)seq; 10329 _PyObject_GC_TRACK(it); 10330 return (PyObject *)it; 10331} 10332 10333size_t 10334Py_UNICODE_strlen(const Py_UNICODE *u) 10335{ 10336 int res = 0; 10337 while(*u++) 10338 res++; 10339 return res; 10340} 10341 10342Py_UNICODE* 10343Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2) 10344{ 10345 Py_UNICODE *u = s1; 10346 while ((*u++ = *s2++)); 10347 return s1; 10348} 10349 10350Py_UNICODE* 10351Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n) 10352{ 10353 Py_UNICODE *u = s1; 10354 while ((*u++ = *s2++)) 10355 if (n-- == 0) 10356 break; 10357 return s1; 10358} 10359 10360Py_UNICODE* 10361Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2) 10362{ 10363 Py_UNICODE *u1 = s1; 10364 u1 += Py_UNICODE_strlen(u1); 10365 Py_UNICODE_strcpy(u1, s2); 10366 return s1; 10367} 10368 10369int 10370Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2) 10371{ 10372 while (*s1 && *s2 && *s1 == *s2) 10373 s1++, s2++; 10374 if (*s1 && *s2) 10375 return (*s1 < *s2) ? -1 : +1; 10376 if (*s1) 10377 return 1; 10378 if (*s2) 10379 return -1; 10380 return 0; 10381} 10382 10383int 10384Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n) 10385{ 10386 register Py_UNICODE u1, u2; 10387 for (; n != 0; n--) { 10388 u1 = *s1; 10389 u2 = *s2; 10390 if (u1 != u2) 10391 return (u1 < u2) ? -1 : +1; 10392 if (u1 == '\0') 10393 return 0; 10394 s1++; 10395 s2++; 10396 } 10397 return 0; 10398} 10399 10400Py_UNICODE* 10401Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c) 10402{ 10403 const Py_UNICODE *p; 10404 for (p = s; *p; p++) 10405 if (*p == c) 10406 return (Py_UNICODE*)p; 10407 return NULL; 10408} 10409 10410Py_UNICODE* 10411Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c) 10412{ 10413 const Py_UNICODE *p; 10414 p = s + Py_UNICODE_strlen(s); 10415 while (p != s) { 10416 p--; 10417 if (*p == c) 10418 return (Py_UNICODE*)p; 10419 } 10420 return NULL; 10421} 10422 10423Py_UNICODE* 10424PyUnicode_AsUnicodeCopy(PyObject *object) 10425{ 10426 PyUnicodeObject *unicode = (PyUnicodeObject *)object; 10427 Py_UNICODE *copy; 10428 Py_ssize_t size; 10429 10430 /* Ensure we won't overflow the size. */ 10431 if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) { 10432 PyErr_NoMemory(); 10433 return NULL; 10434 } 10435 size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */ 10436 size *= sizeof(Py_UNICODE); 10437 copy = PyMem_Malloc(size); 10438 if (copy == NULL) { 10439 PyErr_NoMemory(); 10440 return NULL; 10441 } 10442 memcpy(copy, PyUnicode_AS_UNICODE(unicode), size); 10443 return copy; 10444} 10445 10446/* A _string module, to export formatter_parser and formatter_field_name_split 10447 to the string.Formatter class implemented in Python. */ 10448 10449static PyMethodDef _string_methods[] = { 10450 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split, 10451 METH_O, PyDoc_STR("split the argument as a field name")}, 10452 {"formatter_parser", (PyCFunction) formatter_parser, 10453 METH_O, PyDoc_STR("parse the argument as a format string")}, 10454 {NULL, NULL} 10455}; 10456 10457static struct PyModuleDef _string_module = { 10458 PyModuleDef_HEAD_INIT, 10459 "_string", 10460 PyDoc_STR("string helper module"), 10461 0, 10462 _string_methods, 10463 NULL, 10464 NULL, 10465 NULL, 10466 NULL 10467}; 10468 10469PyMODINIT_FUNC 10470PyInit__string(void) 10471{ 10472 return PyModule_Create(&_string_module); 10473} 10474 10475 10476#ifdef __cplusplus 10477} 10478#endif 10479