unicodeobject.c revision c2504931ee6bb19b4d38d0d654b02a6fbc797ebd
1/* 2 3Unicode implementation based on original code by Fredrik Lundh, 4modified by Marc-Andre Lemburg <mal@lemburg.com> according to the 5Unicode Integration Proposal (see file Misc/unicode.txt). 6 7Major speed upgrades to the method implementations at the Reykjavik 8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke. 9 10Copyright (c) Corporation for National Research Initiatives. 11 12-------------------------------------------------------------------- 13The original string type implementation is: 14 15 Copyright (c) 1999 by Secret Labs AB 16 Copyright (c) 1999 by Fredrik Lundh 17 18By obtaining, using, and/or copying this software and/or its 19associated documentation, you agree that you have read, understood, 20and will comply with the following terms and conditions: 21 22Permission to use, copy, modify, and distribute this software and its 23associated documentation for any purpose and without fee is hereby 24granted, provided that the above copyright notice appears in all 25copies, and that both that copyright notice and this permission notice 26appear in supporting documentation, and that the name of Secret Labs 27AB or the author not be used in advertising or publicity pertaining to 28distribution of the software without specific, written prior 29permission. 30 31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO 32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND 33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR 34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT 37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 38-------------------------------------------------------------------- 39 40*/ 41 42#define PY_SSIZE_T_CLEAN 43#include "Python.h" 44 45#include "unicodeobject.h" 46#include "ucnhash.h" 47 48#include "formatter_unicode.h" 49 50#ifdef MS_WINDOWS 51#include <windows.h> 52#endif 53 54/* Limit for the Unicode object free list */ 55 56#define MAX_UNICODE_FREELIST_SIZE 1024 57 58/* Limit for the Unicode object free list stay alive optimization. 59 60 The implementation will keep allocated Unicode memory intact for 61 all objects on the free list having a size less than this 62 limit. This reduces malloc() overhead for small Unicode objects. 63 64 At worst this will result in MAX_UNICODE_FREELIST_SIZE * 65 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT + 66 malloc()-overhead) bytes of unused garbage. 67 68 Setting the limit to 0 effectively turns the feature off. 69 70 Note: This is an experimental feature ! If you get core dumps when 71 using Unicode objects, turn this feature off. 72 73*/ 74 75#define KEEPALIVE_SIZE_LIMIT 9 76 77/* Endianness switches; defaults to little endian */ 78 79#ifdef WORDS_BIGENDIAN 80# define BYTEORDER_IS_BIG_ENDIAN 81#else 82# define BYTEORDER_IS_LITTLE_ENDIAN 83#endif 84 85/* --- Globals ------------------------------------------------------------ 86 87 The globals are initialized by the _PyUnicode_Init() API and should 88 not be used before calling that API. 89 90*/ 91 92 93#ifdef __cplusplus 94extern "C" { 95#endif 96 97/* This dictionary holds all interned unicode strings. Note that references 98 to strings in this dictionary are *not* counted in the string's ob_refcnt. 99 When the interned string reaches a refcnt of 0 the string deallocation 100 function will delete the reference from this dictionary. 101 102 Another way to look at this is that to say that the actual reference 103 count of a string is: s->ob_refcnt + (s->ob_sstate?2:0) 104*/ 105static PyObject *interned; 106 107/* Free list for Unicode objects */ 108static PyUnicodeObject *unicode_freelist; 109static int unicode_freelist_size; 110 111/* The empty Unicode object is shared to improve performance. */ 112static PyUnicodeObject *unicode_empty; 113 114/* Single character Unicode strings in the Latin-1 range are being 115 shared as well. */ 116static PyUnicodeObject *unicode_latin1[256]; 117 118/* Default encoding to use and assume when NULL is passed as encoding 119 parameter; it is fixed to "utf-8". Always use the 120 PyUnicode_GetDefaultEncoding() API to access this global. */ 121static const char unicode_default_encoding[] = "utf-8"; 122 123Py_UNICODE 124PyUnicode_GetMax(void) 125{ 126#ifdef Py_UNICODE_WIDE 127 return 0x10FFFF; 128#else 129 /* This is actually an illegal character, so it should 130 not be passed to unichr. */ 131 return 0xFFFF; 132#endif 133} 134 135/* --- Bloom Filters ----------------------------------------------------- */ 136 137/* stuff to implement simple "bloom filters" for Unicode characters. 138 to keep things simple, we use a single bitmask, using the least 5 139 bits from each unicode characters as the bit index. */ 140 141/* the linebreak mask is set up by Unicode_Init below */ 142 143#define BLOOM_MASK unsigned long 144 145static BLOOM_MASK bloom_linebreak; 146 147#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F)))) 148 149#define BLOOM_LINEBREAK(ch)\ 150 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK((ch))) 151 152Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len) 153{ 154 /* calculate simple bloom-style bitmask for a given unicode string */ 155 156 long mask; 157 Py_ssize_t i; 158 159 mask = 0; 160 for (i = 0; i < len; i++) 161 mask |= (1 << (ptr[i] & 0x1F)); 162 163 return mask; 164} 165 166Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen) 167{ 168 Py_ssize_t i; 169 170 for (i = 0; i < setlen; i++) 171 if (set[i] == chr) 172 return 1; 173 174 return 0; 175} 176 177#define BLOOM_MEMBER(mask, chr, set, setlen)\ 178 BLOOM(mask, chr) && unicode_member(chr, set, setlen) 179 180/* --- Unicode Object ----------------------------------------------------- */ 181 182static 183int unicode_resize(register PyUnicodeObject *unicode, 184 Py_ssize_t length) 185{ 186 void *oldstr; 187 188 /* Shortcut if there's nothing much to do. */ 189 if (unicode->length == length) 190 goto reset; 191 192 /* Resizing shared object (unicode_empty or single character 193 objects) in-place is not allowed. Use PyUnicode_Resize() 194 instead ! */ 195 196 if (unicode == unicode_empty || 197 (unicode->length == 1 && 198 unicode->str[0] < 256U && 199 unicode_latin1[unicode->str[0]] == unicode)) { 200 PyErr_SetString(PyExc_SystemError, 201 "can't resize shared unicode objects"); 202 return -1; 203 } 204 205 /* We allocate one more byte to make sure the string is Ux0000 terminated. 206 The overallocation is also used by fastsearch, which assumes that it's 207 safe to look at str[length] (without making any assumptions about what 208 it contains). */ 209 210 oldstr = unicode->str; 211 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1); 212 if (!unicode->str) { 213 unicode->str = (Py_UNICODE *)oldstr; 214 PyErr_NoMemory(); 215 return -1; 216 } 217 unicode->str[length] = 0; 218 unicode->length = length; 219 220 reset: 221 /* Reset the object caches */ 222 if (unicode->defenc) { 223 Py_DECREF(unicode->defenc); 224 unicode->defenc = NULL; 225 } 226 unicode->hash = -1; 227 228 return 0; 229} 230 231/* We allocate one more byte to make sure the string is 232 Ux0000 terminated; some code (e.g. new_identifier) 233 relies on that. 234 235 XXX This allocator could further be enhanced by assuring that the 236 free list never reduces its size below 1. 237 238*/ 239 240static 241PyUnicodeObject *_PyUnicode_New(Py_ssize_t length) 242{ 243 register PyUnicodeObject *unicode; 244 245 /* Optimization for empty strings */ 246 if (length == 0 && unicode_empty != NULL) { 247 Py_INCREF(unicode_empty); 248 return unicode_empty; 249 } 250 251 /* Unicode freelist & memory allocation */ 252 if (unicode_freelist) { 253 unicode = unicode_freelist; 254 unicode_freelist = *(PyUnicodeObject **)unicode; 255 unicode_freelist_size--; 256 if (unicode->str) { 257 /* Keep-Alive optimization: we only upsize the buffer, 258 never downsize it. */ 259 if ((unicode->length < length) && 260 unicode_resize(unicode, length) < 0) { 261 PyMem_DEL(unicode->str); 262 goto onError; 263 } 264 } 265 else { 266 unicode->str = PyMem_NEW(Py_UNICODE, length + 1); 267 } 268 PyObject_INIT(unicode, &PyUnicode_Type); 269 } 270 else { 271 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type); 272 if (unicode == NULL) 273 return NULL; 274 unicode->str = PyMem_NEW(Py_UNICODE, length + 1); 275 } 276 277 if (!unicode->str) { 278 PyErr_NoMemory(); 279 goto onError; 280 } 281 /* Initialize the first element to guard against cases where 282 * the caller fails before initializing str -- unicode_resize() 283 * reads str[0], and the Keep-Alive optimization can keep memory 284 * allocated for str alive across a call to unicode_dealloc(unicode). 285 * We don't want unicode_resize to read uninitialized memory in 286 * that case. 287 */ 288 unicode->str[0] = 0; 289 unicode->str[length] = 0; 290 unicode->length = length; 291 unicode->hash = -1; 292 unicode->state = 0; 293 unicode->defenc = NULL; 294 return unicode; 295 296 onError: 297 _Py_ForgetReference((PyObject *)unicode); 298 PyObject_Del(unicode); 299 return NULL; 300} 301 302static 303void unicode_dealloc(register PyUnicodeObject *unicode) 304{ 305 switch (PyUnicode_CHECK_INTERNED(unicode)) { 306 case SSTATE_NOT_INTERNED: 307 break; 308 309 case SSTATE_INTERNED_MORTAL: 310 /* revive dead object temporarily for DelItem */ 311 Py_Refcnt(unicode) = 3; 312 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0) 313 Py_FatalError( 314 "deletion of interned unicode string failed"); 315 break; 316 317 case SSTATE_INTERNED_IMMORTAL: 318 Py_FatalError("Immortal interned unicode string died."); 319 320 default: 321 Py_FatalError("Inconsistent interned unicode string state."); 322 } 323 324 if (PyUnicode_CheckExact(unicode) && 325 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) { 326 /* Keep-Alive optimization */ 327 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) { 328 PyMem_DEL(unicode->str); 329 unicode->str = NULL; 330 unicode->length = 0; 331 } 332 if (unicode->defenc) { 333 Py_DECREF(unicode->defenc); 334 unicode->defenc = NULL; 335 } 336 /* Add to free list */ 337 *(PyUnicodeObject **)unicode = unicode_freelist; 338 unicode_freelist = unicode; 339 unicode_freelist_size++; 340 } 341 else { 342 PyMem_DEL(unicode->str); 343 Py_XDECREF(unicode->defenc); 344 Py_Type(unicode)->tp_free((PyObject *)unicode); 345 } 346} 347 348int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length) 349{ 350 register PyUnicodeObject *v; 351 352 /* Argument checks */ 353 if (unicode == NULL) { 354 PyErr_BadInternalCall(); 355 return -1; 356 } 357 v = (PyUnicodeObject *)*unicode; 358 if (v == NULL || !PyUnicode_Check(v) || Py_Refcnt(v) != 1 || length < 0) { 359 PyErr_BadInternalCall(); 360 return -1; 361 } 362 363 /* Resizing unicode_empty and single character objects is not 364 possible since these are being shared. We simply return a fresh 365 copy with the same Unicode content. */ 366 if (v->length != length && 367 (v == unicode_empty || v->length == 1)) { 368 PyUnicodeObject *w = _PyUnicode_New(length); 369 if (w == NULL) 370 return -1; 371 Py_UNICODE_COPY(w->str, v->str, 372 length < v->length ? length : v->length); 373 Py_DECREF(*unicode); 374 *unicode = (PyObject *)w; 375 return 0; 376 } 377 378 /* Note that we don't have to modify *unicode for unshared Unicode 379 objects, since we can modify them in-place. */ 380 return unicode_resize(v, length); 381} 382 383/* Internal API for use in unicodeobject.c only ! */ 384#define _PyUnicode_Resize(unicodevar, length) \ 385 PyUnicode_Resize(((PyObject **)(unicodevar)), length) 386 387PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u, 388 Py_ssize_t size) 389{ 390 PyUnicodeObject *unicode; 391 392 /* If the Unicode data is known at construction time, we can apply 393 some optimizations which share commonly used objects. */ 394 if (u != NULL) { 395 396 /* Optimization for empty strings */ 397 if (size == 0 && unicode_empty != NULL) { 398 Py_INCREF(unicode_empty); 399 return (PyObject *)unicode_empty; 400 } 401 402 /* Single character Unicode objects in the Latin-1 range are 403 shared when using this constructor */ 404 if (size == 1 && *u < 256) { 405 unicode = unicode_latin1[*u]; 406 if (!unicode) { 407 unicode = _PyUnicode_New(1); 408 if (!unicode) 409 return NULL; 410 unicode->str[0] = *u; 411 unicode_latin1[*u] = unicode; 412 } 413 Py_INCREF(unicode); 414 return (PyObject *)unicode; 415 } 416 } 417 418 unicode = _PyUnicode_New(size); 419 if (!unicode) 420 return NULL; 421 422 /* Copy the Unicode data into the new object */ 423 if (u != NULL) 424 Py_UNICODE_COPY(unicode->str, u, size); 425 426 return (PyObject *)unicode; 427} 428 429PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size) 430{ 431 PyUnicodeObject *unicode; 432 /* If the Unicode data is known at construction time, we can apply 433 some optimizations which share commonly used objects. 434 Also, this means the input must be UTF-8, so fall back to the 435 UTF-8 decoder at the end. */ 436 if (u != NULL) { 437 438 /* Optimization for empty strings */ 439 if (size == 0 && unicode_empty != NULL) { 440 Py_INCREF(unicode_empty); 441 return (PyObject *)unicode_empty; 442 } 443 444 /* Single characters are shared when using this constructor. 445 Restrict to ASCII, since the input must be UTF-8. */ 446 if (size == 1 && Py_CHARMASK(*u) < 128) { 447 unicode = unicode_latin1[Py_CHARMASK(*u)]; 448 if (!unicode) { 449 unicode = _PyUnicode_New(1); 450 if (!unicode) 451 return NULL; 452 unicode->str[0] = Py_CHARMASK(*u); 453 unicode_latin1[Py_CHARMASK(*u)] = unicode; 454 } 455 Py_INCREF(unicode); 456 return (PyObject *)unicode; 457 } 458 459 return PyUnicode_DecodeUTF8(u, size, NULL); 460 } 461 462 unicode = _PyUnicode_New(size); 463 if (!unicode) 464 return NULL; 465 466 return (PyObject *)unicode; 467} 468 469PyObject *PyUnicode_FromString(const char *u) 470{ 471 size_t size = strlen(u); 472 if (size > PY_SSIZE_T_MAX) { 473 PyErr_SetString(PyExc_OverflowError, "input too long"); 474 return NULL; 475 } 476 477 return PyUnicode_FromStringAndSize(u, size); 478} 479 480#ifdef HAVE_WCHAR_H 481 482PyObject *PyUnicode_FromWideChar(register const wchar_t *w, 483 Py_ssize_t size) 484{ 485 PyUnicodeObject *unicode; 486 487 if (w == NULL) { 488 PyErr_BadInternalCall(); 489 return NULL; 490 } 491 492 unicode = _PyUnicode_New(size); 493 if (!unicode) 494 return NULL; 495 496 /* Copy the wchar_t data into the new object */ 497#ifdef HAVE_USABLE_WCHAR_T 498 memcpy(unicode->str, w, size * sizeof(wchar_t)); 499#else 500 { 501 register Py_UNICODE *u; 502 register Py_ssize_t i; 503 u = PyUnicode_AS_UNICODE(unicode); 504 for (i = size; i > 0; i--) 505 *u++ = *w++; 506 } 507#endif 508 509 return (PyObject *)unicode; 510} 511 512static void 513makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c) 514{ 515 *fmt++ = '%'; 516 if (width) { 517 if (zeropad) 518 *fmt++ = '0'; 519 fmt += sprintf(fmt, "%d", width); 520 } 521 if (precision) 522 fmt += sprintf(fmt, ".%d", precision); 523 if (longflag) 524 *fmt++ = 'l'; 525 else if (size_tflag) { 526 char *f = PY_FORMAT_SIZE_T; 527 while (*f) 528 *fmt++ = *f++; 529 } 530 *fmt++ = c; 531 *fmt = '\0'; 532} 533 534#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;} 535 536PyObject * 537PyUnicode_FromFormatV(const char *format, va_list vargs) 538{ 539 va_list count; 540 Py_ssize_t callcount = 0; 541 PyObject **callresults = NULL; 542 PyObject **callresult = NULL; 543 Py_ssize_t n = 0; 544 int width = 0; 545 int precision = 0; 546 int zeropad; 547 const char* f; 548 Py_UNICODE *s; 549 PyObject *string; 550 /* used by sprintf */ 551 char buffer[21]; 552 /* use abuffer instead of buffer, if we need more space 553 * (which can happen if there's a format specifier with width). */ 554 char *abuffer = NULL; 555 char *realbuffer; 556 Py_ssize_t abuffersize = 0; 557 char fmt[60]; /* should be enough for %0width.precisionld */ 558 const char *copy; 559 560#ifdef VA_LIST_IS_ARRAY 561 Py_MEMCPY(count, vargs, sizeof(va_list)); 562#else 563#ifdef __va_copy 564 __va_copy(count, vargs); 565#else 566 count = vargs; 567#endif 568#endif 569 /* step 1: count the number of %S/%R format specifications 570 * (we call PyObject_Unicode()/PyObject_Repr() for these objects 571 * once during step 3 and put the result in an array) */ 572 for (f = format; *f; f++) { 573 if (*f == '%' && (*(f+1)=='S' || *(f+1)=='R')) 574 ++callcount; 575 } 576 /* step 2: allocate memory for the results of 577 * PyObject_Unicode()/PyObject_Repr() calls */ 578 if (callcount) { 579 callresults = PyMem_Malloc(sizeof(PyObject *)*callcount); 580 if (!callresults) { 581 PyErr_NoMemory(); 582 return NULL; 583 } 584 callresult = callresults; 585 } 586 /* step 3: figure out how large a buffer we need */ 587 for (f = format; *f; f++) { 588 if (*f == '%') { 589 const char* p = f; 590 width = 0; 591 while (isdigit(Py_CHARMASK(*f))) 592 width = (width*10) + *f++ - '0'; 593 while (*++f && *f != '%' && !isalpha(Py_CHARMASK(*f))) 594 ; 595 596 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since 597 * they don't affect the amount of space we reserve. 598 */ 599 if ((*f == 'l' || *f == 'z') && 600 (f[1] == 'd' || f[1] == 'u')) 601 ++f; 602 603 switch (*f) { 604 case 'c': 605 (void)va_arg(count, int); 606 /* fall through... */ 607 case '%': 608 n++; 609 break; 610 case 'd': case 'u': case 'i': case 'x': 611 (void) va_arg(count, int); 612 /* 20 bytes is enough to hold a 64-bit 613 integer. Decimal takes the most space. 614 This isn't enough for octal. 615 If a width is specified we need more 616 (which we allocate later). */ 617 if (width < 20) 618 width = 20; 619 n += width; 620 if (abuffersize < width) 621 abuffersize = width; 622 break; 623 case 's': 624 { 625 /* UTF-8 */ 626 unsigned char*s; 627 s = va_arg(count, unsigned char*); 628 while (*s) { 629 if (*s < 128) { 630 n++; s++; 631 } else if (*s < 0xc0) { 632 /* invalid UTF-8 */ 633 n++; s++; 634 } else if (*s < 0xc0) { 635 n++; 636 s++; if(!*s)break; 637 s++; 638 } else if (*s < 0xe0) { 639 n++; 640 s++; if(!*s)break; 641 s++; if(!*s)break; 642 s++; 643 } else { 644 #ifdef Py_UNICODE_WIDE 645 n++; 646 #else 647 n+=2; 648 #endif 649 s++; if(!*s)break; 650 s++; if(!*s)break; 651 s++; if(!*s)break; 652 s++; 653 } 654 } 655 break; 656 } 657 case 'U': 658 { 659 PyObject *obj = va_arg(count, PyObject *); 660 assert(obj && PyUnicode_Check(obj)); 661 n += PyUnicode_GET_SIZE(obj); 662 break; 663 } 664 case 'V': 665 { 666 PyObject *obj = va_arg(count, PyObject *); 667 const char *str = va_arg(count, const char *); 668 assert(obj || str); 669 assert(!obj || PyUnicode_Check(obj)); 670 if (obj) 671 n += PyUnicode_GET_SIZE(obj); 672 else 673 n += strlen(str); 674 break; 675 } 676 case 'S': 677 { 678 PyObject *obj = va_arg(count, PyObject *); 679 PyObject *str; 680 assert(obj); 681 str = PyObject_Unicode(obj); 682 if (!str) 683 goto fail; 684 n += PyUnicode_GET_SIZE(str); 685 /* Remember the str and switch to the next slot */ 686 *callresult++ = str; 687 break; 688 } 689 case 'R': 690 { 691 PyObject *obj = va_arg(count, PyObject *); 692 PyObject *repr; 693 assert(obj); 694 repr = PyObject_Repr(obj); 695 if (!repr) 696 goto fail; 697 n += PyUnicode_GET_SIZE(repr); 698 /* Remember the repr and switch to the next slot */ 699 *callresult++ = repr; 700 break; 701 } 702 case 'p': 703 (void) va_arg(count, int); 704 /* maximum 64-bit pointer representation: 705 * 0xffffffffffffffff 706 * so 19 characters is enough. 707 * XXX I count 18 -- what's the extra for? 708 */ 709 n += 19; 710 break; 711 default: 712 /* if we stumble upon an unknown 713 formatting code, copy the rest of 714 the format string to the output 715 string. (we cannot just skip the 716 code, since there's no way to know 717 what's in the argument list) */ 718 n += strlen(p); 719 goto expand; 720 } 721 } else 722 n++; 723 } 724 expand: 725 if (abuffersize > 20) { 726 abuffer = PyMem_Malloc(abuffersize); 727 if (!abuffer) { 728 PyErr_NoMemory(); 729 goto fail; 730 } 731 realbuffer = abuffer; 732 } 733 else 734 realbuffer = buffer; 735 /* step 4: fill the buffer */ 736 /* Since we've analyzed how much space we need for the worst case, 737 we don't have to resize the string. 738 There can be no errors beyond this point. */ 739 string = PyUnicode_FromUnicode(NULL, n); 740 if (!string) 741 goto fail; 742 743 s = PyUnicode_AS_UNICODE(string); 744 callresult = callresults; 745 746 for (f = format; *f; f++) { 747 if (*f == '%') { 748 const char* p = f++; 749 int longflag = 0; 750 int size_tflag = 0; 751 zeropad = (*f == '0'); 752 /* parse the width.precision part */ 753 width = 0; 754 while (isdigit(Py_CHARMASK(*f))) 755 width = (width*10) + *f++ - '0'; 756 precision = 0; 757 if (*f == '.') { 758 f++; 759 while (isdigit(Py_CHARMASK(*f))) 760 precision = (precision*10) + *f++ - '0'; 761 } 762 /* handle the long flag, but only for %ld and %lu. 763 others can be added when necessary. */ 764 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) { 765 longflag = 1; 766 ++f; 767 } 768 /* handle the size_t flag. */ 769 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) { 770 size_tflag = 1; 771 ++f; 772 } 773 774 switch (*f) { 775 case 'c': 776 *s++ = va_arg(vargs, int); 777 break; 778 case 'd': 779 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd'); 780 if (longflag) 781 sprintf(realbuffer, fmt, va_arg(vargs, long)); 782 else if (size_tflag) 783 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t)); 784 else 785 sprintf(realbuffer, fmt, va_arg(vargs, int)); 786 appendstring(realbuffer); 787 break; 788 case 'u': 789 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u'); 790 if (longflag) 791 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long)); 792 else if (size_tflag) 793 sprintf(realbuffer, fmt, va_arg(vargs, size_t)); 794 else 795 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int)); 796 appendstring(realbuffer); 797 break; 798 case 'i': 799 makefmt(fmt, 0, 0, zeropad, width, precision, 'i'); 800 sprintf(realbuffer, fmt, va_arg(vargs, int)); 801 appendstring(realbuffer); 802 break; 803 case 'x': 804 makefmt(fmt, 0, 0, zeropad, width, precision, 'x'); 805 sprintf(realbuffer, fmt, va_arg(vargs, int)); 806 appendstring(realbuffer); 807 break; 808 case 's': 809 { 810 /* Parameter must be UTF-8 encoded. 811 In case of encoding errors, use 812 the replacement character. */ 813 PyObject *u; 814 p = va_arg(vargs, char*); 815 u = PyUnicode_DecodeUTF8(p, strlen(p), 816 "replace"); 817 if (!u) 818 goto fail; 819 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(u), 820 PyUnicode_GET_SIZE(u)); 821 s += PyUnicode_GET_SIZE(u); 822 Py_DECREF(u); 823 break; 824 } 825 case 'U': 826 { 827 PyObject *obj = va_arg(vargs, PyObject *); 828 Py_ssize_t size = PyUnicode_GET_SIZE(obj); 829 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size); 830 s += size; 831 break; 832 } 833 case 'V': 834 { 835 PyObject *obj = va_arg(vargs, PyObject *); 836 const char *str = va_arg(vargs, const char *); 837 if (obj) { 838 Py_ssize_t size = PyUnicode_GET_SIZE(obj); 839 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size); 840 s += size; 841 } else { 842 appendstring(str); 843 } 844 break; 845 } 846 case 'S': 847 case 'R': 848 { 849 Py_UNICODE *ucopy; 850 Py_ssize_t usize; 851 Py_ssize_t upos; 852 /* unused, since we already have the result */ 853 (void) va_arg(vargs, PyObject *); 854 ucopy = PyUnicode_AS_UNICODE(*callresult); 855 usize = PyUnicode_GET_SIZE(*callresult); 856 for (upos = 0; upos<usize;) 857 *s++ = ucopy[upos++]; 858 /* We're done with the unicode()/repr() => forget it */ 859 Py_DECREF(*callresult); 860 /* switch to next unicode()/repr() result */ 861 ++callresult; 862 break; 863 } 864 case 'p': 865 sprintf(buffer, "%p", va_arg(vargs, void*)); 866 /* %p is ill-defined: ensure leading 0x. */ 867 if (buffer[1] == 'X') 868 buffer[1] = 'x'; 869 else if (buffer[1] != 'x') { 870 memmove(buffer+2, buffer, strlen(buffer)+1); 871 buffer[0] = '0'; 872 buffer[1] = 'x'; 873 } 874 appendstring(buffer); 875 break; 876 case '%': 877 *s++ = '%'; 878 break; 879 default: 880 appendstring(p); 881 goto end; 882 } 883 } else 884 *s++ = *f; 885 } 886 887 end: 888 if (callresults) 889 PyMem_Free(callresults); 890 if (abuffer) 891 PyMem_Free(abuffer); 892 _PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string)); 893 return string; 894 fail: 895 if (callresults) { 896 PyObject **callresult2 = callresults; 897 while (callresult2 < callresult) { 898 Py_DECREF(*callresult2); 899 ++callresult2; 900 } 901 PyMem_Free(callresults); 902 } 903 if (abuffer) 904 PyMem_Free(abuffer); 905 return NULL; 906} 907 908#undef appendstring 909 910PyObject * 911PyUnicode_FromFormat(const char *format, ...) 912{ 913 PyObject* ret; 914 va_list vargs; 915 916#ifdef HAVE_STDARG_PROTOTYPES 917 va_start(vargs, format); 918#else 919 va_start(vargs); 920#endif 921 ret = PyUnicode_FromFormatV(format, vargs); 922 va_end(vargs); 923 return ret; 924} 925 926Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode, 927 wchar_t *w, 928 Py_ssize_t size) 929{ 930 if (unicode == NULL) { 931 PyErr_BadInternalCall(); 932 return -1; 933 } 934 935 /* If possible, try to copy the 0-termination as well */ 936 if (size > PyUnicode_GET_SIZE(unicode)) 937 size = PyUnicode_GET_SIZE(unicode) + 1; 938 939#ifdef HAVE_USABLE_WCHAR_T 940 memcpy(w, unicode->str, size * sizeof(wchar_t)); 941#else 942 { 943 register Py_UNICODE *u; 944 register Py_ssize_t i; 945 u = PyUnicode_AS_UNICODE(unicode); 946 for (i = size; i > 0; i--) 947 *w++ = *u++; 948 } 949#endif 950 951 if (size > PyUnicode_GET_SIZE(unicode)) 952 return PyUnicode_GET_SIZE(unicode); 953 else 954 return size; 955} 956 957#endif 958 959PyObject *PyUnicode_FromOrdinal(int ordinal) 960{ 961 Py_UNICODE s[2]; 962 963 if (ordinal < 0 || ordinal > 0x10ffff) { 964 PyErr_SetString(PyExc_ValueError, 965 "chr() arg not in range(0x110000)"); 966 return NULL; 967 } 968 969#ifndef Py_UNICODE_WIDE 970 if (ordinal > 0xffff) { 971 ordinal -= 0x10000; 972 s[0] = 0xD800 | (ordinal >> 10); 973 s[1] = 0xDC00 | (ordinal & 0x3FF); 974 return PyUnicode_FromUnicode(s, 2); 975 } 976#endif 977 978 s[0] = (Py_UNICODE)ordinal; 979 return PyUnicode_FromUnicode(s, 1); 980} 981 982PyObject *PyUnicode_FromObject(register PyObject *obj) 983{ 984 /* XXX Perhaps we should make this API an alias of 985 PyObject_Unicode() instead ?! */ 986 if (PyUnicode_CheckExact(obj)) { 987 Py_INCREF(obj); 988 return obj; 989 } 990 if (PyUnicode_Check(obj)) { 991 /* For a Unicode subtype that's not a Unicode object, 992 return a true Unicode object with the same data. */ 993 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj), 994 PyUnicode_GET_SIZE(obj)); 995 } 996 return PyUnicode_FromEncodedObject(obj, NULL, "strict"); 997} 998 999PyObject *PyUnicode_FromEncodedObject(register PyObject *obj, 1000 const char *encoding, 1001 const char *errors) 1002{ 1003 const char *s = NULL; 1004 Py_ssize_t len; 1005 PyObject *v; 1006 1007 if (obj == NULL) { 1008 PyErr_BadInternalCall(); 1009 return NULL; 1010 } 1011 1012 if (PyUnicode_Check(obj)) { 1013 PyErr_SetString(PyExc_TypeError, 1014 "decoding Unicode is not supported"); 1015 return NULL; 1016 } 1017 1018 /* Coerce object */ 1019 if (PyString_Check(obj)) { 1020 s = PyString_AS_STRING(obj); 1021 len = PyString_GET_SIZE(obj); 1022 } 1023 else if (PyObject_AsCharBuffer(obj, &s, &len)) { 1024 /* Overwrite the error message with something more useful in 1025 case of a TypeError. */ 1026 if (PyErr_ExceptionMatches(PyExc_TypeError)) 1027 PyErr_Format(PyExc_TypeError, 1028 "coercing to Unicode: need string or buffer, " 1029 "%.80s found", 1030 Py_Type(obj)->tp_name); 1031 goto onError; 1032 } 1033 1034 /* Convert to Unicode */ 1035 if (len == 0) { 1036 Py_INCREF(unicode_empty); 1037 v = (PyObject *)unicode_empty; 1038 } 1039 else 1040 v = PyUnicode_Decode(s, len, encoding, errors); 1041 1042 return v; 1043 1044 onError: 1045 return NULL; 1046} 1047 1048PyObject *PyUnicode_Decode(const char *s, 1049 Py_ssize_t size, 1050 const char *encoding, 1051 const char *errors) 1052{ 1053 PyObject *buffer = NULL, *unicode; 1054 1055 if (encoding == NULL) 1056 encoding = PyUnicode_GetDefaultEncoding(); 1057 1058 /* Shortcuts for common default encodings */ 1059 if (strcmp(encoding, "utf-8") == 0) 1060 return PyUnicode_DecodeUTF8(s, size, errors); 1061 else if (strcmp(encoding, "latin-1") == 0) 1062 return PyUnicode_DecodeLatin1(s, size, errors); 1063#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T) 1064 else if (strcmp(encoding, "mbcs") == 0) 1065 return PyUnicode_DecodeMBCS(s, size, errors); 1066#endif 1067 else if (strcmp(encoding, "ascii") == 0) 1068 return PyUnicode_DecodeASCII(s, size, errors); 1069 1070 /* Decode via the codec registry */ 1071 buffer = PyBuffer_FromMemory((void *)s, size); 1072 if (buffer == NULL) 1073 goto onError; 1074 unicode = PyCodec_Decode(buffer, encoding, errors); 1075 if (unicode == NULL) 1076 goto onError; 1077 if (!PyUnicode_Check(unicode)) { 1078 PyErr_Format(PyExc_TypeError, 1079 "decoder did not return an unicode object (type=%.400s)", 1080 Py_Type(unicode)->tp_name); 1081 Py_DECREF(unicode); 1082 goto onError; 1083 } 1084 Py_DECREF(buffer); 1085 return unicode; 1086 1087 onError: 1088 Py_XDECREF(buffer); 1089 return NULL; 1090} 1091 1092PyObject *PyUnicode_AsDecodedObject(PyObject *unicode, 1093 const char *encoding, 1094 const char *errors) 1095{ 1096 PyObject *v; 1097 1098 if (!PyUnicode_Check(unicode)) { 1099 PyErr_BadArgument(); 1100 goto onError; 1101 } 1102 1103 if (encoding == NULL) 1104 encoding = PyUnicode_GetDefaultEncoding(); 1105 1106 /* Decode via the codec registry */ 1107 v = PyCodec_Decode(unicode, encoding, errors); 1108 if (v == NULL) 1109 goto onError; 1110 return v; 1111 1112 onError: 1113 return NULL; 1114} 1115 1116PyObject *PyUnicode_Encode(const Py_UNICODE *s, 1117 Py_ssize_t size, 1118 const char *encoding, 1119 const char *errors) 1120{ 1121 PyObject *v, *unicode; 1122 1123 unicode = PyUnicode_FromUnicode(s, size); 1124 if (unicode == NULL) 1125 return NULL; 1126 v = PyUnicode_AsEncodedString(unicode, encoding, errors); 1127 Py_DECREF(unicode); 1128 return v; 1129} 1130 1131PyObject *PyUnicode_AsEncodedObject(PyObject *unicode, 1132 const char *encoding, 1133 const char *errors) 1134{ 1135 PyObject *v; 1136 1137 if (!PyUnicode_Check(unicode)) { 1138 PyErr_BadArgument(); 1139 goto onError; 1140 } 1141 1142 if (encoding == NULL) 1143 encoding = PyUnicode_GetDefaultEncoding(); 1144 1145 /* Encode via the codec registry */ 1146 v = PyCodec_Encode(unicode, encoding, errors); 1147 if (v == NULL) 1148 goto onError; 1149 return v; 1150 1151 onError: 1152 return NULL; 1153} 1154 1155PyObject *PyUnicode_AsEncodedString(PyObject *unicode, 1156 const char *encoding, 1157 const char *errors) 1158{ 1159 PyObject *v; 1160 1161 if (!PyUnicode_Check(unicode)) { 1162 PyErr_BadArgument(); 1163 goto onError; 1164 } 1165 1166 if (encoding == NULL) 1167 encoding = PyUnicode_GetDefaultEncoding(); 1168 1169 /* Shortcuts for common default encodings */ 1170 if (errors == NULL) { 1171 if (strcmp(encoding, "utf-8") == 0) 1172 return PyUnicode_AsUTF8String(unicode); 1173 else if (strcmp(encoding, "latin-1") == 0) 1174 return PyUnicode_AsLatin1String(unicode); 1175#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T) 1176 else if (strcmp(encoding, "mbcs") == 0) 1177 return PyUnicode_AsMBCSString(unicode); 1178#endif 1179 else if (strcmp(encoding, "ascii") == 0) 1180 return PyUnicode_AsASCIIString(unicode); 1181 } 1182 1183 /* Encode via the codec registry */ 1184 v = PyCodec_Encode(unicode, encoding, errors); 1185 if (v == NULL) 1186 goto onError; 1187 if (!PyBytes_Check(v)) { 1188 if (PyString_Check(v)) { 1189 /* Old codec, turn it into bytes */ 1190 PyObject *b = PyBytes_FromObject(v); 1191 Py_DECREF(v); 1192 return b; 1193 } 1194 PyErr_Format(PyExc_TypeError, 1195 "encoder did not return a bytes object " 1196 "(type=%.400s, encoding=%.20s, errors=%.20s)", 1197 v->ob_type->tp_name, 1198 encoding ? encoding : "NULL", 1199 errors ? errors : "NULL"); 1200 Py_DECREF(v); 1201 goto onError; 1202 } 1203 return v; 1204 1205 onError: 1206 return NULL; 1207} 1208 1209PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode, 1210 const char *errors) 1211{ 1212 PyObject *v = ((PyUnicodeObject *)unicode)->defenc; 1213 PyObject *b; 1214 if (v) 1215 return v; 1216 if (errors != NULL) 1217 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString"); 1218 b = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode), 1219 PyUnicode_GET_SIZE(unicode), 1220 NULL); 1221 if (!b) 1222 return NULL; 1223 v = PyString_FromStringAndSize(PyBytes_AsString(b), 1224 PyBytes_Size(b)); 1225 Py_DECREF(b); 1226 ((PyUnicodeObject *)unicode)->defenc = v; 1227 return v; 1228} 1229 1230char* 1231PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize) 1232{ 1233 PyObject *str8; 1234 if (!PyUnicode_Check(unicode)) { 1235 PyErr_BadArgument(); 1236 return NULL; 1237 } 1238 str8 = _PyUnicode_AsDefaultEncodedString(unicode, NULL); 1239 if (str8 == NULL) 1240 return NULL; 1241 if (psize != NULL) 1242 *psize = PyString_GET_SIZE(str8); 1243 return PyString_AS_STRING(str8); 1244} 1245 1246char* 1247PyUnicode_AsString(PyObject *unicode) 1248{ 1249 return PyUnicode_AsStringAndSize(unicode, NULL); 1250} 1251 1252Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode) 1253{ 1254 if (!PyUnicode_Check(unicode)) { 1255 PyErr_BadArgument(); 1256 goto onError; 1257 } 1258 return PyUnicode_AS_UNICODE(unicode); 1259 1260 onError: 1261 return NULL; 1262} 1263 1264Py_ssize_t PyUnicode_GetSize(PyObject *unicode) 1265{ 1266 if (!PyUnicode_Check(unicode)) { 1267 PyErr_BadArgument(); 1268 goto onError; 1269 } 1270 return PyUnicode_GET_SIZE(unicode); 1271 1272 onError: 1273 return -1; 1274} 1275 1276const char *PyUnicode_GetDefaultEncoding(void) 1277{ 1278 return unicode_default_encoding; 1279} 1280 1281int PyUnicode_SetDefaultEncoding(const char *encoding) 1282{ 1283 if (strcmp(encoding, unicode_default_encoding) != 0) { 1284 PyErr_Format(PyExc_ValueError, 1285 "Can only set default encoding to %s", 1286 unicode_default_encoding); 1287 return -1; 1288 } 1289 return 0; 1290} 1291 1292/* error handling callback helper: 1293 build arguments, call the callback and check the arguments, 1294 if no exception occurred, copy the replacement to the output 1295 and adjust various state variables. 1296 return 0 on success, -1 on error 1297*/ 1298 1299static 1300int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler, 1301 const char *encoding, const char *reason, 1302 const char **input, const char **inend, Py_ssize_t *startinpos, 1303 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr, 1304 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr) 1305{ 1306 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple"; 1307 1308 PyObject *restuple = NULL; 1309 PyObject *repunicode = NULL; 1310 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output); 1311 Py_ssize_t insize; 1312 Py_ssize_t requiredsize; 1313 Py_ssize_t newpos; 1314 Py_UNICODE *repptr; 1315 PyObject *inputobj = NULL; 1316 Py_ssize_t repsize; 1317 int res = -1; 1318 1319 if (*errorHandler == NULL) { 1320 *errorHandler = PyCodec_LookupError(errors); 1321 if (*errorHandler == NULL) 1322 goto onError; 1323 } 1324 1325 if (*exceptionObject == NULL) { 1326 *exceptionObject = PyUnicodeDecodeError_Create( 1327 encoding, *input, *inend-*input, *startinpos, *endinpos, reason); 1328 if (*exceptionObject == NULL) 1329 goto onError; 1330 } 1331 else { 1332 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos)) 1333 goto onError; 1334 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos)) 1335 goto onError; 1336 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason)) 1337 goto onError; 1338 } 1339 1340 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL); 1341 if (restuple == NULL) 1342 goto onError; 1343 if (!PyTuple_Check(restuple)) { 1344 PyErr_Format(PyExc_TypeError, &argparse[4]); 1345 goto onError; 1346 } 1347 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos)) 1348 goto onError; 1349 1350 /* Copy back the bytes variables, which might have been modified by the 1351 callback */ 1352 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject); 1353 if (!inputobj) 1354 goto onError; 1355 if (!PyBytes_Check(inputobj)) { 1356 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes"); 1357 } 1358 *input = PyBytes_AS_STRING(inputobj); 1359 insize = PyBytes_GET_SIZE(inputobj); 1360 *inend = *input + insize; 1361 /* we can DECREF safely, as the exception has another reference, 1362 so the object won't go away. */ 1363 Py_DECREF(inputobj); 1364 1365 if (newpos<0) 1366 newpos = insize+newpos; 1367 if (newpos<0 || newpos>insize) { 1368 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos); 1369 goto onError; 1370 } 1371 1372 /* need more space? (at least enough for what we 1373 have+the replacement+the rest of the string (starting 1374 at the new input position), so we won't have to check space 1375 when there are no errors in the rest of the string) */ 1376 repptr = PyUnicode_AS_UNICODE(repunicode); 1377 repsize = PyUnicode_GET_SIZE(repunicode); 1378 requiredsize = *outpos + repsize + insize-newpos; 1379 if (requiredsize > outsize) { 1380 if (requiredsize<2*outsize) 1381 requiredsize = 2*outsize; 1382 if (PyUnicode_Resize(output, requiredsize) < 0) 1383 goto onError; 1384 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos; 1385 } 1386 *endinpos = newpos; 1387 *inptr = *input + newpos; 1388 Py_UNICODE_COPY(*outptr, repptr, repsize); 1389 *outptr += repsize; 1390 *outpos += repsize; 1391 1392 /* we made it! */ 1393 res = 0; 1394 1395 onError: 1396 Py_XDECREF(restuple); 1397 return res; 1398} 1399 1400/* --- UTF-7 Codec -------------------------------------------------------- */ 1401 1402/* see RFC2152 for details */ 1403 1404static 1405char utf7_special[128] = { 1406 /* indicate whether a UTF-7 character is special i.e. cannot be directly 1407 encoded: 1408 0 - not special 1409 1 - special 1410 2 - whitespace (optional) 1411 3 - RFC2152 Set O (optional) */ 1412 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1, 1413 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1414 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1, 1415 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0, 1416 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1417 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3, 1418 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1419 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1, 1420 1421}; 1422 1423/* Note: The comparison (c) <= 0 is a trick to work-around gcc 1424 warnings about the comparison always being false; since 1425 utf7_special[0] is 1, we can safely make that one comparison 1426 true */ 1427 1428#define SPECIAL(c, encodeO, encodeWS) \ 1429 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \ 1430 (encodeWS && (utf7_special[(c)] == 2)) || \ 1431 (encodeO && (utf7_special[(c)] == 3))) 1432 1433#define B64(n) \ 1434 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f]) 1435#define B64CHAR(c) \ 1436 (isalnum(c) || (c) == '+' || (c) == '/') 1437#define UB64(c) \ 1438 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \ 1439 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 ) 1440 1441#define ENCODE(out, ch, bits) \ 1442 while (bits >= 6) { \ 1443 *out++ = B64(ch >> (bits-6)); \ 1444 bits -= 6; \ 1445 } 1446 1447#define DECODE(out, ch, bits, surrogate) \ 1448 while (bits >= 16) { \ 1449 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \ 1450 bits -= 16; \ 1451 if (surrogate) { \ 1452 /* We have already generated an error for the high surrogate \ 1453 so let's not bother seeing if the low surrogate is correct or not */ \ 1454 surrogate = 0; \ 1455 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \ 1456 /* This is a surrogate pair. Unfortunately we can't represent \ 1457 it in a 16-bit character */ \ 1458 surrogate = 1; \ 1459 errmsg = "code pairs are not supported"; \ 1460 goto utf7Error; \ 1461 } else { \ 1462 *out++ = outCh; \ 1463 } \ 1464 } 1465 1466PyObject *PyUnicode_DecodeUTF7(const char *s, 1467 Py_ssize_t size, 1468 const char *errors) 1469{ 1470 const char *starts = s; 1471 Py_ssize_t startinpos; 1472 Py_ssize_t endinpos; 1473 Py_ssize_t outpos; 1474 const char *e; 1475 PyUnicodeObject *unicode; 1476 Py_UNICODE *p; 1477 const char *errmsg = ""; 1478 int inShift = 0; 1479 unsigned int bitsleft = 0; 1480 unsigned long charsleft = 0; 1481 int surrogate = 0; 1482 PyObject *errorHandler = NULL; 1483 PyObject *exc = NULL; 1484 1485 unicode = _PyUnicode_New(size); 1486 if (!unicode) 1487 return NULL; 1488 if (size == 0) 1489 return (PyObject *)unicode; 1490 1491 p = unicode->str; 1492 e = s + size; 1493 1494 while (s < e) { 1495 Py_UNICODE ch; 1496 restart: 1497 ch = *s; 1498 1499 if (inShift) { 1500 if ((ch == '-') || !B64CHAR(ch)) { 1501 inShift = 0; 1502 s++; 1503 1504 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate); 1505 if (bitsleft >= 6) { 1506 /* The shift sequence has a partial character in it. If 1507 bitsleft < 6 then we could just classify it as padding 1508 but that is not the case here */ 1509 1510 errmsg = "partial character in shift sequence"; 1511 goto utf7Error; 1512 } 1513 /* According to RFC2152 the remaining bits should be zero. We 1514 choose to signal an error/insert a replacement character 1515 here so indicate the potential of a misencoded character. */ 1516 1517 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */ 1518 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) { 1519 errmsg = "non-zero padding bits in shift sequence"; 1520 goto utf7Error; 1521 } 1522 1523 if (ch == '-') { 1524 if ((s < e) && (*(s) == '-')) { 1525 *p++ = '-'; 1526 inShift = 1; 1527 } 1528 } else if (SPECIAL(ch,0,0)) { 1529 errmsg = "unexpected special character"; 1530 goto utf7Error; 1531 } else { 1532 *p++ = ch; 1533 } 1534 } else { 1535 charsleft = (charsleft << 6) | UB64(ch); 1536 bitsleft += 6; 1537 s++; 1538 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate); 1539 } 1540 } 1541 else if ( ch == '+' ) { 1542 startinpos = s-starts; 1543 s++; 1544 if (s < e && *s == '-') { 1545 s++; 1546 *p++ = '+'; 1547 } else 1548 { 1549 inShift = 1; 1550 bitsleft = 0; 1551 } 1552 } 1553 else if (SPECIAL(ch,0,0)) { 1554 startinpos = s-starts; 1555 errmsg = "unexpected special character"; 1556 s++; 1557 goto utf7Error; 1558 } 1559 else { 1560 *p++ = ch; 1561 s++; 1562 } 1563 continue; 1564 utf7Error: 1565 outpos = p-PyUnicode_AS_UNICODE(unicode); 1566 endinpos = s-starts; 1567 if (unicode_decode_call_errorhandler( 1568 errors, &errorHandler, 1569 "utf7", errmsg, 1570 &starts, &e, &startinpos, &endinpos, &exc, &s, 1571 (PyObject **)&unicode, &outpos, &p)) 1572 goto onError; 1573 } 1574 1575 if (inShift) { 1576 outpos = p-PyUnicode_AS_UNICODE(unicode); 1577 endinpos = size; 1578 if (unicode_decode_call_errorhandler( 1579 errors, &errorHandler, 1580 "utf7", "unterminated shift sequence", 1581 &starts, &e, &startinpos, &endinpos, &exc, &s, 1582 (PyObject **)&unicode, &outpos, &p)) 1583 goto onError; 1584 if (s < e) 1585 goto restart; 1586 } 1587 1588 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0) 1589 goto onError; 1590 1591 Py_XDECREF(errorHandler); 1592 Py_XDECREF(exc); 1593 return (PyObject *)unicode; 1594 1595onError: 1596 Py_XDECREF(errorHandler); 1597 Py_XDECREF(exc); 1598 Py_DECREF(unicode); 1599 return NULL; 1600} 1601 1602 1603PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s, 1604 Py_ssize_t size, 1605 int encodeSetO, 1606 int encodeWhiteSpace, 1607 const char *errors) 1608{ 1609 PyObject *v; 1610 /* It might be possible to tighten this worst case */ 1611 Py_ssize_t cbAllocated = 5 * size; 1612 int inShift = 0; 1613 Py_ssize_t i = 0; 1614 unsigned int bitsleft = 0; 1615 unsigned long charsleft = 0; 1616 char * out; 1617 char * start; 1618 1619 if (size == 0) 1620 return PyBytes_FromStringAndSize(NULL, 0); 1621 1622 v = PyBytes_FromStringAndSize(NULL, cbAllocated); 1623 if (v == NULL) 1624 return NULL; 1625 1626 start = out = PyBytes_AS_STRING(v); 1627 for (;i < size; ++i) { 1628 Py_UNICODE ch = s[i]; 1629 1630 if (!inShift) { 1631 if (ch == '+') { 1632 *out++ = '+'; 1633 *out++ = '-'; 1634 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) { 1635 charsleft = ch; 1636 bitsleft = 16; 1637 *out++ = '+'; 1638 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft); 1639 inShift = bitsleft > 0; 1640 } else { 1641 *out++ = (char) ch; 1642 } 1643 } else { 1644 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) { 1645 *out++ = B64(charsleft << (6-bitsleft)); 1646 charsleft = 0; 1647 bitsleft = 0; 1648 /* Characters not in the BASE64 set implicitly unshift the sequence 1649 so no '-' is required, except if the character is itself a '-' */ 1650 if (B64CHAR(ch) || ch == '-') { 1651 *out++ = '-'; 1652 } 1653 inShift = 0; 1654 *out++ = (char) ch; 1655 } else { 1656 bitsleft += 16; 1657 charsleft = (charsleft << 16) | ch; 1658 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft); 1659 1660 /* If the next character is special then we dont' need to terminate 1661 the shift sequence. If the next character is not a BASE64 character 1662 or '-' then the shift sequence will be terminated implicitly and we 1663 don't have to insert a '-'. */ 1664 1665 if (bitsleft == 0) { 1666 if (i + 1 < size) { 1667 Py_UNICODE ch2 = s[i+1]; 1668 1669 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) { 1670 1671 } else if (B64CHAR(ch2) || ch2 == '-') { 1672 *out++ = '-'; 1673 inShift = 0; 1674 } else { 1675 inShift = 0; 1676 } 1677 1678 } 1679 else { 1680 *out++ = '-'; 1681 inShift = 0; 1682 } 1683 } 1684 } 1685 } 1686 } 1687 if (bitsleft) { 1688 *out++= B64(charsleft << (6-bitsleft) ); 1689 *out++ = '-'; 1690 } 1691 1692 if (PyBytes_Resize(v, out - start)) { 1693 Py_DECREF(v); 1694 return NULL; 1695 } 1696 return v; 1697} 1698 1699#undef SPECIAL 1700#undef B64 1701#undef B64CHAR 1702#undef UB64 1703#undef ENCODE 1704#undef DECODE 1705 1706/* --- UTF-8 Codec -------------------------------------------------------- */ 1707 1708static 1709char utf8_code_length[256] = { 1710 /* Map UTF-8 encoded prefix byte to sequence length. zero means 1711 illegal prefix. see RFC 2279 for details */ 1712 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1713 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1714 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1715 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1716 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1717 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1718 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1719 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1720 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1721 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1722 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1723 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1724 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1725 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1726 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1727 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0 1728}; 1729 1730PyObject *PyUnicode_DecodeUTF8(const char *s, 1731 Py_ssize_t size, 1732 const char *errors) 1733{ 1734 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL); 1735} 1736 1737PyObject *PyUnicode_DecodeUTF8Stateful(const char *s, 1738 Py_ssize_t size, 1739 const char *errors, 1740 Py_ssize_t *consumed) 1741{ 1742 const char *starts = s; 1743 int n; 1744 Py_ssize_t startinpos; 1745 Py_ssize_t endinpos; 1746 Py_ssize_t outpos; 1747 const char *e; 1748 PyUnicodeObject *unicode; 1749 Py_UNICODE *p; 1750 const char *errmsg = ""; 1751 PyObject *errorHandler = NULL; 1752 PyObject *exc = NULL; 1753 1754 /* Note: size will always be longer than the resulting Unicode 1755 character count */ 1756 unicode = _PyUnicode_New(size); 1757 if (!unicode) 1758 return NULL; 1759 if (size == 0) { 1760 if (consumed) 1761 *consumed = 0; 1762 return (PyObject *)unicode; 1763 } 1764 1765 /* Unpack UTF-8 encoded data */ 1766 p = unicode->str; 1767 e = s + size; 1768 1769 while (s < e) { 1770 Py_UCS4 ch = (unsigned char)*s; 1771 1772 if (ch < 0x80) { 1773 *p++ = (Py_UNICODE)ch; 1774 s++; 1775 continue; 1776 } 1777 1778 n = utf8_code_length[ch]; 1779 1780 if (s + n > e) { 1781 if (consumed) 1782 break; 1783 else { 1784 errmsg = "unexpected end of data"; 1785 startinpos = s-starts; 1786 endinpos = size; 1787 goto utf8Error; 1788 } 1789 } 1790 1791 switch (n) { 1792 1793 case 0: 1794 errmsg = "unexpected code byte"; 1795 startinpos = s-starts; 1796 endinpos = startinpos+1; 1797 goto utf8Error; 1798 1799 case 1: 1800 errmsg = "internal error"; 1801 startinpos = s-starts; 1802 endinpos = startinpos+1; 1803 goto utf8Error; 1804 1805 case 2: 1806 if ((s[1] & 0xc0) != 0x80) { 1807 errmsg = "invalid data"; 1808 startinpos = s-starts; 1809 endinpos = startinpos+2; 1810 goto utf8Error; 1811 } 1812 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f); 1813 if (ch < 0x80) { 1814 startinpos = s-starts; 1815 endinpos = startinpos+2; 1816 errmsg = "illegal encoding"; 1817 goto utf8Error; 1818 } 1819 else 1820 *p++ = (Py_UNICODE)ch; 1821 break; 1822 1823 case 3: 1824 if ((s[1] & 0xc0) != 0x80 || 1825 (s[2] & 0xc0) != 0x80) { 1826 errmsg = "invalid data"; 1827 startinpos = s-starts; 1828 endinpos = startinpos+3; 1829 goto utf8Error; 1830 } 1831 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f); 1832 if (ch < 0x0800) { 1833 /* Note: UTF-8 encodings of surrogates are considered 1834 legal UTF-8 sequences; 1835 1836 XXX For wide builds (UCS-4) we should probably try 1837 to recombine the surrogates into a single code 1838 unit. 1839 */ 1840 errmsg = "illegal encoding"; 1841 startinpos = s-starts; 1842 endinpos = startinpos+3; 1843 goto utf8Error; 1844 } 1845 else 1846 *p++ = (Py_UNICODE)ch; 1847 break; 1848 1849 case 4: 1850 if ((s[1] & 0xc0) != 0x80 || 1851 (s[2] & 0xc0) != 0x80 || 1852 (s[3] & 0xc0) != 0x80) { 1853 errmsg = "invalid data"; 1854 startinpos = s-starts; 1855 endinpos = startinpos+4; 1856 goto utf8Error; 1857 } 1858 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) + 1859 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f); 1860 /* validate and convert to UTF-16 */ 1861 if ((ch < 0x10000) /* minimum value allowed for 4 1862 byte encoding */ 1863 || (ch > 0x10ffff)) /* maximum value allowed for 1864 UTF-16 */ 1865 { 1866 errmsg = "illegal encoding"; 1867 startinpos = s-starts; 1868 endinpos = startinpos+4; 1869 goto utf8Error; 1870 } 1871#ifdef Py_UNICODE_WIDE 1872 *p++ = (Py_UNICODE)ch; 1873#else 1874 /* compute and append the two surrogates: */ 1875 1876 /* translate from 10000..10FFFF to 0..FFFF */ 1877 ch -= 0x10000; 1878 1879 /* high surrogate = top 10 bits added to D800 */ 1880 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10)); 1881 1882 /* low surrogate = bottom 10 bits added to DC00 */ 1883 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF)); 1884#endif 1885 break; 1886 1887 default: 1888 /* Other sizes are only needed for UCS-4 */ 1889 errmsg = "unsupported Unicode code range"; 1890 startinpos = s-starts; 1891 endinpos = startinpos+n; 1892 goto utf8Error; 1893 } 1894 s += n; 1895 continue; 1896 1897 utf8Error: 1898 outpos = p-PyUnicode_AS_UNICODE(unicode); 1899 if (unicode_decode_call_errorhandler( 1900 errors, &errorHandler, 1901 "utf8", errmsg, 1902 &starts, &e, &startinpos, &endinpos, &exc, &s, 1903 (PyObject **)&unicode, &outpos, &p)) 1904 goto onError; 1905 } 1906 if (consumed) 1907 *consumed = s-starts; 1908 1909 /* Adjust length */ 1910 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0) 1911 goto onError; 1912 1913 Py_XDECREF(errorHandler); 1914 Py_XDECREF(exc); 1915 return (PyObject *)unicode; 1916 1917onError: 1918 Py_XDECREF(errorHandler); 1919 Py_XDECREF(exc); 1920 Py_DECREF(unicode); 1921 return NULL; 1922} 1923 1924/* Allocation strategy: if the string is short, convert into a stack buffer 1925 and allocate exactly as much space needed at the end. Else allocate the 1926 maximum possible needed (4 result bytes per Unicode character), and return 1927 the excess memory at the end. 1928*/ 1929PyObject * 1930PyUnicode_EncodeUTF8(const Py_UNICODE *s, 1931 Py_ssize_t size, 1932 const char *errors) 1933{ 1934#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */ 1935 1936 Py_ssize_t i; /* index into s of next input byte */ 1937 PyObject *v; /* result string object */ 1938 char *p; /* next free byte in output buffer */ 1939 Py_ssize_t nallocated; /* number of result bytes allocated */ 1940 Py_ssize_t nneeded; /* number of result bytes needed */ 1941 char stackbuf[MAX_SHORT_UNICHARS * 4]; 1942 1943 assert(s != NULL); 1944 assert(size >= 0); 1945 1946 if (size <= MAX_SHORT_UNICHARS) { 1947 /* Write into the stack buffer; nallocated can't overflow. 1948 * At the end, we'll allocate exactly as much heap space as it 1949 * turns out we need. 1950 */ 1951 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int); 1952 v = NULL; /* will allocate after we're done */ 1953 p = stackbuf; 1954 } 1955 else { 1956 /* Overallocate on the heap, and give the excess back at the end. */ 1957 nallocated = size * 4; 1958 if (nallocated / 4 != size) /* overflow! */ 1959 return PyErr_NoMemory(); 1960 v = PyBytes_FromStringAndSize(NULL, nallocated); 1961 if (v == NULL) 1962 return NULL; 1963 p = PyBytes_AS_STRING(v); 1964 } 1965 1966 for (i = 0; i < size;) { 1967 Py_UCS4 ch = s[i++]; 1968 1969 if (ch < 0x80) 1970 /* Encode ASCII */ 1971 *p++ = (char) ch; 1972 1973 else if (ch < 0x0800) { 1974 /* Encode Latin-1 */ 1975 *p++ = (char)(0xc0 | (ch >> 6)); 1976 *p++ = (char)(0x80 | (ch & 0x3f)); 1977 } 1978 else { 1979 /* Encode UCS2 Unicode ordinals */ 1980 if (ch < 0x10000) { 1981 /* Special case: check for high surrogate */ 1982 if (0xD800 <= ch && ch <= 0xDBFF && i != size) { 1983 Py_UCS4 ch2 = s[i]; 1984 /* Check for low surrogate and combine the two to 1985 form a UCS4 value */ 1986 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) { 1987 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000; 1988 i++; 1989 goto encodeUCS4; 1990 } 1991 /* Fall through: handles isolated high surrogates */ 1992 } 1993 *p++ = (char)(0xe0 | (ch >> 12)); 1994 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); 1995 *p++ = (char)(0x80 | (ch & 0x3f)); 1996 continue; 1997 } 1998encodeUCS4: 1999 /* Encode UCS4 Unicode ordinals */ 2000 *p++ = (char)(0xf0 | (ch >> 18)); 2001 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f)); 2002 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); 2003 *p++ = (char)(0x80 | (ch & 0x3f)); 2004 } 2005 } 2006 2007 if (v == NULL) { 2008 /* This was stack allocated. */ 2009 nneeded = p - stackbuf; 2010 assert(nneeded <= nallocated); 2011 v = PyBytes_FromStringAndSize(stackbuf, nneeded); 2012 } 2013 else { 2014 /* Cut back to size actually needed. */ 2015 nneeded = p - PyBytes_AS_STRING(v); 2016 assert(nneeded <= nallocated); 2017 PyBytes_Resize(v, nneeded); 2018 } 2019 return v; 2020 2021#undef MAX_SHORT_UNICHARS 2022} 2023 2024PyObject *PyUnicode_AsUTF8String(PyObject *unicode) 2025{ 2026 if (!PyUnicode_Check(unicode)) { 2027 PyErr_BadArgument(); 2028 return NULL; 2029 } 2030 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode), 2031 PyUnicode_GET_SIZE(unicode), 2032 NULL); 2033} 2034 2035/* --- UTF-32 Codec ------------------------------------------------------- */ 2036 2037PyObject * 2038PyUnicode_DecodeUTF32(const char *s, 2039 Py_ssize_t size, 2040 const char *errors, 2041 int *byteorder) 2042{ 2043 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL); 2044} 2045 2046PyObject * 2047PyUnicode_DecodeUTF32Stateful(const char *s, 2048 Py_ssize_t size, 2049 const char *errors, 2050 int *byteorder, 2051 Py_ssize_t *consumed) 2052{ 2053 const char *starts = s; 2054 Py_ssize_t startinpos; 2055 Py_ssize_t endinpos; 2056 Py_ssize_t outpos; 2057 PyUnicodeObject *unicode; 2058 Py_UNICODE *p; 2059#ifndef Py_UNICODE_WIDE 2060 int i, pairs; 2061#else 2062 const int pairs = 0; 2063#endif 2064 const unsigned char *q, *e; 2065 int bo = 0; /* assume native ordering by default */ 2066 const char *errmsg = ""; 2067 /* Offsets from q for retrieving bytes in the right order. */ 2068#ifdef BYTEORDER_IS_LITTLE_ENDIAN 2069 int iorder[] = {0, 1, 2, 3}; 2070#else 2071 int iorder[] = {3, 2, 1, 0}; 2072#endif 2073 PyObject *errorHandler = NULL; 2074 PyObject *exc = NULL; 2075 /* On narrow builds we split characters outside the BMP into two 2076 codepoints => count how much extra space we need. */ 2077#ifndef Py_UNICODE_WIDE 2078 for (i = pairs = 0; i < size/4; i++) 2079 if (((Py_UCS4 *)s)[i] >= 0x10000) 2080 pairs++; 2081#endif 2082 2083 /* This might be one to much, because of a BOM */ 2084 unicode = _PyUnicode_New((size+3)/4+pairs); 2085 if (!unicode) 2086 return NULL; 2087 if (size == 0) 2088 return (PyObject *)unicode; 2089 2090 /* Unpack UTF-32 encoded data */ 2091 p = unicode->str; 2092 q = (unsigned char *)s; 2093 e = q + size; 2094 2095 if (byteorder) 2096 bo = *byteorder; 2097 2098 /* Check for BOM marks (U+FEFF) in the input and adjust current 2099 byte order setting accordingly. In native mode, the leading BOM 2100 mark is skipped, in all other modes, it is copied to the output 2101 stream as-is (giving a ZWNBSP character). */ 2102 if (bo == 0) { 2103 if (size >= 4) { 2104 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) | 2105 (q[iorder[1]] << 8) | q[iorder[0]]; 2106#ifdef BYTEORDER_IS_LITTLE_ENDIAN 2107 if (bom == 0x0000FEFF) { 2108 q += 4; 2109 bo = -1; 2110 } 2111 else if (bom == 0xFFFE0000) { 2112 q += 4; 2113 bo = 1; 2114 } 2115#else 2116 if (bom == 0x0000FEFF) { 2117 q += 4; 2118 bo = 1; 2119 } 2120 else if (bom == 0xFFFE0000) { 2121 q += 4; 2122 bo = -1; 2123 } 2124#endif 2125 } 2126 } 2127 2128 if (bo == -1) { 2129 /* force LE */ 2130 iorder[0] = 0; 2131 iorder[1] = 1; 2132 iorder[2] = 2; 2133 iorder[3] = 3; 2134 } 2135 else if (bo == 1) { 2136 /* force BE */ 2137 iorder[0] = 3; 2138 iorder[1] = 2; 2139 iorder[2] = 1; 2140 iorder[3] = 0; 2141 } 2142 2143 while (q < e) { 2144 Py_UCS4 ch; 2145 /* remaining bytes at the end? (size should be divisible by 4) */ 2146 if (e-q<4) { 2147 if (consumed) 2148 break; 2149 errmsg = "truncated data"; 2150 startinpos = ((const char *)q)-starts; 2151 endinpos = ((const char *)e)-starts; 2152 goto utf32Error; 2153 /* The remaining input chars are ignored if the callback 2154 chooses to skip the input */ 2155 } 2156 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) | 2157 (q[iorder[1]] << 8) | q[iorder[0]]; 2158 2159 if (ch >= 0x110000) 2160 { 2161 errmsg = "codepoint not in range(0x110000)"; 2162 startinpos = ((const char *)q)-starts; 2163 endinpos = startinpos+4; 2164 goto utf32Error; 2165 } 2166#ifndef Py_UNICODE_WIDE 2167 if (ch >= 0x10000) 2168 { 2169 *p++ = 0xD800 | ((ch-0x10000) >> 10); 2170 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF); 2171 } 2172 else 2173#endif 2174 *p++ = ch; 2175 q += 4; 2176 continue; 2177 utf32Error: 2178 outpos = p-PyUnicode_AS_UNICODE(unicode); 2179 if (unicode_decode_call_errorhandler( 2180 errors, &errorHandler, 2181 "utf32", errmsg, 2182 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q, 2183 (PyObject **)&unicode, &outpos, &p)) 2184 goto onError; 2185 } 2186 2187 if (byteorder) 2188 *byteorder = bo; 2189 2190 if (consumed) 2191 *consumed = (const char *)q-starts; 2192 2193 /* Adjust length */ 2194 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0) 2195 goto onError; 2196 2197 Py_XDECREF(errorHandler); 2198 Py_XDECREF(exc); 2199 return (PyObject *)unicode; 2200 2201onError: 2202 Py_DECREF(unicode); 2203 Py_XDECREF(errorHandler); 2204 Py_XDECREF(exc); 2205 return NULL; 2206} 2207 2208PyObject * 2209PyUnicode_EncodeUTF32(const Py_UNICODE *s, 2210 Py_ssize_t size, 2211 const char *errors, 2212 int byteorder) 2213{ 2214 PyObject *v; 2215 unsigned char *p; 2216#ifndef Py_UNICODE_WIDE 2217 int i, pairs; 2218#else 2219 const int pairs = 0; 2220#endif 2221 /* Offsets from p for storing byte pairs in the right order. */ 2222#ifdef BYTEORDER_IS_LITTLE_ENDIAN 2223 int iorder[] = {0, 1, 2, 3}; 2224#else 2225 int iorder[] = {3, 2, 1, 0}; 2226#endif 2227 2228#define STORECHAR(CH) \ 2229 do { \ 2230 p[iorder[3]] = ((CH) >> 24) & 0xff; \ 2231 p[iorder[2]] = ((CH) >> 16) & 0xff; \ 2232 p[iorder[1]] = ((CH) >> 8) & 0xff; \ 2233 p[iorder[0]] = (CH) & 0xff; \ 2234 p += 4; \ 2235 } while(0) 2236 2237 /* In narrow builds we can output surrogate pairs as one codepoint, 2238 so we need less space. */ 2239#ifndef Py_UNICODE_WIDE 2240 for (i = pairs = 0; i < size-1; i++) 2241 if (0xD800 <= s[i] && s[i] <= 0xDBFF && 2242 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF) 2243 pairs++; 2244#endif 2245 v = PyBytes_FromStringAndSize(NULL, 2246 4 * (size - pairs + (byteorder == 0))); 2247 if (v == NULL) 2248 return NULL; 2249 2250 p = (unsigned char *)PyBytes_AS_STRING(v); 2251 if (byteorder == 0) 2252 STORECHAR(0xFEFF); 2253 if (size == 0) 2254 return v; 2255 2256 if (byteorder == -1) { 2257 /* force LE */ 2258 iorder[0] = 0; 2259 iorder[1] = 1; 2260 iorder[2] = 2; 2261 iorder[3] = 3; 2262 } 2263 else if (byteorder == 1) { 2264 /* force BE */ 2265 iorder[0] = 3; 2266 iorder[1] = 2; 2267 iorder[2] = 1; 2268 iorder[3] = 0; 2269 } 2270 2271 while (size-- > 0) { 2272 Py_UCS4 ch = *s++; 2273#ifndef Py_UNICODE_WIDE 2274 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) { 2275 Py_UCS4 ch2 = *s; 2276 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) { 2277 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000; 2278 s++; 2279 size--; 2280 } 2281 } 2282#endif 2283 STORECHAR(ch); 2284 } 2285 return v; 2286#undef STORECHAR 2287} 2288 2289PyObject *PyUnicode_AsUTF32String(PyObject *unicode) 2290{ 2291 if (!PyUnicode_Check(unicode)) { 2292 PyErr_BadArgument(); 2293 return NULL; 2294 } 2295 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode), 2296 PyUnicode_GET_SIZE(unicode), 2297 NULL, 2298 0); 2299} 2300 2301/* --- UTF-16 Codec ------------------------------------------------------- */ 2302 2303PyObject * 2304PyUnicode_DecodeUTF16(const char *s, 2305 Py_ssize_t size, 2306 const char *errors, 2307 int *byteorder) 2308{ 2309 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL); 2310} 2311 2312PyObject * 2313PyUnicode_DecodeUTF16Stateful(const char *s, 2314 Py_ssize_t size, 2315 const char *errors, 2316 int *byteorder, 2317 Py_ssize_t *consumed) 2318{ 2319 const char *starts = s; 2320 Py_ssize_t startinpos; 2321 Py_ssize_t endinpos; 2322 Py_ssize_t outpos; 2323 PyUnicodeObject *unicode; 2324 Py_UNICODE *p; 2325 const unsigned char *q, *e; 2326 int bo = 0; /* assume native ordering by default */ 2327 const char *errmsg = ""; 2328 /* Offsets from q for retrieving byte pairs in the right order. */ 2329#ifdef BYTEORDER_IS_LITTLE_ENDIAN 2330 int ihi = 1, ilo = 0; 2331#else 2332 int ihi = 0, ilo = 1; 2333#endif 2334 PyObject *errorHandler = NULL; 2335 PyObject *exc = NULL; 2336 2337 /* Note: size will always be longer than the resulting Unicode 2338 character count */ 2339 unicode = _PyUnicode_New(size); 2340 if (!unicode) 2341 return NULL; 2342 if (size == 0) 2343 return (PyObject *)unicode; 2344 2345 /* Unpack UTF-16 encoded data */ 2346 p = unicode->str; 2347 q = (unsigned char *)s; 2348 e = q + size; 2349 2350 if (byteorder) 2351 bo = *byteorder; 2352 2353 /* Check for BOM marks (U+FEFF) in the input and adjust current 2354 byte order setting accordingly. In native mode, the leading BOM 2355 mark is skipped, in all other modes, it is copied to the output 2356 stream as-is (giving a ZWNBSP character). */ 2357 if (bo == 0) { 2358 if (size >= 2) { 2359 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo]; 2360#ifdef BYTEORDER_IS_LITTLE_ENDIAN 2361 if (bom == 0xFEFF) { 2362 q += 2; 2363 bo = -1; 2364 } 2365 else if (bom == 0xFFFE) { 2366 q += 2; 2367 bo = 1; 2368 } 2369#else 2370 if (bom == 0xFEFF) { 2371 q += 2; 2372 bo = 1; 2373 } 2374 else if (bom == 0xFFFE) { 2375 q += 2; 2376 bo = -1; 2377 } 2378#endif 2379 } 2380 } 2381 2382 if (bo == -1) { 2383 /* force LE */ 2384 ihi = 1; 2385 ilo = 0; 2386 } 2387 else if (bo == 1) { 2388 /* force BE */ 2389 ihi = 0; 2390 ilo = 1; 2391 } 2392 2393 while (q < e) { 2394 Py_UNICODE ch; 2395 /* remaining bytes at the end? (size should be even) */ 2396 if (e-q<2) { 2397 if (consumed) 2398 break; 2399 errmsg = "truncated data"; 2400 startinpos = ((const char *)q)-starts; 2401 endinpos = ((const char *)e)-starts; 2402 goto utf16Error; 2403 /* The remaining input chars are ignored if the callback 2404 chooses to skip the input */ 2405 } 2406 ch = (q[ihi] << 8) | q[ilo]; 2407 2408 q += 2; 2409 2410 if (ch < 0xD800 || ch > 0xDFFF) { 2411 *p++ = ch; 2412 continue; 2413 } 2414 2415 /* UTF-16 code pair: */ 2416 if (q >= e) { 2417 errmsg = "unexpected end of data"; 2418 startinpos = (((const char *)q)-2)-starts; 2419 endinpos = ((const char *)e)-starts; 2420 goto utf16Error; 2421 } 2422 if (0xD800 <= ch && ch <= 0xDBFF) { 2423 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo]; 2424 q += 2; 2425 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) { 2426#ifndef Py_UNICODE_WIDE 2427 *p++ = ch; 2428 *p++ = ch2; 2429#else 2430 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000; 2431#endif 2432 continue; 2433 } 2434 else { 2435 errmsg = "illegal UTF-16 surrogate"; 2436 startinpos = (((const char *)q)-4)-starts; 2437 endinpos = startinpos+2; 2438 goto utf16Error; 2439 } 2440 2441 } 2442 errmsg = "illegal encoding"; 2443 startinpos = (((const char *)q)-2)-starts; 2444 endinpos = startinpos+2; 2445 /* Fall through to report the error */ 2446 2447 utf16Error: 2448 outpos = p-PyUnicode_AS_UNICODE(unicode); 2449 if (unicode_decode_call_errorhandler( 2450 errors, &errorHandler, 2451 "utf16", errmsg, 2452 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q, 2453 (PyObject **)&unicode, &outpos, &p)) 2454 goto onError; 2455 } 2456 2457 if (byteorder) 2458 *byteorder = bo; 2459 2460 if (consumed) 2461 *consumed = (const char *)q-starts; 2462 2463 /* Adjust length */ 2464 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0) 2465 goto onError; 2466 2467 Py_XDECREF(errorHandler); 2468 Py_XDECREF(exc); 2469 return (PyObject *)unicode; 2470 2471onError: 2472 Py_DECREF(unicode); 2473 Py_XDECREF(errorHandler); 2474 Py_XDECREF(exc); 2475 return NULL; 2476} 2477 2478PyObject * 2479PyUnicode_EncodeUTF16(const Py_UNICODE *s, 2480 Py_ssize_t size, 2481 const char *errors, 2482 int byteorder) 2483{ 2484 PyObject *v; 2485 unsigned char *p; 2486#ifdef Py_UNICODE_WIDE 2487 int i, pairs; 2488#else 2489 const int pairs = 0; 2490#endif 2491 /* Offsets from p for storing byte pairs in the right order. */ 2492#ifdef BYTEORDER_IS_LITTLE_ENDIAN 2493 int ihi = 1, ilo = 0; 2494#else 2495 int ihi = 0, ilo = 1; 2496#endif 2497 2498#define STORECHAR(CH) \ 2499 do { \ 2500 p[ihi] = ((CH) >> 8) & 0xff; \ 2501 p[ilo] = (CH) & 0xff; \ 2502 p += 2; \ 2503 } while(0) 2504 2505#ifdef Py_UNICODE_WIDE 2506 for (i = pairs = 0; i < size; i++) 2507 if (s[i] >= 0x10000) 2508 pairs++; 2509#endif 2510 v = PyBytes_FromStringAndSize(NULL, 2511 2 * (size + pairs + (byteorder == 0))); 2512 if (v == NULL) 2513 return NULL; 2514 2515 p = (unsigned char *)PyBytes_AS_STRING(v); 2516 if (byteorder == 0) 2517 STORECHAR(0xFEFF); 2518 if (size == 0) 2519 return v; 2520 2521 if (byteorder == -1) { 2522 /* force LE */ 2523 ihi = 1; 2524 ilo = 0; 2525 } 2526 else if (byteorder == 1) { 2527 /* force BE */ 2528 ihi = 0; 2529 ilo = 1; 2530 } 2531 2532 while (size-- > 0) { 2533 Py_UNICODE ch = *s++; 2534 Py_UNICODE ch2 = 0; 2535#ifdef Py_UNICODE_WIDE 2536 if (ch >= 0x10000) { 2537 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF); 2538 ch = 0xD800 | ((ch-0x10000) >> 10); 2539 } 2540#endif 2541 STORECHAR(ch); 2542 if (ch2) 2543 STORECHAR(ch2); 2544 } 2545 return v; 2546#undef STORECHAR 2547} 2548 2549PyObject *PyUnicode_AsUTF16String(PyObject *unicode) 2550{ 2551 if (!PyUnicode_Check(unicode)) { 2552 PyErr_BadArgument(); 2553 return NULL; 2554 } 2555 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode), 2556 PyUnicode_GET_SIZE(unicode), 2557 NULL, 2558 0); 2559} 2560 2561/* --- Unicode Escape Codec ----------------------------------------------- */ 2562 2563static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL; 2564 2565PyObject *PyUnicode_DecodeUnicodeEscape(const char *s, 2566 Py_ssize_t size, 2567 const char *errors) 2568{ 2569 const char *starts = s; 2570 Py_ssize_t startinpos; 2571 Py_ssize_t endinpos; 2572 Py_ssize_t outpos; 2573 int i; 2574 PyUnicodeObject *v; 2575 Py_UNICODE *p; 2576 const char *end; 2577 char* message; 2578 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */ 2579 PyObject *errorHandler = NULL; 2580 PyObject *exc = NULL; 2581 2582 /* Escaped strings will always be longer than the resulting 2583 Unicode string, so we start with size here and then reduce the 2584 length after conversion to the true value. 2585 (but if the error callback returns a long replacement string 2586 we'll have to allocate more space) */ 2587 v = _PyUnicode_New(size); 2588 if (v == NULL) 2589 goto onError; 2590 if (size == 0) 2591 return (PyObject *)v; 2592 2593 p = PyUnicode_AS_UNICODE(v); 2594 end = s + size; 2595 2596 while (s < end) { 2597 unsigned char c; 2598 Py_UNICODE x; 2599 int digits; 2600 2601 /* Non-escape characters are interpreted as Unicode ordinals */ 2602 if (*s != '\\') { 2603 *p++ = (unsigned char) *s++; 2604 continue; 2605 } 2606 2607 startinpos = s-starts; 2608 /* \ - Escapes */ 2609 s++; 2610 switch (*s++) { 2611 2612 /* \x escapes */ 2613 case '\n': break; 2614 case '\\': *p++ = '\\'; break; 2615 case '\'': *p++ = '\''; break; 2616 case '\"': *p++ = '\"'; break; 2617 case 'b': *p++ = '\b'; break; 2618 case 'f': *p++ = '\014'; break; /* FF */ 2619 case 't': *p++ = '\t'; break; 2620 case 'n': *p++ = '\n'; break; 2621 case 'r': *p++ = '\r'; break; 2622 case 'v': *p++ = '\013'; break; /* VT */ 2623 case 'a': *p++ = '\007'; break; /* BEL, not classic C */ 2624 2625 /* \OOO (octal) escapes */ 2626 case '0': case '1': case '2': case '3': 2627 case '4': case '5': case '6': case '7': 2628 x = s[-1] - '0'; 2629 if ('0' <= *s && *s <= '7') { 2630 x = (x<<3) + *s++ - '0'; 2631 if ('0' <= *s && *s <= '7') 2632 x = (x<<3) + *s++ - '0'; 2633 } 2634 *p++ = x; 2635 break; 2636 2637 /* hex escapes */ 2638 /* \xXX */ 2639 case 'x': 2640 digits = 2; 2641 message = "truncated \\xXX escape"; 2642 goto hexescape; 2643 2644 /* \uXXXX */ 2645 case 'u': 2646 digits = 4; 2647 message = "truncated \\uXXXX escape"; 2648 goto hexescape; 2649 2650 /* \UXXXXXXXX */ 2651 case 'U': 2652 digits = 8; 2653 message = "truncated \\UXXXXXXXX escape"; 2654 hexescape: 2655 chr = 0; 2656 outpos = p-PyUnicode_AS_UNICODE(v); 2657 if (s+digits>end) { 2658 endinpos = size; 2659 if (unicode_decode_call_errorhandler( 2660 errors, &errorHandler, 2661 "unicodeescape", "end of string in escape sequence", 2662 &starts, &end, &startinpos, &endinpos, &exc, &s, 2663 (PyObject **)&v, &outpos, &p)) 2664 goto onError; 2665 goto nextByte; 2666 } 2667 for (i = 0; i < digits; ++i) { 2668 c = (unsigned char) s[i]; 2669 if (!isxdigit(c)) { 2670 endinpos = (s+i+1)-starts; 2671 if (unicode_decode_call_errorhandler( 2672 errors, &errorHandler, 2673 "unicodeescape", message, 2674 &starts, &end, &startinpos, &endinpos, &exc, &s, 2675 (PyObject **)&v, &outpos, &p)) 2676 goto onError; 2677 goto nextByte; 2678 } 2679 chr = (chr<<4) & ~0xF; 2680 if (c >= '0' && c <= '9') 2681 chr += c - '0'; 2682 else if (c >= 'a' && c <= 'f') 2683 chr += 10 + c - 'a'; 2684 else 2685 chr += 10 + c - 'A'; 2686 } 2687 s += i; 2688 if (chr == 0xffffffff && PyErr_Occurred()) 2689 /* _decoding_error will have already written into the 2690 target buffer. */ 2691 break; 2692 store: 2693 /* when we get here, chr is a 32-bit unicode character */ 2694 if (chr <= 0xffff) 2695 /* UCS-2 character */ 2696 *p++ = (Py_UNICODE) chr; 2697 else if (chr <= 0x10ffff) { 2698 /* UCS-4 character. Either store directly, or as 2699 surrogate pair. */ 2700#ifdef Py_UNICODE_WIDE 2701 *p++ = chr; 2702#else 2703 chr -= 0x10000L; 2704 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10); 2705 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF); 2706#endif 2707 } else { 2708 endinpos = s-starts; 2709 outpos = p-PyUnicode_AS_UNICODE(v); 2710 if (unicode_decode_call_errorhandler( 2711 errors, &errorHandler, 2712 "unicodeescape", "illegal Unicode character", 2713 &starts, &end, &startinpos, &endinpos, &exc, &s, 2714 (PyObject **)&v, &outpos, &p)) 2715 goto onError; 2716 } 2717 break; 2718 2719 /* \N{name} */ 2720 case 'N': 2721 message = "malformed \\N character escape"; 2722 if (ucnhash_CAPI == NULL) { 2723 /* load the unicode data module */ 2724 PyObject *m, *api; 2725 m = PyImport_ImportModule("unicodedata"); 2726 if (m == NULL) 2727 goto ucnhashError; 2728 api = PyObject_GetAttrString(m, "ucnhash_CAPI"); 2729 Py_DECREF(m); 2730 if (api == NULL) 2731 goto ucnhashError; 2732 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api); 2733 Py_DECREF(api); 2734 if (ucnhash_CAPI == NULL) 2735 goto ucnhashError; 2736 } 2737 if (*s == '{') { 2738 const char *start = s+1; 2739 /* look for the closing brace */ 2740 while (*s != '}' && s < end) 2741 s++; 2742 if (s > start && s < end && *s == '}') { 2743 /* found a name. look it up in the unicode database */ 2744 message = "unknown Unicode character name"; 2745 s++; 2746 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr)) 2747 goto store; 2748 } 2749 } 2750 endinpos = s-starts; 2751 outpos = p-PyUnicode_AS_UNICODE(v); 2752 if (unicode_decode_call_errorhandler( 2753 errors, &errorHandler, 2754 "unicodeescape", message, 2755 &starts, &end, &startinpos, &endinpos, &exc, &s, 2756 (PyObject **)&v, &outpos, &p)) 2757 goto onError; 2758 break; 2759 2760 default: 2761 if (s > end) { 2762 message = "\\ at end of string"; 2763 s--; 2764 endinpos = s-starts; 2765 outpos = p-PyUnicode_AS_UNICODE(v); 2766 if (unicode_decode_call_errorhandler( 2767 errors, &errorHandler, 2768 "unicodeescape", message, 2769 &starts, &end, &startinpos, &endinpos, &exc, &s, 2770 (PyObject **)&v, &outpos, &p)) 2771 goto onError; 2772 } 2773 else { 2774 *p++ = '\\'; 2775 *p++ = (unsigned char)s[-1]; 2776 } 2777 break; 2778 } 2779 nextByte: 2780 ; 2781 } 2782 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0) 2783 goto onError; 2784 Py_XDECREF(errorHandler); 2785 Py_XDECREF(exc); 2786 return (PyObject *)v; 2787 2788ucnhashError: 2789 PyErr_SetString( 2790 PyExc_UnicodeError, 2791 "\\N escapes not supported (can't load unicodedata module)" 2792 ); 2793 Py_XDECREF(v); 2794 Py_XDECREF(errorHandler); 2795 Py_XDECREF(exc); 2796 return NULL; 2797 2798onError: 2799 Py_XDECREF(v); 2800 Py_XDECREF(errorHandler); 2801 Py_XDECREF(exc); 2802 return NULL; 2803} 2804 2805/* Return a Unicode-Escape string version of the Unicode object. 2806 2807 If quotes is true, the string is enclosed in u"" or u'' quotes as 2808 appropriate. 2809 2810*/ 2811 2812Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s, 2813 Py_ssize_t size, 2814 Py_UNICODE ch) 2815{ 2816 /* like wcschr, but doesn't stop at NULL characters */ 2817 2818 while (size-- > 0) { 2819 if (*s == ch) 2820 return s; 2821 s++; 2822 } 2823 2824 return NULL; 2825} 2826 2827static const char *hexdigits = "0123456789abcdef"; 2828 2829PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s, 2830 Py_ssize_t size) 2831{ 2832 PyObject *repr; 2833 char *p; 2834 2835 /* XXX(nnorwitz): rather than over-allocating, it would be 2836 better to choose a different scheme. Perhaps scan the 2837 first N-chars of the string and allocate based on that size. 2838 */ 2839 /* Initial allocation is based on the longest-possible unichr 2840 escape. 2841 2842 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source 2843 unichr, so in this case it's the longest unichr escape. In 2844 narrow (UTF-16) builds this is five chars per source unichr 2845 since there are two unichrs in the surrogate pair, so in narrow 2846 (UTF-16) builds it's not the longest unichr escape. 2847 2848 In wide or narrow builds '\uxxxx' is 6 chars per source unichr, 2849 so in the narrow (UTF-16) build case it's the longest unichr 2850 escape. 2851 */ 2852 2853 repr = PyBytes_FromStringAndSize(NULL, 2854#ifdef Py_UNICODE_WIDE 2855 + 10*size 2856#else 2857 + 6*size 2858#endif 2859 + 1); 2860 if (repr == NULL) 2861 return NULL; 2862 2863 p = PyBytes_AS_STRING(repr); 2864 2865 while (size-- > 0) { 2866 Py_UNICODE ch = *s++; 2867 2868 /* Escape backslashes */ 2869 if (ch == '\\') { 2870 *p++ = '\\'; 2871 *p++ = (char) ch; 2872 continue; 2873 } 2874 2875#ifdef Py_UNICODE_WIDE 2876 /* Map 21-bit characters to '\U00xxxxxx' */ 2877 else if (ch >= 0x10000) { 2878 *p++ = '\\'; 2879 *p++ = 'U'; 2880 *p++ = hexdigits[(ch >> 28) & 0x0000000F]; 2881 *p++ = hexdigits[(ch >> 24) & 0x0000000F]; 2882 *p++ = hexdigits[(ch >> 20) & 0x0000000F]; 2883 *p++ = hexdigits[(ch >> 16) & 0x0000000F]; 2884 *p++ = hexdigits[(ch >> 12) & 0x0000000F]; 2885 *p++ = hexdigits[(ch >> 8) & 0x0000000F]; 2886 *p++ = hexdigits[(ch >> 4) & 0x0000000F]; 2887 *p++ = hexdigits[ch & 0x0000000F]; 2888 continue; 2889 } 2890#else 2891 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */ 2892 else if (ch >= 0xD800 && ch < 0xDC00) { 2893 Py_UNICODE ch2; 2894 Py_UCS4 ucs; 2895 2896 ch2 = *s++; 2897 size--; 2898 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) { 2899 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000; 2900 *p++ = '\\'; 2901 *p++ = 'U'; 2902 *p++ = hexdigits[(ucs >> 28) & 0x0000000F]; 2903 *p++ = hexdigits[(ucs >> 24) & 0x0000000F]; 2904 *p++ = hexdigits[(ucs >> 20) & 0x0000000F]; 2905 *p++ = hexdigits[(ucs >> 16) & 0x0000000F]; 2906 *p++ = hexdigits[(ucs >> 12) & 0x0000000F]; 2907 *p++ = hexdigits[(ucs >> 8) & 0x0000000F]; 2908 *p++ = hexdigits[(ucs >> 4) & 0x0000000F]; 2909 *p++ = hexdigits[ucs & 0x0000000F]; 2910 continue; 2911 } 2912 /* Fall through: isolated surrogates are copied as-is */ 2913 s--; 2914 size++; 2915 } 2916#endif 2917 2918 /* Map 16-bit characters to '\uxxxx' */ 2919 if (ch >= 256) { 2920 *p++ = '\\'; 2921 *p++ = 'u'; 2922 *p++ = hexdigits[(ch >> 12) & 0x000F]; 2923 *p++ = hexdigits[(ch >> 8) & 0x000F]; 2924 *p++ = hexdigits[(ch >> 4) & 0x000F]; 2925 *p++ = hexdigits[ch & 0x000F]; 2926 } 2927 2928 /* Map special whitespace to '\t', \n', '\r' */ 2929 else if (ch == '\t') { 2930 *p++ = '\\'; 2931 *p++ = 't'; 2932 } 2933 else if (ch == '\n') { 2934 *p++ = '\\'; 2935 *p++ = 'n'; 2936 } 2937 else if (ch == '\r') { 2938 *p++ = '\\'; 2939 *p++ = 'r'; 2940 } 2941 2942 /* Map non-printable US ASCII to '\xhh' */ 2943 else if (ch < ' ' || ch >= 0x7F) { 2944 *p++ = '\\'; 2945 *p++ = 'x'; 2946 *p++ = hexdigits[(ch >> 4) & 0x000F]; 2947 *p++ = hexdigits[ch & 0x000F]; 2948 } 2949 2950 /* Copy everything else as-is */ 2951 else 2952 *p++ = (char) ch; 2953 } 2954 2955 *p = '\0'; 2956 if (PyBytes_Resize(repr, p - PyBytes_AS_STRING(repr))) { 2957 Py_DECREF(repr); 2958 return NULL; 2959 } 2960 return repr; 2961} 2962 2963PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode) 2964{ 2965 PyObject *s, *result; 2966 if (!PyUnicode_Check(unicode)) { 2967 PyErr_BadArgument(); 2968 return NULL; 2969 } 2970 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode), 2971 PyUnicode_GET_SIZE(unicode)); 2972 2973 if (!s) 2974 return NULL; 2975 result = PyString_FromStringAndSize(PyBytes_AS_STRING(s), 2976 PyBytes_GET_SIZE(s)); 2977 Py_DECREF(s); 2978 return result; 2979} 2980 2981/* --- Raw Unicode Escape Codec ------------------------------------------- */ 2982 2983PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s, 2984 Py_ssize_t size, 2985 const char *errors) 2986{ 2987 const char *starts = s; 2988 Py_ssize_t startinpos; 2989 Py_ssize_t endinpos; 2990 Py_ssize_t outpos; 2991 PyUnicodeObject *v; 2992 Py_UNICODE *p; 2993 const char *end; 2994 const char *bs; 2995 PyObject *errorHandler = NULL; 2996 PyObject *exc = NULL; 2997 2998 /* Escaped strings will always be longer than the resulting 2999 Unicode string, so we start with size here and then reduce the 3000 length after conversion to the true value. (But decoding error 3001 handler might have to resize the string) */ 3002 v = _PyUnicode_New(size); 3003 if (v == NULL) 3004 goto onError; 3005 if (size == 0) 3006 return (PyObject *)v; 3007 p = PyUnicode_AS_UNICODE(v); 3008 end = s + size; 3009 while (s < end) { 3010 unsigned char c; 3011 Py_UCS4 x; 3012 int i; 3013 int count; 3014 3015 /* Non-escape characters are interpreted as Unicode ordinals */ 3016 if (*s != '\\') { 3017 *p++ = (unsigned char)*s++; 3018 continue; 3019 } 3020 startinpos = s-starts; 3021 3022 /* \u-escapes are only interpreted iff the number of leading 3023 backslashes if odd */ 3024 bs = s; 3025 for (;s < end;) { 3026 if (*s != '\\') 3027 break; 3028 *p++ = (unsigned char)*s++; 3029 } 3030 if (((s - bs) & 1) == 0 || 3031 s >= end || 3032 (*s != 'u' && *s != 'U')) { 3033 continue; 3034 } 3035 p--; 3036 count = *s=='u' ? 4 : 8; 3037 s++; 3038 3039 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */ 3040 outpos = p-PyUnicode_AS_UNICODE(v); 3041 for (x = 0, i = 0; i < count; ++i, ++s) { 3042 c = (unsigned char)*s; 3043 if (!isxdigit(c)) { 3044 endinpos = s-starts; 3045 if (unicode_decode_call_errorhandler( 3046 errors, &errorHandler, 3047 "rawunicodeescape", "truncated \\uXXXX", 3048 &starts, &end, &startinpos, &endinpos, &exc, &s, 3049 (PyObject **)&v, &outpos, &p)) 3050 goto onError; 3051 goto nextByte; 3052 } 3053 x = (x<<4) & ~0xF; 3054 if (c >= '0' && c <= '9') 3055 x += c - '0'; 3056 else if (c >= 'a' && c <= 'f') 3057 x += 10 + c - 'a'; 3058 else 3059 x += 10 + c - 'A'; 3060 } 3061#ifndef Py_UNICODE_WIDE 3062 if (x > 0x10000) { 3063 if (unicode_decode_call_errorhandler( 3064 errors, &errorHandler, 3065 "rawunicodeescape", "\\Uxxxxxxxx out of range", 3066 &starts, &end, &startinpos, &endinpos, &exc, &s, 3067 (PyObject **)&v, &outpos, &p)) 3068 goto onError; 3069 } 3070#endif 3071 *p++ = x; 3072 nextByte: 3073 ; 3074 } 3075 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0) 3076 goto onError; 3077 Py_XDECREF(errorHandler); 3078 Py_XDECREF(exc); 3079 return (PyObject *)v; 3080 3081 onError: 3082 Py_XDECREF(v); 3083 Py_XDECREF(errorHandler); 3084 Py_XDECREF(exc); 3085 return NULL; 3086} 3087 3088PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s, 3089 Py_ssize_t size) 3090{ 3091 PyObject *repr; 3092 char *p; 3093 char *q; 3094 3095#ifdef Py_UNICODE_WIDE 3096 repr = PyBytes_FromStringAndSize(NULL, 10 * size); 3097#else 3098 repr = PyBytes_FromStringAndSize(NULL, 6 * size); 3099#endif 3100 if (repr == NULL) 3101 return NULL; 3102 if (size == 0) 3103 return repr; 3104 3105 p = q = PyBytes_AS_STRING(repr); 3106 while (size-- > 0) { 3107 Py_UNICODE ch = *s++; 3108#ifdef Py_UNICODE_WIDE 3109 /* Map 32-bit characters to '\Uxxxxxxxx' */ 3110 if (ch >= 0x10000) { 3111 *p++ = '\\'; 3112 *p++ = 'U'; 3113 *p++ = hexdigits[(ch >> 28) & 0xf]; 3114 *p++ = hexdigits[(ch >> 24) & 0xf]; 3115 *p++ = hexdigits[(ch >> 20) & 0xf]; 3116 *p++ = hexdigits[(ch >> 16) & 0xf]; 3117 *p++ = hexdigits[(ch >> 12) & 0xf]; 3118 *p++ = hexdigits[(ch >> 8) & 0xf]; 3119 *p++ = hexdigits[(ch >> 4) & 0xf]; 3120 *p++ = hexdigits[ch & 15]; 3121 } 3122 else 3123#endif 3124 /* Map 16-bit characters to '\uxxxx' */ 3125 if (ch >= 256) { 3126 *p++ = '\\'; 3127 *p++ = 'u'; 3128 *p++ = hexdigits[(ch >> 12) & 0xf]; 3129 *p++ = hexdigits[(ch >> 8) & 0xf]; 3130 *p++ = hexdigits[(ch >> 4) & 0xf]; 3131 *p++ = hexdigits[ch & 15]; 3132 } 3133 /* Copy everything else as-is */ 3134 else 3135 *p++ = (char) ch; 3136 } 3137 *p = '\0'; 3138 if (PyBytes_Resize(repr, p - q)) { 3139 Py_DECREF(repr); 3140 return NULL; 3141 } 3142 return repr; 3143} 3144 3145PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode) 3146{ 3147 PyObject *s, *result; 3148 if (!PyUnicode_Check(unicode)) { 3149 PyErr_BadArgument(); 3150 return NULL; 3151 } 3152 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode), 3153 PyUnicode_GET_SIZE(unicode)); 3154 3155 if (!s) 3156 return NULL; 3157 result = PyString_FromStringAndSize(PyBytes_AS_STRING(s), 3158 PyBytes_GET_SIZE(s)); 3159 Py_DECREF(s); 3160 return result; 3161} 3162 3163/* --- Unicode Internal Codec ------------------------------------------- */ 3164 3165PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s, 3166 Py_ssize_t size, 3167 const char *errors) 3168{ 3169 const char *starts = s; 3170 Py_ssize_t startinpos; 3171 Py_ssize_t endinpos; 3172 Py_ssize_t outpos; 3173 PyUnicodeObject *v; 3174 Py_UNICODE *p; 3175 const char *end; 3176 const char *reason; 3177 PyObject *errorHandler = NULL; 3178 PyObject *exc = NULL; 3179 3180#ifdef Py_UNICODE_WIDE 3181 Py_UNICODE unimax = PyUnicode_GetMax(); 3182#endif 3183 3184 /* XXX overflow detection missing */ 3185 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE); 3186 if (v == NULL) 3187 goto onError; 3188 if (PyUnicode_GetSize((PyObject *)v) == 0) 3189 return (PyObject *)v; 3190 p = PyUnicode_AS_UNICODE(v); 3191 end = s + size; 3192 3193 while (s < end) { 3194 memcpy(p, s, sizeof(Py_UNICODE)); 3195 /* We have to sanity check the raw data, otherwise doom looms for 3196 some malformed UCS-4 data. */ 3197 if ( 3198 #ifdef Py_UNICODE_WIDE 3199 *p > unimax || *p < 0 || 3200 #endif 3201 end-s < Py_UNICODE_SIZE 3202 ) 3203 { 3204 startinpos = s - starts; 3205 if (end-s < Py_UNICODE_SIZE) { 3206 endinpos = end-starts; 3207 reason = "truncated input"; 3208 } 3209 else { 3210 endinpos = s - starts + Py_UNICODE_SIZE; 3211 reason = "illegal code point (> 0x10FFFF)"; 3212 } 3213 outpos = p - PyUnicode_AS_UNICODE(v); 3214 if (unicode_decode_call_errorhandler( 3215 errors, &errorHandler, 3216 "unicode_internal", reason, 3217 &starts, &end, &startinpos, &endinpos, &exc, &s, 3218 (PyObject **)&v, &outpos, &p)) { 3219 goto onError; 3220 } 3221 } 3222 else { 3223 p++; 3224 s += Py_UNICODE_SIZE; 3225 } 3226 } 3227 3228 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0) 3229 goto onError; 3230 Py_XDECREF(errorHandler); 3231 Py_XDECREF(exc); 3232 return (PyObject *)v; 3233 3234 onError: 3235 Py_XDECREF(v); 3236 Py_XDECREF(errorHandler); 3237 Py_XDECREF(exc); 3238 return NULL; 3239} 3240 3241/* --- Latin-1 Codec ------------------------------------------------------ */ 3242 3243PyObject *PyUnicode_DecodeLatin1(const char *s, 3244 Py_ssize_t size, 3245 const char *errors) 3246{ 3247 PyUnicodeObject *v; 3248 Py_UNICODE *p; 3249 3250 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */ 3251 if (size == 1) { 3252 Py_UNICODE r = *(unsigned char*)s; 3253 return PyUnicode_FromUnicode(&r, 1); 3254 } 3255 3256 v = _PyUnicode_New(size); 3257 if (v == NULL) 3258 goto onError; 3259 if (size == 0) 3260 return (PyObject *)v; 3261 p = PyUnicode_AS_UNICODE(v); 3262 while (size-- > 0) 3263 *p++ = (unsigned char)*s++; 3264 return (PyObject *)v; 3265 3266 onError: 3267 Py_XDECREF(v); 3268 return NULL; 3269} 3270 3271/* create or adjust a UnicodeEncodeError */ 3272static void make_encode_exception(PyObject **exceptionObject, 3273 const char *encoding, 3274 const Py_UNICODE *unicode, Py_ssize_t size, 3275 Py_ssize_t startpos, Py_ssize_t endpos, 3276 const char *reason) 3277{ 3278 if (*exceptionObject == NULL) { 3279 *exceptionObject = PyUnicodeEncodeError_Create( 3280 encoding, unicode, size, startpos, endpos, reason); 3281 } 3282 else { 3283 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos)) 3284 goto onError; 3285 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos)) 3286 goto onError; 3287 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason)) 3288 goto onError; 3289 return; 3290 onError: 3291 Py_DECREF(*exceptionObject); 3292 *exceptionObject = NULL; 3293 } 3294} 3295 3296/* raises a UnicodeEncodeError */ 3297static void raise_encode_exception(PyObject **exceptionObject, 3298 const char *encoding, 3299 const Py_UNICODE *unicode, Py_ssize_t size, 3300 Py_ssize_t startpos, Py_ssize_t endpos, 3301 const char *reason) 3302{ 3303 make_encode_exception(exceptionObject, 3304 encoding, unicode, size, startpos, endpos, reason); 3305 if (*exceptionObject != NULL) 3306 PyCodec_StrictErrors(*exceptionObject); 3307} 3308 3309/* error handling callback helper: 3310 build arguments, call the callback and check the arguments, 3311 put the result into newpos and return the replacement string, which 3312 has to be freed by the caller */ 3313static PyObject *unicode_encode_call_errorhandler(const char *errors, 3314 PyObject **errorHandler, 3315 const char *encoding, const char *reason, 3316 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject, 3317 Py_ssize_t startpos, Py_ssize_t endpos, 3318 Py_ssize_t *newpos) 3319{ 3320 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple"; 3321 3322 PyObject *restuple; 3323 PyObject *resunicode; 3324 3325 if (*errorHandler == NULL) { 3326 *errorHandler = PyCodec_LookupError(errors); 3327 if (*errorHandler == NULL) 3328 return NULL; 3329 } 3330 3331 make_encode_exception(exceptionObject, 3332 encoding, unicode, size, startpos, endpos, reason); 3333 if (*exceptionObject == NULL) 3334 return NULL; 3335 3336 restuple = PyObject_CallFunctionObjArgs( 3337 *errorHandler, *exceptionObject, NULL); 3338 if (restuple == NULL) 3339 return NULL; 3340 if (!PyTuple_Check(restuple)) { 3341 PyErr_Format(PyExc_TypeError, &argparse[4]); 3342 Py_DECREF(restuple); 3343 return NULL; 3344 } 3345 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, 3346 &resunicode, newpos)) { 3347 Py_DECREF(restuple); 3348 return NULL; 3349 } 3350 if (*newpos<0) 3351 *newpos = size+*newpos; 3352 if (*newpos<0 || *newpos>size) { 3353 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos); 3354 Py_DECREF(restuple); 3355 return NULL; 3356 } 3357 Py_INCREF(resunicode); 3358 Py_DECREF(restuple); 3359 return resunicode; 3360} 3361 3362static PyObject *unicode_encode_ucs1(const Py_UNICODE *p, 3363 Py_ssize_t size, 3364 const char *errors, 3365 int limit) 3366{ 3367 /* output object */ 3368 PyObject *res; 3369 /* pointers to the beginning and end+1 of input */ 3370 const Py_UNICODE *startp = p; 3371 const Py_UNICODE *endp = p + size; 3372 /* pointer to the beginning of the unencodable characters */ 3373 /* const Py_UNICODE *badp = NULL; */ 3374 /* pointer into the output */ 3375 char *str; 3376 /* current output position */ 3377 Py_ssize_t respos = 0; 3378 Py_ssize_t ressize; 3379 const char *encoding = (limit == 256) ? "latin-1" : "ascii"; 3380 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)"; 3381 PyObject *errorHandler = NULL; 3382 PyObject *exc = NULL; 3383 /* the following variable is used for caching string comparisons 3384 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */ 3385 int known_errorHandler = -1; 3386 3387 /* allocate enough for a simple encoding without 3388 replacements, if we need more, we'll resize */ 3389 res = PyBytes_FromStringAndSize(NULL, size); 3390 if (res == NULL) 3391 goto onError; 3392 if (size == 0) 3393 return res; 3394 str = PyBytes_AS_STRING(res); 3395 ressize = size; 3396 3397 while (p<endp) { 3398 Py_UNICODE c = *p; 3399 3400 /* can we encode this? */ 3401 if (c<limit) { 3402 /* no overflow check, because we know that the space is enough */ 3403 *str++ = (char)c; 3404 ++p; 3405 } 3406 else { 3407 Py_ssize_t unicodepos = p-startp; 3408 Py_ssize_t requiredsize; 3409 PyObject *repunicode; 3410 Py_ssize_t repsize; 3411 Py_ssize_t newpos; 3412 Py_ssize_t respos; 3413 Py_UNICODE *uni2; 3414 /* startpos for collecting unencodable chars */ 3415 const Py_UNICODE *collstart = p; 3416 const Py_UNICODE *collend = p; 3417 /* find all unecodable characters */ 3418 while ((collend < endp) && ((*collend)>=limit)) 3419 ++collend; 3420 /* cache callback name lookup (if not done yet, i.e. it's the first error) */ 3421 if (known_errorHandler==-1) { 3422 if ((errors==NULL) || (!strcmp(errors, "strict"))) 3423 known_errorHandler = 1; 3424 else if (!strcmp(errors, "replace")) 3425 known_errorHandler = 2; 3426 else if (!strcmp(errors, "ignore")) 3427 known_errorHandler = 3; 3428 else if (!strcmp(errors, "xmlcharrefreplace")) 3429 known_errorHandler = 4; 3430 else 3431 known_errorHandler = 0; 3432 } 3433 switch (known_errorHandler) { 3434 case 1: /* strict */ 3435 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason); 3436 goto onError; 3437 case 2: /* replace */ 3438 while (collstart++<collend) 3439 *str++ = '?'; /* fall through */ 3440 case 3: /* ignore */ 3441 p = collend; 3442 break; 3443 case 4: /* xmlcharrefreplace */ 3444 respos = str - PyBytes_AS_STRING(res); 3445 /* determine replacement size (temporarily (mis)uses p) */ 3446 for (p = collstart, repsize = 0; p < collend; ++p) { 3447 if (*p<10) 3448 repsize += 2+1+1; 3449 else if (*p<100) 3450 repsize += 2+2+1; 3451 else if (*p<1000) 3452 repsize += 2+3+1; 3453 else if (*p<10000) 3454 repsize += 2+4+1; 3455#ifndef Py_UNICODE_WIDE 3456 else 3457 repsize += 2+5+1; 3458#else 3459 else if (*p<100000) 3460 repsize += 2+5+1; 3461 else if (*p<1000000) 3462 repsize += 2+6+1; 3463 else 3464 repsize += 2+7+1; 3465#endif 3466 } 3467 requiredsize = respos+repsize+(endp-collend); 3468 if (requiredsize > ressize) { 3469 if (requiredsize<2*ressize) 3470 requiredsize = 2*ressize; 3471 if (PyBytes_Resize(res, requiredsize)) 3472 goto onError; 3473 str = PyBytes_AS_STRING(res) + respos; 3474 ressize = requiredsize; 3475 } 3476 /* generate replacement (temporarily (mis)uses p) */ 3477 for (p = collstart; p < collend; ++p) { 3478 str += sprintf(str, "&#%d;", (int)*p); 3479 } 3480 p = collend; 3481 break; 3482 default: 3483 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler, 3484 encoding, reason, startp, size, &exc, 3485 collstart-startp, collend-startp, &newpos); 3486 if (repunicode == NULL) 3487 goto onError; 3488 /* need more space? (at least enough for what we 3489 have+the replacement+the rest of the string, so 3490 we won't have to check space for encodable characters) */ 3491 respos = str - PyBytes_AS_STRING(res); 3492 repsize = PyUnicode_GET_SIZE(repunicode); 3493 requiredsize = respos+repsize+(endp-collend); 3494 if (requiredsize > ressize) { 3495 if (requiredsize<2*ressize) 3496 requiredsize = 2*ressize; 3497 if (PyBytes_Resize(res, requiredsize)) { 3498 Py_DECREF(repunicode); 3499 goto onError; 3500 } 3501 str = PyBytes_AS_STRING(res) + respos; 3502 ressize = requiredsize; 3503 } 3504 /* check if there is anything unencodable in the replacement 3505 and copy it to the output */ 3506 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) { 3507 c = *uni2; 3508 if (c >= limit) { 3509 raise_encode_exception(&exc, encoding, startp, size, 3510 unicodepos, unicodepos+1, reason); 3511 Py_DECREF(repunicode); 3512 goto onError; 3513 } 3514 *str = (char)c; 3515 } 3516 p = startp + newpos; 3517 Py_DECREF(repunicode); 3518 } 3519 } 3520 } 3521 /* Resize if we allocated to much */ 3522 respos = str - PyBytes_AS_STRING(res); 3523 if (respos<ressize) 3524 /* If this falls res will be NULL */ 3525 PyBytes_Resize(res, respos); 3526 Py_XDECREF(errorHandler); 3527 Py_XDECREF(exc); 3528 return res; 3529 3530 onError: 3531 Py_XDECREF(res); 3532 Py_XDECREF(errorHandler); 3533 Py_XDECREF(exc); 3534 return NULL; 3535} 3536 3537PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p, 3538 Py_ssize_t size, 3539 const char *errors) 3540{ 3541 return unicode_encode_ucs1(p, size, errors, 256); 3542} 3543 3544PyObject *PyUnicode_AsLatin1String(PyObject *unicode) 3545{ 3546 if (!PyUnicode_Check(unicode)) { 3547 PyErr_BadArgument(); 3548 return NULL; 3549 } 3550 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode), 3551 PyUnicode_GET_SIZE(unicode), 3552 NULL); 3553} 3554 3555/* --- 7-bit ASCII Codec -------------------------------------------------- */ 3556 3557PyObject *PyUnicode_DecodeASCII(const char *s, 3558 Py_ssize_t size, 3559 const char *errors) 3560{ 3561 const char *starts = s; 3562 PyUnicodeObject *v; 3563 Py_UNICODE *p; 3564 Py_ssize_t startinpos; 3565 Py_ssize_t endinpos; 3566 Py_ssize_t outpos; 3567 const char *e; 3568 PyObject *errorHandler = NULL; 3569 PyObject *exc = NULL; 3570 3571 /* ASCII is equivalent to the first 128 ordinals in Unicode. */ 3572 if (size == 1 && *(unsigned char*)s < 128) { 3573 Py_UNICODE r = *(unsigned char*)s; 3574 return PyUnicode_FromUnicode(&r, 1); 3575 } 3576 3577 v = _PyUnicode_New(size); 3578 if (v == NULL) 3579 goto onError; 3580 if (size == 0) 3581 return (PyObject *)v; 3582 p = PyUnicode_AS_UNICODE(v); 3583 e = s + size; 3584 while (s < e) { 3585 register unsigned char c = (unsigned char)*s; 3586 if (c < 128) { 3587 *p++ = c; 3588 ++s; 3589 } 3590 else { 3591 startinpos = s-starts; 3592 endinpos = startinpos + 1; 3593 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v); 3594 if (unicode_decode_call_errorhandler( 3595 errors, &errorHandler, 3596 "ascii", "ordinal not in range(128)", 3597 &starts, &e, &startinpos, &endinpos, &exc, &s, 3598 (PyObject **)&v, &outpos, &p)) 3599 goto onError; 3600 } 3601 } 3602 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v)) 3603 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0) 3604 goto onError; 3605 Py_XDECREF(errorHandler); 3606 Py_XDECREF(exc); 3607 return (PyObject *)v; 3608 3609 onError: 3610 Py_XDECREF(v); 3611 Py_XDECREF(errorHandler); 3612 Py_XDECREF(exc); 3613 return NULL; 3614} 3615 3616PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p, 3617 Py_ssize_t size, 3618 const char *errors) 3619{ 3620 return unicode_encode_ucs1(p, size, errors, 128); 3621} 3622 3623PyObject *PyUnicode_AsASCIIString(PyObject *unicode) 3624{ 3625 if (!PyUnicode_Check(unicode)) { 3626 PyErr_BadArgument(); 3627 return NULL; 3628 } 3629 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode), 3630 PyUnicode_GET_SIZE(unicode), 3631 NULL); 3632} 3633 3634#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T) 3635 3636/* --- MBCS codecs for Windows -------------------------------------------- */ 3637 3638#if SIZEOF_INT < SIZEOF_SSIZE_T 3639#define NEED_RETRY 3640#endif 3641 3642/* XXX This code is limited to "true" double-byte encodings, as 3643 a) it assumes an incomplete character consists of a single byte, and 3644 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte 3645 encodings, see IsDBCSLeadByteEx documentation. */ 3646 3647static int is_dbcs_lead_byte(const char *s, int offset) 3648{ 3649 const char *curr = s + offset; 3650 3651 if (IsDBCSLeadByte(*curr)) { 3652 const char *prev = CharPrev(s, curr); 3653 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2); 3654 } 3655 return 0; 3656} 3657 3658/* 3659 * Decode MBCS string into unicode object. If 'final' is set, converts 3660 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise. 3661 */ 3662static int decode_mbcs(PyUnicodeObject **v, 3663 const char *s, /* MBCS string */ 3664 int size, /* sizeof MBCS string */ 3665 int final) 3666{ 3667 Py_UNICODE *p; 3668 Py_ssize_t n = 0; 3669 int usize = 0; 3670 3671 assert(size >= 0); 3672 3673 /* Skip trailing lead-byte unless 'final' is set */ 3674 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1)) 3675 --size; 3676 3677 /* First get the size of the result */ 3678 if (size > 0) { 3679 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0); 3680 if (usize == 0) { 3681 PyErr_SetFromWindowsErrWithFilename(0, NULL); 3682 return -1; 3683 } 3684 } 3685 3686 if (*v == NULL) { 3687 /* Create unicode object */ 3688 *v = _PyUnicode_New(usize); 3689 if (*v == NULL) 3690 return -1; 3691 } 3692 else { 3693 /* Extend unicode object */ 3694 n = PyUnicode_GET_SIZE(*v); 3695 if (_PyUnicode_Resize(v, n + usize) < 0) 3696 return -1; 3697 } 3698 3699 /* Do the conversion */ 3700 if (size > 0) { 3701 p = PyUnicode_AS_UNICODE(*v) + n; 3702 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) { 3703 PyErr_SetFromWindowsErrWithFilename(0, NULL); 3704 return -1; 3705 } 3706 } 3707 3708 return size; 3709} 3710 3711PyObject *PyUnicode_DecodeMBCSStateful(const char *s, 3712 Py_ssize_t size, 3713 const char *errors, 3714 Py_ssize_t *consumed) 3715{ 3716 PyUnicodeObject *v = NULL; 3717 int done; 3718 3719 if (consumed) 3720 *consumed = 0; 3721 3722#ifdef NEED_RETRY 3723 retry: 3724 if (size > INT_MAX) 3725 done = decode_mbcs(&v, s, INT_MAX, 0); 3726 else 3727#endif 3728 done = decode_mbcs(&v, s, (int)size, !consumed); 3729 3730 if (done < 0) { 3731 Py_XDECREF(v); 3732 return NULL; 3733 } 3734 3735 if (consumed) 3736 *consumed += done; 3737 3738#ifdef NEED_RETRY 3739 if (size > INT_MAX) { 3740 s += done; 3741 size -= done; 3742 goto retry; 3743 } 3744#endif 3745 3746 return (PyObject *)v; 3747} 3748 3749PyObject *PyUnicode_DecodeMBCS(const char *s, 3750 Py_ssize_t size, 3751 const char *errors) 3752{ 3753 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL); 3754} 3755 3756/* 3757 * Convert unicode into string object (MBCS). 3758 * Returns 0 if succeed, -1 otherwise. 3759 */ 3760static int encode_mbcs(PyObject **repr, 3761 const Py_UNICODE *p, /* unicode */ 3762 int size) /* size of unicode */ 3763{ 3764 int mbcssize = 0; 3765 Py_ssize_t n = 0; 3766 3767 assert(size >= 0); 3768 3769 /* First get the size of the result */ 3770 if (size > 0) { 3771 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL); 3772 if (mbcssize == 0) { 3773 PyErr_SetFromWindowsErrWithFilename(0, NULL); 3774 return -1; 3775 } 3776 } 3777 3778 if (*repr == NULL) { 3779 /* Create string object */ 3780 *repr = PyBytes_FromStringAndSize(NULL, mbcssize); 3781 if (*repr == NULL) 3782 return -1; 3783 } 3784 else { 3785 /* Extend string object */ 3786 n = PyBytes_Size(*repr); 3787 if (PyBytes_Resize(*repr, n + mbcssize) < 0) 3788 return -1; 3789 } 3790 3791 /* Do the conversion */ 3792 if (size > 0) { 3793 char *s = PyBytes_AS_STRING(*repr) + n; 3794 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) { 3795 PyErr_SetFromWindowsErrWithFilename(0, NULL); 3796 return -1; 3797 } 3798 } 3799 3800 return 0; 3801} 3802 3803PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p, 3804 Py_ssize_t size, 3805 const char *errors) 3806{ 3807 PyObject *repr = NULL; 3808 int ret; 3809 3810#ifdef NEED_RETRY 3811 retry: 3812 if (size > INT_MAX) 3813 ret = encode_mbcs(&repr, p, INT_MAX); 3814 else 3815#endif 3816 ret = encode_mbcs(&repr, p, (int)size); 3817 3818 if (ret < 0) { 3819 Py_XDECREF(repr); 3820 return NULL; 3821 } 3822 3823#ifdef NEED_RETRY 3824 if (size > INT_MAX) { 3825 p += INT_MAX; 3826 size -= INT_MAX; 3827 goto retry; 3828 } 3829#endif 3830 3831 return repr; 3832} 3833 3834PyObject *PyUnicode_AsMBCSString(PyObject *unicode) 3835{ 3836 if (!PyUnicode_Check(unicode)) { 3837 PyErr_BadArgument(); 3838 return NULL; 3839 } 3840 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode), 3841 PyUnicode_GET_SIZE(unicode), 3842 NULL); 3843} 3844 3845#undef NEED_RETRY 3846 3847#endif /* MS_WINDOWS */ 3848 3849/* --- Character Mapping Codec -------------------------------------------- */ 3850 3851PyObject *PyUnicode_DecodeCharmap(const char *s, 3852 Py_ssize_t size, 3853 PyObject *mapping, 3854 const char *errors) 3855{ 3856 const char *starts = s; 3857 Py_ssize_t startinpos; 3858 Py_ssize_t endinpos; 3859 Py_ssize_t outpos; 3860 const char *e; 3861 PyUnicodeObject *v; 3862 Py_UNICODE *p; 3863 Py_ssize_t extrachars = 0; 3864 PyObject *errorHandler = NULL; 3865 PyObject *exc = NULL; 3866 Py_UNICODE *mapstring = NULL; 3867 Py_ssize_t maplen = 0; 3868 3869 /* Default to Latin-1 */ 3870 if (mapping == NULL) 3871 return PyUnicode_DecodeLatin1(s, size, errors); 3872 3873 v = _PyUnicode_New(size); 3874 if (v == NULL) 3875 goto onError; 3876 if (size == 0) 3877 return (PyObject *)v; 3878 p = PyUnicode_AS_UNICODE(v); 3879 e = s + size; 3880 if (PyUnicode_CheckExact(mapping)) { 3881 mapstring = PyUnicode_AS_UNICODE(mapping); 3882 maplen = PyUnicode_GET_SIZE(mapping); 3883 while (s < e) { 3884 unsigned char ch = *s; 3885 Py_UNICODE x = 0xfffe; /* illegal value */ 3886 3887 if (ch < maplen) 3888 x = mapstring[ch]; 3889 3890 if (x == 0xfffe) { 3891 /* undefined mapping */ 3892 outpos = p-PyUnicode_AS_UNICODE(v); 3893 startinpos = s-starts; 3894 endinpos = startinpos+1; 3895 if (unicode_decode_call_errorhandler( 3896 errors, &errorHandler, 3897 "charmap", "character maps to <undefined>", 3898 &starts, &e, &startinpos, &endinpos, &exc, &s, 3899 (PyObject **)&v, &outpos, &p)) { 3900 goto onError; 3901 } 3902 continue; 3903 } 3904 *p++ = x; 3905 ++s; 3906 } 3907 } 3908 else { 3909 while (s < e) { 3910 unsigned char ch = *s; 3911 PyObject *w, *x; 3912 3913 /* Get mapping (char ordinal -> integer, Unicode char or None) */ 3914 w = PyInt_FromLong((long)ch); 3915 if (w == NULL) 3916 goto onError; 3917 x = PyObject_GetItem(mapping, w); 3918 Py_DECREF(w); 3919 if (x == NULL) { 3920 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 3921 /* No mapping found means: mapping is undefined. */ 3922 PyErr_Clear(); 3923 x = Py_None; 3924 Py_INCREF(x); 3925 } else 3926 goto onError; 3927 } 3928 3929 /* Apply mapping */ 3930 if (PyInt_Check(x)) { 3931 long value = PyInt_AS_LONG(x); 3932 if (value < 0 || value > 65535) { 3933 PyErr_SetString(PyExc_TypeError, 3934 "character mapping must be in range(65536)"); 3935 Py_DECREF(x); 3936 goto onError; 3937 } 3938 *p++ = (Py_UNICODE)value; 3939 } 3940 else if (x == Py_None) { 3941 /* undefined mapping */ 3942 outpos = p-PyUnicode_AS_UNICODE(v); 3943 startinpos = s-starts; 3944 endinpos = startinpos+1; 3945 if (unicode_decode_call_errorhandler( 3946 errors, &errorHandler, 3947 "charmap", "character maps to <undefined>", 3948 &starts, &e, &startinpos, &endinpos, &exc, &s, 3949 (PyObject **)&v, &outpos, &p)) { 3950 Py_DECREF(x); 3951 goto onError; 3952 } 3953 Py_DECREF(x); 3954 continue; 3955 } 3956 else if (PyUnicode_Check(x)) { 3957 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x); 3958 3959 if (targetsize == 1) 3960 /* 1-1 mapping */ 3961 *p++ = *PyUnicode_AS_UNICODE(x); 3962 3963 else if (targetsize > 1) { 3964 /* 1-n mapping */ 3965 if (targetsize > extrachars) { 3966 /* resize first */ 3967 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v); 3968 Py_ssize_t needed = (targetsize - extrachars) + \ 3969 (targetsize << 2); 3970 extrachars += needed; 3971 /* XXX overflow detection missing */ 3972 if (_PyUnicode_Resize(&v, 3973 PyUnicode_GET_SIZE(v) + needed) < 0) { 3974 Py_DECREF(x); 3975 goto onError; 3976 } 3977 p = PyUnicode_AS_UNICODE(v) + oldpos; 3978 } 3979 Py_UNICODE_COPY(p, 3980 PyUnicode_AS_UNICODE(x), 3981 targetsize); 3982 p += targetsize; 3983 extrachars -= targetsize; 3984 } 3985 /* 1-0 mapping: skip the character */ 3986 } 3987 else { 3988 /* wrong return value */ 3989 PyErr_SetString(PyExc_TypeError, 3990 "character mapping must return integer, None or unicode"); 3991 Py_DECREF(x); 3992 goto onError; 3993 } 3994 Py_DECREF(x); 3995 ++s; 3996 } 3997 } 3998 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v)) 3999 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0) 4000 goto onError; 4001 Py_XDECREF(errorHandler); 4002 Py_XDECREF(exc); 4003 return (PyObject *)v; 4004 4005 onError: 4006 Py_XDECREF(errorHandler); 4007 Py_XDECREF(exc); 4008 Py_XDECREF(v); 4009 return NULL; 4010} 4011 4012/* Charmap encoding: the lookup table */ 4013 4014struct encoding_map{ 4015 PyObject_HEAD 4016 unsigned char level1[32]; 4017 int count2, count3; 4018 unsigned char level23[1]; 4019}; 4020 4021static PyObject* 4022encoding_map_size(PyObject *obj, PyObject* args) 4023{ 4024 struct encoding_map *map = (struct encoding_map*)obj; 4025 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 + 4026 128*map->count3); 4027} 4028 4029static PyMethodDef encoding_map_methods[] = { 4030 {"size", encoding_map_size, METH_NOARGS, 4031 PyDoc_STR("Return the size (in bytes) of this object") }, 4032 { 0 } 4033}; 4034 4035static void 4036encoding_map_dealloc(PyObject* o) 4037{ 4038 PyObject_FREE(o); 4039} 4040 4041static PyTypeObject EncodingMapType = { 4042 PyVarObject_HEAD_INIT(NULL, 0) 4043 "EncodingMap", /*tp_name*/ 4044 sizeof(struct encoding_map), /*tp_basicsize*/ 4045 0, /*tp_itemsize*/ 4046 /* methods */ 4047 encoding_map_dealloc, /*tp_dealloc*/ 4048 0, /*tp_print*/ 4049 0, /*tp_getattr*/ 4050 0, /*tp_setattr*/ 4051 0, /*tp_compare*/ 4052 0, /*tp_repr*/ 4053 0, /*tp_as_number*/ 4054 0, /*tp_as_sequence*/ 4055 0, /*tp_as_mapping*/ 4056 0, /*tp_hash*/ 4057 0, /*tp_call*/ 4058 0, /*tp_str*/ 4059 0, /*tp_getattro*/ 4060 0, /*tp_setattro*/ 4061 0, /*tp_as_buffer*/ 4062 Py_TPFLAGS_DEFAULT, /*tp_flags*/ 4063 0, /*tp_doc*/ 4064 0, /*tp_traverse*/ 4065 0, /*tp_clear*/ 4066 0, /*tp_richcompare*/ 4067 0, /*tp_weaklistoffset*/ 4068 0, /*tp_iter*/ 4069 0, /*tp_iternext*/ 4070 encoding_map_methods, /*tp_methods*/ 4071 0, /*tp_members*/ 4072 0, /*tp_getset*/ 4073 0, /*tp_base*/ 4074 0, /*tp_dict*/ 4075 0, /*tp_descr_get*/ 4076 0, /*tp_descr_set*/ 4077 0, /*tp_dictoffset*/ 4078 0, /*tp_init*/ 4079 0, /*tp_alloc*/ 4080 0, /*tp_new*/ 4081 0, /*tp_free*/ 4082 0, /*tp_is_gc*/ 4083}; 4084 4085PyObject* 4086PyUnicode_BuildEncodingMap(PyObject* string) 4087{ 4088 Py_UNICODE *decode; 4089 PyObject *result; 4090 struct encoding_map *mresult; 4091 int i; 4092 int need_dict = 0; 4093 unsigned char level1[32]; 4094 unsigned char level2[512]; 4095 unsigned char *mlevel1, *mlevel2, *mlevel3; 4096 int count2 = 0, count3 = 0; 4097 4098 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) { 4099 PyErr_BadArgument(); 4100 return NULL; 4101 } 4102 decode = PyUnicode_AS_UNICODE(string); 4103 memset(level1, 0xFF, sizeof level1); 4104 memset(level2, 0xFF, sizeof level2); 4105 4106 /* If there isn't a one-to-one mapping of NULL to \0, 4107 or if there are non-BMP characters, we need to use 4108 a mapping dictionary. */ 4109 if (decode[0] != 0) 4110 need_dict = 1; 4111 for (i = 1; i < 256; i++) { 4112 int l1, l2; 4113 if (decode[i] == 0 4114 #ifdef Py_UNICODE_WIDE 4115 || decode[i] > 0xFFFF 4116 #endif 4117 ) { 4118 need_dict = 1; 4119 break; 4120 } 4121 if (decode[i] == 0xFFFE) 4122 /* unmapped character */ 4123 continue; 4124 l1 = decode[i] >> 11; 4125 l2 = decode[i] >> 7; 4126 if (level1[l1] == 0xFF) 4127 level1[l1] = count2++; 4128 if (level2[l2] == 0xFF) 4129 level2[l2] = count3++; 4130 } 4131 4132 if (count2 >= 0xFF || count3 >= 0xFF) 4133 need_dict = 1; 4134 4135 if (need_dict) { 4136 PyObject *result = PyDict_New(); 4137 PyObject *key, *value; 4138 if (!result) 4139 return NULL; 4140 for (i = 0; i < 256; i++) { 4141 key = value = NULL; 4142 key = PyInt_FromLong(decode[i]); 4143 value = PyInt_FromLong(i); 4144 if (!key || !value) 4145 goto failed1; 4146 if (PyDict_SetItem(result, key, value) == -1) 4147 goto failed1; 4148 Py_DECREF(key); 4149 Py_DECREF(value); 4150 } 4151 return result; 4152 failed1: 4153 Py_XDECREF(key); 4154 Py_XDECREF(value); 4155 Py_DECREF(result); 4156 return NULL; 4157 } 4158 4159 /* Create a three-level trie */ 4160 result = PyObject_MALLOC(sizeof(struct encoding_map) + 4161 16*count2 + 128*count3 - 1); 4162 if (!result) 4163 return PyErr_NoMemory(); 4164 PyObject_Init(result, &EncodingMapType); 4165 mresult = (struct encoding_map*)result; 4166 mresult->count2 = count2; 4167 mresult->count3 = count3; 4168 mlevel1 = mresult->level1; 4169 mlevel2 = mresult->level23; 4170 mlevel3 = mresult->level23 + 16*count2; 4171 memcpy(mlevel1, level1, 32); 4172 memset(mlevel2, 0xFF, 16*count2); 4173 memset(mlevel3, 0, 128*count3); 4174 count3 = 0; 4175 for (i = 1; i < 256; i++) { 4176 int o1, o2, o3, i2, i3; 4177 if (decode[i] == 0xFFFE) 4178 /* unmapped character */ 4179 continue; 4180 o1 = decode[i]>>11; 4181 o2 = (decode[i]>>7) & 0xF; 4182 i2 = 16*mlevel1[o1] + o2; 4183 if (mlevel2[i2] == 0xFF) 4184 mlevel2[i2] = count3++; 4185 o3 = decode[i] & 0x7F; 4186 i3 = 128*mlevel2[i2] + o3; 4187 mlevel3[i3] = i; 4188 } 4189 return result; 4190} 4191 4192static int 4193encoding_map_lookup(Py_UNICODE c, PyObject *mapping) 4194{ 4195 struct encoding_map *map = (struct encoding_map*)mapping; 4196 int l1 = c>>11; 4197 int l2 = (c>>7) & 0xF; 4198 int l3 = c & 0x7F; 4199 int i; 4200 4201#ifdef Py_UNICODE_WIDE 4202 if (c > 0xFFFF) { 4203 return -1; 4204 } 4205#endif 4206 if (c == 0) 4207 return 0; 4208 /* level 1*/ 4209 i = map->level1[l1]; 4210 if (i == 0xFF) { 4211 return -1; 4212 } 4213 /* level 2*/ 4214 i = map->level23[16*i+l2]; 4215 if (i == 0xFF) { 4216 return -1; 4217 } 4218 /* level 3 */ 4219 i = map->level23[16*map->count2 + 128*i + l3]; 4220 if (i == 0) { 4221 return -1; 4222 } 4223 return i; 4224} 4225 4226/* Lookup the character ch in the mapping. If the character 4227 can't be found, Py_None is returned (or NULL, if another 4228 error occurred). */ 4229static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping) 4230{ 4231 PyObject *w = PyInt_FromLong((long)c); 4232 PyObject *x; 4233 4234 if (w == NULL) 4235 return NULL; 4236 x = PyObject_GetItem(mapping, w); 4237 Py_DECREF(w); 4238 if (x == NULL) { 4239 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 4240 /* No mapping found means: mapping is undefined. */ 4241 PyErr_Clear(); 4242 x = Py_None; 4243 Py_INCREF(x); 4244 return x; 4245 } else 4246 return NULL; 4247 } 4248 else if (x == Py_None) 4249 return x; 4250 else if (PyInt_Check(x)) { 4251 long value = PyInt_AS_LONG(x); 4252 if (value < 0 || value > 255) { 4253 PyErr_SetString(PyExc_TypeError, 4254 "character mapping must be in range(256)"); 4255 Py_DECREF(x); 4256 return NULL; 4257 } 4258 return x; 4259 } 4260 else if (PyString_Check(x)) 4261 return x; 4262 else { 4263 /* wrong return value */ 4264 PyErr_Format(PyExc_TypeError, 4265 "character mapping must return integer, None or str8, not %.400s", 4266 x->ob_type->tp_name); 4267 Py_DECREF(x); 4268 return NULL; 4269 } 4270} 4271 4272static int 4273charmapencode_resize(PyObject *outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize) 4274{ 4275 Py_ssize_t outsize = PyBytes_GET_SIZE( outobj); 4276 /* exponentially overallocate to minimize reallocations */ 4277 if (requiredsize < 2*outsize) 4278 requiredsize = 2*outsize; 4279 if (PyBytes_Resize(outobj, requiredsize)) { 4280 Py_DECREF(outobj); 4281 return -1; 4282 } 4283 return 0; 4284} 4285 4286typedef enum charmapencode_result { 4287 enc_SUCCESS, enc_FAILED, enc_EXCEPTION 4288}charmapencode_result; 4289/* lookup the character, put the result in the output string and adjust 4290 various state variables. Resize the output bytes object if not enough 4291 space is available. Return a new reference to the object that 4292 was put in the output buffer, or Py_None, if the mapping was undefined 4293 (in which case no character was written) or NULL, if a 4294 reallocation error occurred. The caller must decref the result */ 4295static 4296charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping, 4297 PyObject *outobj, Py_ssize_t *outpos) 4298{ 4299 PyObject *rep; 4300 char *outstart; 4301 Py_ssize_t outsize = PyBytes_GET_SIZE(outobj); 4302 4303 if (Py_Type(mapping) == &EncodingMapType) { 4304 int res = encoding_map_lookup(c, mapping); 4305 Py_ssize_t requiredsize = *outpos+1; 4306 if (res == -1) 4307 return enc_FAILED; 4308 if (outsize<requiredsize) 4309 if (charmapencode_resize(outobj, outpos, requiredsize)) 4310 return enc_EXCEPTION; 4311 outstart = PyBytes_AS_STRING(outobj); 4312 outstart[(*outpos)++] = (char)res; 4313 return enc_SUCCESS; 4314 } 4315 4316 rep = charmapencode_lookup(c, mapping); 4317 if (rep==NULL) 4318 return enc_EXCEPTION; 4319 else if (rep==Py_None) { 4320 Py_DECREF(rep); 4321 return enc_FAILED; 4322 } else { 4323 if (PyInt_Check(rep)) { 4324 Py_ssize_t requiredsize = *outpos+1; 4325 if (outsize<requiredsize) 4326 if (charmapencode_resize(outobj, outpos, requiredsize)) { 4327 Py_DECREF(rep); 4328 return enc_EXCEPTION; 4329 } 4330 outstart = PyBytes_AS_STRING(outobj); 4331 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep); 4332 } 4333 else { 4334 const char *repchars = PyString_AS_STRING(rep); 4335 Py_ssize_t repsize = PyString_GET_SIZE(rep); 4336 Py_ssize_t requiredsize = *outpos+repsize; 4337 if (outsize<requiredsize) 4338 if (charmapencode_resize(outobj, outpos, requiredsize)) { 4339 Py_DECREF(rep); 4340 return enc_EXCEPTION; 4341 } 4342 outstart = PyBytes_AS_STRING(outobj); 4343 memcpy(outstart + *outpos, repchars, repsize); 4344 *outpos += repsize; 4345 } 4346 } 4347 Py_DECREF(rep); 4348 return enc_SUCCESS; 4349} 4350 4351/* handle an error in PyUnicode_EncodeCharmap 4352 Return 0 on success, -1 on error */ 4353static 4354int charmap_encoding_error( 4355 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping, 4356 PyObject **exceptionObject, 4357 int *known_errorHandler, PyObject **errorHandler, const char *errors, 4358 PyObject *res, Py_ssize_t *respos) 4359{ 4360 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */ 4361 Py_ssize_t repsize; 4362 Py_ssize_t newpos; 4363 Py_UNICODE *uni2; 4364 /* startpos for collecting unencodable chars */ 4365 Py_ssize_t collstartpos = *inpos; 4366 Py_ssize_t collendpos = *inpos+1; 4367 Py_ssize_t collpos; 4368 char *encoding = "charmap"; 4369 char *reason = "character maps to <undefined>"; 4370 charmapencode_result x; 4371 4372 /* find all unencodable characters */ 4373 while (collendpos < size) { 4374 PyObject *rep; 4375 if (Py_Type(mapping) == &EncodingMapType) { 4376 int res = encoding_map_lookup(p[collendpos], mapping); 4377 if (res != -1) 4378 break; 4379 ++collendpos; 4380 continue; 4381 } 4382 4383 rep = charmapencode_lookup(p[collendpos], mapping); 4384 if (rep==NULL) 4385 return -1; 4386 else if (rep!=Py_None) { 4387 Py_DECREF(rep); 4388 break; 4389 } 4390 Py_DECREF(rep); 4391 ++collendpos; 4392 } 4393 /* cache callback name lookup 4394 * (if not done yet, i.e. it's the first error) */ 4395 if (*known_errorHandler==-1) { 4396 if ((errors==NULL) || (!strcmp(errors, "strict"))) 4397 *known_errorHandler = 1; 4398 else if (!strcmp(errors, "replace")) 4399 *known_errorHandler = 2; 4400 else if (!strcmp(errors, "ignore")) 4401 *known_errorHandler = 3; 4402 else if (!strcmp(errors, "xmlcharrefreplace")) 4403 *known_errorHandler = 4; 4404 else 4405 *known_errorHandler = 0; 4406 } 4407 switch (*known_errorHandler) { 4408 case 1: /* strict */ 4409 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason); 4410 return -1; 4411 case 2: /* replace */ 4412 for (collpos = collstartpos; collpos<collendpos; ++collpos) { 4413 x = charmapencode_output('?', mapping, res, respos); 4414 if (x==enc_EXCEPTION) { 4415 return -1; 4416 } 4417 else if (x==enc_FAILED) { 4418 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason); 4419 return -1; 4420 } 4421 } 4422 /* fall through */ 4423 case 3: /* ignore */ 4424 *inpos = collendpos; 4425 break; 4426 case 4: /* xmlcharrefreplace */ 4427 /* generate replacement (temporarily (mis)uses p) */ 4428 for (collpos = collstartpos; collpos < collendpos; ++collpos) { 4429 char buffer[2+29+1+1]; 4430 char *cp; 4431 sprintf(buffer, "&#%d;", (int)p[collpos]); 4432 for (cp = buffer; *cp; ++cp) { 4433 x = charmapencode_output(*cp, mapping, res, respos); 4434 if (x==enc_EXCEPTION) 4435 return -1; 4436 else if (x==enc_FAILED) { 4437 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason); 4438 return -1; 4439 } 4440 } 4441 } 4442 *inpos = collendpos; 4443 break; 4444 default: 4445 repunicode = unicode_encode_call_errorhandler(errors, errorHandler, 4446 encoding, reason, p, size, exceptionObject, 4447 collstartpos, collendpos, &newpos); 4448 if (repunicode == NULL) 4449 return -1; 4450 /* generate replacement */ 4451 repsize = PyUnicode_GET_SIZE(repunicode); 4452 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) { 4453 x = charmapencode_output(*uni2, mapping, res, respos); 4454 if (x==enc_EXCEPTION) { 4455 return -1; 4456 } 4457 else if (x==enc_FAILED) { 4458 Py_DECREF(repunicode); 4459 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason); 4460 return -1; 4461 } 4462 } 4463 *inpos = newpos; 4464 Py_DECREF(repunicode); 4465 } 4466 return 0; 4467} 4468 4469PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p, 4470 Py_ssize_t size, 4471 PyObject *mapping, 4472 const char *errors) 4473{ 4474 /* output object */ 4475 PyObject *res = NULL; 4476 /* current input position */ 4477 Py_ssize_t inpos = 0; 4478 /* current output position */ 4479 Py_ssize_t respos = 0; 4480 PyObject *errorHandler = NULL; 4481 PyObject *exc = NULL; 4482 /* the following variable is used for caching string comparisons 4483 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 4484 * 3=ignore, 4=xmlcharrefreplace */ 4485 int known_errorHandler = -1; 4486 4487 /* Default to Latin-1 */ 4488 if (mapping == NULL) 4489 return PyUnicode_EncodeLatin1(p, size, errors); 4490 4491 /* allocate enough for a simple encoding without 4492 replacements, if we need more, we'll resize */ 4493 res = PyBytes_FromStringAndSize(NULL, size); 4494 if (res == NULL) 4495 goto onError; 4496 if (size == 0) 4497 return res; 4498 4499 while (inpos<size) { 4500 /* try to encode it */ 4501 charmapencode_result x = charmapencode_output(p[inpos], mapping, res, &respos); 4502 if (x==enc_EXCEPTION) /* error */ 4503 goto onError; 4504 if (x==enc_FAILED) { /* unencodable character */ 4505 if (charmap_encoding_error(p, size, &inpos, mapping, 4506 &exc, 4507 &known_errorHandler, &errorHandler, errors, 4508 res, &respos)) { 4509 goto onError; 4510 } 4511 } 4512 else 4513 /* done with this character => adjust input position */ 4514 ++inpos; 4515 } 4516 4517 /* Resize if we allocated to much */ 4518 if (respos<PyBytes_GET_SIZE(res)) { 4519 if (PyBytes_Resize(res, respos)) 4520 goto onError; 4521 } 4522 Py_XDECREF(exc); 4523 Py_XDECREF(errorHandler); 4524 return res; 4525 4526 onError: 4527 Py_XDECREF(res); 4528 Py_XDECREF(exc); 4529 Py_XDECREF(errorHandler); 4530 return NULL; 4531} 4532 4533PyObject *PyUnicode_AsCharmapString(PyObject *unicode, 4534 PyObject *mapping) 4535{ 4536 if (!PyUnicode_Check(unicode) || mapping == NULL) { 4537 PyErr_BadArgument(); 4538 return NULL; 4539 } 4540 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode), 4541 PyUnicode_GET_SIZE(unicode), 4542 mapping, 4543 NULL); 4544} 4545 4546/* create or adjust a UnicodeTranslateError */ 4547static void make_translate_exception(PyObject **exceptionObject, 4548 const Py_UNICODE *unicode, Py_ssize_t size, 4549 Py_ssize_t startpos, Py_ssize_t endpos, 4550 const char *reason) 4551{ 4552 if (*exceptionObject == NULL) { 4553 *exceptionObject = PyUnicodeTranslateError_Create( 4554 unicode, size, startpos, endpos, reason); 4555 } 4556 else { 4557 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos)) 4558 goto onError; 4559 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos)) 4560 goto onError; 4561 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason)) 4562 goto onError; 4563 return; 4564 onError: 4565 Py_DECREF(*exceptionObject); 4566 *exceptionObject = NULL; 4567 } 4568} 4569 4570/* raises a UnicodeTranslateError */ 4571static void raise_translate_exception(PyObject **exceptionObject, 4572 const Py_UNICODE *unicode, Py_ssize_t size, 4573 Py_ssize_t startpos, Py_ssize_t endpos, 4574 const char *reason) 4575{ 4576 make_translate_exception(exceptionObject, 4577 unicode, size, startpos, endpos, reason); 4578 if (*exceptionObject != NULL) 4579 PyCodec_StrictErrors(*exceptionObject); 4580} 4581 4582/* error handling callback helper: 4583 build arguments, call the callback and check the arguments, 4584 put the result into newpos and return the replacement string, which 4585 has to be freed by the caller */ 4586static PyObject *unicode_translate_call_errorhandler(const char *errors, 4587 PyObject **errorHandler, 4588 const char *reason, 4589 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject, 4590 Py_ssize_t startpos, Py_ssize_t endpos, 4591 Py_ssize_t *newpos) 4592{ 4593 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple"; 4594 4595 Py_ssize_t i_newpos; 4596 PyObject *restuple; 4597 PyObject *resunicode; 4598 4599 if (*errorHandler == NULL) { 4600 *errorHandler = PyCodec_LookupError(errors); 4601 if (*errorHandler == NULL) 4602 return NULL; 4603 } 4604 4605 make_translate_exception(exceptionObject, 4606 unicode, size, startpos, endpos, reason); 4607 if (*exceptionObject == NULL) 4608 return NULL; 4609 4610 restuple = PyObject_CallFunctionObjArgs( 4611 *errorHandler, *exceptionObject, NULL); 4612 if (restuple == NULL) 4613 return NULL; 4614 if (!PyTuple_Check(restuple)) { 4615 PyErr_Format(PyExc_TypeError, &argparse[4]); 4616 Py_DECREF(restuple); 4617 return NULL; 4618 } 4619 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, 4620 &resunicode, &i_newpos)) { 4621 Py_DECREF(restuple); 4622 return NULL; 4623 } 4624 if (i_newpos<0) 4625 *newpos = size+i_newpos; 4626 else 4627 *newpos = i_newpos; 4628 if (*newpos<0 || *newpos>size) { 4629 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos); 4630 Py_DECREF(restuple); 4631 return NULL; 4632 } 4633 Py_INCREF(resunicode); 4634 Py_DECREF(restuple); 4635 return resunicode; 4636} 4637 4638/* Lookup the character ch in the mapping and put the result in result, 4639 which must be decrefed by the caller. 4640 Return 0 on success, -1 on error */ 4641static 4642int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result) 4643{ 4644 PyObject *w = PyInt_FromLong((long)c); 4645 PyObject *x; 4646 4647 if (w == NULL) 4648 return -1; 4649 x = PyObject_GetItem(mapping, w); 4650 Py_DECREF(w); 4651 if (x == NULL) { 4652 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 4653 /* No mapping found means: use 1:1 mapping. */ 4654 PyErr_Clear(); 4655 *result = NULL; 4656 return 0; 4657 } else 4658 return -1; 4659 } 4660 else if (x == Py_None) { 4661 *result = x; 4662 return 0; 4663 } 4664 else if (PyInt_Check(x)) { 4665 long value = PyInt_AS_LONG(x); 4666 long max = PyUnicode_GetMax(); 4667 if (value < 0 || value > max) { 4668 PyErr_Format(PyExc_TypeError, 4669 "character mapping must be in range(0x%lx)", max+1); 4670 Py_DECREF(x); 4671 return -1; 4672 } 4673 *result = x; 4674 return 0; 4675 } 4676 else if (PyUnicode_Check(x)) { 4677 *result = x; 4678 return 0; 4679 } 4680 else { 4681 /* wrong return value */ 4682 PyErr_SetString(PyExc_TypeError, 4683 "character mapping must return integer, None or unicode"); 4684 Py_DECREF(x); 4685 return -1; 4686 } 4687} 4688/* ensure that *outobj is at least requiredsize characters long, 4689if not reallocate and adjust various state variables. 4690Return 0 on success, -1 on error */ 4691static 4692int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp, 4693 Py_ssize_t requiredsize) 4694{ 4695 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj); 4696 if (requiredsize > oldsize) { 4697 /* remember old output position */ 4698 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj); 4699 /* exponentially overallocate to minimize reallocations */ 4700 if (requiredsize < 2 * oldsize) 4701 requiredsize = 2 * oldsize; 4702 if (_PyUnicode_Resize(outobj, requiredsize) < 0) 4703 return -1; 4704 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos; 4705 } 4706 return 0; 4707} 4708/* lookup the character, put the result in the output string and adjust 4709 various state variables. Return a new reference to the object that 4710 was put in the output buffer in *result, or Py_None, if the mapping was 4711 undefined (in which case no character was written). 4712 The called must decref result. 4713 Return 0 on success, -1 on error. */ 4714static 4715int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp, 4716 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp, 4717 PyObject **res) 4718{ 4719 if (charmaptranslate_lookup(*curinp, mapping, res)) 4720 return -1; 4721 if (*res==NULL) { 4722 /* not found => default to 1:1 mapping */ 4723 *(*outp)++ = *curinp; 4724 } 4725 else if (*res==Py_None) 4726 ; 4727 else if (PyInt_Check(*res)) { 4728 /* no overflow check, because we know that the space is enough */ 4729 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res); 4730 } 4731 else if (PyUnicode_Check(*res)) { 4732 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res); 4733 if (repsize==1) { 4734 /* no overflow check, because we know that the space is enough */ 4735 *(*outp)++ = *PyUnicode_AS_UNICODE(*res); 4736 } 4737 else if (repsize!=0) { 4738 /* more than one character */ 4739 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) + 4740 (insize - (curinp-startinp)) + 4741 repsize - 1; 4742 if (charmaptranslate_makespace(outobj, outp, requiredsize)) 4743 return -1; 4744 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize); 4745 *outp += repsize; 4746 } 4747 } 4748 else 4749 return -1; 4750 return 0; 4751} 4752 4753PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p, 4754 Py_ssize_t size, 4755 PyObject *mapping, 4756 const char *errors) 4757{ 4758 /* output object */ 4759 PyObject *res = NULL; 4760 /* pointers to the beginning and end+1 of input */ 4761 const Py_UNICODE *startp = p; 4762 const Py_UNICODE *endp = p + size; 4763 /* pointer into the output */ 4764 Py_UNICODE *str; 4765 /* current output position */ 4766 Py_ssize_t respos = 0; 4767 char *reason = "character maps to <undefined>"; 4768 PyObject *errorHandler = NULL; 4769 PyObject *exc = NULL; 4770 /* the following variable is used for caching string comparisons 4771 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 4772 * 3=ignore, 4=xmlcharrefreplace */ 4773 int known_errorHandler = -1; 4774 4775 if (mapping == NULL) { 4776 PyErr_BadArgument(); 4777 return NULL; 4778 } 4779 4780 /* allocate enough for a simple 1:1 translation without 4781 replacements, if we need more, we'll resize */ 4782 res = PyUnicode_FromUnicode(NULL, size); 4783 if (res == NULL) 4784 goto onError; 4785 if (size == 0) 4786 return res; 4787 str = PyUnicode_AS_UNICODE(res); 4788 4789 while (p<endp) { 4790 /* try to encode it */ 4791 PyObject *x = NULL; 4792 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) { 4793 Py_XDECREF(x); 4794 goto onError; 4795 } 4796 Py_XDECREF(x); 4797 if (x!=Py_None) /* it worked => adjust input pointer */ 4798 ++p; 4799 else { /* untranslatable character */ 4800 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */ 4801 Py_ssize_t repsize; 4802 Py_ssize_t newpos; 4803 Py_UNICODE *uni2; 4804 /* startpos for collecting untranslatable chars */ 4805 const Py_UNICODE *collstart = p; 4806 const Py_UNICODE *collend = p+1; 4807 const Py_UNICODE *coll; 4808 4809 /* find all untranslatable characters */ 4810 while (collend < endp) { 4811 if (charmaptranslate_lookup(*collend, mapping, &x)) 4812 goto onError; 4813 Py_XDECREF(x); 4814 if (x!=Py_None) 4815 break; 4816 ++collend; 4817 } 4818 /* cache callback name lookup 4819 * (if not done yet, i.e. it's the first error) */ 4820 if (known_errorHandler==-1) { 4821 if ((errors==NULL) || (!strcmp(errors, "strict"))) 4822 known_errorHandler = 1; 4823 else if (!strcmp(errors, "replace")) 4824 known_errorHandler = 2; 4825 else if (!strcmp(errors, "ignore")) 4826 known_errorHandler = 3; 4827 else if (!strcmp(errors, "xmlcharrefreplace")) 4828 known_errorHandler = 4; 4829 else 4830 known_errorHandler = 0; 4831 } 4832 switch (known_errorHandler) { 4833 case 1: /* strict */ 4834 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason); 4835 goto onError; 4836 case 2: /* replace */ 4837 /* No need to check for space, this is a 1:1 replacement */ 4838 for (coll = collstart; coll<collend; ++coll) 4839 *str++ = '?'; 4840 /* fall through */ 4841 case 3: /* ignore */ 4842 p = collend; 4843 break; 4844 case 4: /* xmlcharrefreplace */ 4845 /* generate replacement (temporarily (mis)uses p) */ 4846 for (p = collstart; p < collend; ++p) { 4847 char buffer[2+29+1+1]; 4848 char *cp; 4849 sprintf(buffer, "&#%d;", (int)*p); 4850 if (charmaptranslate_makespace(&res, &str, 4851 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend))) 4852 goto onError; 4853 for (cp = buffer; *cp; ++cp) 4854 *str++ = *cp; 4855 } 4856 p = collend; 4857 break; 4858 default: 4859 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler, 4860 reason, startp, size, &exc, 4861 collstart-startp, collend-startp, &newpos); 4862 if (repunicode == NULL) 4863 goto onError; 4864 /* generate replacement */ 4865 repsize = PyUnicode_GET_SIZE(repunicode); 4866 if (charmaptranslate_makespace(&res, &str, 4867 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) { 4868 Py_DECREF(repunicode); 4869 goto onError; 4870 } 4871 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) 4872 *str++ = *uni2; 4873 p = startp + newpos; 4874 Py_DECREF(repunicode); 4875 } 4876 } 4877 } 4878 /* Resize if we allocated to much */ 4879 respos = str-PyUnicode_AS_UNICODE(res); 4880 if (respos<PyUnicode_GET_SIZE(res)) { 4881 if (_PyUnicode_Resize(&res, respos) < 0) 4882 goto onError; 4883 } 4884 Py_XDECREF(exc); 4885 Py_XDECREF(errorHandler); 4886 return res; 4887 4888 onError: 4889 Py_XDECREF(res); 4890 Py_XDECREF(exc); 4891 Py_XDECREF(errorHandler); 4892 return NULL; 4893} 4894 4895PyObject *PyUnicode_Translate(PyObject *str, 4896 PyObject *mapping, 4897 const char *errors) 4898{ 4899 PyObject *result; 4900 4901 str = PyUnicode_FromObject(str); 4902 if (str == NULL) 4903 goto onError; 4904 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str), 4905 PyUnicode_GET_SIZE(str), 4906 mapping, 4907 errors); 4908 Py_DECREF(str); 4909 return result; 4910 4911 onError: 4912 Py_XDECREF(str); 4913 return NULL; 4914} 4915 4916/* --- Decimal Encoder ---------------------------------------------------- */ 4917 4918int PyUnicode_EncodeDecimal(Py_UNICODE *s, 4919 Py_ssize_t length, 4920 char *output, 4921 const char *errors) 4922{ 4923 Py_UNICODE *p, *end; 4924 PyObject *errorHandler = NULL; 4925 PyObject *exc = NULL; 4926 const char *encoding = "decimal"; 4927 const char *reason = "invalid decimal Unicode string"; 4928 /* the following variable is used for caching string comparisons 4929 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */ 4930 int known_errorHandler = -1; 4931 4932 if (output == NULL) { 4933 PyErr_BadArgument(); 4934 return -1; 4935 } 4936 4937 p = s; 4938 end = s + length; 4939 while (p < end) { 4940 register Py_UNICODE ch = *p; 4941 int decimal; 4942 PyObject *repunicode; 4943 Py_ssize_t repsize; 4944 Py_ssize_t newpos; 4945 Py_UNICODE *uni2; 4946 Py_UNICODE *collstart; 4947 Py_UNICODE *collend; 4948 4949 if (Py_UNICODE_ISSPACE(ch)) { 4950 *output++ = ' '; 4951 ++p; 4952 continue; 4953 } 4954 decimal = Py_UNICODE_TODECIMAL(ch); 4955 if (decimal >= 0) { 4956 *output++ = '0' + decimal; 4957 ++p; 4958 continue; 4959 } 4960 if (0 < ch && ch < 256) { 4961 *output++ = (char)ch; 4962 ++p; 4963 continue; 4964 } 4965 /* All other characters are considered unencodable */ 4966 collstart = p; 4967 collend = p+1; 4968 while (collend < end) { 4969 if ((0 < *collend && *collend < 256) || 4970 !Py_UNICODE_ISSPACE(*collend) || 4971 Py_UNICODE_TODECIMAL(*collend)) 4972 break; 4973 } 4974 /* cache callback name lookup 4975 * (if not done yet, i.e. it's the first error) */ 4976 if (known_errorHandler==-1) { 4977 if ((errors==NULL) || (!strcmp(errors, "strict"))) 4978 known_errorHandler = 1; 4979 else if (!strcmp(errors, "replace")) 4980 known_errorHandler = 2; 4981 else if (!strcmp(errors, "ignore")) 4982 known_errorHandler = 3; 4983 else if (!strcmp(errors, "xmlcharrefreplace")) 4984 known_errorHandler = 4; 4985 else 4986 known_errorHandler = 0; 4987 } 4988 switch (known_errorHandler) { 4989 case 1: /* strict */ 4990 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason); 4991 goto onError; 4992 case 2: /* replace */ 4993 for (p = collstart; p < collend; ++p) 4994 *output++ = '?'; 4995 /* fall through */ 4996 case 3: /* ignore */ 4997 p = collend; 4998 break; 4999 case 4: /* xmlcharrefreplace */ 5000 /* generate replacement (temporarily (mis)uses p) */ 5001 for (p = collstart; p < collend; ++p) 5002 output += sprintf(output, "&#%d;", (int)*p); 5003 p = collend; 5004 break; 5005 default: 5006 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler, 5007 encoding, reason, s, length, &exc, 5008 collstart-s, collend-s, &newpos); 5009 if (repunicode == NULL) 5010 goto onError; 5011 /* generate replacement */ 5012 repsize = PyUnicode_GET_SIZE(repunicode); 5013 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) { 5014 Py_UNICODE ch = *uni2; 5015 if (Py_UNICODE_ISSPACE(ch)) 5016 *output++ = ' '; 5017 else { 5018 decimal = Py_UNICODE_TODECIMAL(ch); 5019 if (decimal >= 0) 5020 *output++ = '0' + decimal; 5021 else if (0 < ch && ch < 256) 5022 *output++ = (char)ch; 5023 else { 5024 Py_DECREF(repunicode); 5025 raise_encode_exception(&exc, encoding, 5026 s, length, collstart-s, collend-s, reason); 5027 goto onError; 5028 } 5029 } 5030 } 5031 p = s + newpos; 5032 Py_DECREF(repunicode); 5033 } 5034 } 5035 /* 0-terminate the output string */ 5036 *output++ = '\0'; 5037 Py_XDECREF(exc); 5038 Py_XDECREF(errorHandler); 5039 return 0; 5040 5041 onError: 5042 Py_XDECREF(exc); 5043 Py_XDECREF(errorHandler); 5044 return -1; 5045} 5046 5047/* --- Helpers ------------------------------------------------------------ */ 5048 5049#include "stringlib/unicodedefs.h" 5050 5051#include "stringlib/fastsearch.h" 5052 5053#include "stringlib/count.h" 5054#include "stringlib/find.h" 5055#include "stringlib/partition.h" 5056 5057/* helper macro to fixup start/end slice values */ 5058#define FIX_START_END(obj) \ 5059 if (start < 0) \ 5060 start += (obj)->length; \ 5061 if (start < 0) \ 5062 start = 0; \ 5063 if (end > (obj)->length) \ 5064 end = (obj)->length; \ 5065 if (end < 0) \ 5066 end += (obj)->length; \ 5067 if (end < 0) \ 5068 end = 0; 5069 5070Py_ssize_t PyUnicode_Count(PyObject *str, 5071 PyObject *substr, 5072 Py_ssize_t start, 5073 Py_ssize_t end) 5074{ 5075 Py_ssize_t result; 5076 PyUnicodeObject* str_obj; 5077 PyUnicodeObject* sub_obj; 5078 5079 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str); 5080 if (!str_obj) 5081 return -1; 5082 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr); 5083 if (!sub_obj) { 5084 Py_DECREF(str_obj); 5085 return -1; 5086 } 5087 5088 FIX_START_END(str_obj); 5089 5090 result = stringlib_count( 5091 str_obj->str + start, end - start, sub_obj->str, sub_obj->length 5092 ); 5093 5094 Py_DECREF(sub_obj); 5095 Py_DECREF(str_obj); 5096 5097 return result; 5098} 5099 5100Py_ssize_t PyUnicode_Find(PyObject *str, 5101 PyObject *sub, 5102 Py_ssize_t start, 5103 Py_ssize_t end, 5104 int direction) 5105{ 5106 Py_ssize_t result; 5107 5108 str = PyUnicode_FromObject(str); 5109 if (!str) 5110 return -2; 5111 sub = PyUnicode_FromObject(sub); 5112 if (!sub) { 5113 Py_DECREF(str); 5114 return -2; 5115 } 5116 5117 if (direction > 0) 5118 result = stringlib_find_slice( 5119 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str), 5120 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub), 5121 start, end 5122 ); 5123 else 5124 result = stringlib_rfind_slice( 5125 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str), 5126 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub), 5127 start, end 5128 ); 5129 5130 Py_DECREF(str); 5131 Py_DECREF(sub); 5132 5133 return result; 5134} 5135 5136static 5137int tailmatch(PyUnicodeObject *self, 5138 PyUnicodeObject *substring, 5139 Py_ssize_t start, 5140 Py_ssize_t end, 5141 int direction) 5142{ 5143 if (substring->length == 0) 5144 return 1; 5145 5146 FIX_START_END(self); 5147 5148 end -= substring->length; 5149 if (end < start) 5150 return 0; 5151 5152 if (direction > 0) { 5153 if (Py_UNICODE_MATCH(self, end, substring)) 5154 return 1; 5155 } else { 5156 if (Py_UNICODE_MATCH(self, start, substring)) 5157 return 1; 5158 } 5159 5160 return 0; 5161} 5162 5163Py_ssize_t PyUnicode_Tailmatch(PyObject *str, 5164 PyObject *substr, 5165 Py_ssize_t start, 5166 Py_ssize_t end, 5167 int direction) 5168{ 5169 Py_ssize_t result; 5170 5171 str = PyUnicode_FromObject(str); 5172 if (str == NULL) 5173 return -1; 5174 substr = PyUnicode_FromObject(substr); 5175 if (substr == NULL) { 5176 Py_DECREF(str); 5177 return -1; 5178 } 5179 5180 result = tailmatch((PyUnicodeObject *)str, 5181 (PyUnicodeObject *)substr, 5182 start, end, direction); 5183 Py_DECREF(str); 5184 Py_DECREF(substr); 5185 return result; 5186} 5187 5188/* Apply fixfct filter to the Unicode object self and return a 5189 reference to the modified object */ 5190 5191static 5192PyObject *fixup(PyUnicodeObject *self, 5193 int (*fixfct)(PyUnicodeObject *s)) 5194{ 5195 5196 PyUnicodeObject *u; 5197 5198 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length); 5199 if (u == NULL) 5200 return NULL; 5201 5202 Py_UNICODE_COPY(u->str, self->str, self->length); 5203 5204 if (!fixfct(u) && PyUnicode_CheckExact(self)) { 5205 /* fixfct should return TRUE if it modified the buffer. If 5206 FALSE, return a reference to the original buffer instead 5207 (to save space, not time) */ 5208 Py_INCREF(self); 5209 Py_DECREF(u); 5210 return (PyObject*) self; 5211 } 5212 return (PyObject*) u; 5213} 5214 5215static 5216int fixupper(PyUnicodeObject *self) 5217{ 5218 Py_ssize_t len = self->length; 5219 Py_UNICODE *s = self->str; 5220 int status = 0; 5221 5222 while (len-- > 0) { 5223 register Py_UNICODE ch; 5224 5225 ch = Py_UNICODE_TOUPPER(*s); 5226 if (ch != *s) { 5227 status = 1; 5228 *s = ch; 5229 } 5230 s++; 5231 } 5232 5233 return status; 5234} 5235 5236static 5237int fixlower(PyUnicodeObject *self) 5238{ 5239 Py_ssize_t len = self->length; 5240 Py_UNICODE *s = self->str; 5241 int status = 0; 5242 5243 while (len-- > 0) { 5244 register Py_UNICODE ch; 5245 5246 ch = Py_UNICODE_TOLOWER(*s); 5247 if (ch != *s) { 5248 status = 1; 5249 *s = ch; 5250 } 5251 s++; 5252 } 5253 5254 return status; 5255} 5256 5257static 5258int fixswapcase(PyUnicodeObject *self) 5259{ 5260 Py_ssize_t len = self->length; 5261 Py_UNICODE *s = self->str; 5262 int status = 0; 5263 5264 while (len-- > 0) { 5265 if (Py_UNICODE_ISUPPER(*s)) { 5266 *s = Py_UNICODE_TOLOWER(*s); 5267 status = 1; 5268 } else if (Py_UNICODE_ISLOWER(*s)) { 5269 *s = Py_UNICODE_TOUPPER(*s); 5270 status = 1; 5271 } 5272 s++; 5273 } 5274 5275 return status; 5276} 5277 5278static 5279int fixcapitalize(PyUnicodeObject *self) 5280{ 5281 Py_ssize_t len = self->length; 5282 Py_UNICODE *s = self->str; 5283 int status = 0; 5284 5285 if (len == 0) 5286 return 0; 5287 if (Py_UNICODE_ISLOWER(*s)) { 5288 *s = Py_UNICODE_TOUPPER(*s); 5289 status = 1; 5290 } 5291 s++; 5292 while (--len > 0) { 5293 if (Py_UNICODE_ISUPPER(*s)) { 5294 *s = Py_UNICODE_TOLOWER(*s); 5295 status = 1; 5296 } 5297 s++; 5298 } 5299 return status; 5300} 5301 5302static 5303int fixtitle(PyUnicodeObject *self) 5304{ 5305 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 5306 register Py_UNICODE *e; 5307 int previous_is_cased; 5308 5309 /* Shortcut for single character strings */ 5310 if (PyUnicode_GET_SIZE(self) == 1) { 5311 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p); 5312 if (*p != ch) { 5313 *p = ch; 5314 return 1; 5315 } 5316 else 5317 return 0; 5318 } 5319 5320 e = p + PyUnicode_GET_SIZE(self); 5321 previous_is_cased = 0; 5322 for (; p < e; p++) { 5323 register const Py_UNICODE ch = *p; 5324 5325 if (previous_is_cased) 5326 *p = Py_UNICODE_TOLOWER(ch); 5327 else 5328 *p = Py_UNICODE_TOTITLE(ch); 5329 5330 if (Py_UNICODE_ISLOWER(ch) || 5331 Py_UNICODE_ISUPPER(ch) || 5332 Py_UNICODE_ISTITLE(ch)) 5333 previous_is_cased = 1; 5334 else 5335 previous_is_cased = 0; 5336 } 5337 return 1; 5338} 5339 5340PyObject * 5341PyUnicode_Join(PyObject *separator, PyObject *seq) 5342{ 5343 PyObject *internal_separator = NULL; 5344 const Py_UNICODE blank = ' '; 5345 const Py_UNICODE *sep = ␣ 5346 Py_ssize_t seplen = 1; 5347 PyUnicodeObject *res = NULL; /* the result */ 5348 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */ 5349 Py_ssize_t res_used; /* # used bytes */ 5350 Py_UNICODE *res_p; /* pointer to free byte in res's string area */ 5351 PyObject *fseq; /* PySequence_Fast(seq) */ 5352 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */ 5353 PyObject *item; 5354 Py_ssize_t i; 5355 5356 fseq = PySequence_Fast(seq, ""); 5357 if (fseq == NULL) { 5358 return NULL; 5359 } 5360 5361 /* Grrrr. A codec may be invoked to convert str objects to 5362 * Unicode, and so it's possible to call back into Python code 5363 * during PyUnicode_FromObject(), and so it's possible for a sick 5364 * codec to change the size of fseq (if seq is a list). Therefore 5365 * we have to keep refetching the size -- can't assume seqlen 5366 * is invariant. 5367 */ 5368 seqlen = PySequence_Fast_GET_SIZE(fseq); 5369 /* If empty sequence, return u"". */ 5370 if (seqlen == 0) { 5371 res = _PyUnicode_New(0); /* empty sequence; return u"" */ 5372 goto Done; 5373 } 5374 /* If singleton sequence with an exact Unicode, return that. */ 5375 if (seqlen == 1) { 5376 item = PySequence_Fast_GET_ITEM(fseq, 0); 5377 if (PyUnicode_CheckExact(item)) { 5378 Py_INCREF(item); 5379 res = (PyUnicodeObject *)item; 5380 goto Done; 5381 } 5382 } 5383 5384 /* At least two items to join, or one that isn't exact Unicode. */ 5385 if (seqlen > 1) { 5386 /* Set up sep and seplen -- they're needed. */ 5387 if (separator == NULL) { 5388 sep = ␣ 5389 seplen = 1; 5390 } 5391 else { 5392 internal_separator = PyUnicode_FromObject(separator); 5393 if (internal_separator == NULL) 5394 goto onError; 5395 sep = PyUnicode_AS_UNICODE(internal_separator); 5396 seplen = PyUnicode_GET_SIZE(internal_separator); 5397 /* In case PyUnicode_FromObject() mutated seq. */ 5398 seqlen = PySequence_Fast_GET_SIZE(fseq); 5399 } 5400 } 5401 5402 /* Get space. */ 5403 res = _PyUnicode_New(res_alloc); 5404 if (res == NULL) 5405 goto onError; 5406 res_p = PyUnicode_AS_UNICODE(res); 5407 res_used = 0; 5408 5409 for (i = 0; i < seqlen; ++i) { 5410 Py_ssize_t itemlen; 5411 Py_ssize_t new_res_used; 5412 5413 item = PySequence_Fast_GET_ITEM(fseq, i); 5414 /* Convert item to Unicode. */ 5415 if (! PyUnicode_Check(item) && ! PyString_Check(item)) { 5416 PyErr_Format(PyExc_TypeError, 5417 "sequence item %zd: expected string or Unicode," 5418 " %.80s found", 5419 i, Py_Type(item)->tp_name); 5420 goto onError; 5421 } 5422 item = PyUnicode_FromObject(item); 5423 if (item == NULL) 5424 goto onError; 5425 /* We own a reference to item from here on. */ 5426 5427 /* In case PyUnicode_FromObject() mutated seq. */ 5428 seqlen = PySequence_Fast_GET_SIZE(fseq); 5429 5430 /* Make sure we have enough space for the separator and the item. */ 5431 itemlen = PyUnicode_GET_SIZE(item); 5432 new_res_used = res_used + itemlen; 5433 if (new_res_used < 0) 5434 goto Overflow; 5435 if (i < seqlen - 1) { 5436 new_res_used += seplen; 5437 if (new_res_used < 0) 5438 goto Overflow; 5439 } 5440 if (new_res_used > res_alloc) { 5441 /* double allocated size until it's big enough */ 5442 do { 5443 res_alloc += res_alloc; 5444 if (res_alloc <= 0) 5445 goto Overflow; 5446 } while (new_res_used > res_alloc); 5447 if (_PyUnicode_Resize(&res, res_alloc) < 0) { 5448 Py_DECREF(item); 5449 goto onError; 5450 } 5451 res_p = PyUnicode_AS_UNICODE(res) + res_used; 5452 } 5453 5454 /* Copy item, and maybe the separator. */ 5455 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen); 5456 res_p += itemlen; 5457 if (i < seqlen - 1) { 5458 Py_UNICODE_COPY(res_p, sep, seplen); 5459 res_p += seplen; 5460 } 5461 Py_DECREF(item); 5462 res_used = new_res_used; 5463 } 5464 5465 /* Shrink res to match the used area; this probably can't fail, 5466 * but it's cheap to check. 5467 */ 5468 if (_PyUnicode_Resize(&res, res_used) < 0) 5469 goto onError; 5470 5471 Done: 5472 Py_XDECREF(internal_separator); 5473 Py_DECREF(fseq); 5474 return (PyObject *)res; 5475 5476 Overflow: 5477 PyErr_SetString(PyExc_OverflowError, 5478 "join() result is too long for a Python string"); 5479 Py_DECREF(item); 5480 /* fall through */ 5481 5482 onError: 5483 Py_XDECREF(internal_separator); 5484 Py_DECREF(fseq); 5485 Py_XDECREF(res); 5486 return NULL; 5487} 5488 5489static 5490PyUnicodeObject *pad(PyUnicodeObject *self, 5491 Py_ssize_t left, 5492 Py_ssize_t right, 5493 Py_UNICODE fill) 5494{ 5495 PyUnicodeObject *u; 5496 5497 if (left < 0) 5498 left = 0; 5499 if (right < 0) 5500 right = 0; 5501 5502 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) { 5503 Py_INCREF(self); 5504 return self; 5505 } 5506 5507 u = _PyUnicode_New(left + self->length + right); 5508 if (u) { 5509 if (left) 5510 Py_UNICODE_FILL(u->str, fill, left); 5511 Py_UNICODE_COPY(u->str + left, self->str, self->length); 5512 if (right) 5513 Py_UNICODE_FILL(u->str + left + self->length, fill, right); 5514 } 5515 5516 return u; 5517} 5518 5519#define SPLIT_APPEND(data, left, right) \ 5520 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \ 5521 if (!str) \ 5522 goto onError; \ 5523 if (PyList_Append(list, str)) { \ 5524 Py_DECREF(str); \ 5525 goto onError; \ 5526 } \ 5527 else \ 5528 Py_DECREF(str); 5529 5530static 5531PyObject *split_whitespace(PyUnicodeObject *self, 5532 PyObject *list, 5533 Py_ssize_t maxcount) 5534{ 5535 register Py_ssize_t i; 5536 register Py_ssize_t j; 5537 Py_ssize_t len = self->length; 5538 PyObject *str; 5539 5540 for (i = j = 0; i < len; ) { 5541 /* find a token */ 5542 while (i < len && Py_UNICODE_ISSPACE(self->str[i])) 5543 i++; 5544 j = i; 5545 while (i < len && !Py_UNICODE_ISSPACE(self->str[i])) 5546 i++; 5547 if (j < i) { 5548 if (maxcount-- <= 0) 5549 break; 5550 SPLIT_APPEND(self->str, j, i); 5551 while (i < len && Py_UNICODE_ISSPACE(self->str[i])) 5552 i++; 5553 j = i; 5554 } 5555 } 5556 if (j < len) { 5557 SPLIT_APPEND(self->str, j, len); 5558 } 5559 return list; 5560 5561 onError: 5562 Py_DECREF(list); 5563 return NULL; 5564} 5565 5566PyObject *PyUnicode_Splitlines(PyObject *string, 5567 int keepends) 5568{ 5569 register Py_ssize_t i; 5570 register Py_ssize_t j; 5571 Py_ssize_t len; 5572 PyObject *list; 5573 PyObject *str; 5574 Py_UNICODE *data; 5575 5576 string = PyUnicode_FromObject(string); 5577 if (string == NULL) 5578 return NULL; 5579 data = PyUnicode_AS_UNICODE(string); 5580 len = PyUnicode_GET_SIZE(string); 5581 5582 list = PyList_New(0); 5583 if (!list) 5584 goto onError; 5585 5586 for (i = j = 0; i < len; ) { 5587 Py_ssize_t eol; 5588 5589 /* Find a line and append it */ 5590 while (i < len && !BLOOM_LINEBREAK(data[i])) 5591 i++; 5592 5593 /* Skip the line break reading CRLF as one line break */ 5594 eol = i; 5595 if (i < len) { 5596 if (data[i] == '\r' && i + 1 < len && 5597 data[i+1] == '\n') 5598 i += 2; 5599 else 5600 i++; 5601 if (keepends) 5602 eol = i; 5603 } 5604 SPLIT_APPEND(data, j, eol); 5605 j = i; 5606 } 5607 if (j < len) { 5608 SPLIT_APPEND(data, j, len); 5609 } 5610 5611 Py_DECREF(string); 5612 return list; 5613 5614 onError: 5615 Py_XDECREF(list); 5616 Py_DECREF(string); 5617 return NULL; 5618} 5619 5620static 5621PyObject *split_char(PyUnicodeObject *self, 5622 PyObject *list, 5623 Py_UNICODE ch, 5624 Py_ssize_t maxcount) 5625{ 5626 register Py_ssize_t i; 5627 register Py_ssize_t j; 5628 Py_ssize_t len = self->length; 5629 PyObject *str; 5630 5631 for (i = j = 0; i < len; ) { 5632 if (self->str[i] == ch) { 5633 if (maxcount-- <= 0) 5634 break; 5635 SPLIT_APPEND(self->str, j, i); 5636 i = j = i + 1; 5637 } else 5638 i++; 5639 } 5640 if (j <= len) { 5641 SPLIT_APPEND(self->str, j, len); 5642 } 5643 return list; 5644 5645 onError: 5646 Py_DECREF(list); 5647 return NULL; 5648} 5649 5650static 5651PyObject *split_substring(PyUnicodeObject *self, 5652 PyObject *list, 5653 PyUnicodeObject *substring, 5654 Py_ssize_t maxcount) 5655{ 5656 register Py_ssize_t i; 5657 register Py_ssize_t j; 5658 Py_ssize_t len = self->length; 5659 Py_ssize_t sublen = substring->length; 5660 PyObject *str; 5661 5662 for (i = j = 0; i <= len - sublen; ) { 5663 if (Py_UNICODE_MATCH(self, i, substring)) { 5664 if (maxcount-- <= 0) 5665 break; 5666 SPLIT_APPEND(self->str, j, i); 5667 i = j = i + sublen; 5668 } else 5669 i++; 5670 } 5671 if (j <= len) { 5672 SPLIT_APPEND(self->str, j, len); 5673 } 5674 return list; 5675 5676 onError: 5677 Py_DECREF(list); 5678 return NULL; 5679} 5680 5681static 5682PyObject *rsplit_whitespace(PyUnicodeObject *self, 5683 PyObject *list, 5684 Py_ssize_t maxcount) 5685{ 5686 register Py_ssize_t i; 5687 register Py_ssize_t j; 5688 Py_ssize_t len = self->length; 5689 PyObject *str; 5690 5691 for (i = j = len - 1; i >= 0; ) { 5692 /* find a token */ 5693 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i])) 5694 i--; 5695 j = i; 5696 while (i >= 0 && !Py_UNICODE_ISSPACE(self->str[i])) 5697 i--; 5698 if (j > i) { 5699 if (maxcount-- <= 0) 5700 break; 5701 SPLIT_APPEND(self->str, i + 1, j + 1); 5702 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i])) 5703 i--; 5704 j = i; 5705 } 5706 } 5707 if (j >= 0) { 5708 SPLIT_APPEND(self->str, 0, j + 1); 5709 } 5710 if (PyList_Reverse(list) < 0) 5711 goto onError; 5712 return list; 5713 5714 onError: 5715 Py_DECREF(list); 5716 return NULL; 5717} 5718 5719static 5720PyObject *rsplit_char(PyUnicodeObject *self, 5721 PyObject *list, 5722 Py_UNICODE ch, 5723 Py_ssize_t maxcount) 5724{ 5725 register Py_ssize_t i; 5726 register Py_ssize_t j; 5727 Py_ssize_t len = self->length; 5728 PyObject *str; 5729 5730 for (i = j = len - 1; i >= 0; ) { 5731 if (self->str[i] == ch) { 5732 if (maxcount-- <= 0) 5733 break; 5734 SPLIT_APPEND(self->str, i + 1, j + 1); 5735 j = i = i - 1; 5736 } else 5737 i--; 5738 } 5739 if (j >= -1) { 5740 SPLIT_APPEND(self->str, 0, j + 1); 5741 } 5742 if (PyList_Reverse(list) < 0) 5743 goto onError; 5744 return list; 5745 5746 onError: 5747 Py_DECREF(list); 5748 return NULL; 5749} 5750 5751static 5752PyObject *rsplit_substring(PyUnicodeObject *self, 5753 PyObject *list, 5754 PyUnicodeObject *substring, 5755 Py_ssize_t maxcount) 5756{ 5757 register Py_ssize_t i; 5758 register Py_ssize_t j; 5759 Py_ssize_t len = self->length; 5760 Py_ssize_t sublen = substring->length; 5761 PyObject *str; 5762 5763 for (i = len - sublen, j = len; i >= 0; ) { 5764 if (Py_UNICODE_MATCH(self, i, substring)) { 5765 if (maxcount-- <= 0) 5766 break; 5767 SPLIT_APPEND(self->str, i + sublen, j); 5768 j = i; 5769 i -= sublen; 5770 } else 5771 i--; 5772 } 5773 if (j >= 0) { 5774 SPLIT_APPEND(self->str, 0, j); 5775 } 5776 if (PyList_Reverse(list) < 0) 5777 goto onError; 5778 return list; 5779 5780 onError: 5781 Py_DECREF(list); 5782 return NULL; 5783} 5784 5785#undef SPLIT_APPEND 5786 5787static 5788PyObject *split(PyUnicodeObject *self, 5789 PyUnicodeObject *substring, 5790 Py_ssize_t maxcount) 5791{ 5792 PyObject *list; 5793 5794 if (maxcount < 0) 5795 maxcount = PY_SSIZE_T_MAX; 5796 5797 list = PyList_New(0); 5798 if (!list) 5799 return NULL; 5800 5801 if (substring == NULL) 5802 return split_whitespace(self,list,maxcount); 5803 5804 else if (substring->length == 1) 5805 return split_char(self,list,substring->str[0],maxcount); 5806 5807 else if (substring->length == 0) { 5808 Py_DECREF(list); 5809 PyErr_SetString(PyExc_ValueError, "empty separator"); 5810 return NULL; 5811 } 5812 else 5813 return split_substring(self,list,substring,maxcount); 5814} 5815 5816static 5817PyObject *rsplit(PyUnicodeObject *self, 5818 PyUnicodeObject *substring, 5819 Py_ssize_t maxcount) 5820{ 5821 PyObject *list; 5822 5823 if (maxcount < 0) 5824 maxcount = PY_SSIZE_T_MAX; 5825 5826 list = PyList_New(0); 5827 if (!list) 5828 return NULL; 5829 5830 if (substring == NULL) 5831 return rsplit_whitespace(self,list,maxcount); 5832 5833 else if (substring->length == 1) 5834 return rsplit_char(self,list,substring->str[0],maxcount); 5835 5836 else if (substring->length == 0) { 5837 Py_DECREF(list); 5838 PyErr_SetString(PyExc_ValueError, "empty separator"); 5839 return NULL; 5840 } 5841 else 5842 return rsplit_substring(self,list,substring,maxcount); 5843} 5844 5845static 5846PyObject *replace(PyUnicodeObject *self, 5847 PyUnicodeObject *str1, 5848 PyUnicodeObject *str2, 5849 Py_ssize_t maxcount) 5850{ 5851 PyUnicodeObject *u; 5852 5853 if (maxcount < 0) 5854 maxcount = PY_SSIZE_T_MAX; 5855 5856 if (str1->length == str2->length) { 5857 /* same length */ 5858 Py_ssize_t i; 5859 if (str1->length == 1) { 5860 /* replace characters */ 5861 Py_UNICODE u1, u2; 5862 if (!findchar(self->str, self->length, str1->str[0])) 5863 goto nothing; 5864 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length); 5865 if (!u) 5866 return NULL; 5867 Py_UNICODE_COPY(u->str, self->str, self->length); 5868 u1 = str1->str[0]; 5869 u2 = str2->str[0]; 5870 for (i = 0; i < u->length; i++) 5871 if (u->str[i] == u1) { 5872 if (--maxcount < 0) 5873 break; 5874 u->str[i] = u2; 5875 } 5876 } else { 5877 i = fastsearch( 5878 self->str, self->length, str1->str, str1->length, FAST_SEARCH 5879 ); 5880 if (i < 0) 5881 goto nothing; 5882 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length); 5883 if (!u) 5884 return NULL; 5885 Py_UNICODE_COPY(u->str, self->str, self->length); 5886 while (i <= self->length - str1->length) 5887 if (Py_UNICODE_MATCH(self, i, str1)) { 5888 if (--maxcount < 0) 5889 break; 5890 Py_UNICODE_COPY(u->str+i, str2->str, str2->length); 5891 i += str1->length; 5892 } else 5893 i++; 5894 } 5895 } else { 5896 5897 Py_ssize_t n, i, j, e; 5898 Py_ssize_t product, new_size, delta; 5899 Py_UNICODE *p; 5900 5901 /* replace strings */ 5902 n = stringlib_count(self->str, self->length, str1->str, str1->length); 5903 if (n > maxcount) 5904 n = maxcount; 5905 if (n == 0) 5906 goto nothing; 5907 /* new_size = self->length + n * (str2->length - str1->length)); */ 5908 delta = (str2->length - str1->length); 5909 if (delta == 0) { 5910 new_size = self->length; 5911 } else { 5912 product = n * (str2->length - str1->length); 5913 if ((product / (str2->length - str1->length)) != n) { 5914 PyErr_SetString(PyExc_OverflowError, 5915 "replace string is too long"); 5916 return NULL; 5917 } 5918 new_size = self->length + product; 5919 if (new_size < 0) { 5920 PyErr_SetString(PyExc_OverflowError, 5921 "replace string is too long"); 5922 return NULL; 5923 } 5924 } 5925 u = _PyUnicode_New(new_size); 5926 if (!u) 5927 return NULL; 5928 i = 0; 5929 p = u->str; 5930 e = self->length - str1->length; 5931 if (str1->length > 0) { 5932 while (n-- > 0) { 5933 /* look for next match */ 5934 j = i; 5935 while (j <= e) { 5936 if (Py_UNICODE_MATCH(self, j, str1)) 5937 break; 5938 j++; 5939 } 5940 if (j > i) { 5941 if (j > e) 5942 break; 5943 /* copy unchanged part [i:j] */ 5944 Py_UNICODE_COPY(p, self->str+i, j-i); 5945 p += j - i; 5946 } 5947 /* copy substitution string */ 5948 if (str2->length > 0) { 5949 Py_UNICODE_COPY(p, str2->str, str2->length); 5950 p += str2->length; 5951 } 5952 i = j + str1->length; 5953 } 5954 if (i < self->length) 5955 /* copy tail [i:] */ 5956 Py_UNICODE_COPY(p, self->str+i, self->length-i); 5957 } else { 5958 /* interleave */ 5959 while (n > 0) { 5960 Py_UNICODE_COPY(p, str2->str, str2->length); 5961 p += str2->length; 5962 if (--n <= 0) 5963 break; 5964 *p++ = self->str[i++]; 5965 } 5966 Py_UNICODE_COPY(p, self->str+i, self->length-i); 5967 } 5968 } 5969 return (PyObject *) u; 5970 5971nothing: 5972 /* nothing to replace; return original string (when possible) */ 5973 if (PyUnicode_CheckExact(self)) { 5974 Py_INCREF(self); 5975 return (PyObject *) self; 5976 } 5977 return PyUnicode_FromUnicode(self->str, self->length); 5978} 5979 5980/* --- Unicode Object Methods --------------------------------------------- */ 5981 5982PyDoc_STRVAR(title__doc__, 5983"S.title() -> unicode\n\ 5984\n\ 5985Return a titlecased version of S, i.e. words start with title case\n\ 5986characters, all remaining cased characters have lower case."); 5987 5988static PyObject* 5989unicode_title(PyUnicodeObject *self) 5990{ 5991 return fixup(self, fixtitle); 5992} 5993 5994PyDoc_STRVAR(capitalize__doc__, 5995"S.capitalize() -> unicode\n\ 5996\n\ 5997Return a capitalized version of S, i.e. make the first character\n\ 5998have upper case."); 5999 6000static PyObject* 6001unicode_capitalize(PyUnicodeObject *self) 6002{ 6003 return fixup(self, fixcapitalize); 6004} 6005 6006#if 0 6007PyDoc_STRVAR(capwords__doc__, 6008"S.capwords() -> unicode\n\ 6009\n\ 6010Apply .capitalize() to all words in S and return the result with\n\ 6011normalized whitespace (all whitespace strings are replaced by ' ')."); 6012 6013static PyObject* 6014unicode_capwords(PyUnicodeObject *self) 6015{ 6016 PyObject *list; 6017 PyObject *item; 6018 Py_ssize_t i; 6019 6020 /* Split into words */ 6021 list = split(self, NULL, -1); 6022 if (!list) 6023 return NULL; 6024 6025 /* Capitalize each word */ 6026 for (i = 0; i < PyList_GET_SIZE(list); i++) { 6027 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i), 6028 fixcapitalize); 6029 if (item == NULL) 6030 goto onError; 6031 Py_DECREF(PyList_GET_ITEM(list, i)); 6032 PyList_SET_ITEM(list, i, item); 6033 } 6034 6035 /* Join the words to form a new string */ 6036 item = PyUnicode_Join(NULL, list); 6037 6038onError: 6039 Py_DECREF(list); 6040 return (PyObject *)item; 6041} 6042#endif 6043 6044/* Argument converter. Coerces to a single unicode character */ 6045 6046static int 6047convert_uc(PyObject *obj, void *addr) 6048{ 6049 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr; 6050 PyObject *uniobj; 6051 Py_UNICODE *unistr; 6052 6053 uniobj = PyUnicode_FromObject(obj); 6054 if (uniobj == NULL) { 6055 PyErr_SetString(PyExc_TypeError, 6056 "The fill character cannot be converted to Unicode"); 6057 return 0; 6058 } 6059 if (PyUnicode_GET_SIZE(uniobj) != 1) { 6060 PyErr_SetString(PyExc_TypeError, 6061 "The fill character must be exactly one character long"); 6062 Py_DECREF(uniobj); 6063 return 0; 6064 } 6065 unistr = PyUnicode_AS_UNICODE(uniobj); 6066 *fillcharloc = unistr[0]; 6067 Py_DECREF(uniobj); 6068 return 1; 6069} 6070 6071PyDoc_STRVAR(center__doc__, 6072"S.center(width[, fillchar]) -> unicode\n\ 6073\n\ 6074Return S centered in a Unicode string of length width. Padding is\n\ 6075done using the specified fill character (default is a space)"); 6076 6077static PyObject * 6078unicode_center(PyUnicodeObject *self, PyObject *args) 6079{ 6080 Py_ssize_t marg, left; 6081 Py_ssize_t width; 6082 Py_UNICODE fillchar = ' '; 6083 6084 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar)) 6085 return NULL; 6086 6087 if (self->length >= width && PyUnicode_CheckExact(self)) { 6088 Py_INCREF(self); 6089 return (PyObject*) self; 6090 } 6091 6092 marg = width - self->length; 6093 left = marg / 2 + (marg & width & 1); 6094 6095 return (PyObject*) pad(self, left, marg - left, fillchar); 6096} 6097 6098#if 0 6099 6100/* This code should go into some future Unicode collation support 6101 module. The basic comparison should compare ordinals on a naive 6102 basis (this is what Java does and thus JPython too). */ 6103 6104/* speedy UTF-16 code point order comparison */ 6105/* gleaned from: */ 6106/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */ 6107 6108static short utf16Fixup[32] = 6109{ 6110 0, 0, 0, 0, 0, 0, 0, 0, 6111 0, 0, 0, 0, 0, 0, 0, 0, 6112 0, 0, 0, 0, 0, 0, 0, 0, 6113 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800 6114}; 6115 6116static int 6117unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2) 6118{ 6119 Py_ssize_t len1, len2; 6120 6121 Py_UNICODE *s1 = str1->str; 6122 Py_UNICODE *s2 = str2->str; 6123 6124 len1 = str1->length; 6125 len2 = str2->length; 6126 6127 while (len1 > 0 && len2 > 0) { 6128 Py_UNICODE c1, c2; 6129 6130 c1 = *s1++; 6131 c2 = *s2++; 6132 6133 if (c1 > (1<<11) * 26) 6134 c1 += utf16Fixup[c1>>11]; 6135 if (c2 > (1<<11) * 26) 6136 c2 += utf16Fixup[c2>>11]; 6137 /* now c1 and c2 are in UTF-32-compatible order */ 6138 6139 if (c1 != c2) 6140 return (c1 < c2) ? -1 : 1; 6141 6142 len1--; len2--; 6143 } 6144 6145 return (len1 < len2) ? -1 : (len1 != len2); 6146} 6147 6148#else 6149 6150static int 6151unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2) 6152{ 6153 register Py_ssize_t len1, len2; 6154 6155 Py_UNICODE *s1 = str1->str; 6156 Py_UNICODE *s2 = str2->str; 6157 6158 len1 = str1->length; 6159 len2 = str2->length; 6160 6161 while (len1 > 0 && len2 > 0) { 6162 Py_UNICODE c1, c2; 6163 6164 c1 = *s1++; 6165 c2 = *s2++; 6166 6167 if (c1 != c2) 6168 return (c1 < c2) ? -1 : 1; 6169 6170 len1--; len2--; 6171 } 6172 6173 return (len1 < len2) ? -1 : (len1 != len2); 6174} 6175 6176#endif 6177 6178int PyUnicode_Compare(PyObject *left, 6179 PyObject *right) 6180{ 6181 if (PyUnicode_Check(left) && PyUnicode_Check(right)) 6182 return unicode_compare((PyUnicodeObject *)left, 6183 (PyUnicodeObject *)right); 6184 if ((PyString_Check(left) && PyUnicode_Check(right)) || 6185 (PyUnicode_Check(left) && PyString_Check(right))) { 6186 if (PyUnicode_Check(left)) 6187 left = _PyUnicode_AsDefaultEncodedString(left, NULL); 6188 if (PyUnicode_Check(right)) 6189 right = _PyUnicode_AsDefaultEncodedString(right, NULL); 6190 assert(PyString_Check(left)); 6191 assert(PyString_Check(right)); 6192 return PyObject_Compare(left, right); 6193 } 6194 PyErr_Format(PyExc_TypeError, 6195 "Can't compare %.100s and %.100s", 6196 left->ob_type->tp_name, 6197 right->ob_type->tp_name); 6198 return -1; 6199} 6200 6201int 6202PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str) 6203{ 6204 int i; 6205 Py_UNICODE *id; 6206 assert(PyUnicode_Check(uni)); 6207 id = PyUnicode_AS_UNICODE(uni); 6208 /* Compare Unicode string and source character set string */ 6209 for (i = 0; id[i] && str[i]; i++) 6210 if (id[i] != str[i]) 6211 return ((int)id[i] < (int)str[i]) ? -1 : 1; 6212 if (id[i]) 6213 return 1; /* uni is longer */ 6214 if (str[i]) 6215 return -1; /* str is longer */ 6216 return 0; 6217} 6218 6219PyObject *PyUnicode_RichCompare(PyObject *left, 6220 PyObject *right, 6221 int op) 6222{ 6223 int result; 6224 6225 result = PyUnicode_Compare(left, right); 6226 if (result == -1 && PyErr_Occurred()) 6227 goto onError; 6228 6229 /* Convert the return value to a Boolean */ 6230 switch (op) { 6231 case Py_EQ: 6232 result = (result == 0); 6233 break; 6234 case Py_NE: 6235 result = (result != 0); 6236 break; 6237 case Py_LE: 6238 result = (result <= 0); 6239 break; 6240 case Py_GE: 6241 result = (result >= 0); 6242 break; 6243 case Py_LT: 6244 result = (result == -1); 6245 break; 6246 case Py_GT: 6247 result = (result == 1); 6248 break; 6249 } 6250 return PyBool_FromLong(result); 6251 6252 onError: 6253 6254 /* Standard case 6255 6256 Type errors mean that PyUnicode_FromObject() could not convert 6257 one of the arguments (usually the right hand side) to Unicode, 6258 ie. we can't handle the comparison request. However, it is 6259 possible that the other object knows a comparison method, which 6260 is why we return Py_NotImplemented to give the other object a 6261 chance. 6262 6263 */ 6264 if (PyErr_ExceptionMatches(PyExc_TypeError)) { 6265 PyErr_Clear(); 6266 Py_INCREF(Py_NotImplemented); 6267 return Py_NotImplemented; 6268 } 6269 if (op != Py_EQ && op != Py_NE) 6270 return NULL; 6271 6272 /* Equality comparison. 6273 6274 This is a special case: we silence any PyExc_UnicodeDecodeError 6275 and instead turn it into a PyErr_UnicodeWarning. 6276 6277 */ 6278 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) 6279 return NULL; 6280 PyErr_Clear(); 6281 if (PyErr_WarnEx(PyExc_UnicodeWarning, 6282 (op == Py_EQ) ? 6283 "Unicode equal comparison " 6284 "failed to convert both arguments to Unicode - " 6285 "interpreting them as being unequal" 6286 : 6287 "Unicode unequal comparison " 6288 "failed to convert both arguments to Unicode - " 6289 "interpreting them as being unequal", 6290 1) < 0) 6291 return NULL; 6292 result = (op == Py_NE); 6293 return PyBool_FromLong(result); 6294} 6295 6296int PyUnicode_Contains(PyObject *container, 6297 PyObject *element) 6298{ 6299 PyObject *str, *sub; 6300 int result; 6301 6302 /* Coerce the two arguments */ 6303 sub = PyUnicode_FromObject(element); 6304 if (!sub) { 6305 PyErr_Format(PyExc_TypeError, 6306 "'in <string>' requires string as left operand, not %s", 6307 element->ob_type->tp_name); 6308 return -1; 6309 } 6310 6311 str = PyUnicode_FromObject(container); 6312 if (!str) { 6313 Py_DECREF(sub); 6314 return -1; 6315 } 6316 6317 result = stringlib_contains_obj(str, sub); 6318 6319 Py_DECREF(str); 6320 Py_DECREF(sub); 6321 6322 return result; 6323} 6324 6325/* Concat to string or Unicode object giving a new Unicode object. */ 6326 6327PyObject *PyUnicode_Concat(PyObject *left, 6328 PyObject *right) 6329{ 6330 PyUnicodeObject *u = NULL, *v = NULL, *w; 6331 6332 if (PyBytes_Check(left) || PyBytes_Check(right)) 6333 return PyBytes_Concat(left, right); 6334 6335 /* Coerce the two arguments */ 6336 u = (PyUnicodeObject *)PyUnicode_FromObject(left); 6337 if (u == NULL) 6338 goto onError; 6339 v = (PyUnicodeObject *)PyUnicode_FromObject(right); 6340 if (v == NULL) 6341 goto onError; 6342 6343 /* Shortcuts */ 6344 if (v == unicode_empty) { 6345 Py_DECREF(v); 6346 return (PyObject *)u; 6347 } 6348 if (u == unicode_empty) { 6349 Py_DECREF(u); 6350 return (PyObject *)v; 6351 } 6352 6353 /* Concat the two Unicode strings */ 6354 w = _PyUnicode_New(u->length + v->length); 6355 if (w == NULL) 6356 goto onError; 6357 Py_UNICODE_COPY(w->str, u->str, u->length); 6358 Py_UNICODE_COPY(w->str + u->length, v->str, v->length); 6359 6360 Py_DECREF(u); 6361 Py_DECREF(v); 6362 return (PyObject *)w; 6363 6364onError: 6365 Py_XDECREF(u); 6366 Py_XDECREF(v); 6367 return NULL; 6368} 6369 6370void 6371PyUnicode_Append(PyObject **pleft, PyObject *right) 6372{ 6373 PyObject *new; 6374 if (*pleft == NULL) 6375 return; 6376 if (right == NULL || !PyUnicode_Check(*pleft)) { 6377 Py_DECREF(*pleft); 6378 *pleft = NULL; 6379 return; 6380 } 6381 new = PyUnicode_Concat(*pleft, right); 6382 Py_DECREF(*pleft); 6383 *pleft = new; 6384} 6385 6386void 6387PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right) 6388{ 6389 PyUnicode_Append(pleft, right); 6390 Py_XDECREF(right); 6391} 6392 6393PyDoc_STRVAR(count__doc__, 6394"S.count(sub[, start[, end]]) -> int\n\ 6395\n\ 6396Return the number of non-overlapping occurrences of substring sub in\n\ 6397Unicode string S[start:end]. Optional arguments start and end are\n\ 6398interpreted as in slice notation."); 6399 6400static PyObject * 6401unicode_count(PyUnicodeObject *self, PyObject *args) 6402{ 6403 PyUnicodeObject *substring; 6404 Py_ssize_t start = 0; 6405 Py_ssize_t end = PY_SSIZE_T_MAX; 6406 PyObject *result; 6407 6408 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring, 6409 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 6410 return NULL; 6411 6412 substring = (PyUnicodeObject *)PyUnicode_FromObject( 6413 (PyObject *)substring); 6414 if (substring == NULL) 6415 return NULL; 6416 6417 FIX_START_END(self); 6418 6419 result = PyInt_FromSsize_t( 6420 stringlib_count(self->str + start, end - start, 6421 substring->str, substring->length) 6422 ); 6423 6424 Py_DECREF(substring); 6425 6426 return result; 6427} 6428 6429PyDoc_STRVAR(encode__doc__, 6430"S.encode([encoding[,errors]]) -> string or unicode\n\ 6431\n\ 6432Encodes S using the codec registered for encoding. encoding defaults\n\ 6433to the default encoding. errors may be given to set a different error\n\ 6434handling scheme. Default is 'strict' meaning that encoding errors raise\n\ 6435a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\ 6436'xmlcharrefreplace' as well as any other name registered with\n\ 6437codecs.register_error that can handle UnicodeEncodeErrors."); 6438 6439static PyObject * 6440unicode_encode(PyUnicodeObject *self, PyObject *args) 6441{ 6442 char *encoding = NULL; 6443 char *errors = NULL; 6444 PyObject *v; 6445 6446 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors)) 6447 return NULL; 6448 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors); 6449 if (v == NULL) 6450 goto onError; 6451 if (!PyBytes_Check(v)) { 6452 PyErr_Format(PyExc_TypeError, 6453 "encoder did not return a bytes object " 6454 "(type=%.400s)", 6455 Py_Type(v)->tp_name); 6456 Py_DECREF(v); 6457 return NULL; 6458 } 6459 return v; 6460 6461 onError: 6462 return NULL; 6463} 6464 6465PyDoc_STRVAR(expandtabs__doc__, 6466"S.expandtabs([tabsize]) -> unicode\n\ 6467\n\ 6468Return a copy of S where all tab characters are expanded using spaces.\n\ 6469If tabsize is not given, a tab size of 8 characters is assumed."); 6470 6471static PyObject* 6472unicode_expandtabs(PyUnicodeObject *self, PyObject *args) 6473{ 6474 Py_UNICODE *e; 6475 Py_UNICODE *p; 6476 Py_UNICODE *q; 6477 Py_ssize_t i, j, old_j; 6478 PyUnicodeObject *u; 6479 int tabsize = 8; 6480 6481 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize)) 6482 return NULL; 6483 6484 /* First pass: determine size of output string */ 6485 i = j = old_j = 0; 6486 e = self->str + self->length; 6487 for (p = self->str; p < e; p++) 6488 if (*p == '\t') { 6489 if (tabsize > 0) { 6490 j += tabsize - (j % tabsize); 6491 if (old_j > j) { 6492 PyErr_SetString(PyExc_OverflowError, 6493 "new string is too long"); 6494 return NULL; 6495 } 6496 old_j = j; 6497 } 6498 } 6499 else { 6500 j++; 6501 if (*p == '\n' || *p == '\r') { 6502 i += j; 6503 old_j = j = 0; 6504 if (i < 0) { 6505 PyErr_SetString(PyExc_OverflowError, 6506 "new string is too long"); 6507 return NULL; 6508 } 6509 } 6510 } 6511 6512 if ((i + j) < 0) { 6513 PyErr_SetString(PyExc_OverflowError, "new string is too long"); 6514 return NULL; 6515 } 6516 6517 /* Second pass: create output string and fill it */ 6518 u = _PyUnicode_New(i + j); 6519 if (!u) 6520 return NULL; 6521 6522 j = 0; 6523 q = u->str; 6524 6525 for (p = self->str; p < e; p++) 6526 if (*p == '\t') { 6527 if (tabsize > 0) { 6528 i = tabsize - (j % tabsize); 6529 j += i; 6530 while (i--) 6531 *q++ = ' '; 6532 } 6533 } 6534 else { 6535 j++; 6536 *q++ = *p; 6537 if (*p == '\n' || *p == '\r') 6538 j = 0; 6539 } 6540 6541 return (PyObject*) u; 6542} 6543 6544PyDoc_STRVAR(find__doc__, 6545"S.find(sub [,start [,end]]) -> int\n\ 6546\n\ 6547Return the lowest index in S where substring sub is found,\n\ 6548such that sub is contained within s[start:end]. Optional\n\ 6549arguments start and end are interpreted as in slice notation.\n\ 6550\n\ 6551Return -1 on failure."); 6552 6553static PyObject * 6554unicode_find(PyUnicodeObject *self, PyObject *args) 6555{ 6556 PyObject *substring; 6557 Py_ssize_t start = 0; 6558 Py_ssize_t end = PY_SSIZE_T_MAX; 6559 Py_ssize_t result; 6560 6561 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring, 6562 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 6563 return NULL; 6564 substring = PyUnicode_FromObject(substring); 6565 if (!substring) 6566 return NULL; 6567 6568 result = stringlib_find_slice( 6569 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self), 6570 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring), 6571 start, end 6572 ); 6573 6574 Py_DECREF(substring); 6575 6576 return PyInt_FromSsize_t(result); 6577} 6578 6579static PyObject * 6580unicode_getitem(PyUnicodeObject *self, Py_ssize_t index) 6581{ 6582 if (index < 0 || index >= self->length) { 6583 PyErr_SetString(PyExc_IndexError, "string index out of range"); 6584 return NULL; 6585 } 6586 6587 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1); 6588} 6589 6590/* Believe it or not, this produces the same value for ASCII strings 6591 as string_hash(). */ 6592static long 6593unicode_hash(PyUnicodeObject *self) 6594{ 6595 Py_ssize_t len; 6596 Py_UNICODE *p; 6597 long x; 6598 6599 if (self->hash != -1) 6600 return self->hash; 6601 len = Py_Size(self); 6602 p = self->str; 6603 x = *p << 7; 6604 while (--len >= 0) 6605 x = (1000003*x) ^ *p++; 6606 x ^= Py_Size(self); 6607 if (x == -1) 6608 x = -2; 6609 self->hash = x; 6610 return x; 6611} 6612 6613PyDoc_STRVAR(index__doc__, 6614"S.index(sub [,start [,end]]) -> int\n\ 6615\n\ 6616Like S.find() but raise ValueError when the substring is not found."); 6617 6618static PyObject * 6619unicode_index(PyUnicodeObject *self, PyObject *args) 6620{ 6621 Py_ssize_t result; 6622 PyObject *substring; 6623 Py_ssize_t start = 0; 6624 Py_ssize_t end = PY_SSIZE_T_MAX; 6625 6626 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring, 6627 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 6628 return NULL; 6629 substring = PyUnicode_FromObject(substring); 6630 if (!substring) 6631 return NULL; 6632 6633 result = stringlib_find_slice( 6634 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self), 6635 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring), 6636 start, end 6637 ); 6638 6639 Py_DECREF(substring); 6640 6641 if (result < 0) { 6642 PyErr_SetString(PyExc_ValueError, "substring not found"); 6643 return NULL; 6644 } 6645 6646 return PyInt_FromSsize_t(result); 6647} 6648 6649PyDoc_STRVAR(islower__doc__, 6650"S.islower() -> bool\n\ 6651\n\ 6652Return True if all cased characters in S are lowercase and there is\n\ 6653at least one cased character in S, False otherwise."); 6654 6655static PyObject* 6656unicode_islower(PyUnicodeObject *self) 6657{ 6658 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 6659 register const Py_UNICODE *e; 6660 int cased; 6661 6662 /* Shortcut for single character strings */ 6663 if (PyUnicode_GET_SIZE(self) == 1) 6664 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p)); 6665 6666 /* Special case for empty strings */ 6667 if (PyUnicode_GET_SIZE(self) == 0) 6668 return PyBool_FromLong(0); 6669 6670 e = p + PyUnicode_GET_SIZE(self); 6671 cased = 0; 6672 for (; p < e; p++) { 6673 register const Py_UNICODE ch = *p; 6674 6675 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) 6676 return PyBool_FromLong(0); 6677 else if (!cased && Py_UNICODE_ISLOWER(ch)) 6678 cased = 1; 6679 } 6680 return PyBool_FromLong(cased); 6681} 6682 6683PyDoc_STRVAR(isupper__doc__, 6684"S.isupper() -> bool\n\ 6685\n\ 6686Return True if all cased characters in S are uppercase and there is\n\ 6687at least one cased character in S, False otherwise."); 6688 6689static PyObject* 6690unicode_isupper(PyUnicodeObject *self) 6691{ 6692 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 6693 register const Py_UNICODE *e; 6694 int cased; 6695 6696 /* Shortcut for single character strings */ 6697 if (PyUnicode_GET_SIZE(self) == 1) 6698 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0); 6699 6700 /* Special case for empty strings */ 6701 if (PyUnicode_GET_SIZE(self) == 0) 6702 return PyBool_FromLong(0); 6703 6704 e = p + PyUnicode_GET_SIZE(self); 6705 cased = 0; 6706 for (; p < e; p++) { 6707 register const Py_UNICODE ch = *p; 6708 6709 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch)) 6710 return PyBool_FromLong(0); 6711 else if (!cased && Py_UNICODE_ISUPPER(ch)) 6712 cased = 1; 6713 } 6714 return PyBool_FromLong(cased); 6715} 6716 6717PyDoc_STRVAR(istitle__doc__, 6718"S.istitle() -> bool\n\ 6719\n\ 6720Return True if S is a titlecased string and there is at least one\n\ 6721character in S, i.e. upper- and titlecase characters may only\n\ 6722follow uncased characters and lowercase characters only cased ones.\n\ 6723Return False otherwise."); 6724 6725static PyObject* 6726unicode_istitle(PyUnicodeObject *self) 6727{ 6728 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 6729 register const Py_UNICODE *e; 6730 int cased, previous_is_cased; 6731 6732 /* Shortcut for single character strings */ 6733 if (PyUnicode_GET_SIZE(self) == 1) 6734 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) || 6735 (Py_UNICODE_ISUPPER(*p) != 0)); 6736 6737 /* Special case for empty strings */ 6738 if (PyUnicode_GET_SIZE(self) == 0) 6739 return PyBool_FromLong(0); 6740 6741 e = p + PyUnicode_GET_SIZE(self); 6742 cased = 0; 6743 previous_is_cased = 0; 6744 for (; p < e; p++) { 6745 register const Py_UNICODE ch = *p; 6746 6747 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) { 6748 if (previous_is_cased) 6749 return PyBool_FromLong(0); 6750 previous_is_cased = 1; 6751 cased = 1; 6752 } 6753 else if (Py_UNICODE_ISLOWER(ch)) { 6754 if (!previous_is_cased) 6755 return PyBool_FromLong(0); 6756 previous_is_cased = 1; 6757 cased = 1; 6758 } 6759 else 6760 previous_is_cased = 0; 6761 } 6762 return PyBool_FromLong(cased); 6763} 6764 6765PyDoc_STRVAR(isspace__doc__, 6766"S.isspace() -> bool\n\ 6767\n\ 6768Return True if all characters in S are whitespace\n\ 6769and there is at least one character in S, False otherwise."); 6770 6771static PyObject* 6772unicode_isspace(PyUnicodeObject *self) 6773{ 6774 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 6775 register const Py_UNICODE *e; 6776 6777 /* Shortcut for single character strings */ 6778 if (PyUnicode_GET_SIZE(self) == 1 && 6779 Py_UNICODE_ISSPACE(*p)) 6780 return PyBool_FromLong(1); 6781 6782 /* Special case for empty strings */ 6783 if (PyUnicode_GET_SIZE(self) == 0) 6784 return PyBool_FromLong(0); 6785 6786 e = p + PyUnicode_GET_SIZE(self); 6787 for (; p < e; p++) { 6788 if (!Py_UNICODE_ISSPACE(*p)) 6789 return PyBool_FromLong(0); 6790 } 6791 return PyBool_FromLong(1); 6792} 6793 6794PyDoc_STRVAR(isalpha__doc__, 6795"S.isalpha() -> bool\n\ 6796\n\ 6797Return True if all characters in S are alphabetic\n\ 6798and there is at least one character in S, False otherwise."); 6799 6800static PyObject* 6801unicode_isalpha(PyUnicodeObject *self) 6802{ 6803 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 6804 register const Py_UNICODE *e; 6805 6806 /* Shortcut for single character strings */ 6807 if (PyUnicode_GET_SIZE(self) == 1 && 6808 Py_UNICODE_ISALPHA(*p)) 6809 return PyBool_FromLong(1); 6810 6811 /* Special case for empty strings */ 6812 if (PyUnicode_GET_SIZE(self) == 0) 6813 return PyBool_FromLong(0); 6814 6815 e = p + PyUnicode_GET_SIZE(self); 6816 for (; p < e; p++) { 6817 if (!Py_UNICODE_ISALPHA(*p)) 6818 return PyBool_FromLong(0); 6819 } 6820 return PyBool_FromLong(1); 6821} 6822 6823PyDoc_STRVAR(isalnum__doc__, 6824"S.isalnum() -> bool\n\ 6825\n\ 6826Return True if all characters in S are alphanumeric\n\ 6827and there is at least one character in S, False otherwise."); 6828 6829static PyObject* 6830unicode_isalnum(PyUnicodeObject *self) 6831{ 6832 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 6833 register const Py_UNICODE *e; 6834 6835 /* Shortcut for single character strings */ 6836 if (PyUnicode_GET_SIZE(self) == 1 && 6837 Py_UNICODE_ISALNUM(*p)) 6838 return PyBool_FromLong(1); 6839 6840 /* Special case for empty strings */ 6841 if (PyUnicode_GET_SIZE(self) == 0) 6842 return PyBool_FromLong(0); 6843 6844 e = p + PyUnicode_GET_SIZE(self); 6845 for (; p < e; p++) { 6846 if (!Py_UNICODE_ISALNUM(*p)) 6847 return PyBool_FromLong(0); 6848 } 6849 return PyBool_FromLong(1); 6850} 6851 6852PyDoc_STRVAR(isdecimal__doc__, 6853"S.isdecimal() -> bool\n\ 6854\n\ 6855Return True if there are only decimal characters in S,\n\ 6856False otherwise."); 6857 6858static PyObject* 6859unicode_isdecimal(PyUnicodeObject *self) 6860{ 6861 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 6862 register const Py_UNICODE *e; 6863 6864 /* Shortcut for single character strings */ 6865 if (PyUnicode_GET_SIZE(self) == 1 && 6866 Py_UNICODE_ISDECIMAL(*p)) 6867 return PyBool_FromLong(1); 6868 6869 /* Special case for empty strings */ 6870 if (PyUnicode_GET_SIZE(self) == 0) 6871 return PyBool_FromLong(0); 6872 6873 e = p + PyUnicode_GET_SIZE(self); 6874 for (; p < e; p++) { 6875 if (!Py_UNICODE_ISDECIMAL(*p)) 6876 return PyBool_FromLong(0); 6877 } 6878 return PyBool_FromLong(1); 6879} 6880 6881PyDoc_STRVAR(isdigit__doc__, 6882"S.isdigit() -> bool\n\ 6883\n\ 6884Return True if all characters in S are digits\n\ 6885and there is at least one character in S, False otherwise."); 6886 6887static PyObject* 6888unicode_isdigit(PyUnicodeObject *self) 6889{ 6890 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 6891 register const Py_UNICODE *e; 6892 6893 /* Shortcut for single character strings */ 6894 if (PyUnicode_GET_SIZE(self) == 1 && 6895 Py_UNICODE_ISDIGIT(*p)) 6896 return PyBool_FromLong(1); 6897 6898 /* Special case for empty strings */ 6899 if (PyUnicode_GET_SIZE(self) == 0) 6900 return PyBool_FromLong(0); 6901 6902 e = p + PyUnicode_GET_SIZE(self); 6903 for (; p < e; p++) { 6904 if (!Py_UNICODE_ISDIGIT(*p)) 6905 return PyBool_FromLong(0); 6906 } 6907 return PyBool_FromLong(1); 6908} 6909 6910PyDoc_STRVAR(isnumeric__doc__, 6911"S.isnumeric() -> bool\n\ 6912\n\ 6913Return True if there are only numeric characters in S,\n\ 6914False otherwise."); 6915 6916static PyObject* 6917unicode_isnumeric(PyUnicodeObject *self) 6918{ 6919 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 6920 register const Py_UNICODE *e; 6921 6922 /* Shortcut for single character strings */ 6923 if (PyUnicode_GET_SIZE(self) == 1 && 6924 Py_UNICODE_ISNUMERIC(*p)) 6925 return PyBool_FromLong(1); 6926 6927 /* Special case for empty strings */ 6928 if (PyUnicode_GET_SIZE(self) == 0) 6929 return PyBool_FromLong(0); 6930 6931 e = p + PyUnicode_GET_SIZE(self); 6932 for (; p < e; p++) { 6933 if (!Py_UNICODE_ISNUMERIC(*p)) 6934 return PyBool_FromLong(0); 6935 } 6936 return PyBool_FromLong(1); 6937} 6938 6939int 6940PyUnicode_IsIdentifier(PyObject *self) 6941{ 6942 register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self); 6943 register const Py_UNICODE *e; 6944 6945 /* Special case for empty strings */ 6946 if (PyUnicode_GET_SIZE(self) == 0) 6947 return 0; 6948 6949 /* PEP 3131 says that the first character must be in 6950 XID_Start and subsequent characters in XID_Continue, 6951 and for the ASCII range, the 2.x rules apply (i.e 6952 start with letters and underscore, continue with 6953 letters, digits, underscore). However, given the current 6954 definition of XID_Start and XID_Continue, it is sufficient 6955 to check just for these, except that _ must be allowed 6956 as starting an identifier. */ 6957 if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */) 6958 return 0; 6959 6960 e = p + PyUnicode_GET_SIZE(self); 6961 for (p++; p < e; p++) { 6962 if (!_PyUnicode_IsXidContinue(*p)) 6963 return 0; 6964 } 6965 return 1; 6966} 6967 6968PyDoc_STRVAR(isidentifier__doc__, 6969"S.isidentifier() -> bool\n\ 6970\n\ 6971Return True if S is a valid identifier according\n\ 6972to the language definition."); 6973 6974static PyObject* 6975unicode_isidentifier(PyObject *self) 6976{ 6977 return PyBool_FromLong(PyUnicode_IsIdentifier(self)); 6978} 6979 6980PyDoc_STRVAR(join__doc__, 6981"S.join(sequence) -> unicode\n\ 6982\n\ 6983Return a string which is the concatenation of the strings in the\n\ 6984sequence. The separator between elements is S."); 6985 6986static PyObject* 6987unicode_join(PyObject *self, PyObject *data) 6988{ 6989 return PyUnicode_Join(self, data); 6990} 6991 6992static Py_ssize_t 6993unicode_length(PyUnicodeObject *self) 6994{ 6995 return self->length; 6996} 6997 6998PyDoc_STRVAR(ljust__doc__, 6999"S.ljust(width[, fillchar]) -> int\n\ 7000\n\ 7001Return S left justified in a Unicode string of length width. Padding is\n\ 7002done using the specified fill character (default is a space)."); 7003 7004static PyObject * 7005unicode_ljust(PyUnicodeObject *self, PyObject *args) 7006{ 7007 Py_ssize_t width; 7008 Py_UNICODE fillchar = ' '; 7009 7010 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar)) 7011 return NULL; 7012 7013 if (self->length >= width && PyUnicode_CheckExact(self)) { 7014 Py_INCREF(self); 7015 return (PyObject*) self; 7016 } 7017 7018 return (PyObject*) pad(self, 0, width - self->length, fillchar); 7019} 7020 7021PyDoc_STRVAR(lower__doc__, 7022"S.lower() -> unicode\n\ 7023\n\ 7024Return a copy of the string S converted to lowercase."); 7025 7026static PyObject* 7027unicode_lower(PyUnicodeObject *self) 7028{ 7029 return fixup(self, fixlower); 7030} 7031 7032#define LEFTSTRIP 0 7033#define RIGHTSTRIP 1 7034#define BOTHSTRIP 2 7035 7036/* Arrays indexed by above */ 7037static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"}; 7038 7039#define STRIPNAME(i) (stripformat[i]+3) 7040 7041/* externally visible for str.strip(unicode) */ 7042PyObject * 7043_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj) 7044{ 7045 Py_UNICODE *s = PyUnicode_AS_UNICODE(self); 7046 Py_ssize_t len = PyUnicode_GET_SIZE(self); 7047 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj); 7048 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj); 7049 Py_ssize_t i, j; 7050 7051 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen); 7052 7053 i = 0; 7054 if (striptype != RIGHTSTRIP) { 7055 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) { 7056 i++; 7057 } 7058 } 7059 7060 j = len; 7061 if (striptype != LEFTSTRIP) { 7062 do { 7063 j--; 7064 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen)); 7065 j++; 7066 } 7067 7068 if (i == 0 && j == len && PyUnicode_CheckExact(self)) { 7069 Py_INCREF(self); 7070 return (PyObject*)self; 7071 } 7072 else 7073 return PyUnicode_FromUnicode(s+i, j-i); 7074} 7075 7076 7077static PyObject * 7078do_strip(PyUnicodeObject *self, int striptype) 7079{ 7080 Py_UNICODE *s = PyUnicode_AS_UNICODE(self); 7081 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j; 7082 7083 i = 0; 7084 if (striptype != RIGHTSTRIP) { 7085 while (i < len && Py_UNICODE_ISSPACE(s[i])) { 7086 i++; 7087 } 7088 } 7089 7090 j = len; 7091 if (striptype != LEFTSTRIP) { 7092 do { 7093 j--; 7094 } while (j >= i && Py_UNICODE_ISSPACE(s[j])); 7095 j++; 7096 } 7097 7098 if (i == 0 && j == len && PyUnicode_CheckExact(self)) { 7099 Py_INCREF(self); 7100 return (PyObject*)self; 7101 } 7102 else 7103 return PyUnicode_FromUnicode(s+i, j-i); 7104} 7105 7106 7107static PyObject * 7108do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args) 7109{ 7110 PyObject *sep = NULL; 7111 7112 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep)) 7113 return NULL; 7114 7115 if (sep != NULL && sep != Py_None) { 7116 if (PyUnicode_Check(sep)) 7117 return _PyUnicode_XStrip(self, striptype, sep); 7118 else if (PyString_Check(sep)) { 7119 PyObject *res; 7120 sep = PyUnicode_FromObject(sep); 7121 if (sep==NULL) 7122 return NULL; 7123 res = _PyUnicode_XStrip(self, striptype, sep); 7124 Py_DECREF(sep); 7125 return res; 7126 } 7127 else { 7128 PyErr_Format(PyExc_TypeError, 7129 "%s arg must be None, unicode or str", 7130 STRIPNAME(striptype)); 7131 return NULL; 7132 } 7133 } 7134 7135 return do_strip(self, striptype); 7136} 7137 7138 7139PyDoc_STRVAR(strip__doc__, 7140"S.strip([chars]) -> unicode\n\ 7141\n\ 7142Return a copy of the string S with leading and trailing\n\ 7143whitespace removed.\n\ 7144If chars is given and not None, remove characters in chars instead.\n\ 7145If chars is a str, it will be converted to unicode before stripping"); 7146 7147static PyObject * 7148unicode_strip(PyUnicodeObject *self, PyObject *args) 7149{ 7150 if (PyTuple_GET_SIZE(args) == 0) 7151 return do_strip(self, BOTHSTRIP); /* Common case */ 7152 else 7153 return do_argstrip(self, BOTHSTRIP, args); 7154} 7155 7156 7157PyDoc_STRVAR(lstrip__doc__, 7158"S.lstrip([chars]) -> unicode\n\ 7159\n\ 7160Return a copy of the string S with leading whitespace removed.\n\ 7161If chars is given and not None, remove characters in chars instead.\n\ 7162If chars is a str, it will be converted to unicode before stripping"); 7163 7164static PyObject * 7165unicode_lstrip(PyUnicodeObject *self, PyObject *args) 7166{ 7167 if (PyTuple_GET_SIZE(args) == 0) 7168 return do_strip(self, LEFTSTRIP); /* Common case */ 7169 else 7170 return do_argstrip(self, LEFTSTRIP, args); 7171} 7172 7173 7174PyDoc_STRVAR(rstrip__doc__, 7175"S.rstrip([chars]) -> unicode\n\ 7176\n\ 7177Return a copy of the string S with trailing whitespace removed.\n\ 7178If chars is given and not None, remove characters in chars instead.\n\ 7179If chars is a str, it will be converted to unicode before stripping"); 7180 7181static PyObject * 7182unicode_rstrip(PyUnicodeObject *self, PyObject *args) 7183{ 7184 if (PyTuple_GET_SIZE(args) == 0) 7185 return do_strip(self, RIGHTSTRIP); /* Common case */ 7186 else 7187 return do_argstrip(self, RIGHTSTRIP, args); 7188} 7189 7190 7191static PyObject* 7192unicode_repeat(PyUnicodeObject *str, Py_ssize_t len) 7193{ 7194 PyUnicodeObject *u; 7195 Py_UNICODE *p; 7196 Py_ssize_t nchars; 7197 size_t nbytes; 7198 7199 if (len < 0) 7200 len = 0; 7201 7202 if (len == 1 && PyUnicode_CheckExact(str)) { 7203 /* no repeat, return original string */ 7204 Py_INCREF(str); 7205 return (PyObject*) str; 7206 } 7207 7208 /* ensure # of chars needed doesn't overflow int and # of bytes 7209 * needed doesn't overflow size_t 7210 */ 7211 nchars = len * str->length; 7212 if (len && nchars / len != str->length) { 7213 PyErr_SetString(PyExc_OverflowError, 7214 "repeated string is too long"); 7215 return NULL; 7216 } 7217 nbytes = (nchars + 1) * sizeof(Py_UNICODE); 7218 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) { 7219 PyErr_SetString(PyExc_OverflowError, 7220 "repeated string is too long"); 7221 return NULL; 7222 } 7223 u = _PyUnicode_New(nchars); 7224 if (!u) 7225 return NULL; 7226 7227 p = u->str; 7228 7229 if (str->length == 1 && len > 0) { 7230 Py_UNICODE_FILL(p, str->str[0], len); 7231 } else { 7232 Py_ssize_t done = 0; /* number of characters copied this far */ 7233 if (done < nchars) { 7234 Py_UNICODE_COPY(p, str->str, str->length); 7235 done = str->length; 7236 } 7237 while (done < nchars) { 7238 int n = (done <= nchars-done) ? done : nchars-done; 7239 Py_UNICODE_COPY(p+done, p, n); 7240 done += n; 7241 } 7242 } 7243 7244 return (PyObject*) u; 7245} 7246 7247PyObject *PyUnicode_Replace(PyObject *obj, 7248 PyObject *subobj, 7249 PyObject *replobj, 7250 Py_ssize_t maxcount) 7251{ 7252 PyObject *self; 7253 PyObject *str1; 7254 PyObject *str2; 7255 PyObject *result; 7256 7257 self = PyUnicode_FromObject(obj); 7258 if (self == NULL) 7259 return NULL; 7260 str1 = PyUnicode_FromObject(subobj); 7261 if (str1 == NULL) { 7262 Py_DECREF(self); 7263 return NULL; 7264 } 7265 str2 = PyUnicode_FromObject(replobj); 7266 if (str2 == NULL) { 7267 Py_DECREF(self); 7268 Py_DECREF(str1); 7269 return NULL; 7270 } 7271 result = replace((PyUnicodeObject *)self, 7272 (PyUnicodeObject *)str1, 7273 (PyUnicodeObject *)str2, 7274 maxcount); 7275 Py_DECREF(self); 7276 Py_DECREF(str1); 7277 Py_DECREF(str2); 7278 return result; 7279} 7280 7281PyDoc_STRVAR(replace__doc__, 7282"S.replace (old, new[, maxsplit]) -> unicode\n\ 7283\n\ 7284Return a copy of S with all occurrences of substring\n\ 7285old replaced by new. If the optional argument maxsplit is\n\ 7286given, only the first maxsplit occurrences are replaced."); 7287 7288static PyObject* 7289unicode_replace(PyUnicodeObject *self, PyObject *args) 7290{ 7291 PyUnicodeObject *str1; 7292 PyUnicodeObject *str2; 7293 Py_ssize_t maxcount = -1; 7294 PyObject *result; 7295 7296 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount)) 7297 return NULL; 7298 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1); 7299 if (str1 == NULL) 7300 return NULL; 7301 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2); 7302 if (str2 == NULL) { 7303 Py_DECREF(str1); 7304 return NULL; 7305 } 7306 7307 result = replace(self, str1, str2, maxcount); 7308 7309 Py_DECREF(str1); 7310 Py_DECREF(str2); 7311 return result; 7312} 7313 7314static 7315PyObject *unicode_repr(PyObject *unicode) 7316{ 7317 PyObject *repr; 7318 Py_UNICODE *p; 7319 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode); 7320 Py_ssize_t size = PyUnicode_GET_SIZE(unicode); 7321 7322 /* XXX(nnorwitz): rather than over-allocating, it would be 7323 better to choose a different scheme. Perhaps scan the 7324 first N-chars of the string and allocate based on that size. 7325 */ 7326 /* Initial allocation is based on the longest-possible unichr 7327 escape. 7328 7329 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source 7330 unichr, so in this case it's the longest unichr escape. In 7331 narrow (UTF-16) builds this is five chars per source unichr 7332 since there are two unichrs in the surrogate pair, so in narrow 7333 (UTF-16) builds it's not the longest unichr escape. 7334 7335 In wide or narrow builds '\uxxxx' is 6 chars per source unichr, 7336 so in the narrow (UTF-16) build case it's the longest unichr 7337 escape. 7338 */ 7339 7340 repr = PyUnicode_FromUnicode(NULL, 7341 2 /* quotes */ 7342#ifdef Py_UNICODE_WIDE 7343 + 10*size 7344#else 7345 + 6*size 7346#endif 7347 + 1); 7348 if (repr == NULL) 7349 return NULL; 7350 7351 p = PyUnicode_AS_UNICODE(repr); 7352 7353 /* Add quote */ 7354 *p++ = (findchar(s, size, '\'') && 7355 !findchar(s, size, '"')) ? '"' : '\''; 7356 while (size-- > 0) { 7357 Py_UNICODE ch = *s++; 7358 7359 /* Escape quotes and backslashes */ 7360 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) { 7361 *p++ = '\\'; 7362 *p++ = ch; 7363 continue; 7364 } 7365 7366#ifdef Py_UNICODE_WIDE 7367 /* Map 21-bit characters to '\U00xxxxxx' */ 7368 else if (ch >= 0x10000) { 7369 *p++ = '\\'; 7370 *p++ = 'U'; 7371 *p++ = hexdigits[(ch >> 28) & 0x0000000F]; 7372 *p++ = hexdigits[(ch >> 24) & 0x0000000F]; 7373 *p++ = hexdigits[(ch >> 20) & 0x0000000F]; 7374 *p++ = hexdigits[(ch >> 16) & 0x0000000F]; 7375 *p++ = hexdigits[(ch >> 12) & 0x0000000F]; 7376 *p++ = hexdigits[(ch >> 8) & 0x0000000F]; 7377 *p++ = hexdigits[(ch >> 4) & 0x0000000F]; 7378 *p++ = hexdigits[ch & 0x0000000F]; 7379 continue; 7380 } 7381#else 7382 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */ 7383 else if (ch >= 0xD800 && ch < 0xDC00) { 7384 Py_UNICODE ch2; 7385 Py_UCS4 ucs; 7386 7387 ch2 = *s++; 7388 size--; 7389 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) { 7390 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000; 7391 *p++ = '\\'; 7392 *p++ = 'U'; 7393 *p++ = hexdigits[(ucs >> 28) & 0x0000000F]; 7394 *p++ = hexdigits[(ucs >> 24) & 0x0000000F]; 7395 *p++ = hexdigits[(ucs >> 20) & 0x0000000F]; 7396 *p++ = hexdigits[(ucs >> 16) & 0x0000000F]; 7397 *p++ = hexdigits[(ucs >> 12) & 0x0000000F]; 7398 *p++ = hexdigits[(ucs >> 8) & 0x0000000F]; 7399 *p++ = hexdigits[(ucs >> 4) & 0x0000000F]; 7400 *p++ = hexdigits[ucs & 0x0000000F]; 7401 continue; 7402 } 7403 /* Fall through: isolated surrogates are copied as-is */ 7404 s--; 7405 size++; 7406 } 7407#endif 7408 7409 /* Map 16-bit characters to '\uxxxx' */ 7410 if (ch >= 256) { 7411 *p++ = '\\'; 7412 *p++ = 'u'; 7413 *p++ = hexdigits[(ch >> 12) & 0x000F]; 7414 *p++ = hexdigits[(ch >> 8) & 0x000F]; 7415 *p++ = hexdigits[(ch >> 4) & 0x000F]; 7416 *p++ = hexdigits[ch & 0x000F]; 7417 } 7418 7419 /* Map special whitespace to '\t', \n', '\r' */ 7420 else if (ch == '\t') { 7421 *p++ = '\\'; 7422 *p++ = 't'; 7423 } 7424 else if (ch == '\n') { 7425 *p++ = '\\'; 7426 *p++ = 'n'; 7427 } 7428 else if (ch == '\r') { 7429 *p++ = '\\'; 7430 *p++ = 'r'; 7431 } 7432 7433 /* Map non-printable US ASCII to '\xhh' */ 7434 else if (ch < ' ' || ch >= 0x7F) { 7435 *p++ = '\\'; 7436 *p++ = 'x'; 7437 *p++ = hexdigits[(ch >> 4) & 0x000F]; 7438 *p++ = hexdigits[ch & 0x000F]; 7439 } 7440 7441 /* Copy everything else as-is */ 7442 else 7443 *p++ = (char) ch; 7444 } 7445 /* Add quote */ 7446 *p++ = PyUnicode_AS_UNICODE(repr)[0]; 7447 7448 *p = '\0'; 7449 _PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr)); 7450 return repr; 7451} 7452 7453PyDoc_STRVAR(rfind__doc__, 7454"S.rfind(sub [,start [,end]]) -> int\n\ 7455\n\ 7456Return the highest index in S where substring sub is found,\n\ 7457such that sub is contained within s[start:end]. Optional\n\ 7458arguments start and end are interpreted as in slice notation.\n\ 7459\n\ 7460Return -1 on failure."); 7461 7462static PyObject * 7463unicode_rfind(PyUnicodeObject *self, PyObject *args) 7464{ 7465 PyObject *substring; 7466 Py_ssize_t start = 0; 7467 Py_ssize_t end = PY_SSIZE_T_MAX; 7468 Py_ssize_t result; 7469 7470 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring, 7471 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 7472 return NULL; 7473 substring = PyUnicode_FromObject(substring); 7474 if (!substring) 7475 return NULL; 7476 7477 result = stringlib_rfind_slice( 7478 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self), 7479 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring), 7480 start, end 7481 ); 7482 7483 Py_DECREF(substring); 7484 7485 return PyInt_FromSsize_t(result); 7486} 7487 7488PyDoc_STRVAR(rindex__doc__, 7489"S.rindex(sub [,start [,end]]) -> int\n\ 7490\n\ 7491Like S.rfind() but raise ValueError when the substring is not found."); 7492 7493static PyObject * 7494unicode_rindex(PyUnicodeObject *self, PyObject *args) 7495{ 7496 PyObject *substring; 7497 Py_ssize_t start = 0; 7498 Py_ssize_t end = PY_SSIZE_T_MAX; 7499 Py_ssize_t result; 7500 7501 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring, 7502 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 7503 return NULL; 7504 substring = PyUnicode_FromObject(substring); 7505 if (!substring) 7506 return NULL; 7507 7508 result = stringlib_rfind_slice( 7509 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self), 7510 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring), 7511 start, end 7512 ); 7513 7514 Py_DECREF(substring); 7515 7516 if (result < 0) { 7517 PyErr_SetString(PyExc_ValueError, "substring not found"); 7518 return NULL; 7519 } 7520 return PyInt_FromSsize_t(result); 7521} 7522 7523PyDoc_STRVAR(rjust__doc__, 7524"S.rjust(width[, fillchar]) -> unicode\n\ 7525\n\ 7526Return S right justified in a Unicode string of length width. Padding is\n\ 7527done using the specified fill character (default is a space)."); 7528 7529static PyObject * 7530unicode_rjust(PyUnicodeObject *self, PyObject *args) 7531{ 7532 Py_ssize_t width; 7533 Py_UNICODE fillchar = ' '; 7534 7535 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar)) 7536 return NULL; 7537 7538 if (self->length >= width && PyUnicode_CheckExact(self)) { 7539 Py_INCREF(self); 7540 return (PyObject*) self; 7541 } 7542 7543 return (PyObject*) pad(self, width - self->length, 0, fillchar); 7544} 7545 7546PyObject *PyUnicode_Split(PyObject *s, 7547 PyObject *sep, 7548 Py_ssize_t maxsplit) 7549{ 7550 PyObject *result; 7551 7552 s = PyUnicode_FromObject(s); 7553 if (s == NULL) 7554 return NULL; 7555 if (sep != NULL) { 7556 sep = PyUnicode_FromObject(sep); 7557 if (sep == NULL) { 7558 Py_DECREF(s); 7559 return NULL; 7560 } 7561 } 7562 7563 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit); 7564 7565 Py_DECREF(s); 7566 Py_XDECREF(sep); 7567 return result; 7568} 7569 7570PyDoc_STRVAR(split__doc__, 7571"S.split([sep [,maxsplit]]) -> list of strings\n\ 7572\n\ 7573Return a list of the words in S, using sep as the\n\ 7574delimiter string. If maxsplit is given, at most maxsplit\n\ 7575splits are done. If sep is not specified or is None,\n\ 7576any whitespace string is a separator."); 7577 7578static PyObject* 7579unicode_split(PyUnicodeObject *self, PyObject *args) 7580{ 7581 PyObject *substring = Py_None; 7582 Py_ssize_t maxcount = -1; 7583 7584 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount)) 7585 return NULL; 7586 7587 if (substring == Py_None) 7588 return split(self, NULL, maxcount); 7589 else if (PyUnicode_Check(substring)) 7590 return split(self, (PyUnicodeObject *)substring, maxcount); 7591 else 7592 return PyUnicode_Split((PyObject *)self, substring, maxcount); 7593} 7594 7595PyObject * 7596PyUnicode_Partition(PyObject *str_in, PyObject *sep_in) 7597{ 7598 PyObject* str_obj; 7599 PyObject* sep_obj; 7600 PyObject* out; 7601 7602 str_obj = PyUnicode_FromObject(str_in); 7603 if (!str_obj) 7604 return NULL; 7605 sep_obj = PyUnicode_FromObject(sep_in); 7606 if (!sep_obj) { 7607 Py_DECREF(str_obj); 7608 return NULL; 7609 } 7610 7611 out = stringlib_partition( 7612 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj), 7613 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj) 7614 ); 7615 7616 Py_DECREF(sep_obj); 7617 Py_DECREF(str_obj); 7618 7619 return out; 7620} 7621 7622 7623PyObject * 7624PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in) 7625{ 7626 PyObject* str_obj; 7627 PyObject* sep_obj; 7628 PyObject* out; 7629 7630 str_obj = PyUnicode_FromObject(str_in); 7631 if (!str_obj) 7632 return NULL; 7633 sep_obj = PyUnicode_FromObject(sep_in); 7634 if (!sep_obj) { 7635 Py_DECREF(str_obj); 7636 return NULL; 7637 } 7638 7639 out = stringlib_rpartition( 7640 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj), 7641 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj) 7642 ); 7643 7644 Py_DECREF(sep_obj); 7645 Py_DECREF(str_obj); 7646 7647 return out; 7648} 7649 7650PyDoc_STRVAR(partition__doc__, 7651"S.partition(sep) -> (head, sep, tail)\n\ 7652\n\ 7653Searches for the separator sep in S, and returns the part before it,\n\ 7654the separator itself, and the part after it. If the separator is not\n\ 7655found, returns S and two empty strings."); 7656 7657static PyObject* 7658unicode_partition(PyUnicodeObject *self, PyObject *separator) 7659{ 7660 return PyUnicode_Partition((PyObject *)self, separator); 7661} 7662 7663PyDoc_STRVAR(rpartition__doc__, 7664"S.rpartition(sep) -> (tail, sep, head)\n\ 7665\n\ 7666Searches for the separator sep in S, starting at the end of S, and returns\n\ 7667the part before it, the separator itself, and the part after it. If the\n\ 7668separator is not found, returns two empty strings and S."); 7669 7670static PyObject* 7671unicode_rpartition(PyUnicodeObject *self, PyObject *separator) 7672{ 7673 return PyUnicode_RPartition((PyObject *)self, separator); 7674} 7675 7676PyObject *PyUnicode_RSplit(PyObject *s, 7677 PyObject *sep, 7678 Py_ssize_t maxsplit) 7679{ 7680 PyObject *result; 7681 7682 s = PyUnicode_FromObject(s); 7683 if (s == NULL) 7684 return NULL; 7685 if (sep != NULL) { 7686 sep = PyUnicode_FromObject(sep); 7687 if (sep == NULL) { 7688 Py_DECREF(s); 7689 return NULL; 7690 } 7691 } 7692 7693 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit); 7694 7695 Py_DECREF(s); 7696 Py_XDECREF(sep); 7697 return result; 7698} 7699 7700PyDoc_STRVAR(rsplit__doc__, 7701"S.rsplit([sep [,maxsplit]]) -> list of strings\n\ 7702\n\ 7703Return a list of the words in S, using sep as the\n\ 7704delimiter string, starting at the end of the string and\n\ 7705working to the front. If maxsplit is given, at most maxsplit\n\ 7706splits are done. If sep is not specified, any whitespace string\n\ 7707is a separator."); 7708 7709static PyObject* 7710unicode_rsplit(PyUnicodeObject *self, PyObject *args) 7711{ 7712 PyObject *substring = Py_None; 7713 Py_ssize_t maxcount = -1; 7714 7715 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount)) 7716 return NULL; 7717 7718 if (substring == Py_None) 7719 return rsplit(self, NULL, maxcount); 7720 else if (PyUnicode_Check(substring)) 7721 return rsplit(self, (PyUnicodeObject *)substring, maxcount); 7722 else 7723 return PyUnicode_RSplit((PyObject *)self, substring, maxcount); 7724} 7725 7726PyDoc_STRVAR(splitlines__doc__, 7727"S.splitlines([keepends]]) -> list of strings\n\ 7728\n\ 7729Return a list of the lines in S, breaking at line boundaries.\n\ 7730Line breaks are not included in the resulting list unless keepends\n\ 7731is given and true."); 7732 7733static PyObject* 7734unicode_splitlines(PyUnicodeObject *self, PyObject *args) 7735{ 7736 int keepends = 0; 7737 7738 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends)) 7739 return NULL; 7740 7741 return PyUnicode_Splitlines((PyObject *)self, keepends); 7742} 7743 7744static 7745PyObject *unicode_str(PyObject *self) 7746{ 7747 if (PyUnicode_CheckExact(self)) { 7748 Py_INCREF(self); 7749 return self; 7750 } else 7751 /* Subtype -- return genuine unicode string with the same value. */ 7752 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self), 7753 PyUnicode_GET_SIZE(self)); 7754} 7755 7756PyDoc_STRVAR(swapcase__doc__, 7757"S.swapcase() -> unicode\n\ 7758\n\ 7759Return a copy of S with uppercase characters converted to lowercase\n\ 7760and vice versa."); 7761 7762static PyObject* 7763unicode_swapcase(PyUnicodeObject *self) 7764{ 7765 return fixup(self, fixswapcase); 7766} 7767 7768PyDoc_STRVAR(translate__doc__, 7769"S.translate(table) -> unicode\n\ 7770\n\ 7771Return a copy of the string S, where all characters have been mapped\n\ 7772through the given translation table, which must be a mapping of\n\ 7773Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\ 7774Unmapped characters are left untouched. Characters mapped to None\n\ 7775are deleted."); 7776 7777static PyObject* 7778unicode_translate(PyUnicodeObject *self, PyObject *table) 7779{ 7780 return PyUnicode_TranslateCharmap(self->str, 7781 self->length, 7782 table, 7783 "ignore"); 7784} 7785 7786PyDoc_STRVAR(upper__doc__, 7787"S.upper() -> unicode\n\ 7788\n\ 7789Return a copy of S converted to uppercase."); 7790 7791static PyObject* 7792unicode_upper(PyUnicodeObject *self) 7793{ 7794 return fixup(self, fixupper); 7795} 7796 7797PyDoc_STRVAR(zfill__doc__, 7798"S.zfill(width) -> unicode\n\ 7799\n\ 7800Pad a numeric string x with zeros on the left, to fill a field\n\ 7801of the specified width. The string x is never truncated."); 7802 7803static PyObject * 7804unicode_zfill(PyUnicodeObject *self, PyObject *args) 7805{ 7806 Py_ssize_t fill; 7807 PyUnicodeObject *u; 7808 7809 Py_ssize_t width; 7810 if (!PyArg_ParseTuple(args, "n:zfill", &width)) 7811 return NULL; 7812 7813 if (self->length >= width) { 7814 if (PyUnicode_CheckExact(self)) { 7815 Py_INCREF(self); 7816 return (PyObject*) self; 7817 } 7818 else 7819 return PyUnicode_FromUnicode( 7820 PyUnicode_AS_UNICODE(self), 7821 PyUnicode_GET_SIZE(self) 7822 ); 7823 } 7824 7825 fill = width - self->length; 7826 7827 u = pad(self, fill, 0, '0'); 7828 7829 if (u == NULL) 7830 return NULL; 7831 7832 if (u->str[fill] == '+' || u->str[fill] == '-') { 7833 /* move sign to beginning of string */ 7834 u->str[0] = u->str[fill]; 7835 u->str[fill] = '0'; 7836 } 7837 7838 return (PyObject*) u; 7839} 7840 7841#if 0 7842static PyObject* 7843unicode_freelistsize(PyUnicodeObject *self) 7844{ 7845 return PyInt_FromLong(unicode_freelist_size); 7846} 7847#endif 7848 7849PyDoc_STRVAR(startswith__doc__, 7850"S.startswith(prefix[, start[, end]]) -> bool\n\ 7851\n\ 7852Return True if S starts with the specified prefix, False otherwise.\n\ 7853With optional start, test S beginning at that position.\n\ 7854With optional end, stop comparing S at that position.\n\ 7855prefix can also be a tuple of strings to try."); 7856 7857static PyObject * 7858unicode_startswith(PyUnicodeObject *self, 7859 PyObject *args) 7860{ 7861 PyObject *subobj; 7862 PyUnicodeObject *substring; 7863 Py_ssize_t start = 0; 7864 Py_ssize_t end = PY_SSIZE_T_MAX; 7865 int result; 7866 7867 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj, 7868 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 7869 return NULL; 7870 if (PyTuple_Check(subobj)) { 7871 Py_ssize_t i; 7872 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) { 7873 substring = (PyUnicodeObject *)PyUnicode_FromObject( 7874 PyTuple_GET_ITEM(subobj, i)); 7875 if (substring == NULL) 7876 return NULL; 7877 result = tailmatch(self, substring, start, end, -1); 7878 Py_DECREF(substring); 7879 if (result) { 7880 Py_RETURN_TRUE; 7881 } 7882 } 7883 /* nothing matched */ 7884 Py_RETURN_FALSE; 7885 } 7886 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj); 7887 if (substring == NULL) 7888 return NULL; 7889 result = tailmatch(self, substring, start, end, -1); 7890 Py_DECREF(substring); 7891 return PyBool_FromLong(result); 7892} 7893 7894 7895PyDoc_STRVAR(endswith__doc__, 7896"S.endswith(suffix[, start[, end]]) -> bool\n\ 7897\n\ 7898Return True if S ends with the specified suffix, False otherwise.\n\ 7899With optional start, test S beginning at that position.\n\ 7900With optional end, stop comparing S at that position.\n\ 7901suffix can also be a tuple of strings to try."); 7902 7903static PyObject * 7904unicode_endswith(PyUnicodeObject *self, 7905 PyObject *args) 7906{ 7907 PyObject *subobj; 7908 PyUnicodeObject *substring; 7909 Py_ssize_t start = 0; 7910 Py_ssize_t end = PY_SSIZE_T_MAX; 7911 int result; 7912 7913 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj, 7914 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 7915 return NULL; 7916 if (PyTuple_Check(subobj)) { 7917 Py_ssize_t i; 7918 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) { 7919 substring = (PyUnicodeObject *)PyUnicode_FromObject( 7920 PyTuple_GET_ITEM(subobj, i)); 7921 if (substring == NULL) 7922 return NULL; 7923 result = tailmatch(self, substring, start, end, +1); 7924 Py_DECREF(substring); 7925 if (result) { 7926 Py_RETURN_TRUE; 7927 } 7928 } 7929 Py_RETURN_FALSE; 7930 } 7931 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj); 7932 if (substring == NULL) 7933 return NULL; 7934 7935 result = tailmatch(self, substring, start, end, +1); 7936 Py_DECREF(substring); 7937 return PyBool_FromLong(result); 7938} 7939 7940#include "stringlib/string_format.h" 7941 7942PyDoc_STRVAR(format__doc__, 7943"S.format(*args, **kwargs) -> unicode\n\ 7944\n\ 7945"); 7946 7947PyDoc_STRVAR(p_format__doc__, 7948"S.__format__(format_spec) -> unicode\n\ 7949\n\ 7950"); 7951 7952static PyObject * 7953unicode_getnewargs(PyUnicodeObject *v) 7954{ 7955 return Py_BuildValue("(u#)", v->str, v->length); 7956} 7957 7958 7959static PyMethodDef unicode_methods[] = { 7960 7961 /* Order is according to common usage: often used methods should 7962 appear first, since lookup is done sequentially. */ 7963 7964 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__}, 7965 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__}, 7966 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__}, 7967 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__}, 7968 {"join", (PyCFunction) unicode_join, METH_O, join__doc__}, 7969 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__}, 7970 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__}, 7971 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__}, 7972 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__}, 7973 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__}, 7974 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__}, 7975 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__}, 7976 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__}, 7977 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__}, 7978 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__}, 7979 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__}, 7980 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__}, 7981 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__}, 7982 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__}, 7983 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__}, 7984 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__}, 7985 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__}, 7986 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__}, 7987 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__}, 7988 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__}, 7989 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__}, 7990 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__}, 7991 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__}, 7992 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__}, 7993 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__}, 7994 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__}, 7995 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__}, 7996 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__}, 7997 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__}, 7998 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__}, 7999 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__}, 8000 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__}, 8001 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__}, 8002 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__}, 8003 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__}, 8004 {"__format__", (PyCFunction) unicode_unicode__format__, METH_VARARGS, p_format__doc__}, 8005 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS}, 8006 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS}, 8007#if 0 8008 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__}, 8009#endif 8010 8011#if 0 8012 /* This one is just used for debugging the implementation. */ 8013 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS}, 8014#endif 8015 8016 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS}, 8017 {NULL, NULL} 8018}; 8019 8020static PyObject * 8021unicode_mod(PyObject *v, PyObject *w) 8022{ 8023 if (!PyUnicode_Check(v)) { 8024 Py_INCREF(Py_NotImplemented); 8025 return Py_NotImplemented; 8026 } 8027 return PyUnicode_Format(v, w); 8028} 8029 8030static PyNumberMethods unicode_as_number = { 8031 0, /*nb_add*/ 8032 0, /*nb_subtract*/ 8033 0, /*nb_multiply*/ 8034 unicode_mod, /*nb_remainder*/ 8035}; 8036 8037static PySequenceMethods unicode_as_sequence = { 8038 (lenfunc) unicode_length, /* sq_length */ 8039 PyUnicode_Concat, /* sq_concat */ 8040 (ssizeargfunc) unicode_repeat, /* sq_repeat */ 8041 (ssizeargfunc) unicode_getitem, /* sq_item */ 8042 0, /* sq_slice */ 8043 0, /* sq_ass_item */ 8044 0, /* sq_ass_slice */ 8045 PyUnicode_Contains, /* sq_contains */ 8046}; 8047 8048static PyObject* 8049unicode_subscript(PyUnicodeObject* self, PyObject* item) 8050{ 8051 if (PyIndex_Check(item)) { 8052 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError); 8053 if (i == -1 && PyErr_Occurred()) 8054 return NULL; 8055 if (i < 0) 8056 i += PyUnicode_GET_SIZE(self); 8057 return unicode_getitem(self, i); 8058 } else if (PySlice_Check(item)) { 8059 Py_ssize_t start, stop, step, slicelength, cur, i; 8060 Py_UNICODE* source_buf; 8061 Py_UNICODE* result_buf; 8062 PyObject* result; 8063 8064 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self), 8065 &start, &stop, &step, &slicelength) < 0) { 8066 return NULL; 8067 } 8068 8069 if (slicelength <= 0) { 8070 return PyUnicode_FromUnicode(NULL, 0); 8071 } else if (start == 0 && step == 1 && slicelength == self->length && 8072 PyUnicode_CheckExact(self)) { 8073 Py_INCREF(self); 8074 return (PyObject *)self; 8075 } else if (step == 1) { 8076 return PyUnicode_FromUnicode(self->str + start, slicelength); 8077 } else { 8078 source_buf = PyUnicode_AS_UNICODE((PyObject*)self); 8079 result_buf = (Py_UNICODE *)PyMem_MALLOC(slicelength* 8080 sizeof(Py_UNICODE)); 8081 8082 if (result_buf == NULL) 8083 return PyErr_NoMemory(); 8084 8085 for (cur = start, i = 0; i < slicelength; cur += step, i++) { 8086 result_buf[i] = source_buf[cur]; 8087 } 8088 8089 result = PyUnicode_FromUnicode(result_buf, slicelength); 8090 PyMem_FREE(result_buf); 8091 return result; 8092 } 8093 } else { 8094 PyErr_SetString(PyExc_TypeError, "string indices must be integers"); 8095 return NULL; 8096 } 8097} 8098 8099static PyMappingMethods unicode_as_mapping = { 8100 (lenfunc)unicode_length, /* mp_length */ 8101 (binaryfunc)unicode_subscript, /* mp_subscript */ 8102 (objobjargproc)0, /* mp_ass_subscript */ 8103}; 8104 8105 8106static int 8107unicode_buffer_getbuffer(PyUnicodeObject *self, PyBuffer *view, int flags) 8108{ 8109 8110 if (flags & PyBUF_CHARACTER) { 8111 PyErr_SetString(PyExc_SystemError, "can't use str as char buffer"); 8112 return -1; 8113 } 8114 return PyBuffer_FillInfo(view, (void *)self->str, 8115 PyUnicode_GET_DATA_SIZE(self), 1, flags); 8116} 8117 8118 8119/* Helpers for PyUnicode_Format() */ 8120 8121static PyObject * 8122getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx) 8123{ 8124 Py_ssize_t argidx = *p_argidx; 8125 if (argidx < arglen) { 8126 (*p_argidx)++; 8127 if (arglen < 0) 8128 return args; 8129 else 8130 return PyTuple_GetItem(args, argidx); 8131 } 8132 PyErr_SetString(PyExc_TypeError, 8133 "not enough arguments for format string"); 8134 return NULL; 8135} 8136 8137#define F_LJUST (1<<0) 8138#define F_SIGN (1<<1) 8139#define F_BLANK (1<<2) 8140#define F_ALT (1<<3) 8141#define F_ZERO (1<<4) 8142 8143static Py_ssize_t 8144strtounicode(Py_UNICODE *buffer, const char *charbuffer) 8145{ 8146 register Py_ssize_t i; 8147 Py_ssize_t len = strlen(charbuffer); 8148 for (i = len - 1; i >= 0; i--) 8149 buffer[i] = (Py_UNICODE) charbuffer[i]; 8150 8151 return len; 8152} 8153 8154static int 8155doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x) 8156{ 8157 Py_ssize_t result; 8158 8159 PyOS_ascii_formatd((char *)buffer, len, format, x); 8160 result = strtounicode(buffer, (char *)buffer); 8161 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int); 8162} 8163 8164static int 8165longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x) 8166{ 8167 Py_ssize_t result; 8168 8169 PyOS_snprintf((char *)buffer, len, format, x); 8170 result = strtounicode(buffer, (char *)buffer); 8171 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int); 8172} 8173 8174/* XXX To save some code duplication, formatfloat/long/int could have been 8175 shared with stringobject.c, converting from 8-bit to Unicode after the 8176 formatting is done. */ 8177 8178static int 8179formatfloat(Py_UNICODE *buf, 8180 size_t buflen, 8181 int flags, 8182 int prec, 8183 int type, 8184 PyObject *v) 8185{ 8186 /* fmt = '%#.' + `prec` + `type` 8187 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/ 8188 char fmt[20]; 8189 double x; 8190 8191 x = PyFloat_AsDouble(v); 8192 if (x == -1.0 && PyErr_Occurred()) 8193 return -1; 8194 if (prec < 0) 8195 prec = 6; 8196 if (type == 'f' && (fabs(x) / 1e25) >= 1e25) 8197 type = 'g'; 8198 /* Worst case length calc to ensure no buffer overrun: 8199 8200 'g' formats: 8201 fmt = %#.<prec>g 8202 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp 8203 for any double rep.) 8204 len = 1 + prec + 1 + 2 + 5 = 9 + prec 8205 8206 'f' formats: 8207 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50) 8208 len = 1 + 50 + 1 + prec = 52 + prec 8209 8210 If prec=0 the effective precision is 1 (the leading digit is 8211 always given), therefore increase the length by one. 8212 8213 */ 8214 if (((type == 'g' || type == 'G') && 8215 buflen <= (size_t)10 + (size_t)prec) || 8216 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) { 8217 PyErr_SetString(PyExc_OverflowError, 8218 "formatted float is too long (precision too large?)"); 8219 return -1; 8220 } 8221 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c", 8222 (flags&F_ALT) ? "#" : "", 8223 prec, type); 8224 return doubletounicode(buf, buflen, fmt, x); 8225} 8226 8227static PyObject* 8228formatlong(PyObject *val, int flags, int prec, int type) 8229{ 8230 char *buf; 8231 int len; 8232 PyObject *str; /* temporary string object. */ 8233 PyObject *result; 8234 8235 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len); 8236 if (!str) 8237 return NULL; 8238 result = PyUnicode_FromStringAndSize(buf, len); 8239 Py_DECREF(str); 8240 return result; 8241} 8242 8243static int 8244formatint(Py_UNICODE *buf, 8245 size_t buflen, 8246 int flags, 8247 int prec, 8248 int type, 8249 PyObject *v) 8250{ 8251 /* fmt = '%#.' + `prec` + 'l' + `type` 8252 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine) 8253 * + 1 + 1 8254 * = 24 8255 */ 8256 char fmt[64]; /* plenty big enough! */ 8257 char *sign; 8258 long x; 8259 8260 x = PyInt_AsLong(v); 8261 if (x == -1 && PyErr_Occurred()) 8262 return -1; 8263 if (x < 0 && type == 'u') { 8264 type = 'd'; 8265 } 8266 if (x < 0 && (type == 'x' || type == 'X' || type == 'o')) 8267 sign = "-"; 8268 else 8269 sign = ""; 8270 if (prec < 0) 8271 prec = 1; 8272 8273 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal)) 8274 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11 8275 */ 8276 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) { 8277 PyErr_SetString(PyExc_OverflowError, 8278 "formatted integer is too long (precision too large?)"); 8279 return -1; 8280 } 8281 8282 if ((flags & F_ALT) && 8283 (type == 'x' || type == 'X' || type == 'o')) { 8284 /* When converting under %#o, %#x or %#X, there are a number 8285 * of issues that cause pain: 8286 * - for %#o, we want a different base marker than C 8287 * - when 0 is being converted, the C standard leaves off 8288 * the '0x' or '0X', which is inconsistent with other 8289 * %#x/%#X conversions and inconsistent with Python's 8290 * hex() function 8291 * - there are platforms that violate the standard and 8292 * convert 0 with the '0x' or '0X' 8293 * (Metrowerks, Compaq Tru64) 8294 * - there are platforms that give '0x' when converting 8295 * under %#X, but convert 0 in accordance with the 8296 * standard (OS/2 EMX) 8297 * 8298 * We can achieve the desired consistency by inserting our 8299 * own '0x' or '0X' prefix, and substituting %x/%X in place 8300 * of %#x/%#X. 8301 * 8302 * Note that this is the same approach as used in 8303 * formatint() in stringobject.c 8304 */ 8305 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c", 8306 sign, type, prec, type); 8307 } 8308 else { 8309 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c", 8310 sign, (flags&F_ALT) ? "#" : "", 8311 prec, type); 8312 } 8313 if (sign[0]) 8314 return longtounicode(buf, buflen, fmt, -x); 8315 else 8316 return longtounicode(buf, buflen, fmt, x); 8317} 8318 8319static int 8320formatchar(Py_UNICODE *buf, 8321 size_t buflen, 8322 PyObject *v) 8323{ 8324 /* presume that the buffer is at least 2 characters long */ 8325 if (PyUnicode_Check(v)) { 8326 if (PyUnicode_GET_SIZE(v) != 1) 8327 goto onError; 8328 buf[0] = PyUnicode_AS_UNICODE(v)[0]; 8329 } 8330 8331 else if (PyString_Check(v)) { 8332 if (PyString_GET_SIZE(v) != 1) 8333 goto onError; 8334 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0]; 8335 } 8336 8337 else { 8338 /* Integer input truncated to a character */ 8339 long x; 8340 x = PyInt_AsLong(v); 8341 if (x == -1 && PyErr_Occurred()) 8342 goto onError; 8343#ifdef Py_UNICODE_WIDE 8344 if (x < 0 || x > 0x10ffff) { 8345 PyErr_SetString(PyExc_OverflowError, 8346 "%c arg not in range(0x110000) " 8347 "(wide Python build)"); 8348 return -1; 8349 } 8350#else 8351 if (x < 0 || x > 0xffff) { 8352 PyErr_SetString(PyExc_OverflowError, 8353 "%c arg not in range(0x10000) " 8354 "(narrow Python build)"); 8355 return -1; 8356 } 8357#endif 8358 buf[0] = (Py_UNICODE) x; 8359 } 8360 buf[1] = '\0'; 8361 return 1; 8362 8363 onError: 8364 PyErr_SetString(PyExc_TypeError, 8365 "%c requires int or char"); 8366 return -1; 8367} 8368 8369/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...) 8370 8371 FORMATBUFLEN is the length of the buffer in which the floats, ints, & 8372 chars are formatted. XXX This is a magic number. Each formatting 8373 routine does bounds checking to ensure no overflow, but a better 8374 solution may be to malloc a buffer of appropriate size for each 8375 format. For now, the current solution is sufficient. 8376*/ 8377#define FORMATBUFLEN (size_t)120 8378 8379PyObject *PyUnicode_Format(PyObject *format, 8380 PyObject *args) 8381{ 8382 Py_UNICODE *fmt, *res; 8383 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx; 8384 int args_owned = 0; 8385 PyUnicodeObject *result = NULL; 8386 PyObject *dict = NULL; 8387 PyObject *uformat; 8388 8389 if (format == NULL || args == NULL) { 8390 PyErr_BadInternalCall(); 8391 return NULL; 8392 } 8393 uformat = PyUnicode_FromObject(format); 8394 if (uformat == NULL) 8395 return NULL; 8396 fmt = PyUnicode_AS_UNICODE(uformat); 8397 fmtcnt = PyUnicode_GET_SIZE(uformat); 8398 8399 reslen = rescnt = fmtcnt + 100; 8400 result = _PyUnicode_New(reslen); 8401 if (result == NULL) 8402 goto onError; 8403 res = PyUnicode_AS_UNICODE(result); 8404 8405 if (PyTuple_Check(args)) { 8406 arglen = PyTuple_Size(args); 8407 argidx = 0; 8408 } 8409 else { 8410 arglen = -1; 8411 argidx = -2; 8412 } 8413 if (Py_Type(args)->tp_as_mapping && !PyTuple_Check(args) && 8414 !PyObject_TypeCheck(args, &PyBaseString_Type)) 8415 dict = args; 8416 8417 while (--fmtcnt >= 0) { 8418 if (*fmt != '%') { 8419 if (--rescnt < 0) { 8420 rescnt = fmtcnt + 100; 8421 reslen += rescnt; 8422 if (_PyUnicode_Resize(&result, reslen) < 0) 8423 goto onError; 8424 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt; 8425 --rescnt; 8426 } 8427 *res++ = *fmt++; 8428 } 8429 else { 8430 /* Got a format specifier */ 8431 int flags = 0; 8432 Py_ssize_t width = -1; 8433 int prec = -1; 8434 Py_UNICODE c = '\0'; 8435 Py_UNICODE fill; 8436 PyObject *v = NULL; 8437 PyObject *temp = NULL; 8438 Py_UNICODE *pbuf; 8439 Py_UNICODE sign; 8440 Py_ssize_t len; 8441 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */ 8442 8443 fmt++; 8444 if (*fmt == '(') { 8445 Py_UNICODE *keystart; 8446 Py_ssize_t keylen; 8447 PyObject *key; 8448 int pcount = 1; 8449 8450 if (dict == NULL) { 8451 PyErr_SetString(PyExc_TypeError, 8452 "format requires a mapping"); 8453 goto onError; 8454 } 8455 ++fmt; 8456 --fmtcnt; 8457 keystart = fmt; 8458 /* Skip over balanced parentheses */ 8459 while (pcount > 0 && --fmtcnt >= 0) { 8460 if (*fmt == ')') 8461 --pcount; 8462 else if (*fmt == '(') 8463 ++pcount; 8464 fmt++; 8465 } 8466 keylen = fmt - keystart - 1; 8467 if (fmtcnt < 0 || pcount > 0) { 8468 PyErr_SetString(PyExc_ValueError, 8469 "incomplete format key"); 8470 goto onError; 8471 } 8472#if 0 8473 /* keys are converted to strings using UTF-8 and 8474 then looked up since Python uses strings to hold 8475 variables names etc. in its namespaces and we 8476 wouldn't want to break common idioms. */ 8477 key = PyUnicode_EncodeUTF8(keystart, 8478 keylen, 8479 NULL); 8480#else 8481 key = PyUnicode_FromUnicode(keystart, keylen); 8482#endif 8483 if (key == NULL) 8484 goto onError; 8485 if (args_owned) { 8486 Py_DECREF(args); 8487 args_owned = 0; 8488 } 8489 args = PyObject_GetItem(dict, key); 8490 Py_DECREF(key); 8491 if (args == NULL) { 8492 goto onError; 8493 } 8494 args_owned = 1; 8495 arglen = -1; 8496 argidx = -2; 8497 } 8498 while (--fmtcnt >= 0) { 8499 switch (c = *fmt++) { 8500 case '-': flags |= F_LJUST; continue; 8501 case '+': flags |= F_SIGN; continue; 8502 case ' ': flags |= F_BLANK; continue; 8503 case '#': flags |= F_ALT; continue; 8504 case '0': flags |= F_ZERO; continue; 8505 } 8506 break; 8507 } 8508 if (c == '*') { 8509 v = getnextarg(args, arglen, &argidx); 8510 if (v == NULL) 8511 goto onError; 8512 if (!PyInt_Check(v)) { 8513 PyErr_SetString(PyExc_TypeError, 8514 "* wants int"); 8515 goto onError; 8516 } 8517 width = PyInt_AsLong(v); 8518 if (width == -1 && PyErr_Occurred()) 8519 goto onError; 8520 if (width < 0) { 8521 flags |= F_LJUST; 8522 width = -width; 8523 } 8524 if (--fmtcnt >= 0) 8525 c = *fmt++; 8526 } 8527 else if (c >= '0' && c <= '9') { 8528 width = c - '0'; 8529 while (--fmtcnt >= 0) { 8530 c = *fmt++; 8531 if (c < '0' || c > '9') 8532 break; 8533 if ((width*10) / 10 != width) { 8534 PyErr_SetString(PyExc_ValueError, 8535 "width too big"); 8536 goto onError; 8537 } 8538 width = width*10 + (c - '0'); 8539 } 8540 } 8541 if (c == '.') { 8542 prec = 0; 8543 if (--fmtcnt >= 0) 8544 c = *fmt++; 8545 if (c == '*') { 8546 v = getnextarg(args, arglen, &argidx); 8547 if (v == NULL) 8548 goto onError; 8549 if (!PyInt_Check(v)) { 8550 PyErr_SetString(PyExc_TypeError, 8551 "* wants int"); 8552 goto onError; 8553 } 8554 prec = PyInt_AsLong(v); 8555 if (prec == -1 && PyErr_Occurred()) 8556 goto onError; 8557 if (prec < 0) 8558 prec = 0; 8559 if (--fmtcnt >= 0) 8560 c = *fmt++; 8561 } 8562 else if (c >= '0' && c <= '9') { 8563 prec = c - '0'; 8564 while (--fmtcnt >= 0) { 8565 c = Py_CHARMASK(*fmt++); 8566 if (c < '0' || c > '9') 8567 break; 8568 if ((prec*10) / 10 != prec) { 8569 PyErr_SetString(PyExc_ValueError, 8570 "prec too big"); 8571 goto onError; 8572 } 8573 prec = prec*10 + (c - '0'); 8574 } 8575 } 8576 } /* prec */ 8577 if (fmtcnt >= 0) { 8578 if (c == 'h' || c == 'l' || c == 'L') { 8579 if (--fmtcnt >= 0) 8580 c = *fmt++; 8581 } 8582 } 8583 if (fmtcnt < 0) { 8584 PyErr_SetString(PyExc_ValueError, 8585 "incomplete format"); 8586 goto onError; 8587 } 8588 if (c != '%') { 8589 v = getnextarg(args, arglen, &argidx); 8590 if (v == NULL) 8591 goto onError; 8592 } 8593 sign = 0; 8594 fill = ' '; 8595 switch (c) { 8596 8597 case '%': 8598 pbuf = formatbuf; 8599 /* presume that buffer length is at least 1 */ 8600 pbuf[0] = '%'; 8601 len = 1; 8602 break; 8603 8604 case 's': 8605 case 'r': 8606 if (PyUnicode_Check(v) && c == 's') { 8607 temp = v; 8608 Py_INCREF(temp); 8609 } 8610 else { 8611 PyObject *unicode; 8612 if (c == 's') 8613 temp = PyObject_Unicode(v); 8614 else 8615 temp = PyObject_Repr(v); 8616 if (temp == NULL) 8617 goto onError; 8618 if (PyUnicode_Check(temp)) 8619 /* nothing to do */; 8620 else if (PyString_Check(temp)) { 8621 /* convert to string to Unicode */ 8622 unicode = PyUnicode_Decode(PyString_AS_STRING(temp), 8623 PyString_GET_SIZE(temp), 8624 NULL, 8625 "strict"); 8626 Py_DECREF(temp); 8627 temp = unicode; 8628 if (temp == NULL) 8629 goto onError; 8630 } 8631 else { 8632 Py_DECREF(temp); 8633 PyErr_SetString(PyExc_TypeError, 8634 "%s argument has non-string str()"); 8635 goto onError; 8636 } 8637 } 8638 pbuf = PyUnicode_AS_UNICODE(temp); 8639 len = PyUnicode_GET_SIZE(temp); 8640 if (prec >= 0 && len > prec) 8641 len = prec; 8642 break; 8643 8644 case 'i': 8645 case 'd': 8646 case 'u': 8647 case 'o': 8648 case 'x': 8649 case 'X': 8650 if (c == 'i') 8651 c = 'd'; 8652 if (PyLong_Check(v)) { 8653 temp = formatlong(v, flags, prec, c); 8654 if (!temp) 8655 goto onError; 8656 pbuf = PyUnicode_AS_UNICODE(temp); 8657 len = PyUnicode_GET_SIZE(temp); 8658 sign = 1; 8659 } 8660 else { 8661 pbuf = formatbuf; 8662 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), 8663 flags, prec, c, v); 8664 if (len < 0) 8665 goto onError; 8666 sign = 1; 8667 } 8668 if (flags & F_ZERO) 8669 fill = '0'; 8670 break; 8671 8672 case 'e': 8673 case 'E': 8674 case 'f': 8675 case 'F': 8676 case 'g': 8677 case 'G': 8678 if (c == 'F') 8679 c = 'f'; 8680 pbuf = formatbuf; 8681 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), 8682 flags, prec, c, v); 8683 if (len < 0) 8684 goto onError; 8685 sign = 1; 8686 if (flags & F_ZERO) 8687 fill = '0'; 8688 break; 8689 8690 case 'c': 8691 pbuf = formatbuf; 8692 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v); 8693 if (len < 0) 8694 goto onError; 8695 break; 8696 8697 default: 8698 PyErr_Format(PyExc_ValueError, 8699 "unsupported format character '%c' (0x%x) " 8700 "at index %zd", 8701 (31<=c && c<=126) ? (char)c : '?', 8702 (int)c, 8703 (Py_ssize_t)(fmt - 1 - 8704 PyUnicode_AS_UNICODE(uformat))); 8705 goto onError; 8706 } 8707 if (sign) { 8708 if (*pbuf == '-' || *pbuf == '+') { 8709 sign = *pbuf++; 8710 len--; 8711 } 8712 else if (flags & F_SIGN) 8713 sign = '+'; 8714 else if (flags & F_BLANK) 8715 sign = ' '; 8716 else 8717 sign = 0; 8718 } 8719 if (width < len) 8720 width = len; 8721 if (rescnt - (sign != 0) < width) { 8722 reslen -= rescnt; 8723 rescnt = width + fmtcnt + 100; 8724 reslen += rescnt; 8725 if (reslen < 0) { 8726 Py_XDECREF(temp); 8727 PyErr_NoMemory(); 8728 goto onError; 8729 } 8730 if (_PyUnicode_Resize(&result, reslen) < 0) { 8731 Py_XDECREF(temp); 8732 goto onError; 8733 } 8734 res = PyUnicode_AS_UNICODE(result) 8735 + reslen - rescnt; 8736 } 8737 if (sign) { 8738 if (fill != ' ') 8739 *res++ = sign; 8740 rescnt--; 8741 if (width > len) 8742 width--; 8743 } 8744 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) { 8745 assert(pbuf[0] == '0'); 8746 assert(pbuf[1] == c); 8747 if (fill != ' ') { 8748 *res++ = *pbuf++; 8749 *res++ = *pbuf++; 8750 } 8751 rescnt -= 2; 8752 width -= 2; 8753 if (width < 0) 8754 width = 0; 8755 len -= 2; 8756 } 8757 if (width > len && !(flags & F_LJUST)) { 8758 do { 8759 --rescnt; 8760 *res++ = fill; 8761 } while (--width > len); 8762 } 8763 if (fill == ' ') { 8764 if (sign) 8765 *res++ = sign; 8766 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) { 8767 assert(pbuf[0] == '0'); 8768 assert(pbuf[1] == c); 8769 *res++ = *pbuf++; 8770 *res++ = *pbuf++; 8771 } 8772 } 8773 Py_UNICODE_COPY(res, pbuf, len); 8774 res += len; 8775 rescnt -= len; 8776 while (--width >= len) { 8777 --rescnt; 8778 *res++ = ' '; 8779 } 8780 if (dict && (argidx < arglen) && c != '%') { 8781 PyErr_SetString(PyExc_TypeError, 8782 "not all arguments converted during string formatting"); 8783 Py_XDECREF(temp); 8784 goto onError; 8785 } 8786 Py_XDECREF(temp); 8787 } /* '%' */ 8788 } /* until end */ 8789 if (argidx < arglen && !dict) { 8790 PyErr_SetString(PyExc_TypeError, 8791 "not all arguments converted during string formatting"); 8792 goto onError; 8793 } 8794 8795 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0) 8796 goto onError; 8797 if (args_owned) { 8798 Py_DECREF(args); 8799 } 8800 Py_DECREF(uformat); 8801 return (PyObject *)result; 8802 8803 onError: 8804 Py_XDECREF(result); 8805 Py_DECREF(uformat); 8806 if (args_owned) { 8807 Py_DECREF(args); 8808 } 8809 return NULL; 8810} 8811 8812static PyBufferProcs unicode_as_buffer = { 8813 (getbufferproc) unicode_buffer_getbuffer, 8814 NULL, 8815}; 8816 8817static PyObject * 8818unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds); 8819 8820static PyObject * 8821unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds) 8822{ 8823 PyObject *x = NULL; 8824 static char *kwlist[] = {"object", "encoding", "errors", 0}; 8825 char *encoding = NULL; 8826 char *errors = NULL; 8827 8828 if (type != &PyUnicode_Type) 8829 return unicode_subtype_new(type, args, kwds); 8830 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode", 8831 kwlist, &x, &encoding, &errors)) 8832 return NULL; 8833 if (x == NULL) 8834 return (PyObject *)_PyUnicode_New(0); 8835 if (encoding == NULL && errors == NULL) 8836 return PyObject_Unicode(x); 8837 else 8838 return PyUnicode_FromEncodedObject(x, encoding, errors); 8839} 8840 8841static PyObject * 8842unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds) 8843{ 8844 PyUnicodeObject *tmp, *pnew; 8845 Py_ssize_t n; 8846 8847 assert(PyType_IsSubtype(type, &PyUnicode_Type)); 8848 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds); 8849 if (tmp == NULL) 8850 return NULL; 8851 assert(PyUnicode_Check(tmp)); 8852 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length); 8853 if (pnew == NULL) { 8854 Py_DECREF(tmp); 8855 return NULL; 8856 } 8857 pnew->str = PyMem_NEW(Py_UNICODE, n+1); 8858 if (pnew->str == NULL) { 8859 _Py_ForgetReference((PyObject *)pnew); 8860 PyObject_Del(pnew); 8861 Py_DECREF(tmp); 8862 return PyErr_NoMemory(); 8863 } 8864 Py_UNICODE_COPY(pnew->str, tmp->str, n+1); 8865 pnew->length = n; 8866 pnew->hash = tmp->hash; 8867 Py_DECREF(tmp); 8868 return (PyObject *)pnew; 8869} 8870 8871PyDoc_STRVAR(unicode_doc, 8872"str(string [, encoding[, errors]]) -> object\n\ 8873\n\ 8874Create a new string object from the given encoded string.\n\ 8875encoding defaults to the current default string encoding.\n\ 8876errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'."); 8877 8878static PyObject *unicode_iter(PyObject *seq); 8879 8880PyTypeObject PyUnicode_Type = { 8881 PyVarObject_HEAD_INIT(&PyType_Type, 0) 8882 "str", /* tp_name */ 8883 sizeof(PyUnicodeObject), /* tp_size */ 8884 0, /* tp_itemsize */ 8885 /* Slots */ 8886 (destructor)unicode_dealloc, /* tp_dealloc */ 8887 0, /* tp_print */ 8888 0, /* tp_getattr */ 8889 0, /* tp_setattr */ 8890 0, /* tp_compare */ 8891 unicode_repr, /* tp_repr */ 8892 &unicode_as_number, /* tp_as_number */ 8893 &unicode_as_sequence, /* tp_as_sequence */ 8894 &unicode_as_mapping, /* tp_as_mapping */ 8895 (hashfunc) unicode_hash, /* tp_hash*/ 8896 0, /* tp_call*/ 8897 (reprfunc) unicode_str, /* tp_str */ 8898 PyObject_GenericGetAttr, /* tp_getattro */ 8899 0, /* tp_setattro */ 8900 &unicode_as_buffer, /* tp_as_buffer */ 8901 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | 8902 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */ 8903 unicode_doc, /* tp_doc */ 8904 0, /* tp_traverse */ 8905 0, /* tp_clear */ 8906 PyUnicode_RichCompare, /* tp_richcompare */ 8907 0, /* tp_weaklistoffset */ 8908 unicode_iter, /* tp_iter */ 8909 0, /* tp_iternext */ 8910 unicode_methods, /* tp_methods */ 8911 0, /* tp_members */ 8912 0, /* tp_getset */ 8913 &PyBaseString_Type, /* tp_base */ 8914 0, /* tp_dict */ 8915 0, /* tp_descr_get */ 8916 0, /* tp_descr_set */ 8917 0, /* tp_dictoffset */ 8918 0, /* tp_init */ 8919 0, /* tp_alloc */ 8920 unicode_new, /* tp_new */ 8921 PyObject_Del, /* tp_free */ 8922}; 8923 8924/* Initialize the Unicode implementation */ 8925 8926void _PyUnicode_Init(void) 8927{ 8928 int i; 8929 8930 /* XXX - move this array to unicodectype.c ? */ 8931 Py_UNICODE linebreak[] = { 8932 0x000A, /* LINE FEED */ 8933 0x000D, /* CARRIAGE RETURN */ 8934 0x001C, /* FILE SEPARATOR */ 8935 0x001D, /* GROUP SEPARATOR */ 8936 0x001E, /* RECORD SEPARATOR */ 8937 0x0085, /* NEXT LINE */ 8938 0x2028, /* LINE SEPARATOR */ 8939 0x2029, /* PARAGRAPH SEPARATOR */ 8940 }; 8941 8942 /* Init the implementation */ 8943 unicode_freelist = NULL; 8944 unicode_freelist_size = 0; 8945 unicode_empty = _PyUnicode_New(0); 8946 if (!unicode_empty) 8947 return; 8948 8949 for (i = 0; i < 256; i++) 8950 unicode_latin1[i] = NULL; 8951 if (PyType_Ready(&PyUnicode_Type) < 0) 8952 Py_FatalError("Can't initialize 'unicode'"); 8953 8954 /* initialize the linebreak bloom filter */ 8955 bloom_linebreak = make_bloom_mask( 8956 linebreak, sizeof(linebreak) / sizeof(linebreak[0]) 8957 ); 8958 8959 PyType_Ready(&EncodingMapType); 8960} 8961 8962/* Finalize the Unicode implementation */ 8963 8964void 8965_PyUnicode_Fini(void) 8966{ 8967 PyUnicodeObject *u; 8968 int i; 8969 8970 Py_XDECREF(unicode_empty); 8971 unicode_empty = NULL; 8972 8973 for (i = 0; i < 256; i++) { 8974 if (unicode_latin1[i]) { 8975 Py_DECREF(unicode_latin1[i]); 8976 unicode_latin1[i] = NULL; 8977 } 8978 } 8979 8980 for (u = unicode_freelist; u != NULL;) { 8981 PyUnicodeObject *v = u; 8982 u = *(PyUnicodeObject **)u; 8983 if (v->str) 8984 PyMem_DEL(v->str); 8985 Py_XDECREF(v->defenc); 8986 PyObject_Del(v); 8987 } 8988 unicode_freelist = NULL; 8989 unicode_freelist_size = 0; 8990} 8991 8992void 8993PyUnicode_InternInPlace(PyObject **p) 8994{ 8995 register PyUnicodeObject *s = (PyUnicodeObject *)(*p); 8996 PyObject *t; 8997 if (s == NULL || !PyUnicode_Check(s)) 8998 Py_FatalError( 8999 "PyUnicode_InternInPlace: unicode strings only please!"); 9000 /* If it's a subclass, we don't really know what putting 9001 it in the interned dict might do. */ 9002 if (!PyUnicode_CheckExact(s)) 9003 return; 9004 if (PyUnicode_CHECK_INTERNED(s)) 9005 return; 9006 if (interned == NULL) { 9007 interned = PyDict_New(); 9008 if (interned == NULL) { 9009 PyErr_Clear(); /* Don't leave an exception */ 9010 return; 9011 } 9012 } 9013 /* It might be that the GetItem call fails even 9014 though the key is present in the dictionary, 9015 namely when this happens during a stack overflow. */ 9016 Py_ALLOW_RECURSION 9017 t = PyDict_GetItem(interned, (PyObject *)s); 9018 Py_END_ALLOW_RECURSION 9019 9020 if (t) { 9021 Py_INCREF(t); 9022 Py_DECREF(*p); 9023 *p = t; 9024 return; 9025 } 9026 9027 PyThreadState_GET()->recursion_critical = 1; 9028 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) { 9029 PyErr_Clear(); 9030 PyThreadState_GET()->recursion_critical = 0; 9031 return; 9032 } 9033 PyThreadState_GET()->recursion_critical = 0; 9034 /* The two references in interned are not counted by refcnt. 9035 The deallocator will take care of this */ 9036 Py_Refcnt(s) -= 2; 9037 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL; 9038} 9039 9040void 9041PyUnicode_InternImmortal(PyObject **p) 9042{ 9043 PyUnicode_InternInPlace(p); 9044 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) { 9045 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL; 9046 Py_INCREF(*p); 9047 } 9048} 9049 9050PyObject * 9051PyUnicode_InternFromString(const char *cp) 9052{ 9053 PyObject *s = PyUnicode_FromString(cp); 9054 if (s == NULL) 9055 return NULL; 9056 PyUnicode_InternInPlace(&s); 9057 return s; 9058} 9059 9060void _Py_ReleaseInternedUnicodeStrings(void) 9061{ 9062 PyObject *keys; 9063 PyUnicodeObject *s; 9064 Py_ssize_t i, n; 9065 Py_ssize_t immortal_size = 0, mortal_size = 0; 9066 9067 if (interned == NULL || !PyDict_Check(interned)) 9068 return; 9069 keys = PyDict_Keys(interned); 9070 if (keys == NULL || !PyList_Check(keys)) { 9071 PyErr_Clear(); 9072 return; 9073 } 9074 9075 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak 9076 detector, interned unicode strings are not forcibly deallocated; 9077 rather, we give them their stolen references back, and then clear 9078 and DECREF the interned dict. */ 9079 9080 n = PyList_GET_SIZE(keys); 9081 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n", 9082 n); 9083 for (i = 0; i < n; i++) { 9084 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i); 9085 switch (s->state) { 9086 case SSTATE_NOT_INTERNED: 9087 /* XXX Shouldn't happen */ 9088 break; 9089 case SSTATE_INTERNED_IMMORTAL: 9090 Py_Refcnt(s) += 1; 9091 immortal_size += s->length; 9092 break; 9093 case SSTATE_INTERNED_MORTAL: 9094 Py_Refcnt(s) += 2; 9095 mortal_size += s->length; 9096 break; 9097 default: 9098 Py_FatalError("Inconsistent interned string state."); 9099 } 9100 s->state = SSTATE_NOT_INTERNED; 9101 } 9102 fprintf(stderr, "total size of all interned strings: " 9103 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d " 9104 "mortal/immortal\n", mortal_size, immortal_size); 9105 Py_DECREF(keys); 9106 PyDict_Clear(interned); 9107 Py_DECREF(interned); 9108 interned = NULL; 9109} 9110 9111 9112/********************* Unicode Iterator **************************/ 9113 9114typedef struct { 9115 PyObject_HEAD 9116 Py_ssize_t it_index; 9117 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */ 9118} unicodeiterobject; 9119 9120static void 9121unicodeiter_dealloc(unicodeiterobject *it) 9122{ 9123 _PyObject_GC_UNTRACK(it); 9124 Py_XDECREF(it->it_seq); 9125 PyObject_GC_Del(it); 9126} 9127 9128static int 9129unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg) 9130{ 9131 Py_VISIT(it->it_seq); 9132 return 0; 9133} 9134 9135static PyObject * 9136unicodeiter_next(unicodeiterobject *it) 9137{ 9138 PyUnicodeObject *seq; 9139 PyObject *item; 9140 9141 assert(it != NULL); 9142 seq = it->it_seq; 9143 if (seq == NULL) 9144 return NULL; 9145 assert(PyUnicode_Check(seq)); 9146 9147 if (it->it_index < PyUnicode_GET_SIZE(seq)) { 9148 item = PyUnicode_FromUnicode( 9149 PyUnicode_AS_UNICODE(seq)+it->it_index, 1); 9150 if (item != NULL) 9151 ++it->it_index; 9152 return item; 9153 } 9154 9155 Py_DECREF(seq); 9156 it->it_seq = NULL; 9157 return NULL; 9158} 9159 9160static PyObject * 9161unicodeiter_len(unicodeiterobject *it) 9162{ 9163 Py_ssize_t len = 0; 9164 if (it->it_seq) 9165 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index; 9166 return PyInt_FromSsize_t(len); 9167} 9168 9169PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it))."); 9170 9171static PyMethodDef unicodeiter_methods[] = { 9172 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS, 9173 length_hint_doc}, 9174 {NULL, NULL} /* sentinel */ 9175}; 9176 9177PyTypeObject PyUnicodeIter_Type = { 9178 PyVarObject_HEAD_INIT(&PyType_Type, 0) 9179 "unicodeiterator", /* tp_name */ 9180 sizeof(unicodeiterobject), /* tp_basicsize */ 9181 0, /* tp_itemsize */ 9182 /* methods */ 9183 (destructor)unicodeiter_dealloc, /* tp_dealloc */ 9184 0, /* tp_print */ 9185 0, /* tp_getattr */ 9186 0, /* tp_setattr */ 9187 0, /* tp_compare */ 9188 0, /* tp_repr */ 9189 0, /* tp_as_number */ 9190 0, /* tp_as_sequence */ 9191 0, /* tp_as_mapping */ 9192 0, /* tp_hash */ 9193 0, /* tp_call */ 9194 0, /* tp_str */ 9195 PyObject_GenericGetAttr, /* tp_getattro */ 9196 0, /* tp_setattro */ 9197 0, /* tp_as_buffer */ 9198 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */ 9199 0, /* tp_doc */ 9200 (traverseproc)unicodeiter_traverse, /* tp_traverse */ 9201 0, /* tp_clear */ 9202 0, /* tp_richcompare */ 9203 0, /* tp_weaklistoffset */ 9204 PyObject_SelfIter, /* tp_iter */ 9205 (iternextfunc)unicodeiter_next, /* tp_iternext */ 9206 unicodeiter_methods, /* tp_methods */ 9207 0, 9208}; 9209 9210static PyObject * 9211unicode_iter(PyObject *seq) 9212{ 9213 unicodeiterobject *it; 9214 9215 if (!PyUnicode_Check(seq)) { 9216 PyErr_BadInternalCall(); 9217 return NULL; 9218 } 9219 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type); 9220 if (it == NULL) 9221 return NULL; 9222 it->it_index = 0; 9223 Py_INCREF(seq); 9224 it->it_seq = (PyUnicodeObject *)seq; 9225 _PyObject_GC_TRACK(it); 9226 return (PyObject *)it; 9227} 9228 9229size_t 9230Py_UNICODE_strlen(const Py_UNICODE *u) 9231{ 9232 int res = 0; 9233 while(*u++) 9234 res++; 9235 return res; 9236} 9237 9238Py_UNICODE* 9239Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2) 9240{ 9241 Py_UNICODE *u = s1; 9242 while ((*u++ = *s2++)); 9243 return s1; 9244} 9245 9246Py_UNICODE* 9247Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n) 9248{ 9249 Py_UNICODE *u = s1; 9250 while ((*u++ = *s2++)) 9251 if (n-- == 0) 9252 break; 9253 return s1; 9254} 9255 9256int 9257Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2) 9258{ 9259 while (*s1 && *s2 && *s1 == *s2) 9260 s1++, s2++; 9261 if (*s1 && *s2) 9262 return (*s1 < *s2) ? -1 : +1; 9263 if (*s1) 9264 return 1; 9265 if (*s2) 9266 return -1; 9267 return 0; 9268} 9269 9270Py_UNICODE* 9271Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c) 9272{ 9273 const Py_UNICODE *p; 9274 for (p = s; *p; p++) 9275 if (*p == c) 9276 return (Py_UNICODE*)p; 9277 return NULL; 9278} 9279 9280 9281#ifdef __cplusplus 9282} 9283#endif 9284 9285 9286/* 9287Local variables: 9288c-basic-offset: 4 9289indent-tabs-mode: nil 9290End: 9291*/ 9292