unicodeobject.c revision 70a237179f1213b0c180898b6e1f0b6c4e9cd11c
1/* 2 3Unicode implementation based on original code by Fredrik Lundh, 4modified by Marc-Andre Lemburg <mal@lemburg.com> according to the 5Unicode Integration Proposal (see file Misc/unicode.txt). 6 7Major speed upgrades to the method implementations at the Reykjavik 8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke. 9 10Copyright (c) Corporation for National Research Initiatives. 11 12-------------------------------------------------------------------- 13The original string type implementation is: 14 15 Copyright (c) 1999 by Secret Labs AB 16 Copyright (c) 1999 by Fredrik Lundh 17 18By obtaining, using, and/or copying this software and/or its 19associated documentation, you agree that you have read, understood, 20and will comply with the following terms and conditions: 21 22Permission to use, copy, modify, and distribute this software and its 23associated documentation for any purpose and without fee is hereby 24granted, provided that the above copyright notice appears in all 25copies, and that both that copyright notice and this permission notice 26appear in supporting documentation, and that the name of Secret Labs 27AB or the author not be used in advertising or publicity pertaining to 28distribution of the software without specific, written prior 29permission. 30 31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO 32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND 33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR 34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT 37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 38-------------------------------------------------------------------- 39 40*/ 41 42#define PY_SSIZE_T_CLEAN 43#include "Python.h" 44 45#include "unicodeobject.h" 46#include "ucnhash.h" 47 48#include "formatter_unicode.h" 49 50#ifdef MS_WINDOWS 51#include <windows.h> 52#endif 53 54/* Limit for the Unicode object free list */ 55 56#define MAX_UNICODE_FREELIST_SIZE 1024 57 58/* Limit for the Unicode object free list stay alive optimization. 59 60 The implementation will keep allocated Unicode memory intact for 61 all objects on the free list having a size less than this 62 limit. This reduces malloc() overhead for small Unicode objects. 63 64 At worst this will result in MAX_UNICODE_FREELIST_SIZE * 65 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT + 66 malloc()-overhead) bytes of unused garbage. 67 68 Setting the limit to 0 effectively turns the feature off. 69 70 Note: This is an experimental feature ! If you get core dumps when 71 using Unicode objects, turn this feature off. 72 73*/ 74 75#define KEEPALIVE_SIZE_LIMIT 9 76 77/* Endianness switches; defaults to little endian */ 78 79#ifdef WORDS_BIGENDIAN 80# define BYTEORDER_IS_BIG_ENDIAN 81#else 82# define BYTEORDER_IS_LITTLE_ENDIAN 83#endif 84 85/* --- Globals ------------------------------------------------------------ 86 87 The globals are initialized by the _PyUnicode_Init() API and should 88 not be used before calling that API. 89 90*/ 91 92 93#ifdef __cplusplus 94extern "C" { 95#endif 96 97/* This dictionary holds all interned unicode strings. Note that references 98 to strings in this dictionary are *not* counted in the string's ob_refcnt. 99 When the interned string reaches a refcnt of 0 the string deallocation 100 function will delete the reference from this dictionary. 101 102 Another way to look at this is that to say that the actual reference 103 count of a string is: s->ob_refcnt + (s->ob_sstate?2:0) 104*/ 105static PyObject *interned; 106 107/* Free list for Unicode objects */ 108static PyUnicodeObject *unicode_freelist; 109static int unicode_freelist_size; 110 111/* The empty Unicode object is shared to improve performance. */ 112static PyUnicodeObject *unicode_empty; 113 114/* Single character Unicode strings in the Latin-1 range are being 115 shared as well. */ 116static PyUnicodeObject *unicode_latin1[256]; 117 118/* Default encoding to use and assume when NULL is passed as encoding 119 parameter; it is fixed to "utf-8". Always use the 120 PyUnicode_GetDefaultEncoding() API to access this global. */ 121static const char unicode_default_encoding[] = "utf-8"; 122 123Py_UNICODE 124PyUnicode_GetMax(void) 125{ 126#ifdef Py_UNICODE_WIDE 127 return 0x10FFFF; 128#else 129 /* This is actually an illegal character, so it should 130 not be passed to unichr. */ 131 return 0xFFFF; 132#endif 133} 134 135/* --- Bloom Filters ----------------------------------------------------- */ 136 137/* stuff to implement simple "bloom filters" for Unicode characters. 138 to keep things simple, we use a single bitmask, using the least 5 139 bits from each unicode characters as the bit index. */ 140 141/* the linebreak mask is set up by Unicode_Init below */ 142 143#define BLOOM_MASK unsigned long 144 145static BLOOM_MASK bloom_linebreak; 146 147#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F)))) 148 149#define BLOOM_LINEBREAK(ch)\ 150 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK((ch))) 151 152Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len) 153{ 154 /* calculate simple bloom-style bitmask for a given unicode string */ 155 156 long mask; 157 Py_ssize_t i; 158 159 mask = 0; 160 for (i = 0; i < len; i++) 161 mask |= (1 << (ptr[i] & 0x1F)); 162 163 return mask; 164} 165 166Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen) 167{ 168 Py_ssize_t i; 169 170 for (i = 0; i < setlen; i++) 171 if (set[i] == chr) 172 return 1; 173 174 return 0; 175} 176 177#define BLOOM_MEMBER(mask, chr, set, setlen)\ 178 BLOOM(mask, chr) && unicode_member(chr, set, setlen) 179 180/* --- Unicode Object ----------------------------------------------------- */ 181 182static 183int unicode_resize(register PyUnicodeObject *unicode, 184 Py_ssize_t length) 185{ 186 void *oldstr; 187 188 /* Shortcut if there's nothing much to do. */ 189 if (unicode->length == length) 190 goto reset; 191 192 /* Resizing shared object (unicode_empty or single character 193 objects) in-place is not allowed. Use PyUnicode_Resize() 194 instead ! */ 195 196 if (unicode == unicode_empty || 197 (unicode->length == 1 && 198 unicode->str[0] < 256U && 199 unicode_latin1[unicode->str[0]] == unicode)) { 200 PyErr_SetString(PyExc_SystemError, 201 "can't resize shared unicode objects"); 202 return -1; 203 } 204 205 /* We allocate one more byte to make sure the string is Ux0000 terminated. 206 The overallocation is also used by fastsearch, which assumes that it's 207 safe to look at str[length] (without making any assumptions about what 208 it contains). */ 209 210 oldstr = unicode->str; 211 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1); 212 if (!unicode->str) { 213 unicode->str = (Py_UNICODE *)oldstr; 214 PyErr_NoMemory(); 215 return -1; 216 } 217 unicode->str[length] = 0; 218 unicode->length = length; 219 220 reset: 221 /* Reset the object caches */ 222 if (unicode->defenc) { 223 Py_DECREF(unicode->defenc); 224 unicode->defenc = NULL; 225 } 226 unicode->hash = -1; 227 228 return 0; 229} 230 231/* We allocate one more byte to make sure the string is 232 Ux0000 terminated; some code (e.g. new_identifier) 233 relies on that. 234 235 XXX This allocator could further be enhanced by assuring that the 236 free list never reduces its size below 1. 237 238*/ 239 240static 241PyUnicodeObject *_PyUnicode_New(Py_ssize_t length) 242{ 243 register PyUnicodeObject *unicode; 244 245 /* Optimization for empty strings */ 246 if (length == 0 && unicode_empty != NULL) { 247 Py_INCREF(unicode_empty); 248 return unicode_empty; 249 } 250 251 /* Unicode freelist & memory allocation */ 252 if (unicode_freelist) { 253 unicode = unicode_freelist; 254 unicode_freelist = *(PyUnicodeObject **)unicode; 255 unicode_freelist_size--; 256 if (unicode->str) { 257 /* Keep-Alive optimization: we only upsize the buffer, 258 never downsize it. */ 259 if ((unicode->length < length) && 260 unicode_resize(unicode, length) < 0) { 261 PyMem_DEL(unicode->str); 262 goto onError; 263 } 264 } 265 else { 266 unicode->str = PyMem_NEW(Py_UNICODE, length + 1); 267 } 268 PyObject_INIT(unicode, &PyUnicode_Type); 269 } 270 else { 271 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type); 272 if (unicode == NULL) 273 return NULL; 274 unicode->str = PyMem_NEW(Py_UNICODE, length + 1); 275 } 276 277 if (!unicode->str) { 278 PyErr_NoMemory(); 279 goto onError; 280 } 281 /* Initialize the first element to guard against cases where 282 * the caller fails before initializing str -- unicode_resize() 283 * reads str[0], and the Keep-Alive optimization can keep memory 284 * allocated for str alive across a call to unicode_dealloc(unicode). 285 * We don't want unicode_resize to read uninitialized memory in 286 * that case. 287 */ 288 unicode->str[0] = 0; 289 unicode->str[length] = 0; 290 unicode->length = length; 291 unicode->hash = -1; 292 unicode->state = 0; 293 unicode->defenc = NULL; 294 return unicode; 295 296 onError: 297 _Py_ForgetReference((PyObject *)unicode); 298 PyObject_Del(unicode); 299 return NULL; 300} 301 302static 303void unicode_dealloc(register PyUnicodeObject *unicode) 304{ 305 switch (PyUnicode_CHECK_INTERNED(unicode)) { 306 case SSTATE_NOT_INTERNED: 307 break; 308 309 case SSTATE_INTERNED_MORTAL: 310 /* revive dead object temporarily for DelItem */ 311 Py_Refcnt(unicode) = 3; 312 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0) 313 Py_FatalError( 314 "deletion of interned unicode string failed"); 315 break; 316 317 case SSTATE_INTERNED_IMMORTAL: 318 Py_FatalError("Immortal interned unicode string died."); 319 320 default: 321 Py_FatalError("Inconsistent interned unicode string state."); 322 } 323 324 if (PyUnicode_CheckExact(unicode) && 325 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) { 326 /* Keep-Alive optimization */ 327 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) { 328 PyMem_DEL(unicode->str); 329 unicode->str = NULL; 330 unicode->length = 0; 331 } 332 if (unicode->defenc) { 333 Py_DECREF(unicode->defenc); 334 unicode->defenc = NULL; 335 } 336 /* Add to free list */ 337 *(PyUnicodeObject **)unicode = unicode_freelist; 338 unicode_freelist = unicode; 339 unicode_freelist_size++; 340 } 341 else { 342 PyMem_DEL(unicode->str); 343 Py_XDECREF(unicode->defenc); 344 Py_Type(unicode)->tp_free((PyObject *)unicode); 345 } 346} 347 348int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length) 349{ 350 register PyUnicodeObject *v; 351 352 /* Argument checks */ 353 if (unicode == NULL) { 354 PyErr_BadInternalCall(); 355 return -1; 356 } 357 v = (PyUnicodeObject *)*unicode; 358 if (v == NULL || !PyUnicode_Check(v) || Py_Refcnt(v) != 1 || length < 0) { 359 PyErr_BadInternalCall(); 360 return -1; 361 } 362 363 /* Resizing unicode_empty and single character objects is not 364 possible since these are being shared. We simply return a fresh 365 copy with the same Unicode content. */ 366 if (v->length != length && 367 (v == unicode_empty || v->length == 1)) { 368 PyUnicodeObject *w = _PyUnicode_New(length); 369 if (w == NULL) 370 return -1; 371 Py_UNICODE_COPY(w->str, v->str, 372 length < v->length ? length : v->length); 373 Py_DECREF(*unicode); 374 *unicode = (PyObject *)w; 375 return 0; 376 } 377 378 /* Note that we don't have to modify *unicode for unshared Unicode 379 objects, since we can modify them in-place. */ 380 return unicode_resize(v, length); 381} 382 383/* Internal API for use in unicodeobject.c only ! */ 384#define _PyUnicode_Resize(unicodevar, length) \ 385 PyUnicode_Resize(((PyObject **)(unicodevar)), length) 386 387PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u, 388 Py_ssize_t size) 389{ 390 PyUnicodeObject *unicode; 391 392 /* If the Unicode data is known at construction time, we can apply 393 some optimizations which share commonly used objects. */ 394 if (u != NULL) { 395 396 /* Optimization for empty strings */ 397 if (size == 0 && unicode_empty != NULL) { 398 Py_INCREF(unicode_empty); 399 return (PyObject *)unicode_empty; 400 } 401 402 /* Single character Unicode objects in the Latin-1 range are 403 shared when using this constructor */ 404 if (size == 1 && *u < 256) { 405 unicode = unicode_latin1[*u]; 406 if (!unicode) { 407 unicode = _PyUnicode_New(1); 408 if (!unicode) 409 return NULL; 410 unicode->str[0] = *u; 411 unicode_latin1[*u] = unicode; 412 } 413 Py_INCREF(unicode); 414 return (PyObject *)unicode; 415 } 416 } 417 418 unicode = _PyUnicode_New(size); 419 if (!unicode) 420 return NULL; 421 422 /* Copy the Unicode data into the new object */ 423 if (u != NULL) 424 Py_UNICODE_COPY(unicode->str, u, size); 425 426 return (PyObject *)unicode; 427} 428 429PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size) 430{ 431 PyUnicodeObject *unicode; 432 /* If the Unicode data is known at construction time, we can apply 433 some optimizations which share commonly used objects. 434 Also, this means the input must be UTF-8, so fall back to the 435 UTF-8 decoder at the end. */ 436 if (u != NULL) { 437 438 /* Optimization for empty strings */ 439 if (size == 0 && unicode_empty != NULL) { 440 Py_INCREF(unicode_empty); 441 return (PyObject *)unicode_empty; 442 } 443 444 /* Single characters are shared when using this constructor. 445 Restrict to ASCII, since the input must be UTF-8. */ 446 if (size == 1 && Py_CHARMASK(*u) < 128) { 447 unicode = unicode_latin1[Py_CHARMASK(*u)]; 448 if (!unicode) { 449 unicode = _PyUnicode_New(1); 450 if (!unicode) 451 return NULL; 452 unicode->str[0] = Py_CHARMASK(*u); 453 unicode_latin1[Py_CHARMASK(*u)] = unicode; 454 } 455 Py_INCREF(unicode); 456 return (PyObject *)unicode; 457 } 458 459 return PyUnicode_DecodeUTF8(u, size, NULL); 460 } 461 462 unicode = _PyUnicode_New(size); 463 if (!unicode) 464 return NULL; 465 466 return (PyObject *)unicode; 467} 468 469PyObject *PyUnicode_FromString(const char *u) 470{ 471 size_t size = strlen(u); 472 if (size > PY_SSIZE_T_MAX) { 473 PyErr_SetString(PyExc_OverflowError, "input too long"); 474 return NULL; 475 } 476 477 return PyUnicode_FromStringAndSize(u, size); 478} 479 480#ifdef HAVE_WCHAR_H 481 482PyObject *PyUnicode_FromWideChar(register const wchar_t *w, 483 Py_ssize_t size) 484{ 485 PyUnicodeObject *unicode; 486 487 if (w == NULL) { 488 PyErr_BadInternalCall(); 489 return NULL; 490 } 491 492 unicode = _PyUnicode_New(size); 493 if (!unicode) 494 return NULL; 495 496 /* Copy the wchar_t data into the new object */ 497#ifdef HAVE_USABLE_WCHAR_T 498 memcpy(unicode->str, w, size * sizeof(wchar_t)); 499#else 500 { 501 register Py_UNICODE *u; 502 register Py_ssize_t i; 503 u = PyUnicode_AS_UNICODE(unicode); 504 for (i = size; i > 0; i--) 505 *u++ = *w++; 506 } 507#endif 508 509 return (PyObject *)unicode; 510} 511 512static void 513makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c) 514{ 515 *fmt++ = '%'; 516 if (width) { 517 if (zeropad) 518 *fmt++ = '0'; 519 fmt += sprintf(fmt, "%d", width); 520 } 521 if (precision) 522 fmt += sprintf(fmt, ".%d", precision); 523 if (longflag) 524 *fmt++ = 'l'; 525 else if (size_tflag) { 526 char *f = PY_FORMAT_SIZE_T; 527 while (*f) 528 *fmt++ = *f++; 529 } 530 *fmt++ = c; 531 *fmt = '\0'; 532} 533 534#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;} 535 536PyObject * 537PyUnicode_FromFormatV(const char *format, va_list vargs) 538{ 539 va_list count; 540 Py_ssize_t callcount = 0; 541 PyObject **callresults = NULL; 542 PyObject **callresult = NULL; 543 Py_ssize_t n = 0; 544 int width = 0; 545 int precision = 0; 546 int zeropad; 547 const char* f; 548 Py_UNICODE *s; 549 PyObject *string; 550 /* used by sprintf */ 551 char buffer[21]; 552 /* use abuffer instead of buffer, if we need more space 553 * (which can happen if there's a format specifier with width). */ 554 char *abuffer = NULL; 555 char *realbuffer; 556 Py_ssize_t abuffersize = 0; 557 char fmt[60]; /* should be enough for %0width.precisionld */ 558 const char *copy; 559 560#ifdef VA_LIST_IS_ARRAY 561 Py_MEMCPY(count, vargs, sizeof(va_list)); 562#else 563#ifdef __va_copy 564 __va_copy(count, vargs); 565#else 566 count = vargs; 567#endif 568#endif 569 /* step 1: count the number of %S/%R format specifications 570 * (we call PyObject_Unicode()/PyObject_Repr() for these objects 571 * once during step 3 and put the result in an array) */ 572 for (f = format; *f; f++) { 573 if (*f == '%' && (*(f+1)=='S' || *(f+1)=='R')) 574 ++callcount; 575 } 576 /* step 2: allocate memory for the results of 577 * PyObject_Unicode()/PyObject_Repr() calls */ 578 if (callcount) { 579 callresults = PyMem_Malloc(sizeof(PyObject *)*callcount); 580 if (!callresults) { 581 PyErr_NoMemory(); 582 return NULL; 583 } 584 callresult = callresults; 585 } 586 /* step 3: figure out how large a buffer we need */ 587 for (f = format; *f; f++) { 588 if (*f == '%') { 589 const char* p = f; 590 width = 0; 591 while (isdigit(Py_CHARMASK(*f))) 592 width = (width*10) + *f++ - '0'; 593 while (*++f && *f != '%' && !isalpha(Py_CHARMASK(*f))) 594 ; 595 596 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since 597 * they don't affect the amount of space we reserve. 598 */ 599 if ((*f == 'l' || *f == 'z') && 600 (f[1] == 'd' || f[1] == 'u')) 601 ++f; 602 603 switch (*f) { 604 case 'c': 605 (void)va_arg(count, int); 606 /* fall through... */ 607 case '%': 608 n++; 609 break; 610 case 'd': case 'u': case 'i': case 'x': 611 (void) va_arg(count, int); 612 /* 20 bytes is enough to hold a 64-bit 613 integer. Decimal takes the most space. 614 This isn't enough for octal. 615 If a width is specified we need more 616 (which we allocate later). */ 617 if (width < 20) 618 width = 20; 619 n += width; 620 if (abuffersize < width) 621 abuffersize = width; 622 break; 623 case 's': 624 { 625 /* UTF-8 */ 626 unsigned char*s; 627 s = va_arg(count, unsigned char*); 628 while (*s) { 629 if (*s < 128) { 630 n++; s++; 631 } else if (*s < 0xc0) { 632 /* invalid UTF-8 */ 633 n++; s++; 634 } else if (*s < 0xc0) { 635 n++; 636 s++; if(!*s)break; 637 s++; 638 } else if (*s < 0xe0) { 639 n++; 640 s++; if(!*s)break; 641 s++; if(!*s)break; 642 s++; 643 } else { 644 #ifdef Py_UNICODE_WIDE 645 n++; 646 #else 647 n+=2; 648 #endif 649 s++; if(!*s)break; 650 s++; if(!*s)break; 651 s++; if(!*s)break; 652 s++; 653 } 654 } 655 break; 656 } 657 case 'U': 658 { 659 PyObject *obj = va_arg(count, PyObject *); 660 assert(obj && PyUnicode_Check(obj)); 661 n += PyUnicode_GET_SIZE(obj); 662 break; 663 } 664 case 'V': 665 { 666 PyObject *obj = va_arg(count, PyObject *); 667 const char *str = va_arg(count, const char *); 668 assert(obj || str); 669 assert(!obj || PyUnicode_Check(obj)); 670 if (obj) 671 n += PyUnicode_GET_SIZE(obj); 672 else 673 n += strlen(str); 674 break; 675 } 676 case 'S': 677 { 678 PyObject *obj = va_arg(count, PyObject *); 679 PyObject *str; 680 assert(obj); 681 str = PyObject_Unicode(obj); 682 if (!str) 683 goto fail; 684 n += PyUnicode_GET_SIZE(str); 685 /* Remember the str and switch to the next slot */ 686 *callresult++ = str; 687 break; 688 } 689 case 'R': 690 { 691 PyObject *obj = va_arg(count, PyObject *); 692 PyObject *repr; 693 assert(obj); 694 repr = PyObject_Repr(obj); 695 if (!repr) 696 goto fail; 697 n += PyUnicode_GET_SIZE(repr); 698 /* Remember the repr and switch to the next slot */ 699 *callresult++ = repr; 700 break; 701 } 702 case 'p': 703 (void) va_arg(count, int); 704 /* maximum 64-bit pointer representation: 705 * 0xffffffffffffffff 706 * so 19 characters is enough. 707 * XXX I count 18 -- what's the extra for? 708 */ 709 n += 19; 710 break; 711 default: 712 /* if we stumble upon an unknown 713 formatting code, copy the rest of 714 the format string to the output 715 string. (we cannot just skip the 716 code, since there's no way to know 717 what's in the argument list) */ 718 n += strlen(p); 719 goto expand; 720 } 721 } else 722 n++; 723 } 724 expand: 725 if (abuffersize > 20) { 726 abuffer = PyMem_Malloc(abuffersize); 727 if (!abuffer) { 728 PyErr_NoMemory(); 729 goto fail; 730 } 731 realbuffer = abuffer; 732 } 733 else 734 realbuffer = buffer; 735 /* step 4: fill the buffer */ 736 /* Since we've analyzed how much space we need for the worst case, 737 we don't have to resize the string. 738 There can be no errors beyond this point. */ 739 string = PyUnicode_FromUnicode(NULL, n); 740 if (!string) 741 goto fail; 742 743 s = PyUnicode_AS_UNICODE(string); 744 callresult = callresults; 745 746 for (f = format; *f; f++) { 747 if (*f == '%') { 748 const char* p = f++; 749 int longflag = 0; 750 int size_tflag = 0; 751 zeropad = (*f == '0'); 752 /* parse the width.precision part */ 753 width = 0; 754 while (isdigit(Py_CHARMASK(*f))) 755 width = (width*10) + *f++ - '0'; 756 precision = 0; 757 if (*f == '.') { 758 f++; 759 while (isdigit(Py_CHARMASK(*f))) 760 precision = (precision*10) + *f++ - '0'; 761 } 762 /* handle the long flag, but only for %ld and %lu. 763 others can be added when necessary. */ 764 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) { 765 longflag = 1; 766 ++f; 767 } 768 /* handle the size_t flag. */ 769 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) { 770 size_tflag = 1; 771 ++f; 772 } 773 774 switch (*f) { 775 case 'c': 776 *s++ = va_arg(vargs, int); 777 break; 778 case 'd': 779 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd'); 780 if (longflag) 781 sprintf(realbuffer, fmt, va_arg(vargs, long)); 782 else if (size_tflag) 783 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t)); 784 else 785 sprintf(realbuffer, fmt, va_arg(vargs, int)); 786 appendstring(realbuffer); 787 break; 788 case 'u': 789 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u'); 790 if (longflag) 791 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long)); 792 else if (size_tflag) 793 sprintf(realbuffer, fmt, va_arg(vargs, size_t)); 794 else 795 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int)); 796 appendstring(realbuffer); 797 break; 798 case 'i': 799 makefmt(fmt, 0, 0, zeropad, width, precision, 'i'); 800 sprintf(realbuffer, fmt, va_arg(vargs, int)); 801 appendstring(realbuffer); 802 break; 803 case 'x': 804 makefmt(fmt, 0, 0, zeropad, width, precision, 'x'); 805 sprintf(realbuffer, fmt, va_arg(vargs, int)); 806 appendstring(realbuffer); 807 break; 808 case 's': 809 { 810 /* Parameter must be UTF-8 encoded. 811 In case of encoding errors, use 812 the replacement character. */ 813 PyObject *u; 814 p = va_arg(vargs, char*); 815 u = PyUnicode_DecodeUTF8(p, strlen(p), 816 "replace"); 817 if (!u) 818 goto fail; 819 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(u), 820 PyUnicode_GET_SIZE(u)); 821 s += PyUnicode_GET_SIZE(u); 822 Py_DECREF(u); 823 break; 824 } 825 case 'U': 826 { 827 PyObject *obj = va_arg(vargs, PyObject *); 828 Py_ssize_t size = PyUnicode_GET_SIZE(obj); 829 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size); 830 s += size; 831 break; 832 } 833 case 'V': 834 { 835 PyObject *obj = va_arg(vargs, PyObject *); 836 const char *str = va_arg(vargs, const char *); 837 if (obj) { 838 Py_ssize_t size = PyUnicode_GET_SIZE(obj); 839 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size); 840 s += size; 841 } else { 842 appendstring(str); 843 } 844 break; 845 } 846 case 'S': 847 case 'R': 848 { 849 Py_UNICODE *ucopy; 850 Py_ssize_t usize; 851 Py_ssize_t upos; 852 /* unused, since we already have the result */ 853 (void) va_arg(vargs, PyObject *); 854 ucopy = PyUnicode_AS_UNICODE(*callresult); 855 usize = PyUnicode_GET_SIZE(*callresult); 856 for (upos = 0; upos<usize;) 857 *s++ = ucopy[upos++]; 858 /* We're done with the unicode()/repr() => forget it */ 859 Py_DECREF(*callresult); 860 /* switch to next unicode()/repr() result */ 861 ++callresult; 862 break; 863 } 864 case 'p': 865 sprintf(buffer, "%p", va_arg(vargs, void*)); 866 /* %p is ill-defined: ensure leading 0x. */ 867 if (buffer[1] == 'X') 868 buffer[1] = 'x'; 869 else if (buffer[1] != 'x') { 870 memmove(buffer+2, buffer, strlen(buffer)+1); 871 buffer[0] = '0'; 872 buffer[1] = 'x'; 873 } 874 appendstring(buffer); 875 break; 876 case '%': 877 *s++ = '%'; 878 break; 879 default: 880 appendstring(p); 881 goto end; 882 } 883 } else 884 *s++ = *f; 885 } 886 887 end: 888 if (callresults) 889 PyMem_Free(callresults); 890 if (abuffer) 891 PyMem_Free(abuffer); 892 _PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string)); 893 return string; 894 fail: 895 if (callresults) { 896 PyObject **callresult2 = callresults; 897 while (callresult2 < callresult) { 898 Py_DECREF(*callresult2); 899 ++callresult2; 900 } 901 PyMem_Free(callresults); 902 } 903 if (abuffer) 904 PyMem_Free(abuffer); 905 return NULL; 906} 907 908#undef appendstring 909 910PyObject * 911PyUnicode_FromFormat(const char *format, ...) 912{ 913 PyObject* ret; 914 va_list vargs; 915 916#ifdef HAVE_STDARG_PROTOTYPES 917 va_start(vargs, format); 918#else 919 va_start(vargs); 920#endif 921 ret = PyUnicode_FromFormatV(format, vargs); 922 va_end(vargs); 923 return ret; 924} 925 926Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode, 927 wchar_t *w, 928 Py_ssize_t size) 929{ 930 if (unicode == NULL) { 931 PyErr_BadInternalCall(); 932 return -1; 933 } 934 935 /* If possible, try to copy the 0-termination as well */ 936 if (size > PyUnicode_GET_SIZE(unicode)) 937 size = PyUnicode_GET_SIZE(unicode) + 1; 938 939#ifdef HAVE_USABLE_WCHAR_T 940 memcpy(w, unicode->str, size * sizeof(wchar_t)); 941#else 942 { 943 register Py_UNICODE *u; 944 register Py_ssize_t i; 945 u = PyUnicode_AS_UNICODE(unicode); 946 for (i = size; i > 0; i--) 947 *w++ = *u++; 948 } 949#endif 950 951 if (size > PyUnicode_GET_SIZE(unicode)) 952 return PyUnicode_GET_SIZE(unicode); 953 else 954 return size; 955} 956 957#endif 958 959PyObject *PyUnicode_FromOrdinal(int ordinal) 960{ 961 Py_UNICODE s[2]; 962 963 if (ordinal < 0 || ordinal > 0x10ffff) { 964 PyErr_SetString(PyExc_ValueError, 965 "chr() arg not in range(0x110000)"); 966 return NULL; 967 } 968 969#ifndef Py_UNICODE_WIDE 970 if (ordinal > 0xffff) { 971 ordinal -= 0x10000; 972 s[0] = 0xD800 | (ordinal >> 10); 973 s[1] = 0xDC00 | (ordinal & 0x3FF); 974 return PyUnicode_FromUnicode(s, 2); 975 } 976#endif 977 978 s[0] = (Py_UNICODE)ordinal; 979 return PyUnicode_FromUnicode(s, 1); 980} 981 982PyObject *PyUnicode_FromObject(register PyObject *obj) 983{ 984 /* XXX Perhaps we should make this API an alias of 985 PyObject_Unicode() instead ?! */ 986 if (PyUnicode_CheckExact(obj)) { 987 Py_INCREF(obj); 988 return obj; 989 } 990 if (PyUnicode_Check(obj)) { 991 /* For a Unicode subtype that's not a Unicode object, 992 return a true Unicode object with the same data. */ 993 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj), 994 PyUnicode_GET_SIZE(obj)); 995 } 996 return PyUnicode_FromEncodedObject(obj, NULL, "strict"); 997} 998 999PyObject *PyUnicode_FromEncodedObject(register PyObject *obj, 1000 const char *encoding, 1001 const char *errors) 1002{ 1003 const char *s = NULL; 1004 Py_ssize_t len; 1005 PyObject *v; 1006 1007 if (obj == NULL) { 1008 PyErr_BadInternalCall(); 1009 return NULL; 1010 } 1011 1012 if (PyUnicode_Check(obj)) { 1013 PyErr_SetString(PyExc_TypeError, 1014 "decoding Unicode is not supported"); 1015 return NULL; 1016 } 1017 1018 /* Coerce object */ 1019 if (PyString_Check(obj)) { 1020 s = PyString_AS_STRING(obj); 1021 len = PyString_GET_SIZE(obj); 1022 } 1023 else if (PyObject_AsCharBuffer(obj, &s, &len)) { 1024 /* Overwrite the error message with something more useful in 1025 case of a TypeError. */ 1026 if (PyErr_ExceptionMatches(PyExc_TypeError)) 1027 PyErr_Format(PyExc_TypeError, 1028 "coercing to Unicode: need string or buffer, " 1029 "%.80s found", 1030 Py_Type(obj)->tp_name); 1031 goto onError; 1032 } 1033 1034 /* Convert to Unicode */ 1035 if (len == 0) { 1036 Py_INCREF(unicode_empty); 1037 v = (PyObject *)unicode_empty; 1038 } 1039 else 1040 v = PyUnicode_Decode(s, len, encoding, errors); 1041 1042 return v; 1043 1044 onError: 1045 return NULL; 1046} 1047 1048PyObject *PyUnicode_Decode(const char *s, 1049 Py_ssize_t size, 1050 const char *encoding, 1051 const char *errors) 1052{ 1053 PyObject *buffer = NULL, *unicode; 1054 Py_buffer info; 1055 1056 if (encoding == NULL) 1057 encoding = PyUnicode_GetDefaultEncoding(); 1058 1059 /* Shortcuts for common default encodings */ 1060 if (strcmp(encoding, "utf-8") == 0) 1061 return PyUnicode_DecodeUTF8(s, size, errors); 1062 else if (strcmp(encoding, "latin-1") == 0) 1063 return PyUnicode_DecodeLatin1(s, size, errors); 1064#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T) 1065 else if (strcmp(encoding, "mbcs") == 0) 1066 return PyUnicode_DecodeMBCS(s, size, errors); 1067#endif 1068 else if (strcmp(encoding, "ascii") == 0) 1069 return PyUnicode_DecodeASCII(s, size, errors); 1070 1071 /* Decode via the codec registry */ 1072 buffer = NULL; 1073 if (PyBuffer_FillInfo(&info, (void *)s, size, 1, PyBUF_SIMPLE) < 0) 1074 goto onError; 1075 buffer = PyMemoryView_FromMemory(&info); 1076 if (buffer == NULL) 1077 goto onError; 1078 unicode = PyCodec_Decode(buffer, encoding, errors); 1079 if (unicode == NULL) 1080 goto onError; 1081 if (!PyUnicode_Check(unicode)) { 1082 PyErr_Format(PyExc_TypeError, 1083 "decoder did not return an unicode object (type=%.400s)", 1084 Py_Type(unicode)->tp_name); 1085 Py_DECREF(unicode); 1086 goto onError; 1087 } 1088 Py_DECREF(buffer); 1089 return unicode; 1090 1091 onError: 1092 Py_XDECREF(buffer); 1093 return NULL; 1094} 1095 1096PyObject *PyUnicode_AsDecodedObject(PyObject *unicode, 1097 const char *encoding, 1098 const char *errors) 1099{ 1100 PyObject *v; 1101 1102 if (!PyUnicode_Check(unicode)) { 1103 PyErr_BadArgument(); 1104 goto onError; 1105 } 1106 1107 if (encoding == NULL) 1108 encoding = PyUnicode_GetDefaultEncoding(); 1109 1110 /* Decode via the codec registry */ 1111 v = PyCodec_Decode(unicode, encoding, errors); 1112 if (v == NULL) 1113 goto onError; 1114 return v; 1115 1116 onError: 1117 return NULL; 1118} 1119 1120PyObject *PyUnicode_Encode(const Py_UNICODE *s, 1121 Py_ssize_t size, 1122 const char *encoding, 1123 const char *errors) 1124{ 1125 PyObject *v, *unicode; 1126 1127 unicode = PyUnicode_FromUnicode(s, size); 1128 if (unicode == NULL) 1129 return NULL; 1130 v = PyUnicode_AsEncodedString(unicode, encoding, errors); 1131 Py_DECREF(unicode); 1132 return v; 1133} 1134 1135PyObject *PyUnicode_AsEncodedObject(PyObject *unicode, 1136 const char *encoding, 1137 const char *errors) 1138{ 1139 PyObject *v; 1140 1141 if (!PyUnicode_Check(unicode)) { 1142 PyErr_BadArgument(); 1143 goto onError; 1144 } 1145 1146 if (encoding == NULL) 1147 encoding = PyUnicode_GetDefaultEncoding(); 1148 1149 /* Encode via the codec registry */ 1150 v = PyCodec_Encode(unicode, encoding, errors); 1151 if (v == NULL) 1152 goto onError; 1153 return v; 1154 1155 onError: 1156 return NULL; 1157} 1158 1159PyObject *PyUnicode_AsEncodedString(PyObject *unicode, 1160 const char *encoding, 1161 const char *errors) 1162{ 1163 PyObject *v; 1164 1165 if (!PyUnicode_Check(unicode)) { 1166 PyErr_BadArgument(); 1167 goto onError; 1168 } 1169 1170 if (encoding == NULL) 1171 encoding = PyUnicode_GetDefaultEncoding(); 1172 1173 /* Shortcuts for common default encodings */ 1174 if (errors == NULL) { 1175 if (strcmp(encoding, "utf-8") == 0) 1176 return PyUnicode_AsUTF8String(unicode); 1177 else if (strcmp(encoding, "latin-1") == 0) 1178 return PyUnicode_AsLatin1String(unicode); 1179#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T) 1180 else if (strcmp(encoding, "mbcs") == 0) 1181 return PyUnicode_AsMBCSString(unicode); 1182#endif 1183 else if (strcmp(encoding, "ascii") == 0) 1184 return PyUnicode_AsASCIIString(unicode); 1185 } 1186 1187 /* Encode via the codec registry */ 1188 v = PyCodec_Encode(unicode, encoding, errors); 1189 if (v == NULL) 1190 goto onError; 1191 if (!PyBytes_Check(v)) { 1192 if (PyString_Check(v)) { 1193 /* Old codec, turn it into bytes */ 1194 PyObject *b = PyBytes_FromObject(v); 1195 Py_DECREF(v); 1196 return b; 1197 } 1198 PyErr_Format(PyExc_TypeError, 1199 "encoder did not return a bytes object " 1200 "(type=%.400s, encoding=%.20s, errors=%.20s)", 1201 v->ob_type->tp_name, 1202 encoding ? encoding : "NULL", 1203 errors ? errors : "NULL"); 1204 Py_DECREF(v); 1205 goto onError; 1206 } 1207 return v; 1208 1209 onError: 1210 return NULL; 1211} 1212 1213PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode, 1214 const char *errors) 1215{ 1216 PyObject *v = ((PyUnicodeObject *)unicode)->defenc; 1217 PyObject *b; 1218 if (v) 1219 return v; 1220 if (errors != NULL) 1221 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString"); 1222 b = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode), 1223 PyUnicode_GET_SIZE(unicode), 1224 NULL); 1225 if (!b) 1226 return NULL; 1227 v = PyString_FromStringAndSize(PyBytes_AsString(b), 1228 PyBytes_Size(b)); 1229 Py_DECREF(b); 1230 ((PyUnicodeObject *)unicode)->defenc = v; 1231 return v; 1232} 1233 1234char* 1235PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize) 1236{ 1237 PyObject *str8; 1238 if (!PyUnicode_Check(unicode)) { 1239 PyErr_BadArgument(); 1240 return NULL; 1241 } 1242 str8 = _PyUnicode_AsDefaultEncodedString(unicode, NULL); 1243 if (str8 == NULL) 1244 return NULL; 1245 if (psize != NULL) 1246 *psize = PyString_GET_SIZE(str8); 1247 return PyString_AS_STRING(str8); 1248} 1249 1250char* 1251PyUnicode_AsString(PyObject *unicode) 1252{ 1253 return PyUnicode_AsStringAndSize(unicode, NULL); 1254} 1255 1256Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode) 1257{ 1258 if (!PyUnicode_Check(unicode)) { 1259 PyErr_BadArgument(); 1260 goto onError; 1261 } 1262 return PyUnicode_AS_UNICODE(unicode); 1263 1264 onError: 1265 return NULL; 1266} 1267 1268Py_ssize_t PyUnicode_GetSize(PyObject *unicode) 1269{ 1270 if (!PyUnicode_Check(unicode)) { 1271 PyErr_BadArgument(); 1272 goto onError; 1273 } 1274 return PyUnicode_GET_SIZE(unicode); 1275 1276 onError: 1277 return -1; 1278} 1279 1280const char *PyUnicode_GetDefaultEncoding(void) 1281{ 1282 return unicode_default_encoding; 1283} 1284 1285int PyUnicode_SetDefaultEncoding(const char *encoding) 1286{ 1287 if (strcmp(encoding, unicode_default_encoding) != 0) { 1288 PyErr_Format(PyExc_ValueError, 1289 "Can only set default encoding to %s", 1290 unicode_default_encoding); 1291 return -1; 1292 } 1293 return 0; 1294} 1295 1296/* error handling callback helper: 1297 build arguments, call the callback and check the arguments, 1298 if no exception occurred, copy the replacement to the output 1299 and adjust various state variables. 1300 return 0 on success, -1 on error 1301*/ 1302 1303static 1304int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler, 1305 const char *encoding, const char *reason, 1306 const char **input, const char **inend, Py_ssize_t *startinpos, 1307 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr, 1308 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr) 1309{ 1310 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple"; 1311 1312 PyObject *restuple = NULL; 1313 PyObject *repunicode = NULL; 1314 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output); 1315 Py_ssize_t insize; 1316 Py_ssize_t requiredsize; 1317 Py_ssize_t newpos; 1318 Py_UNICODE *repptr; 1319 PyObject *inputobj = NULL; 1320 Py_ssize_t repsize; 1321 int res = -1; 1322 1323 if (*errorHandler == NULL) { 1324 *errorHandler = PyCodec_LookupError(errors); 1325 if (*errorHandler == NULL) 1326 goto onError; 1327 } 1328 1329 if (*exceptionObject == NULL) { 1330 *exceptionObject = PyUnicodeDecodeError_Create( 1331 encoding, *input, *inend-*input, *startinpos, *endinpos, reason); 1332 if (*exceptionObject == NULL) 1333 goto onError; 1334 } 1335 else { 1336 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos)) 1337 goto onError; 1338 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos)) 1339 goto onError; 1340 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason)) 1341 goto onError; 1342 } 1343 1344 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL); 1345 if (restuple == NULL) 1346 goto onError; 1347 if (!PyTuple_Check(restuple)) { 1348 PyErr_Format(PyExc_TypeError, &argparse[4]); 1349 goto onError; 1350 } 1351 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos)) 1352 goto onError; 1353 1354 /* Copy back the bytes variables, which might have been modified by the 1355 callback */ 1356 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject); 1357 if (!inputobj) 1358 goto onError; 1359 if (!PyBytes_Check(inputobj)) { 1360 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes"); 1361 } 1362 *input = PyBytes_AS_STRING(inputobj); 1363 insize = PyBytes_GET_SIZE(inputobj); 1364 *inend = *input + insize; 1365 /* we can DECREF safely, as the exception has another reference, 1366 so the object won't go away. */ 1367 Py_DECREF(inputobj); 1368 1369 if (newpos<0) 1370 newpos = insize+newpos; 1371 if (newpos<0 || newpos>insize) { 1372 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos); 1373 goto onError; 1374 } 1375 1376 /* need more space? (at least enough for what we 1377 have+the replacement+the rest of the string (starting 1378 at the new input position), so we won't have to check space 1379 when there are no errors in the rest of the string) */ 1380 repptr = PyUnicode_AS_UNICODE(repunicode); 1381 repsize = PyUnicode_GET_SIZE(repunicode); 1382 requiredsize = *outpos + repsize + insize-newpos; 1383 if (requiredsize > outsize) { 1384 if (requiredsize<2*outsize) 1385 requiredsize = 2*outsize; 1386 if (PyUnicode_Resize(output, requiredsize) < 0) 1387 goto onError; 1388 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos; 1389 } 1390 *endinpos = newpos; 1391 *inptr = *input + newpos; 1392 Py_UNICODE_COPY(*outptr, repptr, repsize); 1393 *outptr += repsize; 1394 *outpos += repsize; 1395 1396 /* we made it! */ 1397 res = 0; 1398 1399 onError: 1400 Py_XDECREF(restuple); 1401 return res; 1402} 1403 1404/* --- UTF-7 Codec -------------------------------------------------------- */ 1405 1406/* see RFC2152 for details */ 1407 1408static 1409char utf7_special[128] = { 1410 /* indicate whether a UTF-7 character is special i.e. cannot be directly 1411 encoded: 1412 0 - not special 1413 1 - special 1414 2 - whitespace (optional) 1415 3 - RFC2152 Set O (optional) */ 1416 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1, 1417 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1418 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1, 1419 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0, 1420 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1421 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3, 1422 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1423 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1, 1424 1425}; 1426 1427/* Note: The comparison (c) <= 0 is a trick to work-around gcc 1428 warnings about the comparison always being false; since 1429 utf7_special[0] is 1, we can safely make that one comparison 1430 true */ 1431 1432#define SPECIAL(c, encodeO, encodeWS) \ 1433 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \ 1434 (encodeWS && (utf7_special[(c)] == 2)) || \ 1435 (encodeO && (utf7_special[(c)] == 3))) 1436 1437#define B64(n) \ 1438 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f]) 1439#define B64CHAR(c) \ 1440 (isalnum(c) || (c) == '+' || (c) == '/') 1441#define UB64(c) \ 1442 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \ 1443 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 ) 1444 1445#define ENCODE(out, ch, bits) \ 1446 while (bits >= 6) { \ 1447 *out++ = B64(ch >> (bits-6)); \ 1448 bits -= 6; \ 1449 } 1450 1451#define DECODE(out, ch, bits, surrogate) \ 1452 while (bits >= 16) { \ 1453 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \ 1454 bits -= 16; \ 1455 if (surrogate) { \ 1456 /* We have already generated an error for the high surrogate \ 1457 so let's not bother seeing if the low surrogate is correct or not */ \ 1458 surrogate = 0; \ 1459 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \ 1460 /* This is a surrogate pair. Unfortunately we can't represent \ 1461 it in a 16-bit character */ \ 1462 surrogate = 1; \ 1463 errmsg = "code pairs are not supported"; \ 1464 goto utf7Error; \ 1465 } else { \ 1466 *out++ = outCh; \ 1467 } \ 1468 } 1469 1470PyObject *PyUnicode_DecodeUTF7(const char *s, 1471 Py_ssize_t size, 1472 const char *errors) 1473{ 1474 const char *starts = s; 1475 Py_ssize_t startinpos; 1476 Py_ssize_t endinpos; 1477 Py_ssize_t outpos; 1478 const char *e; 1479 PyUnicodeObject *unicode; 1480 Py_UNICODE *p; 1481 const char *errmsg = ""; 1482 int inShift = 0; 1483 unsigned int bitsleft = 0; 1484 unsigned long charsleft = 0; 1485 int surrogate = 0; 1486 PyObject *errorHandler = NULL; 1487 PyObject *exc = NULL; 1488 1489 unicode = _PyUnicode_New(size); 1490 if (!unicode) 1491 return NULL; 1492 if (size == 0) 1493 return (PyObject *)unicode; 1494 1495 p = unicode->str; 1496 e = s + size; 1497 1498 while (s < e) { 1499 Py_UNICODE ch; 1500 restart: 1501 ch = *s; 1502 1503 if (inShift) { 1504 if ((ch == '-') || !B64CHAR(ch)) { 1505 inShift = 0; 1506 s++; 1507 1508 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate); 1509 if (bitsleft >= 6) { 1510 /* The shift sequence has a partial character in it. If 1511 bitsleft < 6 then we could just classify it as padding 1512 but that is not the case here */ 1513 1514 errmsg = "partial character in shift sequence"; 1515 goto utf7Error; 1516 } 1517 /* According to RFC2152 the remaining bits should be zero. We 1518 choose to signal an error/insert a replacement character 1519 here so indicate the potential of a misencoded character. */ 1520 1521 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */ 1522 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) { 1523 errmsg = "non-zero padding bits in shift sequence"; 1524 goto utf7Error; 1525 } 1526 1527 if (ch == '-') { 1528 if ((s < e) && (*(s) == '-')) { 1529 *p++ = '-'; 1530 inShift = 1; 1531 } 1532 } else if (SPECIAL(ch,0,0)) { 1533 errmsg = "unexpected special character"; 1534 goto utf7Error; 1535 } else { 1536 *p++ = ch; 1537 } 1538 } else { 1539 charsleft = (charsleft << 6) | UB64(ch); 1540 bitsleft += 6; 1541 s++; 1542 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate); 1543 } 1544 } 1545 else if ( ch == '+' ) { 1546 startinpos = s-starts; 1547 s++; 1548 if (s < e && *s == '-') { 1549 s++; 1550 *p++ = '+'; 1551 } else 1552 { 1553 inShift = 1; 1554 bitsleft = 0; 1555 } 1556 } 1557 else if (SPECIAL(ch,0,0)) { 1558 startinpos = s-starts; 1559 errmsg = "unexpected special character"; 1560 s++; 1561 goto utf7Error; 1562 } 1563 else { 1564 *p++ = ch; 1565 s++; 1566 } 1567 continue; 1568 utf7Error: 1569 outpos = p-PyUnicode_AS_UNICODE(unicode); 1570 endinpos = s-starts; 1571 if (unicode_decode_call_errorhandler( 1572 errors, &errorHandler, 1573 "utf7", errmsg, 1574 &starts, &e, &startinpos, &endinpos, &exc, &s, 1575 (PyObject **)&unicode, &outpos, &p)) 1576 goto onError; 1577 } 1578 1579 if (inShift) { 1580 outpos = p-PyUnicode_AS_UNICODE(unicode); 1581 endinpos = size; 1582 if (unicode_decode_call_errorhandler( 1583 errors, &errorHandler, 1584 "utf7", "unterminated shift sequence", 1585 &starts, &e, &startinpos, &endinpos, &exc, &s, 1586 (PyObject **)&unicode, &outpos, &p)) 1587 goto onError; 1588 if (s < e) 1589 goto restart; 1590 } 1591 1592 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0) 1593 goto onError; 1594 1595 Py_XDECREF(errorHandler); 1596 Py_XDECREF(exc); 1597 return (PyObject *)unicode; 1598 1599onError: 1600 Py_XDECREF(errorHandler); 1601 Py_XDECREF(exc); 1602 Py_DECREF(unicode); 1603 return NULL; 1604} 1605 1606 1607PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s, 1608 Py_ssize_t size, 1609 int encodeSetO, 1610 int encodeWhiteSpace, 1611 const char *errors) 1612{ 1613 PyObject *v; 1614 /* It might be possible to tighten this worst case */ 1615 Py_ssize_t cbAllocated = 5 * size; 1616 int inShift = 0; 1617 Py_ssize_t i = 0; 1618 unsigned int bitsleft = 0; 1619 unsigned long charsleft = 0; 1620 char * out; 1621 char * start; 1622 1623 if (size == 0) 1624 return PyBytes_FromStringAndSize(NULL, 0); 1625 1626 v = PyBytes_FromStringAndSize(NULL, cbAllocated); 1627 if (v == NULL) 1628 return NULL; 1629 1630 start = out = PyBytes_AS_STRING(v); 1631 for (;i < size; ++i) { 1632 Py_UNICODE ch = s[i]; 1633 1634 if (!inShift) { 1635 if (ch == '+') { 1636 *out++ = '+'; 1637 *out++ = '-'; 1638 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) { 1639 charsleft = ch; 1640 bitsleft = 16; 1641 *out++ = '+'; 1642 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft); 1643 inShift = bitsleft > 0; 1644 } else { 1645 *out++ = (char) ch; 1646 } 1647 } else { 1648 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) { 1649 *out++ = B64(charsleft << (6-bitsleft)); 1650 charsleft = 0; 1651 bitsleft = 0; 1652 /* Characters not in the BASE64 set implicitly unshift the sequence 1653 so no '-' is required, except if the character is itself a '-' */ 1654 if (B64CHAR(ch) || ch == '-') { 1655 *out++ = '-'; 1656 } 1657 inShift = 0; 1658 *out++ = (char) ch; 1659 } else { 1660 bitsleft += 16; 1661 charsleft = (charsleft << 16) | ch; 1662 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft); 1663 1664 /* If the next character is special then we dont' need to terminate 1665 the shift sequence. If the next character is not a BASE64 character 1666 or '-' then the shift sequence will be terminated implicitly and we 1667 don't have to insert a '-'. */ 1668 1669 if (bitsleft == 0) { 1670 if (i + 1 < size) { 1671 Py_UNICODE ch2 = s[i+1]; 1672 1673 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) { 1674 1675 } else if (B64CHAR(ch2) || ch2 == '-') { 1676 *out++ = '-'; 1677 inShift = 0; 1678 } else { 1679 inShift = 0; 1680 } 1681 1682 } 1683 else { 1684 *out++ = '-'; 1685 inShift = 0; 1686 } 1687 } 1688 } 1689 } 1690 } 1691 if (bitsleft) { 1692 *out++= B64(charsleft << (6-bitsleft) ); 1693 *out++ = '-'; 1694 } 1695 1696 if (PyBytes_Resize(v, out - start)) { 1697 Py_DECREF(v); 1698 return NULL; 1699 } 1700 return v; 1701} 1702 1703#undef SPECIAL 1704#undef B64 1705#undef B64CHAR 1706#undef UB64 1707#undef ENCODE 1708#undef DECODE 1709 1710/* --- UTF-8 Codec -------------------------------------------------------- */ 1711 1712static 1713char utf8_code_length[256] = { 1714 /* Map UTF-8 encoded prefix byte to sequence length. zero means 1715 illegal prefix. see RFC 2279 for details */ 1716 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1717 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1718 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1719 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1720 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1721 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1722 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1723 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1724 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1725 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1726 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1727 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1728 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1729 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1730 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1731 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0 1732}; 1733 1734PyObject *PyUnicode_DecodeUTF8(const char *s, 1735 Py_ssize_t size, 1736 const char *errors) 1737{ 1738 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL); 1739} 1740 1741PyObject *PyUnicode_DecodeUTF8Stateful(const char *s, 1742 Py_ssize_t size, 1743 const char *errors, 1744 Py_ssize_t *consumed) 1745{ 1746 const char *starts = s; 1747 int n; 1748 Py_ssize_t startinpos; 1749 Py_ssize_t endinpos; 1750 Py_ssize_t outpos; 1751 const char *e; 1752 PyUnicodeObject *unicode; 1753 Py_UNICODE *p; 1754 const char *errmsg = ""; 1755 PyObject *errorHandler = NULL; 1756 PyObject *exc = NULL; 1757 1758 /* Note: size will always be longer than the resulting Unicode 1759 character count */ 1760 unicode = _PyUnicode_New(size); 1761 if (!unicode) 1762 return NULL; 1763 if (size == 0) { 1764 if (consumed) 1765 *consumed = 0; 1766 return (PyObject *)unicode; 1767 } 1768 1769 /* Unpack UTF-8 encoded data */ 1770 p = unicode->str; 1771 e = s + size; 1772 1773 while (s < e) { 1774 Py_UCS4 ch = (unsigned char)*s; 1775 1776 if (ch < 0x80) { 1777 *p++ = (Py_UNICODE)ch; 1778 s++; 1779 continue; 1780 } 1781 1782 n = utf8_code_length[ch]; 1783 1784 if (s + n > e) { 1785 if (consumed) 1786 break; 1787 else { 1788 errmsg = "unexpected end of data"; 1789 startinpos = s-starts; 1790 endinpos = size; 1791 goto utf8Error; 1792 } 1793 } 1794 1795 switch (n) { 1796 1797 case 0: 1798 errmsg = "unexpected code byte"; 1799 startinpos = s-starts; 1800 endinpos = startinpos+1; 1801 goto utf8Error; 1802 1803 case 1: 1804 errmsg = "internal error"; 1805 startinpos = s-starts; 1806 endinpos = startinpos+1; 1807 goto utf8Error; 1808 1809 case 2: 1810 if ((s[1] & 0xc0) != 0x80) { 1811 errmsg = "invalid data"; 1812 startinpos = s-starts; 1813 endinpos = startinpos+2; 1814 goto utf8Error; 1815 } 1816 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f); 1817 if (ch < 0x80) { 1818 startinpos = s-starts; 1819 endinpos = startinpos+2; 1820 errmsg = "illegal encoding"; 1821 goto utf8Error; 1822 } 1823 else 1824 *p++ = (Py_UNICODE)ch; 1825 break; 1826 1827 case 3: 1828 if ((s[1] & 0xc0) != 0x80 || 1829 (s[2] & 0xc0) != 0x80) { 1830 errmsg = "invalid data"; 1831 startinpos = s-starts; 1832 endinpos = startinpos+3; 1833 goto utf8Error; 1834 } 1835 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f); 1836 if (ch < 0x0800) { 1837 /* Note: UTF-8 encodings of surrogates are considered 1838 legal UTF-8 sequences; 1839 1840 XXX For wide builds (UCS-4) we should probably try 1841 to recombine the surrogates into a single code 1842 unit. 1843 */ 1844 errmsg = "illegal encoding"; 1845 startinpos = s-starts; 1846 endinpos = startinpos+3; 1847 goto utf8Error; 1848 } 1849 else 1850 *p++ = (Py_UNICODE)ch; 1851 break; 1852 1853 case 4: 1854 if ((s[1] & 0xc0) != 0x80 || 1855 (s[2] & 0xc0) != 0x80 || 1856 (s[3] & 0xc0) != 0x80) { 1857 errmsg = "invalid data"; 1858 startinpos = s-starts; 1859 endinpos = startinpos+4; 1860 goto utf8Error; 1861 } 1862 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) + 1863 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f); 1864 /* validate and convert to UTF-16 */ 1865 if ((ch < 0x10000) /* minimum value allowed for 4 1866 byte encoding */ 1867 || (ch > 0x10ffff)) /* maximum value allowed for 1868 UTF-16 */ 1869 { 1870 errmsg = "illegal encoding"; 1871 startinpos = s-starts; 1872 endinpos = startinpos+4; 1873 goto utf8Error; 1874 } 1875#ifdef Py_UNICODE_WIDE 1876 *p++ = (Py_UNICODE)ch; 1877#else 1878 /* compute and append the two surrogates: */ 1879 1880 /* translate from 10000..10FFFF to 0..FFFF */ 1881 ch -= 0x10000; 1882 1883 /* high surrogate = top 10 bits added to D800 */ 1884 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10)); 1885 1886 /* low surrogate = bottom 10 bits added to DC00 */ 1887 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF)); 1888#endif 1889 break; 1890 1891 default: 1892 /* Other sizes are only needed for UCS-4 */ 1893 errmsg = "unsupported Unicode code range"; 1894 startinpos = s-starts; 1895 endinpos = startinpos+n; 1896 goto utf8Error; 1897 } 1898 s += n; 1899 continue; 1900 1901 utf8Error: 1902 outpos = p-PyUnicode_AS_UNICODE(unicode); 1903 if (unicode_decode_call_errorhandler( 1904 errors, &errorHandler, 1905 "utf8", errmsg, 1906 &starts, &e, &startinpos, &endinpos, &exc, &s, 1907 (PyObject **)&unicode, &outpos, &p)) 1908 goto onError; 1909 } 1910 if (consumed) 1911 *consumed = s-starts; 1912 1913 /* Adjust length */ 1914 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0) 1915 goto onError; 1916 1917 Py_XDECREF(errorHandler); 1918 Py_XDECREF(exc); 1919 return (PyObject *)unicode; 1920 1921onError: 1922 Py_XDECREF(errorHandler); 1923 Py_XDECREF(exc); 1924 Py_DECREF(unicode); 1925 return NULL; 1926} 1927 1928/* Allocation strategy: if the string is short, convert into a stack buffer 1929 and allocate exactly as much space needed at the end. Else allocate the 1930 maximum possible needed (4 result bytes per Unicode character), and return 1931 the excess memory at the end. 1932*/ 1933PyObject * 1934PyUnicode_EncodeUTF8(const Py_UNICODE *s, 1935 Py_ssize_t size, 1936 const char *errors) 1937{ 1938#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */ 1939 1940 Py_ssize_t i; /* index into s of next input byte */ 1941 PyObject *v; /* result string object */ 1942 char *p; /* next free byte in output buffer */ 1943 Py_ssize_t nallocated; /* number of result bytes allocated */ 1944 Py_ssize_t nneeded; /* number of result bytes needed */ 1945 char stackbuf[MAX_SHORT_UNICHARS * 4]; 1946 1947 assert(s != NULL); 1948 assert(size >= 0); 1949 1950 if (size <= MAX_SHORT_UNICHARS) { 1951 /* Write into the stack buffer; nallocated can't overflow. 1952 * At the end, we'll allocate exactly as much heap space as it 1953 * turns out we need. 1954 */ 1955 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int); 1956 v = NULL; /* will allocate after we're done */ 1957 p = stackbuf; 1958 } 1959 else { 1960 /* Overallocate on the heap, and give the excess back at the end. */ 1961 nallocated = size * 4; 1962 if (nallocated / 4 != size) /* overflow! */ 1963 return PyErr_NoMemory(); 1964 v = PyBytes_FromStringAndSize(NULL, nallocated); 1965 if (v == NULL) 1966 return NULL; 1967 p = PyBytes_AS_STRING(v); 1968 } 1969 1970 for (i = 0; i < size;) { 1971 Py_UCS4 ch = s[i++]; 1972 1973 if (ch < 0x80) 1974 /* Encode ASCII */ 1975 *p++ = (char) ch; 1976 1977 else if (ch < 0x0800) { 1978 /* Encode Latin-1 */ 1979 *p++ = (char)(0xc0 | (ch >> 6)); 1980 *p++ = (char)(0x80 | (ch & 0x3f)); 1981 } 1982 else { 1983 /* Encode UCS2 Unicode ordinals */ 1984 if (ch < 0x10000) { 1985 /* Special case: check for high surrogate */ 1986 if (0xD800 <= ch && ch <= 0xDBFF && i != size) { 1987 Py_UCS4 ch2 = s[i]; 1988 /* Check for low surrogate and combine the two to 1989 form a UCS4 value */ 1990 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) { 1991 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000; 1992 i++; 1993 goto encodeUCS4; 1994 } 1995 /* Fall through: handles isolated high surrogates */ 1996 } 1997 *p++ = (char)(0xe0 | (ch >> 12)); 1998 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); 1999 *p++ = (char)(0x80 | (ch & 0x3f)); 2000 continue; 2001 } 2002encodeUCS4: 2003 /* Encode UCS4 Unicode ordinals */ 2004 *p++ = (char)(0xf0 | (ch >> 18)); 2005 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f)); 2006 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); 2007 *p++ = (char)(0x80 | (ch & 0x3f)); 2008 } 2009 } 2010 2011 if (v == NULL) { 2012 /* This was stack allocated. */ 2013 nneeded = p - stackbuf; 2014 assert(nneeded <= nallocated); 2015 v = PyBytes_FromStringAndSize(stackbuf, nneeded); 2016 } 2017 else { 2018 /* Cut back to size actually needed. */ 2019 nneeded = p - PyBytes_AS_STRING(v); 2020 assert(nneeded <= nallocated); 2021 PyBytes_Resize(v, nneeded); 2022 } 2023 return v; 2024 2025#undef MAX_SHORT_UNICHARS 2026} 2027 2028PyObject *PyUnicode_AsUTF8String(PyObject *unicode) 2029{ 2030 if (!PyUnicode_Check(unicode)) { 2031 PyErr_BadArgument(); 2032 return NULL; 2033 } 2034 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode), 2035 PyUnicode_GET_SIZE(unicode), 2036 NULL); 2037} 2038 2039/* --- UTF-32 Codec ------------------------------------------------------- */ 2040 2041PyObject * 2042PyUnicode_DecodeUTF32(const char *s, 2043 Py_ssize_t size, 2044 const char *errors, 2045 int *byteorder) 2046{ 2047 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL); 2048} 2049 2050PyObject * 2051PyUnicode_DecodeUTF32Stateful(const char *s, 2052 Py_ssize_t size, 2053 const char *errors, 2054 int *byteorder, 2055 Py_ssize_t *consumed) 2056{ 2057 const char *starts = s; 2058 Py_ssize_t startinpos; 2059 Py_ssize_t endinpos; 2060 Py_ssize_t outpos; 2061 PyUnicodeObject *unicode; 2062 Py_UNICODE *p; 2063#ifndef Py_UNICODE_WIDE 2064 int i, pairs; 2065#else 2066 const int pairs = 0; 2067#endif 2068 const unsigned char *q, *e; 2069 int bo = 0; /* assume native ordering by default */ 2070 const char *errmsg = ""; 2071 /* Offsets from q for retrieving bytes in the right order. */ 2072#ifdef BYTEORDER_IS_LITTLE_ENDIAN 2073 int iorder[] = {0, 1, 2, 3}; 2074#else 2075 int iorder[] = {3, 2, 1, 0}; 2076#endif 2077 PyObject *errorHandler = NULL; 2078 PyObject *exc = NULL; 2079 /* On narrow builds we split characters outside the BMP into two 2080 codepoints => count how much extra space we need. */ 2081#ifndef Py_UNICODE_WIDE 2082 for (i = pairs = 0; i < size/4; i++) 2083 if (((Py_UCS4 *)s)[i] >= 0x10000) 2084 pairs++; 2085#endif 2086 2087 /* This might be one to much, because of a BOM */ 2088 unicode = _PyUnicode_New((size+3)/4+pairs); 2089 if (!unicode) 2090 return NULL; 2091 if (size == 0) 2092 return (PyObject *)unicode; 2093 2094 /* Unpack UTF-32 encoded data */ 2095 p = unicode->str; 2096 q = (unsigned char *)s; 2097 e = q + size; 2098 2099 if (byteorder) 2100 bo = *byteorder; 2101 2102 /* Check for BOM marks (U+FEFF) in the input and adjust current 2103 byte order setting accordingly. In native mode, the leading BOM 2104 mark is skipped, in all other modes, it is copied to the output 2105 stream as-is (giving a ZWNBSP character). */ 2106 if (bo == 0) { 2107 if (size >= 4) { 2108 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) | 2109 (q[iorder[1]] << 8) | q[iorder[0]]; 2110#ifdef BYTEORDER_IS_LITTLE_ENDIAN 2111 if (bom == 0x0000FEFF) { 2112 q += 4; 2113 bo = -1; 2114 } 2115 else if (bom == 0xFFFE0000) { 2116 q += 4; 2117 bo = 1; 2118 } 2119#else 2120 if (bom == 0x0000FEFF) { 2121 q += 4; 2122 bo = 1; 2123 } 2124 else if (bom == 0xFFFE0000) { 2125 q += 4; 2126 bo = -1; 2127 } 2128#endif 2129 } 2130 } 2131 2132 if (bo == -1) { 2133 /* force LE */ 2134 iorder[0] = 0; 2135 iorder[1] = 1; 2136 iorder[2] = 2; 2137 iorder[3] = 3; 2138 } 2139 else if (bo == 1) { 2140 /* force BE */ 2141 iorder[0] = 3; 2142 iorder[1] = 2; 2143 iorder[2] = 1; 2144 iorder[3] = 0; 2145 } 2146 2147 while (q < e) { 2148 Py_UCS4 ch; 2149 /* remaining bytes at the end? (size should be divisible by 4) */ 2150 if (e-q<4) { 2151 if (consumed) 2152 break; 2153 errmsg = "truncated data"; 2154 startinpos = ((const char *)q)-starts; 2155 endinpos = ((const char *)e)-starts; 2156 goto utf32Error; 2157 /* The remaining input chars are ignored if the callback 2158 chooses to skip the input */ 2159 } 2160 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) | 2161 (q[iorder[1]] << 8) | q[iorder[0]]; 2162 2163 if (ch >= 0x110000) 2164 { 2165 errmsg = "codepoint not in range(0x110000)"; 2166 startinpos = ((const char *)q)-starts; 2167 endinpos = startinpos+4; 2168 goto utf32Error; 2169 } 2170#ifndef Py_UNICODE_WIDE 2171 if (ch >= 0x10000) 2172 { 2173 *p++ = 0xD800 | ((ch-0x10000) >> 10); 2174 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF); 2175 } 2176 else 2177#endif 2178 *p++ = ch; 2179 q += 4; 2180 continue; 2181 utf32Error: 2182 outpos = p-PyUnicode_AS_UNICODE(unicode); 2183 if (unicode_decode_call_errorhandler( 2184 errors, &errorHandler, 2185 "utf32", errmsg, 2186 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q, 2187 (PyObject **)&unicode, &outpos, &p)) 2188 goto onError; 2189 } 2190 2191 if (byteorder) 2192 *byteorder = bo; 2193 2194 if (consumed) 2195 *consumed = (const char *)q-starts; 2196 2197 /* Adjust length */ 2198 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0) 2199 goto onError; 2200 2201 Py_XDECREF(errorHandler); 2202 Py_XDECREF(exc); 2203 return (PyObject *)unicode; 2204 2205onError: 2206 Py_DECREF(unicode); 2207 Py_XDECREF(errorHandler); 2208 Py_XDECREF(exc); 2209 return NULL; 2210} 2211 2212PyObject * 2213PyUnicode_EncodeUTF32(const Py_UNICODE *s, 2214 Py_ssize_t size, 2215 const char *errors, 2216 int byteorder) 2217{ 2218 PyObject *v; 2219 unsigned char *p; 2220#ifndef Py_UNICODE_WIDE 2221 int i, pairs; 2222#else 2223 const int pairs = 0; 2224#endif 2225 /* Offsets from p for storing byte pairs in the right order. */ 2226#ifdef BYTEORDER_IS_LITTLE_ENDIAN 2227 int iorder[] = {0, 1, 2, 3}; 2228#else 2229 int iorder[] = {3, 2, 1, 0}; 2230#endif 2231 2232#define STORECHAR(CH) \ 2233 do { \ 2234 p[iorder[3]] = ((CH) >> 24) & 0xff; \ 2235 p[iorder[2]] = ((CH) >> 16) & 0xff; \ 2236 p[iorder[1]] = ((CH) >> 8) & 0xff; \ 2237 p[iorder[0]] = (CH) & 0xff; \ 2238 p += 4; \ 2239 } while(0) 2240 2241 /* In narrow builds we can output surrogate pairs as one codepoint, 2242 so we need less space. */ 2243#ifndef Py_UNICODE_WIDE 2244 for (i = pairs = 0; i < size-1; i++) 2245 if (0xD800 <= s[i] && s[i] <= 0xDBFF && 2246 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF) 2247 pairs++; 2248#endif 2249 v = PyBytes_FromStringAndSize(NULL, 2250 4 * (size - pairs + (byteorder == 0))); 2251 if (v == NULL) 2252 return NULL; 2253 2254 p = (unsigned char *)PyBytes_AS_STRING(v); 2255 if (byteorder == 0) 2256 STORECHAR(0xFEFF); 2257 if (size == 0) 2258 return v; 2259 2260 if (byteorder == -1) { 2261 /* force LE */ 2262 iorder[0] = 0; 2263 iorder[1] = 1; 2264 iorder[2] = 2; 2265 iorder[3] = 3; 2266 } 2267 else if (byteorder == 1) { 2268 /* force BE */ 2269 iorder[0] = 3; 2270 iorder[1] = 2; 2271 iorder[2] = 1; 2272 iorder[3] = 0; 2273 } 2274 2275 while (size-- > 0) { 2276 Py_UCS4 ch = *s++; 2277#ifndef Py_UNICODE_WIDE 2278 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) { 2279 Py_UCS4 ch2 = *s; 2280 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) { 2281 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000; 2282 s++; 2283 size--; 2284 } 2285 } 2286#endif 2287 STORECHAR(ch); 2288 } 2289 return v; 2290#undef STORECHAR 2291} 2292 2293PyObject *PyUnicode_AsUTF32String(PyObject *unicode) 2294{ 2295 if (!PyUnicode_Check(unicode)) { 2296 PyErr_BadArgument(); 2297 return NULL; 2298 } 2299 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode), 2300 PyUnicode_GET_SIZE(unicode), 2301 NULL, 2302 0); 2303} 2304 2305/* --- UTF-16 Codec ------------------------------------------------------- */ 2306 2307PyObject * 2308PyUnicode_DecodeUTF16(const char *s, 2309 Py_ssize_t size, 2310 const char *errors, 2311 int *byteorder) 2312{ 2313 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL); 2314} 2315 2316PyObject * 2317PyUnicode_DecodeUTF16Stateful(const char *s, 2318 Py_ssize_t size, 2319 const char *errors, 2320 int *byteorder, 2321 Py_ssize_t *consumed) 2322{ 2323 const char *starts = s; 2324 Py_ssize_t startinpos; 2325 Py_ssize_t endinpos; 2326 Py_ssize_t outpos; 2327 PyUnicodeObject *unicode; 2328 Py_UNICODE *p; 2329 const unsigned char *q, *e; 2330 int bo = 0; /* assume native ordering by default */ 2331 const char *errmsg = ""; 2332 /* Offsets from q for retrieving byte pairs in the right order. */ 2333#ifdef BYTEORDER_IS_LITTLE_ENDIAN 2334 int ihi = 1, ilo = 0; 2335#else 2336 int ihi = 0, ilo = 1; 2337#endif 2338 PyObject *errorHandler = NULL; 2339 PyObject *exc = NULL; 2340 2341 /* Note: size will always be longer than the resulting Unicode 2342 character count */ 2343 unicode = _PyUnicode_New(size); 2344 if (!unicode) 2345 return NULL; 2346 if (size == 0) 2347 return (PyObject *)unicode; 2348 2349 /* Unpack UTF-16 encoded data */ 2350 p = unicode->str; 2351 q = (unsigned char *)s; 2352 e = q + size; 2353 2354 if (byteorder) 2355 bo = *byteorder; 2356 2357 /* Check for BOM marks (U+FEFF) in the input and adjust current 2358 byte order setting accordingly. In native mode, the leading BOM 2359 mark is skipped, in all other modes, it is copied to the output 2360 stream as-is (giving a ZWNBSP character). */ 2361 if (bo == 0) { 2362 if (size >= 2) { 2363 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo]; 2364#ifdef BYTEORDER_IS_LITTLE_ENDIAN 2365 if (bom == 0xFEFF) { 2366 q += 2; 2367 bo = -1; 2368 } 2369 else if (bom == 0xFFFE) { 2370 q += 2; 2371 bo = 1; 2372 } 2373#else 2374 if (bom == 0xFEFF) { 2375 q += 2; 2376 bo = 1; 2377 } 2378 else if (bom == 0xFFFE) { 2379 q += 2; 2380 bo = -1; 2381 } 2382#endif 2383 } 2384 } 2385 2386 if (bo == -1) { 2387 /* force LE */ 2388 ihi = 1; 2389 ilo = 0; 2390 } 2391 else if (bo == 1) { 2392 /* force BE */ 2393 ihi = 0; 2394 ilo = 1; 2395 } 2396 2397 while (q < e) { 2398 Py_UNICODE ch; 2399 /* remaining bytes at the end? (size should be even) */ 2400 if (e-q<2) { 2401 if (consumed) 2402 break; 2403 errmsg = "truncated data"; 2404 startinpos = ((const char *)q)-starts; 2405 endinpos = ((const char *)e)-starts; 2406 goto utf16Error; 2407 /* The remaining input chars are ignored if the callback 2408 chooses to skip the input */ 2409 } 2410 ch = (q[ihi] << 8) | q[ilo]; 2411 2412 q += 2; 2413 2414 if (ch < 0xD800 || ch > 0xDFFF) { 2415 *p++ = ch; 2416 continue; 2417 } 2418 2419 /* UTF-16 code pair: */ 2420 if (q >= e) { 2421 errmsg = "unexpected end of data"; 2422 startinpos = (((const char *)q)-2)-starts; 2423 endinpos = ((const char *)e)-starts; 2424 goto utf16Error; 2425 } 2426 if (0xD800 <= ch && ch <= 0xDBFF) { 2427 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo]; 2428 q += 2; 2429 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) { 2430#ifndef Py_UNICODE_WIDE 2431 *p++ = ch; 2432 *p++ = ch2; 2433#else 2434 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000; 2435#endif 2436 continue; 2437 } 2438 else { 2439 errmsg = "illegal UTF-16 surrogate"; 2440 startinpos = (((const char *)q)-4)-starts; 2441 endinpos = startinpos+2; 2442 goto utf16Error; 2443 } 2444 2445 } 2446 errmsg = "illegal encoding"; 2447 startinpos = (((const char *)q)-2)-starts; 2448 endinpos = startinpos+2; 2449 /* Fall through to report the error */ 2450 2451 utf16Error: 2452 outpos = p-PyUnicode_AS_UNICODE(unicode); 2453 if (unicode_decode_call_errorhandler( 2454 errors, &errorHandler, 2455 "utf16", errmsg, 2456 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q, 2457 (PyObject **)&unicode, &outpos, &p)) 2458 goto onError; 2459 } 2460 2461 if (byteorder) 2462 *byteorder = bo; 2463 2464 if (consumed) 2465 *consumed = (const char *)q-starts; 2466 2467 /* Adjust length */ 2468 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0) 2469 goto onError; 2470 2471 Py_XDECREF(errorHandler); 2472 Py_XDECREF(exc); 2473 return (PyObject *)unicode; 2474 2475onError: 2476 Py_DECREF(unicode); 2477 Py_XDECREF(errorHandler); 2478 Py_XDECREF(exc); 2479 return NULL; 2480} 2481 2482PyObject * 2483PyUnicode_EncodeUTF16(const Py_UNICODE *s, 2484 Py_ssize_t size, 2485 const char *errors, 2486 int byteorder) 2487{ 2488 PyObject *v; 2489 unsigned char *p; 2490#ifdef Py_UNICODE_WIDE 2491 int i, pairs; 2492#else 2493 const int pairs = 0; 2494#endif 2495 /* Offsets from p for storing byte pairs in the right order. */ 2496#ifdef BYTEORDER_IS_LITTLE_ENDIAN 2497 int ihi = 1, ilo = 0; 2498#else 2499 int ihi = 0, ilo = 1; 2500#endif 2501 2502#define STORECHAR(CH) \ 2503 do { \ 2504 p[ihi] = ((CH) >> 8) & 0xff; \ 2505 p[ilo] = (CH) & 0xff; \ 2506 p += 2; \ 2507 } while(0) 2508 2509#ifdef Py_UNICODE_WIDE 2510 for (i = pairs = 0; i < size; i++) 2511 if (s[i] >= 0x10000) 2512 pairs++; 2513#endif 2514 v = PyBytes_FromStringAndSize(NULL, 2515 2 * (size + pairs + (byteorder == 0))); 2516 if (v == NULL) 2517 return NULL; 2518 2519 p = (unsigned char *)PyBytes_AS_STRING(v); 2520 if (byteorder == 0) 2521 STORECHAR(0xFEFF); 2522 if (size == 0) 2523 return v; 2524 2525 if (byteorder == -1) { 2526 /* force LE */ 2527 ihi = 1; 2528 ilo = 0; 2529 } 2530 else if (byteorder == 1) { 2531 /* force BE */ 2532 ihi = 0; 2533 ilo = 1; 2534 } 2535 2536 while (size-- > 0) { 2537 Py_UNICODE ch = *s++; 2538 Py_UNICODE ch2 = 0; 2539#ifdef Py_UNICODE_WIDE 2540 if (ch >= 0x10000) { 2541 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF); 2542 ch = 0xD800 | ((ch-0x10000) >> 10); 2543 } 2544#endif 2545 STORECHAR(ch); 2546 if (ch2) 2547 STORECHAR(ch2); 2548 } 2549 return v; 2550#undef STORECHAR 2551} 2552 2553PyObject *PyUnicode_AsUTF16String(PyObject *unicode) 2554{ 2555 if (!PyUnicode_Check(unicode)) { 2556 PyErr_BadArgument(); 2557 return NULL; 2558 } 2559 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode), 2560 PyUnicode_GET_SIZE(unicode), 2561 NULL, 2562 0); 2563} 2564 2565/* --- Unicode Escape Codec ----------------------------------------------- */ 2566 2567static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL; 2568 2569PyObject *PyUnicode_DecodeUnicodeEscape(const char *s, 2570 Py_ssize_t size, 2571 const char *errors) 2572{ 2573 const char *starts = s; 2574 Py_ssize_t startinpos; 2575 Py_ssize_t endinpos; 2576 Py_ssize_t outpos; 2577 int i; 2578 PyUnicodeObject *v; 2579 Py_UNICODE *p; 2580 const char *end; 2581 char* message; 2582 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */ 2583 PyObject *errorHandler = NULL; 2584 PyObject *exc = NULL; 2585 2586 /* Escaped strings will always be longer than the resulting 2587 Unicode string, so we start with size here and then reduce the 2588 length after conversion to the true value. 2589 (but if the error callback returns a long replacement string 2590 we'll have to allocate more space) */ 2591 v = _PyUnicode_New(size); 2592 if (v == NULL) 2593 goto onError; 2594 if (size == 0) 2595 return (PyObject *)v; 2596 2597 p = PyUnicode_AS_UNICODE(v); 2598 end = s + size; 2599 2600 while (s < end) { 2601 unsigned char c; 2602 Py_UNICODE x; 2603 int digits; 2604 2605 /* Non-escape characters are interpreted as Unicode ordinals */ 2606 if (*s != '\\') { 2607 *p++ = (unsigned char) *s++; 2608 continue; 2609 } 2610 2611 startinpos = s-starts; 2612 /* \ - Escapes */ 2613 s++; 2614 switch (*s++) { 2615 2616 /* \x escapes */ 2617 case '\n': break; 2618 case '\\': *p++ = '\\'; break; 2619 case '\'': *p++ = '\''; break; 2620 case '\"': *p++ = '\"'; break; 2621 case 'b': *p++ = '\b'; break; 2622 case 'f': *p++ = '\014'; break; /* FF */ 2623 case 't': *p++ = '\t'; break; 2624 case 'n': *p++ = '\n'; break; 2625 case 'r': *p++ = '\r'; break; 2626 case 'v': *p++ = '\013'; break; /* VT */ 2627 case 'a': *p++ = '\007'; break; /* BEL, not classic C */ 2628 2629 /* \OOO (octal) escapes */ 2630 case '0': case '1': case '2': case '3': 2631 case '4': case '5': case '6': case '7': 2632 x = s[-1] - '0'; 2633 if ('0' <= *s && *s <= '7') { 2634 x = (x<<3) + *s++ - '0'; 2635 if ('0' <= *s && *s <= '7') 2636 x = (x<<3) + *s++ - '0'; 2637 } 2638 *p++ = x; 2639 break; 2640 2641 /* hex escapes */ 2642 /* \xXX */ 2643 case 'x': 2644 digits = 2; 2645 message = "truncated \\xXX escape"; 2646 goto hexescape; 2647 2648 /* \uXXXX */ 2649 case 'u': 2650 digits = 4; 2651 message = "truncated \\uXXXX escape"; 2652 goto hexescape; 2653 2654 /* \UXXXXXXXX */ 2655 case 'U': 2656 digits = 8; 2657 message = "truncated \\UXXXXXXXX escape"; 2658 hexescape: 2659 chr = 0; 2660 outpos = p-PyUnicode_AS_UNICODE(v); 2661 if (s+digits>end) { 2662 endinpos = size; 2663 if (unicode_decode_call_errorhandler( 2664 errors, &errorHandler, 2665 "unicodeescape", "end of string in escape sequence", 2666 &starts, &end, &startinpos, &endinpos, &exc, &s, 2667 (PyObject **)&v, &outpos, &p)) 2668 goto onError; 2669 goto nextByte; 2670 } 2671 for (i = 0; i < digits; ++i) { 2672 c = (unsigned char) s[i]; 2673 if (!isxdigit(c)) { 2674 endinpos = (s+i+1)-starts; 2675 if (unicode_decode_call_errorhandler( 2676 errors, &errorHandler, 2677 "unicodeescape", message, 2678 &starts, &end, &startinpos, &endinpos, &exc, &s, 2679 (PyObject **)&v, &outpos, &p)) 2680 goto onError; 2681 goto nextByte; 2682 } 2683 chr = (chr<<4) & ~0xF; 2684 if (c >= '0' && c <= '9') 2685 chr += c - '0'; 2686 else if (c >= 'a' && c <= 'f') 2687 chr += 10 + c - 'a'; 2688 else 2689 chr += 10 + c - 'A'; 2690 } 2691 s += i; 2692 if (chr == 0xffffffff && PyErr_Occurred()) 2693 /* _decoding_error will have already written into the 2694 target buffer. */ 2695 break; 2696 store: 2697 /* when we get here, chr is a 32-bit unicode character */ 2698 if (chr <= 0xffff) 2699 /* UCS-2 character */ 2700 *p++ = (Py_UNICODE) chr; 2701 else if (chr <= 0x10ffff) { 2702 /* UCS-4 character. Either store directly, or as 2703 surrogate pair. */ 2704#ifdef Py_UNICODE_WIDE 2705 *p++ = chr; 2706#else 2707 chr -= 0x10000L; 2708 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10); 2709 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF); 2710#endif 2711 } else { 2712 endinpos = s-starts; 2713 outpos = p-PyUnicode_AS_UNICODE(v); 2714 if (unicode_decode_call_errorhandler( 2715 errors, &errorHandler, 2716 "unicodeescape", "illegal Unicode character", 2717 &starts, &end, &startinpos, &endinpos, &exc, &s, 2718 (PyObject **)&v, &outpos, &p)) 2719 goto onError; 2720 } 2721 break; 2722 2723 /* \N{name} */ 2724 case 'N': 2725 message = "malformed \\N character escape"; 2726 if (ucnhash_CAPI == NULL) { 2727 /* load the unicode data module */ 2728 PyObject *m, *api; 2729 m = PyImport_ImportModule("unicodedata"); 2730 if (m == NULL) 2731 goto ucnhashError; 2732 api = PyObject_GetAttrString(m, "ucnhash_CAPI"); 2733 Py_DECREF(m); 2734 if (api == NULL) 2735 goto ucnhashError; 2736 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api); 2737 Py_DECREF(api); 2738 if (ucnhash_CAPI == NULL) 2739 goto ucnhashError; 2740 } 2741 if (*s == '{') { 2742 const char *start = s+1; 2743 /* look for the closing brace */ 2744 while (*s != '}' && s < end) 2745 s++; 2746 if (s > start && s < end && *s == '}') { 2747 /* found a name. look it up in the unicode database */ 2748 message = "unknown Unicode character name"; 2749 s++; 2750 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr)) 2751 goto store; 2752 } 2753 } 2754 endinpos = s-starts; 2755 outpos = p-PyUnicode_AS_UNICODE(v); 2756 if (unicode_decode_call_errorhandler( 2757 errors, &errorHandler, 2758 "unicodeescape", message, 2759 &starts, &end, &startinpos, &endinpos, &exc, &s, 2760 (PyObject **)&v, &outpos, &p)) 2761 goto onError; 2762 break; 2763 2764 default: 2765 if (s > end) { 2766 message = "\\ at end of string"; 2767 s--; 2768 endinpos = s-starts; 2769 outpos = p-PyUnicode_AS_UNICODE(v); 2770 if (unicode_decode_call_errorhandler( 2771 errors, &errorHandler, 2772 "unicodeescape", message, 2773 &starts, &end, &startinpos, &endinpos, &exc, &s, 2774 (PyObject **)&v, &outpos, &p)) 2775 goto onError; 2776 } 2777 else { 2778 *p++ = '\\'; 2779 *p++ = (unsigned char)s[-1]; 2780 } 2781 break; 2782 } 2783 nextByte: 2784 ; 2785 } 2786 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0) 2787 goto onError; 2788 Py_XDECREF(errorHandler); 2789 Py_XDECREF(exc); 2790 return (PyObject *)v; 2791 2792ucnhashError: 2793 PyErr_SetString( 2794 PyExc_UnicodeError, 2795 "\\N escapes not supported (can't load unicodedata module)" 2796 ); 2797 Py_XDECREF(v); 2798 Py_XDECREF(errorHandler); 2799 Py_XDECREF(exc); 2800 return NULL; 2801 2802onError: 2803 Py_XDECREF(v); 2804 Py_XDECREF(errorHandler); 2805 Py_XDECREF(exc); 2806 return NULL; 2807} 2808 2809/* Return a Unicode-Escape string version of the Unicode object. 2810 2811 If quotes is true, the string is enclosed in u"" or u'' quotes as 2812 appropriate. 2813 2814*/ 2815 2816Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s, 2817 Py_ssize_t size, 2818 Py_UNICODE ch) 2819{ 2820 /* like wcschr, but doesn't stop at NULL characters */ 2821 2822 while (size-- > 0) { 2823 if (*s == ch) 2824 return s; 2825 s++; 2826 } 2827 2828 return NULL; 2829} 2830 2831static const char *hexdigits = "0123456789abcdef"; 2832 2833PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s, 2834 Py_ssize_t size) 2835{ 2836 PyObject *repr; 2837 char *p; 2838 2839 /* XXX(nnorwitz): rather than over-allocating, it would be 2840 better to choose a different scheme. Perhaps scan the 2841 first N-chars of the string and allocate based on that size. 2842 */ 2843 /* Initial allocation is based on the longest-possible unichr 2844 escape. 2845 2846 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source 2847 unichr, so in this case it's the longest unichr escape. In 2848 narrow (UTF-16) builds this is five chars per source unichr 2849 since there are two unichrs in the surrogate pair, so in narrow 2850 (UTF-16) builds it's not the longest unichr escape. 2851 2852 In wide or narrow builds '\uxxxx' is 6 chars per source unichr, 2853 so in the narrow (UTF-16) build case it's the longest unichr 2854 escape. 2855 */ 2856 2857 repr = PyBytes_FromStringAndSize(NULL, 2858#ifdef Py_UNICODE_WIDE 2859 + 10*size 2860#else 2861 + 6*size 2862#endif 2863 + 1); 2864 if (repr == NULL) 2865 return NULL; 2866 2867 p = PyBytes_AS_STRING(repr); 2868 2869 while (size-- > 0) { 2870 Py_UNICODE ch = *s++; 2871 2872 /* Escape backslashes */ 2873 if (ch == '\\') { 2874 *p++ = '\\'; 2875 *p++ = (char) ch; 2876 continue; 2877 } 2878 2879#ifdef Py_UNICODE_WIDE 2880 /* Map 21-bit characters to '\U00xxxxxx' */ 2881 else if (ch >= 0x10000) { 2882 *p++ = '\\'; 2883 *p++ = 'U'; 2884 *p++ = hexdigits[(ch >> 28) & 0x0000000F]; 2885 *p++ = hexdigits[(ch >> 24) & 0x0000000F]; 2886 *p++ = hexdigits[(ch >> 20) & 0x0000000F]; 2887 *p++ = hexdigits[(ch >> 16) & 0x0000000F]; 2888 *p++ = hexdigits[(ch >> 12) & 0x0000000F]; 2889 *p++ = hexdigits[(ch >> 8) & 0x0000000F]; 2890 *p++ = hexdigits[(ch >> 4) & 0x0000000F]; 2891 *p++ = hexdigits[ch & 0x0000000F]; 2892 continue; 2893 } 2894#else 2895 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */ 2896 else if (ch >= 0xD800 && ch < 0xDC00) { 2897 Py_UNICODE ch2; 2898 Py_UCS4 ucs; 2899 2900 ch2 = *s++; 2901 size--; 2902 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) { 2903 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000; 2904 *p++ = '\\'; 2905 *p++ = 'U'; 2906 *p++ = hexdigits[(ucs >> 28) & 0x0000000F]; 2907 *p++ = hexdigits[(ucs >> 24) & 0x0000000F]; 2908 *p++ = hexdigits[(ucs >> 20) & 0x0000000F]; 2909 *p++ = hexdigits[(ucs >> 16) & 0x0000000F]; 2910 *p++ = hexdigits[(ucs >> 12) & 0x0000000F]; 2911 *p++ = hexdigits[(ucs >> 8) & 0x0000000F]; 2912 *p++ = hexdigits[(ucs >> 4) & 0x0000000F]; 2913 *p++ = hexdigits[ucs & 0x0000000F]; 2914 continue; 2915 } 2916 /* Fall through: isolated surrogates are copied as-is */ 2917 s--; 2918 size++; 2919 } 2920#endif 2921 2922 /* Map 16-bit characters to '\uxxxx' */ 2923 if (ch >= 256) { 2924 *p++ = '\\'; 2925 *p++ = 'u'; 2926 *p++ = hexdigits[(ch >> 12) & 0x000F]; 2927 *p++ = hexdigits[(ch >> 8) & 0x000F]; 2928 *p++ = hexdigits[(ch >> 4) & 0x000F]; 2929 *p++ = hexdigits[ch & 0x000F]; 2930 } 2931 2932 /* Map special whitespace to '\t', \n', '\r' */ 2933 else if (ch == '\t') { 2934 *p++ = '\\'; 2935 *p++ = 't'; 2936 } 2937 else if (ch == '\n') { 2938 *p++ = '\\'; 2939 *p++ = 'n'; 2940 } 2941 else if (ch == '\r') { 2942 *p++ = '\\'; 2943 *p++ = 'r'; 2944 } 2945 2946 /* Map non-printable US ASCII to '\xhh' */ 2947 else if (ch < ' ' || ch >= 0x7F) { 2948 *p++ = '\\'; 2949 *p++ = 'x'; 2950 *p++ = hexdigits[(ch >> 4) & 0x000F]; 2951 *p++ = hexdigits[ch & 0x000F]; 2952 } 2953 2954 /* Copy everything else as-is */ 2955 else 2956 *p++ = (char) ch; 2957 } 2958 2959 *p = '\0'; 2960 if (PyBytes_Resize(repr, p - PyBytes_AS_STRING(repr))) { 2961 Py_DECREF(repr); 2962 return NULL; 2963 } 2964 return repr; 2965} 2966 2967PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode) 2968{ 2969 PyObject *s, *result; 2970 if (!PyUnicode_Check(unicode)) { 2971 PyErr_BadArgument(); 2972 return NULL; 2973 } 2974 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode), 2975 PyUnicode_GET_SIZE(unicode)); 2976 2977 if (!s) 2978 return NULL; 2979 result = PyString_FromStringAndSize(PyBytes_AS_STRING(s), 2980 PyBytes_GET_SIZE(s)); 2981 Py_DECREF(s); 2982 return result; 2983} 2984 2985/* --- Raw Unicode Escape Codec ------------------------------------------- */ 2986 2987PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s, 2988 Py_ssize_t size, 2989 const char *errors) 2990{ 2991 const char *starts = s; 2992 Py_ssize_t startinpos; 2993 Py_ssize_t endinpos; 2994 Py_ssize_t outpos; 2995 PyUnicodeObject *v; 2996 Py_UNICODE *p; 2997 const char *end; 2998 const char *bs; 2999 PyObject *errorHandler = NULL; 3000 PyObject *exc = NULL; 3001 3002 /* Escaped strings will always be longer than the resulting 3003 Unicode string, so we start with size here and then reduce the 3004 length after conversion to the true value. (But decoding error 3005 handler might have to resize the string) */ 3006 v = _PyUnicode_New(size); 3007 if (v == NULL) 3008 goto onError; 3009 if (size == 0) 3010 return (PyObject *)v; 3011 p = PyUnicode_AS_UNICODE(v); 3012 end = s + size; 3013 while (s < end) { 3014 unsigned char c; 3015 Py_UCS4 x; 3016 int i; 3017 int count; 3018 3019 /* Non-escape characters are interpreted as Unicode ordinals */ 3020 if (*s != '\\') { 3021 *p++ = (unsigned char)*s++; 3022 continue; 3023 } 3024 startinpos = s-starts; 3025 3026 /* \u-escapes are only interpreted iff the number of leading 3027 backslashes if odd */ 3028 bs = s; 3029 for (;s < end;) { 3030 if (*s != '\\') 3031 break; 3032 *p++ = (unsigned char)*s++; 3033 } 3034 if (((s - bs) & 1) == 0 || 3035 s >= end || 3036 (*s != 'u' && *s != 'U')) { 3037 continue; 3038 } 3039 p--; 3040 count = *s=='u' ? 4 : 8; 3041 s++; 3042 3043 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */ 3044 outpos = p-PyUnicode_AS_UNICODE(v); 3045 for (x = 0, i = 0; i < count; ++i, ++s) { 3046 c = (unsigned char)*s; 3047 if (!isxdigit(c)) { 3048 endinpos = s-starts; 3049 if (unicode_decode_call_errorhandler( 3050 errors, &errorHandler, 3051 "rawunicodeescape", "truncated \\uXXXX", 3052 &starts, &end, &startinpos, &endinpos, &exc, &s, 3053 (PyObject **)&v, &outpos, &p)) 3054 goto onError; 3055 goto nextByte; 3056 } 3057 x = (x<<4) & ~0xF; 3058 if (c >= '0' && c <= '9') 3059 x += c - '0'; 3060 else if (c >= 'a' && c <= 'f') 3061 x += 10 + c - 'a'; 3062 else 3063 x += 10 + c - 'A'; 3064 } 3065#ifndef Py_UNICODE_WIDE 3066 if (x > 0x10000) { 3067 if (unicode_decode_call_errorhandler( 3068 errors, &errorHandler, 3069 "rawunicodeescape", "\\Uxxxxxxxx out of range", 3070 &starts, &end, &startinpos, &endinpos, &exc, &s, 3071 (PyObject **)&v, &outpos, &p)) 3072 goto onError; 3073 } 3074#endif 3075 *p++ = x; 3076 nextByte: 3077 ; 3078 } 3079 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0) 3080 goto onError; 3081 Py_XDECREF(errorHandler); 3082 Py_XDECREF(exc); 3083 return (PyObject *)v; 3084 3085 onError: 3086 Py_XDECREF(v); 3087 Py_XDECREF(errorHandler); 3088 Py_XDECREF(exc); 3089 return NULL; 3090} 3091 3092PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s, 3093 Py_ssize_t size) 3094{ 3095 PyObject *repr; 3096 char *p; 3097 char *q; 3098 3099#ifdef Py_UNICODE_WIDE 3100 repr = PyBytes_FromStringAndSize(NULL, 10 * size); 3101#else 3102 repr = PyBytes_FromStringAndSize(NULL, 6 * size); 3103#endif 3104 if (repr == NULL) 3105 return NULL; 3106 if (size == 0) 3107 return repr; 3108 3109 p = q = PyBytes_AS_STRING(repr); 3110 while (size-- > 0) { 3111 Py_UNICODE ch = *s++; 3112#ifdef Py_UNICODE_WIDE 3113 /* Map 32-bit characters to '\Uxxxxxxxx' */ 3114 if (ch >= 0x10000) { 3115 *p++ = '\\'; 3116 *p++ = 'U'; 3117 *p++ = hexdigits[(ch >> 28) & 0xf]; 3118 *p++ = hexdigits[(ch >> 24) & 0xf]; 3119 *p++ = hexdigits[(ch >> 20) & 0xf]; 3120 *p++ = hexdigits[(ch >> 16) & 0xf]; 3121 *p++ = hexdigits[(ch >> 12) & 0xf]; 3122 *p++ = hexdigits[(ch >> 8) & 0xf]; 3123 *p++ = hexdigits[(ch >> 4) & 0xf]; 3124 *p++ = hexdigits[ch & 15]; 3125 } 3126 else 3127#endif 3128 /* Map 16-bit characters to '\uxxxx' */ 3129 if (ch >= 256) { 3130 *p++ = '\\'; 3131 *p++ = 'u'; 3132 *p++ = hexdigits[(ch >> 12) & 0xf]; 3133 *p++ = hexdigits[(ch >> 8) & 0xf]; 3134 *p++ = hexdigits[(ch >> 4) & 0xf]; 3135 *p++ = hexdigits[ch & 15]; 3136 } 3137 /* Copy everything else as-is */ 3138 else 3139 *p++ = (char) ch; 3140 } 3141 *p = '\0'; 3142 if (PyBytes_Resize(repr, p - q)) { 3143 Py_DECREF(repr); 3144 return NULL; 3145 } 3146 return repr; 3147} 3148 3149PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode) 3150{ 3151 PyObject *s, *result; 3152 if (!PyUnicode_Check(unicode)) { 3153 PyErr_BadArgument(); 3154 return NULL; 3155 } 3156 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode), 3157 PyUnicode_GET_SIZE(unicode)); 3158 3159 if (!s) 3160 return NULL; 3161 result = PyString_FromStringAndSize(PyBytes_AS_STRING(s), 3162 PyBytes_GET_SIZE(s)); 3163 Py_DECREF(s); 3164 return result; 3165} 3166 3167/* --- Unicode Internal Codec ------------------------------------------- */ 3168 3169PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s, 3170 Py_ssize_t size, 3171 const char *errors) 3172{ 3173 const char *starts = s; 3174 Py_ssize_t startinpos; 3175 Py_ssize_t endinpos; 3176 Py_ssize_t outpos; 3177 PyUnicodeObject *v; 3178 Py_UNICODE *p; 3179 const char *end; 3180 const char *reason; 3181 PyObject *errorHandler = NULL; 3182 PyObject *exc = NULL; 3183 3184#ifdef Py_UNICODE_WIDE 3185 Py_UNICODE unimax = PyUnicode_GetMax(); 3186#endif 3187 3188 /* XXX overflow detection missing */ 3189 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE); 3190 if (v == NULL) 3191 goto onError; 3192 if (PyUnicode_GetSize((PyObject *)v) == 0) 3193 return (PyObject *)v; 3194 p = PyUnicode_AS_UNICODE(v); 3195 end = s + size; 3196 3197 while (s < end) { 3198 memcpy(p, s, sizeof(Py_UNICODE)); 3199 /* We have to sanity check the raw data, otherwise doom looms for 3200 some malformed UCS-4 data. */ 3201 if ( 3202 #ifdef Py_UNICODE_WIDE 3203 *p > unimax || *p < 0 || 3204 #endif 3205 end-s < Py_UNICODE_SIZE 3206 ) 3207 { 3208 startinpos = s - starts; 3209 if (end-s < Py_UNICODE_SIZE) { 3210 endinpos = end-starts; 3211 reason = "truncated input"; 3212 } 3213 else { 3214 endinpos = s - starts + Py_UNICODE_SIZE; 3215 reason = "illegal code point (> 0x10FFFF)"; 3216 } 3217 outpos = p - PyUnicode_AS_UNICODE(v); 3218 if (unicode_decode_call_errorhandler( 3219 errors, &errorHandler, 3220 "unicode_internal", reason, 3221 &starts, &end, &startinpos, &endinpos, &exc, &s, 3222 (PyObject **)&v, &outpos, &p)) { 3223 goto onError; 3224 } 3225 } 3226 else { 3227 p++; 3228 s += Py_UNICODE_SIZE; 3229 } 3230 } 3231 3232 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0) 3233 goto onError; 3234 Py_XDECREF(errorHandler); 3235 Py_XDECREF(exc); 3236 return (PyObject *)v; 3237 3238 onError: 3239 Py_XDECREF(v); 3240 Py_XDECREF(errorHandler); 3241 Py_XDECREF(exc); 3242 return NULL; 3243} 3244 3245/* --- Latin-1 Codec ------------------------------------------------------ */ 3246 3247PyObject *PyUnicode_DecodeLatin1(const char *s, 3248 Py_ssize_t size, 3249 const char *errors) 3250{ 3251 PyUnicodeObject *v; 3252 Py_UNICODE *p; 3253 3254 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */ 3255 if (size == 1) { 3256 Py_UNICODE r = *(unsigned char*)s; 3257 return PyUnicode_FromUnicode(&r, 1); 3258 } 3259 3260 v = _PyUnicode_New(size); 3261 if (v == NULL) 3262 goto onError; 3263 if (size == 0) 3264 return (PyObject *)v; 3265 p = PyUnicode_AS_UNICODE(v); 3266 while (size-- > 0) 3267 *p++ = (unsigned char)*s++; 3268 return (PyObject *)v; 3269 3270 onError: 3271 Py_XDECREF(v); 3272 return NULL; 3273} 3274 3275/* create or adjust a UnicodeEncodeError */ 3276static void make_encode_exception(PyObject **exceptionObject, 3277 const char *encoding, 3278 const Py_UNICODE *unicode, Py_ssize_t size, 3279 Py_ssize_t startpos, Py_ssize_t endpos, 3280 const char *reason) 3281{ 3282 if (*exceptionObject == NULL) { 3283 *exceptionObject = PyUnicodeEncodeError_Create( 3284 encoding, unicode, size, startpos, endpos, reason); 3285 } 3286 else { 3287 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos)) 3288 goto onError; 3289 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos)) 3290 goto onError; 3291 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason)) 3292 goto onError; 3293 return; 3294 onError: 3295 Py_DECREF(*exceptionObject); 3296 *exceptionObject = NULL; 3297 } 3298} 3299 3300/* raises a UnicodeEncodeError */ 3301static void raise_encode_exception(PyObject **exceptionObject, 3302 const char *encoding, 3303 const Py_UNICODE *unicode, Py_ssize_t size, 3304 Py_ssize_t startpos, Py_ssize_t endpos, 3305 const char *reason) 3306{ 3307 make_encode_exception(exceptionObject, 3308 encoding, unicode, size, startpos, endpos, reason); 3309 if (*exceptionObject != NULL) 3310 PyCodec_StrictErrors(*exceptionObject); 3311} 3312 3313/* error handling callback helper: 3314 build arguments, call the callback and check the arguments, 3315 put the result into newpos and return the replacement string, which 3316 has to be freed by the caller */ 3317static PyObject *unicode_encode_call_errorhandler(const char *errors, 3318 PyObject **errorHandler, 3319 const char *encoding, const char *reason, 3320 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject, 3321 Py_ssize_t startpos, Py_ssize_t endpos, 3322 Py_ssize_t *newpos) 3323{ 3324 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple"; 3325 3326 PyObject *restuple; 3327 PyObject *resunicode; 3328 3329 if (*errorHandler == NULL) { 3330 *errorHandler = PyCodec_LookupError(errors); 3331 if (*errorHandler == NULL) 3332 return NULL; 3333 } 3334 3335 make_encode_exception(exceptionObject, 3336 encoding, unicode, size, startpos, endpos, reason); 3337 if (*exceptionObject == NULL) 3338 return NULL; 3339 3340 restuple = PyObject_CallFunctionObjArgs( 3341 *errorHandler, *exceptionObject, NULL); 3342 if (restuple == NULL) 3343 return NULL; 3344 if (!PyTuple_Check(restuple)) { 3345 PyErr_Format(PyExc_TypeError, &argparse[4]); 3346 Py_DECREF(restuple); 3347 return NULL; 3348 } 3349 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, 3350 &resunicode, newpos)) { 3351 Py_DECREF(restuple); 3352 return NULL; 3353 } 3354 if (*newpos<0) 3355 *newpos = size+*newpos; 3356 if (*newpos<0 || *newpos>size) { 3357 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos); 3358 Py_DECREF(restuple); 3359 return NULL; 3360 } 3361 Py_INCREF(resunicode); 3362 Py_DECREF(restuple); 3363 return resunicode; 3364} 3365 3366static PyObject *unicode_encode_ucs1(const Py_UNICODE *p, 3367 Py_ssize_t size, 3368 const char *errors, 3369 int limit) 3370{ 3371 /* output object */ 3372 PyObject *res; 3373 /* pointers to the beginning and end+1 of input */ 3374 const Py_UNICODE *startp = p; 3375 const Py_UNICODE *endp = p + size; 3376 /* pointer to the beginning of the unencodable characters */ 3377 /* const Py_UNICODE *badp = NULL; */ 3378 /* pointer into the output */ 3379 char *str; 3380 /* current output position */ 3381 Py_ssize_t respos = 0; 3382 Py_ssize_t ressize; 3383 const char *encoding = (limit == 256) ? "latin-1" : "ascii"; 3384 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)"; 3385 PyObject *errorHandler = NULL; 3386 PyObject *exc = NULL; 3387 /* the following variable is used for caching string comparisons 3388 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */ 3389 int known_errorHandler = -1; 3390 3391 /* allocate enough for a simple encoding without 3392 replacements, if we need more, we'll resize */ 3393 res = PyBytes_FromStringAndSize(NULL, size); 3394 if (res == NULL) 3395 goto onError; 3396 if (size == 0) 3397 return res; 3398 str = PyBytes_AS_STRING(res); 3399 ressize = size; 3400 3401 while (p<endp) { 3402 Py_UNICODE c = *p; 3403 3404 /* can we encode this? */ 3405 if (c<limit) { 3406 /* no overflow check, because we know that the space is enough */ 3407 *str++ = (char)c; 3408 ++p; 3409 } 3410 else { 3411 Py_ssize_t unicodepos = p-startp; 3412 Py_ssize_t requiredsize; 3413 PyObject *repunicode; 3414 Py_ssize_t repsize; 3415 Py_ssize_t newpos; 3416 Py_ssize_t respos; 3417 Py_UNICODE *uni2; 3418 /* startpos for collecting unencodable chars */ 3419 const Py_UNICODE *collstart = p; 3420 const Py_UNICODE *collend = p; 3421 /* find all unecodable characters */ 3422 while ((collend < endp) && ((*collend)>=limit)) 3423 ++collend; 3424 /* cache callback name lookup (if not done yet, i.e. it's the first error) */ 3425 if (known_errorHandler==-1) { 3426 if ((errors==NULL) || (!strcmp(errors, "strict"))) 3427 known_errorHandler = 1; 3428 else if (!strcmp(errors, "replace")) 3429 known_errorHandler = 2; 3430 else if (!strcmp(errors, "ignore")) 3431 known_errorHandler = 3; 3432 else if (!strcmp(errors, "xmlcharrefreplace")) 3433 known_errorHandler = 4; 3434 else 3435 known_errorHandler = 0; 3436 } 3437 switch (known_errorHandler) { 3438 case 1: /* strict */ 3439 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason); 3440 goto onError; 3441 case 2: /* replace */ 3442 while (collstart++<collend) 3443 *str++ = '?'; /* fall through */ 3444 case 3: /* ignore */ 3445 p = collend; 3446 break; 3447 case 4: /* xmlcharrefreplace */ 3448 respos = str - PyBytes_AS_STRING(res); 3449 /* determine replacement size (temporarily (mis)uses p) */ 3450 for (p = collstart, repsize = 0; p < collend; ++p) { 3451 if (*p<10) 3452 repsize += 2+1+1; 3453 else if (*p<100) 3454 repsize += 2+2+1; 3455 else if (*p<1000) 3456 repsize += 2+3+1; 3457 else if (*p<10000) 3458 repsize += 2+4+1; 3459#ifndef Py_UNICODE_WIDE 3460 else 3461 repsize += 2+5+1; 3462#else 3463 else if (*p<100000) 3464 repsize += 2+5+1; 3465 else if (*p<1000000) 3466 repsize += 2+6+1; 3467 else 3468 repsize += 2+7+1; 3469#endif 3470 } 3471 requiredsize = respos+repsize+(endp-collend); 3472 if (requiredsize > ressize) { 3473 if (requiredsize<2*ressize) 3474 requiredsize = 2*ressize; 3475 if (PyBytes_Resize(res, requiredsize)) 3476 goto onError; 3477 str = PyBytes_AS_STRING(res) + respos; 3478 ressize = requiredsize; 3479 } 3480 /* generate replacement (temporarily (mis)uses p) */ 3481 for (p = collstart; p < collend; ++p) { 3482 str += sprintf(str, "&#%d;", (int)*p); 3483 } 3484 p = collend; 3485 break; 3486 default: 3487 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler, 3488 encoding, reason, startp, size, &exc, 3489 collstart-startp, collend-startp, &newpos); 3490 if (repunicode == NULL) 3491 goto onError; 3492 /* need more space? (at least enough for what we 3493 have+the replacement+the rest of the string, so 3494 we won't have to check space for encodable characters) */ 3495 respos = str - PyBytes_AS_STRING(res); 3496 repsize = PyUnicode_GET_SIZE(repunicode); 3497 requiredsize = respos+repsize+(endp-collend); 3498 if (requiredsize > ressize) { 3499 if (requiredsize<2*ressize) 3500 requiredsize = 2*ressize; 3501 if (PyBytes_Resize(res, requiredsize)) { 3502 Py_DECREF(repunicode); 3503 goto onError; 3504 } 3505 str = PyBytes_AS_STRING(res) + respos; 3506 ressize = requiredsize; 3507 } 3508 /* check if there is anything unencodable in the replacement 3509 and copy it to the output */ 3510 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) { 3511 c = *uni2; 3512 if (c >= limit) { 3513 raise_encode_exception(&exc, encoding, startp, size, 3514 unicodepos, unicodepos+1, reason); 3515 Py_DECREF(repunicode); 3516 goto onError; 3517 } 3518 *str = (char)c; 3519 } 3520 p = startp + newpos; 3521 Py_DECREF(repunicode); 3522 } 3523 } 3524 } 3525 /* Resize if we allocated to much */ 3526 respos = str - PyBytes_AS_STRING(res); 3527 if (respos<ressize) 3528 /* If this falls res will be NULL */ 3529 PyBytes_Resize(res, respos); 3530 Py_XDECREF(errorHandler); 3531 Py_XDECREF(exc); 3532 return res; 3533 3534 onError: 3535 Py_XDECREF(res); 3536 Py_XDECREF(errorHandler); 3537 Py_XDECREF(exc); 3538 return NULL; 3539} 3540 3541PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p, 3542 Py_ssize_t size, 3543 const char *errors) 3544{ 3545 return unicode_encode_ucs1(p, size, errors, 256); 3546} 3547 3548PyObject *PyUnicode_AsLatin1String(PyObject *unicode) 3549{ 3550 if (!PyUnicode_Check(unicode)) { 3551 PyErr_BadArgument(); 3552 return NULL; 3553 } 3554 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode), 3555 PyUnicode_GET_SIZE(unicode), 3556 NULL); 3557} 3558 3559/* --- 7-bit ASCII Codec -------------------------------------------------- */ 3560 3561PyObject *PyUnicode_DecodeASCII(const char *s, 3562 Py_ssize_t size, 3563 const char *errors) 3564{ 3565 const char *starts = s; 3566 PyUnicodeObject *v; 3567 Py_UNICODE *p; 3568 Py_ssize_t startinpos; 3569 Py_ssize_t endinpos; 3570 Py_ssize_t outpos; 3571 const char *e; 3572 PyObject *errorHandler = NULL; 3573 PyObject *exc = NULL; 3574 3575 /* ASCII is equivalent to the first 128 ordinals in Unicode. */ 3576 if (size == 1 && *(unsigned char*)s < 128) { 3577 Py_UNICODE r = *(unsigned char*)s; 3578 return PyUnicode_FromUnicode(&r, 1); 3579 } 3580 3581 v = _PyUnicode_New(size); 3582 if (v == NULL) 3583 goto onError; 3584 if (size == 0) 3585 return (PyObject *)v; 3586 p = PyUnicode_AS_UNICODE(v); 3587 e = s + size; 3588 while (s < e) { 3589 register unsigned char c = (unsigned char)*s; 3590 if (c < 128) { 3591 *p++ = c; 3592 ++s; 3593 } 3594 else { 3595 startinpos = s-starts; 3596 endinpos = startinpos + 1; 3597 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v); 3598 if (unicode_decode_call_errorhandler( 3599 errors, &errorHandler, 3600 "ascii", "ordinal not in range(128)", 3601 &starts, &e, &startinpos, &endinpos, &exc, &s, 3602 (PyObject **)&v, &outpos, &p)) 3603 goto onError; 3604 } 3605 } 3606 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v)) 3607 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0) 3608 goto onError; 3609 Py_XDECREF(errorHandler); 3610 Py_XDECREF(exc); 3611 return (PyObject *)v; 3612 3613 onError: 3614 Py_XDECREF(v); 3615 Py_XDECREF(errorHandler); 3616 Py_XDECREF(exc); 3617 return NULL; 3618} 3619 3620PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p, 3621 Py_ssize_t size, 3622 const char *errors) 3623{ 3624 return unicode_encode_ucs1(p, size, errors, 128); 3625} 3626 3627PyObject *PyUnicode_AsASCIIString(PyObject *unicode) 3628{ 3629 if (!PyUnicode_Check(unicode)) { 3630 PyErr_BadArgument(); 3631 return NULL; 3632 } 3633 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode), 3634 PyUnicode_GET_SIZE(unicode), 3635 NULL); 3636} 3637 3638#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T) 3639 3640/* --- MBCS codecs for Windows -------------------------------------------- */ 3641 3642#if SIZEOF_INT < SIZEOF_SSIZE_T 3643#define NEED_RETRY 3644#endif 3645 3646/* XXX This code is limited to "true" double-byte encodings, as 3647 a) it assumes an incomplete character consists of a single byte, and 3648 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte 3649 encodings, see IsDBCSLeadByteEx documentation. */ 3650 3651static int is_dbcs_lead_byte(const char *s, int offset) 3652{ 3653 const char *curr = s + offset; 3654 3655 if (IsDBCSLeadByte(*curr)) { 3656 const char *prev = CharPrev(s, curr); 3657 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2); 3658 } 3659 return 0; 3660} 3661 3662/* 3663 * Decode MBCS string into unicode object. If 'final' is set, converts 3664 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise. 3665 */ 3666static int decode_mbcs(PyUnicodeObject **v, 3667 const char *s, /* MBCS string */ 3668 int size, /* sizeof MBCS string */ 3669 int final) 3670{ 3671 Py_UNICODE *p; 3672 Py_ssize_t n = 0; 3673 int usize = 0; 3674 3675 assert(size >= 0); 3676 3677 /* Skip trailing lead-byte unless 'final' is set */ 3678 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1)) 3679 --size; 3680 3681 /* First get the size of the result */ 3682 if (size > 0) { 3683 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0); 3684 if (usize == 0) { 3685 PyErr_SetFromWindowsErrWithFilename(0, NULL); 3686 return -1; 3687 } 3688 } 3689 3690 if (*v == NULL) { 3691 /* Create unicode object */ 3692 *v = _PyUnicode_New(usize); 3693 if (*v == NULL) 3694 return -1; 3695 } 3696 else { 3697 /* Extend unicode object */ 3698 n = PyUnicode_GET_SIZE(*v); 3699 if (_PyUnicode_Resize(v, n + usize) < 0) 3700 return -1; 3701 } 3702 3703 /* Do the conversion */ 3704 if (size > 0) { 3705 p = PyUnicode_AS_UNICODE(*v) + n; 3706 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) { 3707 PyErr_SetFromWindowsErrWithFilename(0, NULL); 3708 return -1; 3709 } 3710 } 3711 3712 return size; 3713} 3714 3715PyObject *PyUnicode_DecodeMBCSStateful(const char *s, 3716 Py_ssize_t size, 3717 const char *errors, 3718 Py_ssize_t *consumed) 3719{ 3720 PyUnicodeObject *v = NULL; 3721 int done; 3722 3723 if (consumed) 3724 *consumed = 0; 3725 3726#ifdef NEED_RETRY 3727 retry: 3728 if (size > INT_MAX) 3729 done = decode_mbcs(&v, s, INT_MAX, 0); 3730 else 3731#endif 3732 done = decode_mbcs(&v, s, (int)size, !consumed); 3733 3734 if (done < 0) { 3735 Py_XDECREF(v); 3736 return NULL; 3737 } 3738 3739 if (consumed) 3740 *consumed += done; 3741 3742#ifdef NEED_RETRY 3743 if (size > INT_MAX) { 3744 s += done; 3745 size -= done; 3746 goto retry; 3747 } 3748#endif 3749 3750 return (PyObject *)v; 3751} 3752 3753PyObject *PyUnicode_DecodeMBCS(const char *s, 3754 Py_ssize_t size, 3755 const char *errors) 3756{ 3757 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL); 3758} 3759 3760/* 3761 * Convert unicode into string object (MBCS). 3762 * Returns 0 if succeed, -1 otherwise. 3763 */ 3764static int encode_mbcs(PyObject **repr, 3765 const Py_UNICODE *p, /* unicode */ 3766 int size) /* size of unicode */ 3767{ 3768 int mbcssize = 0; 3769 Py_ssize_t n = 0; 3770 3771 assert(size >= 0); 3772 3773 /* First get the size of the result */ 3774 if (size > 0) { 3775 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL); 3776 if (mbcssize == 0) { 3777 PyErr_SetFromWindowsErrWithFilename(0, NULL); 3778 return -1; 3779 } 3780 } 3781 3782 if (*repr == NULL) { 3783 /* Create string object */ 3784 *repr = PyBytes_FromStringAndSize(NULL, mbcssize); 3785 if (*repr == NULL) 3786 return -1; 3787 } 3788 else { 3789 /* Extend string object */ 3790 n = PyBytes_Size(*repr); 3791 if (PyBytes_Resize(*repr, n + mbcssize) < 0) 3792 return -1; 3793 } 3794 3795 /* Do the conversion */ 3796 if (size > 0) { 3797 char *s = PyBytes_AS_STRING(*repr) + n; 3798 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) { 3799 PyErr_SetFromWindowsErrWithFilename(0, NULL); 3800 return -1; 3801 } 3802 } 3803 3804 return 0; 3805} 3806 3807PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p, 3808 Py_ssize_t size, 3809 const char *errors) 3810{ 3811 PyObject *repr = NULL; 3812 int ret; 3813 3814#ifdef NEED_RETRY 3815 retry: 3816 if (size > INT_MAX) 3817 ret = encode_mbcs(&repr, p, INT_MAX); 3818 else 3819#endif 3820 ret = encode_mbcs(&repr, p, (int)size); 3821 3822 if (ret < 0) { 3823 Py_XDECREF(repr); 3824 return NULL; 3825 } 3826 3827#ifdef NEED_RETRY 3828 if (size > INT_MAX) { 3829 p += INT_MAX; 3830 size -= INT_MAX; 3831 goto retry; 3832 } 3833#endif 3834 3835 return repr; 3836} 3837 3838PyObject *PyUnicode_AsMBCSString(PyObject *unicode) 3839{ 3840 if (!PyUnicode_Check(unicode)) { 3841 PyErr_BadArgument(); 3842 return NULL; 3843 } 3844 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode), 3845 PyUnicode_GET_SIZE(unicode), 3846 NULL); 3847} 3848 3849#undef NEED_RETRY 3850 3851#endif /* MS_WINDOWS */ 3852 3853/* --- Character Mapping Codec -------------------------------------------- */ 3854 3855PyObject *PyUnicode_DecodeCharmap(const char *s, 3856 Py_ssize_t size, 3857 PyObject *mapping, 3858 const char *errors) 3859{ 3860 const char *starts = s; 3861 Py_ssize_t startinpos; 3862 Py_ssize_t endinpos; 3863 Py_ssize_t outpos; 3864 const char *e; 3865 PyUnicodeObject *v; 3866 Py_UNICODE *p; 3867 Py_ssize_t extrachars = 0; 3868 PyObject *errorHandler = NULL; 3869 PyObject *exc = NULL; 3870 Py_UNICODE *mapstring = NULL; 3871 Py_ssize_t maplen = 0; 3872 3873 /* Default to Latin-1 */ 3874 if (mapping == NULL) 3875 return PyUnicode_DecodeLatin1(s, size, errors); 3876 3877 v = _PyUnicode_New(size); 3878 if (v == NULL) 3879 goto onError; 3880 if (size == 0) 3881 return (PyObject *)v; 3882 p = PyUnicode_AS_UNICODE(v); 3883 e = s + size; 3884 if (PyUnicode_CheckExact(mapping)) { 3885 mapstring = PyUnicode_AS_UNICODE(mapping); 3886 maplen = PyUnicode_GET_SIZE(mapping); 3887 while (s < e) { 3888 unsigned char ch = *s; 3889 Py_UNICODE x = 0xfffe; /* illegal value */ 3890 3891 if (ch < maplen) 3892 x = mapstring[ch]; 3893 3894 if (x == 0xfffe) { 3895 /* undefined mapping */ 3896 outpos = p-PyUnicode_AS_UNICODE(v); 3897 startinpos = s-starts; 3898 endinpos = startinpos+1; 3899 if (unicode_decode_call_errorhandler( 3900 errors, &errorHandler, 3901 "charmap", "character maps to <undefined>", 3902 &starts, &e, &startinpos, &endinpos, &exc, &s, 3903 (PyObject **)&v, &outpos, &p)) { 3904 goto onError; 3905 } 3906 continue; 3907 } 3908 *p++ = x; 3909 ++s; 3910 } 3911 } 3912 else { 3913 while (s < e) { 3914 unsigned char ch = *s; 3915 PyObject *w, *x; 3916 3917 /* Get mapping (char ordinal -> integer, Unicode char or None) */ 3918 w = PyInt_FromLong((long)ch); 3919 if (w == NULL) 3920 goto onError; 3921 x = PyObject_GetItem(mapping, w); 3922 Py_DECREF(w); 3923 if (x == NULL) { 3924 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 3925 /* No mapping found means: mapping is undefined. */ 3926 PyErr_Clear(); 3927 x = Py_None; 3928 Py_INCREF(x); 3929 } else 3930 goto onError; 3931 } 3932 3933 /* Apply mapping */ 3934 if (PyInt_Check(x)) { 3935 long value = PyInt_AS_LONG(x); 3936 if (value < 0 || value > 65535) { 3937 PyErr_SetString(PyExc_TypeError, 3938 "character mapping must be in range(65536)"); 3939 Py_DECREF(x); 3940 goto onError; 3941 } 3942 *p++ = (Py_UNICODE)value; 3943 } 3944 else if (x == Py_None) { 3945 /* undefined mapping */ 3946 outpos = p-PyUnicode_AS_UNICODE(v); 3947 startinpos = s-starts; 3948 endinpos = startinpos+1; 3949 if (unicode_decode_call_errorhandler( 3950 errors, &errorHandler, 3951 "charmap", "character maps to <undefined>", 3952 &starts, &e, &startinpos, &endinpos, &exc, &s, 3953 (PyObject **)&v, &outpos, &p)) { 3954 Py_DECREF(x); 3955 goto onError; 3956 } 3957 Py_DECREF(x); 3958 continue; 3959 } 3960 else if (PyUnicode_Check(x)) { 3961 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x); 3962 3963 if (targetsize == 1) 3964 /* 1-1 mapping */ 3965 *p++ = *PyUnicode_AS_UNICODE(x); 3966 3967 else if (targetsize > 1) { 3968 /* 1-n mapping */ 3969 if (targetsize > extrachars) { 3970 /* resize first */ 3971 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v); 3972 Py_ssize_t needed = (targetsize - extrachars) + \ 3973 (targetsize << 2); 3974 extrachars += needed; 3975 /* XXX overflow detection missing */ 3976 if (_PyUnicode_Resize(&v, 3977 PyUnicode_GET_SIZE(v) + needed) < 0) { 3978 Py_DECREF(x); 3979 goto onError; 3980 } 3981 p = PyUnicode_AS_UNICODE(v) + oldpos; 3982 } 3983 Py_UNICODE_COPY(p, 3984 PyUnicode_AS_UNICODE(x), 3985 targetsize); 3986 p += targetsize; 3987 extrachars -= targetsize; 3988 } 3989 /* 1-0 mapping: skip the character */ 3990 } 3991 else { 3992 /* wrong return value */ 3993 PyErr_SetString(PyExc_TypeError, 3994 "character mapping must return integer, None or unicode"); 3995 Py_DECREF(x); 3996 goto onError; 3997 } 3998 Py_DECREF(x); 3999 ++s; 4000 } 4001 } 4002 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v)) 4003 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0) 4004 goto onError; 4005 Py_XDECREF(errorHandler); 4006 Py_XDECREF(exc); 4007 return (PyObject *)v; 4008 4009 onError: 4010 Py_XDECREF(errorHandler); 4011 Py_XDECREF(exc); 4012 Py_XDECREF(v); 4013 return NULL; 4014} 4015 4016/* Charmap encoding: the lookup table */ 4017 4018struct encoding_map{ 4019 PyObject_HEAD 4020 unsigned char level1[32]; 4021 int count2, count3; 4022 unsigned char level23[1]; 4023}; 4024 4025static PyObject* 4026encoding_map_size(PyObject *obj, PyObject* args) 4027{ 4028 struct encoding_map *map = (struct encoding_map*)obj; 4029 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 + 4030 128*map->count3); 4031} 4032 4033static PyMethodDef encoding_map_methods[] = { 4034 {"size", encoding_map_size, METH_NOARGS, 4035 PyDoc_STR("Return the size (in bytes) of this object") }, 4036 { 0 } 4037}; 4038 4039static void 4040encoding_map_dealloc(PyObject* o) 4041{ 4042 PyObject_FREE(o); 4043} 4044 4045static PyTypeObject EncodingMapType = { 4046 PyVarObject_HEAD_INIT(NULL, 0) 4047 "EncodingMap", /*tp_name*/ 4048 sizeof(struct encoding_map), /*tp_basicsize*/ 4049 0, /*tp_itemsize*/ 4050 /* methods */ 4051 encoding_map_dealloc, /*tp_dealloc*/ 4052 0, /*tp_print*/ 4053 0, /*tp_getattr*/ 4054 0, /*tp_setattr*/ 4055 0, /*tp_compare*/ 4056 0, /*tp_repr*/ 4057 0, /*tp_as_number*/ 4058 0, /*tp_as_sequence*/ 4059 0, /*tp_as_mapping*/ 4060 0, /*tp_hash*/ 4061 0, /*tp_call*/ 4062 0, /*tp_str*/ 4063 0, /*tp_getattro*/ 4064 0, /*tp_setattro*/ 4065 0, /*tp_as_buffer*/ 4066 Py_TPFLAGS_DEFAULT, /*tp_flags*/ 4067 0, /*tp_doc*/ 4068 0, /*tp_traverse*/ 4069 0, /*tp_clear*/ 4070 0, /*tp_richcompare*/ 4071 0, /*tp_weaklistoffset*/ 4072 0, /*tp_iter*/ 4073 0, /*tp_iternext*/ 4074 encoding_map_methods, /*tp_methods*/ 4075 0, /*tp_members*/ 4076 0, /*tp_getset*/ 4077 0, /*tp_base*/ 4078 0, /*tp_dict*/ 4079 0, /*tp_descr_get*/ 4080 0, /*tp_descr_set*/ 4081 0, /*tp_dictoffset*/ 4082 0, /*tp_init*/ 4083 0, /*tp_alloc*/ 4084 0, /*tp_new*/ 4085 0, /*tp_free*/ 4086 0, /*tp_is_gc*/ 4087}; 4088 4089PyObject* 4090PyUnicode_BuildEncodingMap(PyObject* string) 4091{ 4092 Py_UNICODE *decode; 4093 PyObject *result; 4094 struct encoding_map *mresult; 4095 int i; 4096 int need_dict = 0; 4097 unsigned char level1[32]; 4098 unsigned char level2[512]; 4099 unsigned char *mlevel1, *mlevel2, *mlevel3; 4100 int count2 = 0, count3 = 0; 4101 4102 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) { 4103 PyErr_BadArgument(); 4104 return NULL; 4105 } 4106 decode = PyUnicode_AS_UNICODE(string); 4107 memset(level1, 0xFF, sizeof level1); 4108 memset(level2, 0xFF, sizeof level2); 4109 4110 /* If there isn't a one-to-one mapping of NULL to \0, 4111 or if there are non-BMP characters, we need to use 4112 a mapping dictionary. */ 4113 if (decode[0] != 0) 4114 need_dict = 1; 4115 for (i = 1; i < 256; i++) { 4116 int l1, l2; 4117 if (decode[i] == 0 4118 #ifdef Py_UNICODE_WIDE 4119 || decode[i] > 0xFFFF 4120 #endif 4121 ) { 4122 need_dict = 1; 4123 break; 4124 } 4125 if (decode[i] == 0xFFFE) 4126 /* unmapped character */ 4127 continue; 4128 l1 = decode[i] >> 11; 4129 l2 = decode[i] >> 7; 4130 if (level1[l1] == 0xFF) 4131 level1[l1] = count2++; 4132 if (level2[l2] == 0xFF) 4133 level2[l2] = count3++; 4134 } 4135 4136 if (count2 >= 0xFF || count3 >= 0xFF) 4137 need_dict = 1; 4138 4139 if (need_dict) { 4140 PyObject *result = PyDict_New(); 4141 PyObject *key, *value; 4142 if (!result) 4143 return NULL; 4144 for (i = 0; i < 256; i++) { 4145 key = value = NULL; 4146 key = PyInt_FromLong(decode[i]); 4147 value = PyInt_FromLong(i); 4148 if (!key || !value) 4149 goto failed1; 4150 if (PyDict_SetItem(result, key, value) == -1) 4151 goto failed1; 4152 Py_DECREF(key); 4153 Py_DECREF(value); 4154 } 4155 return result; 4156 failed1: 4157 Py_XDECREF(key); 4158 Py_XDECREF(value); 4159 Py_DECREF(result); 4160 return NULL; 4161 } 4162 4163 /* Create a three-level trie */ 4164 result = PyObject_MALLOC(sizeof(struct encoding_map) + 4165 16*count2 + 128*count3 - 1); 4166 if (!result) 4167 return PyErr_NoMemory(); 4168 PyObject_Init(result, &EncodingMapType); 4169 mresult = (struct encoding_map*)result; 4170 mresult->count2 = count2; 4171 mresult->count3 = count3; 4172 mlevel1 = mresult->level1; 4173 mlevel2 = mresult->level23; 4174 mlevel3 = mresult->level23 + 16*count2; 4175 memcpy(mlevel1, level1, 32); 4176 memset(mlevel2, 0xFF, 16*count2); 4177 memset(mlevel3, 0, 128*count3); 4178 count3 = 0; 4179 for (i = 1; i < 256; i++) { 4180 int o1, o2, o3, i2, i3; 4181 if (decode[i] == 0xFFFE) 4182 /* unmapped character */ 4183 continue; 4184 o1 = decode[i]>>11; 4185 o2 = (decode[i]>>7) & 0xF; 4186 i2 = 16*mlevel1[o1] + o2; 4187 if (mlevel2[i2] == 0xFF) 4188 mlevel2[i2] = count3++; 4189 o3 = decode[i] & 0x7F; 4190 i3 = 128*mlevel2[i2] + o3; 4191 mlevel3[i3] = i; 4192 } 4193 return result; 4194} 4195 4196static int 4197encoding_map_lookup(Py_UNICODE c, PyObject *mapping) 4198{ 4199 struct encoding_map *map = (struct encoding_map*)mapping; 4200 int l1 = c>>11; 4201 int l2 = (c>>7) & 0xF; 4202 int l3 = c & 0x7F; 4203 int i; 4204 4205#ifdef Py_UNICODE_WIDE 4206 if (c > 0xFFFF) { 4207 return -1; 4208 } 4209#endif 4210 if (c == 0) 4211 return 0; 4212 /* level 1*/ 4213 i = map->level1[l1]; 4214 if (i == 0xFF) { 4215 return -1; 4216 } 4217 /* level 2*/ 4218 i = map->level23[16*i+l2]; 4219 if (i == 0xFF) { 4220 return -1; 4221 } 4222 /* level 3 */ 4223 i = map->level23[16*map->count2 + 128*i + l3]; 4224 if (i == 0) { 4225 return -1; 4226 } 4227 return i; 4228} 4229 4230/* Lookup the character ch in the mapping. If the character 4231 can't be found, Py_None is returned (or NULL, if another 4232 error occurred). */ 4233static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping) 4234{ 4235 PyObject *w = PyInt_FromLong((long)c); 4236 PyObject *x; 4237 4238 if (w == NULL) 4239 return NULL; 4240 x = PyObject_GetItem(mapping, w); 4241 Py_DECREF(w); 4242 if (x == NULL) { 4243 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 4244 /* No mapping found means: mapping is undefined. */ 4245 PyErr_Clear(); 4246 x = Py_None; 4247 Py_INCREF(x); 4248 return x; 4249 } else 4250 return NULL; 4251 } 4252 else if (x == Py_None) 4253 return x; 4254 else if (PyInt_Check(x)) { 4255 long value = PyInt_AS_LONG(x); 4256 if (value < 0 || value > 255) { 4257 PyErr_SetString(PyExc_TypeError, 4258 "character mapping must be in range(256)"); 4259 Py_DECREF(x); 4260 return NULL; 4261 } 4262 return x; 4263 } 4264 else if (PyString_Check(x)) 4265 return x; 4266 else { 4267 /* wrong return value */ 4268 PyErr_Format(PyExc_TypeError, 4269 "character mapping must return integer, None or str8, not %.400s", 4270 x->ob_type->tp_name); 4271 Py_DECREF(x); 4272 return NULL; 4273 } 4274} 4275 4276static int 4277charmapencode_resize(PyObject *outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize) 4278{ 4279 Py_ssize_t outsize = PyBytes_GET_SIZE( outobj); 4280 /* exponentially overallocate to minimize reallocations */ 4281 if (requiredsize < 2*outsize) 4282 requiredsize = 2*outsize; 4283 if (PyBytes_Resize(outobj, requiredsize)) { 4284 Py_DECREF(outobj); 4285 return -1; 4286 } 4287 return 0; 4288} 4289 4290typedef enum charmapencode_result { 4291 enc_SUCCESS, enc_FAILED, enc_EXCEPTION 4292}charmapencode_result; 4293/* lookup the character, put the result in the output string and adjust 4294 various state variables. Resize the output bytes object if not enough 4295 space is available. Return a new reference to the object that 4296 was put in the output buffer, or Py_None, if the mapping was undefined 4297 (in which case no character was written) or NULL, if a 4298 reallocation error occurred. The caller must decref the result */ 4299static 4300charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping, 4301 PyObject *outobj, Py_ssize_t *outpos) 4302{ 4303 PyObject *rep; 4304 char *outstart; 4305 Py_ssize_t outsize = PyBytes_GET_SIZE(outobj); 4306 4307 if (Py_Type(mapping) == &EncodingMapType) { 4308 int res = encoding_map_lookup(c, mapping); 4309 Py_ssize_t requiredsize = *outpos+1; 4310 if (res == -1) 4311 return enc_FAILED; 4312 if (outsize<requiredsize) 4313 if (charmapencode_resize(outobj, outpos, requiredsize)) 4314 return enc_EXCEPTION; 4315 outstart = PyBytes_AS_STRING(outobj); 4316 outstart[(*outpos)++] = (char)res; 4317 return enc_SUCCESS; 4318 } 4319 4320 rep = charmapencode_lookup(c, mapping); 4321 if (rep==NULL) 4322 return enc_EXCEPTION; 4323 else if (rep==Py_None) { 4324 Py_DECREF(rep); 4325 return enc_FAILED; 4326 } else { 4327 if (PyInt_Check(rep)) { 4328 Py_ssize_t requiredsize = *outpos+1; 4329 if (outsize<requiredsize) 4330 if (charmapencode_resize(outobj, outpos, requiredsize)) { 4331 Py_DECREF(rep); 4332 return enc_EXCEPTION; 4333 } 4334 outstart = PyBytes_AS_STRING(outobj); 4335 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep); 4336 } 4337 else { 4338 const char *repchars = PyString_AS_STRING(rep); 4339 Py_ssize_t repsize = PyString_GET_SIZE(rep); 4340 Py_ssize_t requiredsize = *outpos+repsize; 4341 if (outsize<requiredsize) 4342 if (charmapencode_resize(outobj, outpos, requiredsize)) { 4343 Py_DECREF(rep); 4344 return enc_EXCEPTION; 4345 } 4346 outstart = PyBytes_AS_STRING(outobj); 4347 memcpy(outstart + *outpos, repchars, repsize); 4348 *outpos += repsize; 4349 } 4350 } 4351 Py_DECREF(rep); 4352 return enc_SUCCESS; 4353} 4354 4355/* handle an error in PyUnicode_EncodeCharmap 4356 Return 0 on success, -1 on error */ 4357static 4358int charmap_encoding_error( 4359 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping, 4360 PyObject **exceptionObject, 4361 int *known_errorHandler, PyObject **errorHandler, const char *errors, 4362 PyObject *res, Py_ssize_t *respos) 4363{ 4364 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */ 4365 Py_ssize_t repsize; 4366 Py_ssize_t newpos; 4367 Py_UNICODE *uni2; 4368 /* startpos for collecting unencodable chars */ 4369 Py_ssize_t collstartpos = *inpos; 4370 Py_ssize_t collendpos = *inpos+1; 4371 Py_ssize_t collpos; 4372 char *encoding = "charmap"; 4373 char *reason = "character maps to <undefined>"; 4374 charmapencode_result x; 4375 4376 /* find all unencodable characters */ 4377 while (collendpos < size) { 4378 PyObject *rep; 4379 if (Py_Type(mapping) == &EncodingMapType) { 4380 int res = encoding_map_lookup(p[collendpos], mapping); 4381 if (res != -1) 4382 break; 4383 ++collendpos; 4384 continue; 4385 } 4386 4387 rep = charmapencode_lookup(p[collendpos], mapping); 4388 if (rep==NULL) 4389 return -1; 4390 else if (rep!=Py_None) { 4391 Py_DECREF(rep); 4392 break; 4393 } 4394 Py_DECREF(rep); 4395 ++collendpos; 4396 } 4397 /* cache callback name lookup 4398 * (if not done yet, i.e. it's the first error) */ 4399 if (*known_errorHandler==-1) { 4400 if ((errors==NULL) || (!strcmp(errors, "strict"))) 4401 *known_errorHandler = 1; 4402 else if (!strcmp(errors, "replace")) 4403 *known_errorHandler = 2; 4404 else if (!strcmp(errors, "ignore")) 4405 *known_errorHandler = 3; 4406 else if (!strcmp(errors, "xmlcharrefreplace")) 4407 *known_errorHandler = 4; 4408 else 4409 *known_errorHandler = 0; 4410 } 4411 switch (*known_errorHandler) { 4412 case 1: /* strict */ 4413 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason); 4414 return -1; 4415 case 2: /* replace */ 4416 for (collpos = collstartpos; collpos<collendpos; ++collpos) { 4417 x = charmapencode_output('?', mapping, res, respos); 4418 if (x==enc_EXCEPTION) { 4419 return -1; 4420 } 4421 else if (x==enc_FAILED) { 4422 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason); 4423 return -1; 4424 } 4425 } 4426 /* fall through */ 4427 case 3: /* ignore */ 4428 *inpos = collendpos; 4429 break; 4430 case 4: /* xmlcharrefreplace */ 4431 /* generate replacement (temporarily (mis)uses p) */ 4432 for (collpos = collstartpos; collpos < collendpos; ++collpos) { 4433 char buffer[2+29+1+1]; 4434 char *cp; 4435 sprintf(buffer, "&#%d;", (int)p[collpos]); 4436 for (cp = buffer; *cp; ++cp) { 4437 x = charmapencode_output(*cp, mapping, res, respos); 4438 if (x==enc_EXCEPTION) 4439 return -1; 4440 else if (x==enc_FAILED) { 4441 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason); 4442 return -1; 4443 } 4444 } 4445 } 4446 *inpos = collendpos; 4447 break; 4448 default: 4449 repunicode = unicode_encode_call_errorhandler(errors, errorHandler, 4450 encoding, reason, p, size, exceptionObject, 4451 collstartpos, collendpos, &newpos); 4452 if (repunicode == NULL) 4453 return -1; 4454 /* generate replacement */ 4455 repsize = PyUnicode_GET_SIZE(repunicode); 4456 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) { 4457 x = charmapencode_output(*uni2, mapping, res, respos); 4458 if (x==enc_EXCEPTION) { 4459 return -1; 4460 } 4461 else if (x==enc_FAILED) { 4462 Py_DECREF(repunicode); 4463 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason); 4464 return -1; 4465 } 4466 } 4467 *inpos = newpos; 4468 Py_DECREF(repunicode); 4469 } 4470 return 0; 4471} 4472 4473PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p, 4474 Py_ssize_t size, 4475 PyObject *mapping, 4476 const char *errors) 4477{ 4478 /* output object */ 4479 PyObject *res = NULL; 4480 /* current input position */ 4481 Py_ssize_t inpos = 0; 4482 /* current output position */ 4483 Py_ssize_t respos = 0; 4484 PyObject *errorHandler = NULL; 4485 PyObject *exc = NULL; 4486 /* the following variable is used for caching string comparisons 4487 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 4488 * 3=ignore, 4=xmlcharrefreplace */ 4489 int known_errorHandler = -1; 4490 4491 /* Default to Latin-1 */ 4492 if (mapping == NULL) 4493 return PyUnicode_EncodeLatin1(p, size, errors); 4494 4495 /* allocate enough for a simple encoding without 4496 replacements, if we need more, we'll resize */ 4497 res = PyBytes_FromStringAndSize(NULL, size); 4498 if (res == NULL) 4499 goto onError; 4500 if (size == 0) 4501 return res; 4502 4503 while (inpos<size) { 4504 /* try to encode it */ 4505 charmapencode_result x = charmapencode_output(p[inpos], mapping, res, &respos); 4506 if (x==enc_EXCEPTION) /* error */ 4507 goto onError; 4508 if (x==enc_FAILED) { /* unencodable character */ 4509 if (charmap_encoding_error(p, size, &inpos, mapping, 4510 &exc, 4511 &known_errorHandler, &errorHandler, errors, 4512 res, &respos)) { 4513 goto onError; 4514 } 4515 } 4516 else 4517 /* done with this character => adjust input position */ 4518 ++inpos; 4519 } 4520 4521 /* Resize if we allocated to much */ 4522 if (respos<PyBytes_GET_SIZE(res)) { 4523 if (PyBytes_Resize(res, respos)) 4524 goto onError; 4525 } 4526 Py_XDECREF(exc); 4527 Py_XDECREF(errorHandler); 4528 return res; 4529 4530 onError: 4531 Py_XDECREF(res); 4532 Py_XDECREF(exc); 4533 Py_XDECREF(errorHandler); 4534 return NULL; 4535} 4536 4537PyObject *PyUnicode_AsCharmapString(PyObject *unicode, 4538 PyObject *mapping) 4539{ 4540 if (!PyUnicode_Check(unicode) || mapping == NULL) { 4541 PyErr_BadArgument(); 4542 return NULL; 4543 } 4544 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode), 4545 PyUnicode_GET_SIZE(unicode), 4546 mapping, 4547 NULL); 4548} 4549 4550/* create or adjust a UnicodeTranslateError */ 4551static void make_translate_exception(PyObject **exceptionObject, 4552 const Py_UNICODE *unicode, Py_ssize_t size, 4553 Py_ssize_t startpos, Py_ssize_t endpos, 4554 const char *reason) 4555{ 4556 if (*exceptionObject == NULL) { 4557 *exceptionObject = PyUnicodeTranslateError_Create( 4558 unicode, size, startpos, endpos, reason); 4559 } 4560 else { 4561 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos)) 4562 goto onError; 4563 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos)) 4564 goto onError; 4565 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason)) 4566 goto onError; 4567 return; 4568 onError: 4569 Py_DECREF(*exceptionObject); 4570 *exceptionObject = NULL; 4571 } 4572} 4573 4574/* raises a UnicodeTranslateError */ 4575static void raise_translate_exception(PyObject **exceptionObject, 4576 const Py_UNICODE *unicode, Py_ssize_t size, 4577 Py_ssize_t startpos, Py_ssize_t endpos, 4578 const char *reason) 4579{ 4580 make_translate_exception(exceptionObject, 4581 unicode, size, startpos, endpos, reason); 4582 if (*exceptionObject != NULL) 4583 PyCodec_StrictErrors(*exceptionObject); 4584} 4585 4586/* error handling callback helper: 4587 build arguments, call the callback and check the arguments, 4588 put the result into newpos and return the replacement string, which 4589 has to be freed by the caller */ 4590static PyObject *unicode_translate_call_errorhandler(const char *errors, 4591 PyObject **errorHandler, 4592 const char *reason, 4593 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject, 4594 Py_ssize_t startpos, Py_ssize_t endpos, 4595 Py_ssize_t *newpos) 4596{ 4597 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple"; 4598 4599 Py_ssize_t i_newpos; 4600 PyObject *restuple; 4601 PyObject *resunicode; 4602 4603 if (*errorHandler == NULL) { 4604 *errorHandler = PyCodec_LookupError(errors); 4605 if (*errorHandler == NULL) 4606 return NULL; 4607 } 4608 4609 make_translate_exception(exceptionObject, 4610 unicode, size, startpos, endpos, reason); 4611 if (*exceptionObject == NULL) 4612 return NULL; 4613 4614 restuple = PyObject_CallFunctionObjArgs( 4615 *errorHandler, *exceptionObject, NULL); 4616 if (restuple == NULL) 4617 return NULL; 4618 if (!PyTuple_Check(restuple)) { 4619 PyErr_Format(PyExc_TypeError, &argparse[4]); 4620 Py_DECREF(restuple); 4621 return NULL; 4622 } 4623 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, 4624 &resunicode, &i_newpos)) { 4625 Py_DECREF(restuple); 4626 return NULL; 4627 } 4628 if (i_newpos<0) 4629 *newpos = size+i_newpos; 4630 else 4631 *newpos = i_newpos; 4632 if (*newpos<0 || *newpos>size) { 4633 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos); 4634 Py_DECREF(restuple); 4635 return NULL; 4636 } 4637 Py_INCREF(resunicode); 4638 Py_DECREF(restuple); 4639 return resunicode; 4640} 4641 4642/* Lookup the character ch in the mapping and put the result in result, 4643 which must be decrefed by the caller. 4644 Return 0 on success, -1 on error */ 4645static 4646int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result) 4647{ 4648 PyObject *w = PyInt_FromLong((long)c); 4649 PyObject *x; 4650 4651 if (w == NULL) 4652 return -1; 4653 x = PyObject_GetItem(mapping, w); 4654 Py_DECREF(w); 4655 if (x == NULL) { 4656 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 4657 /* No mapping found means: use 1:1 mapping. */ 4658 PyErr_Clear(); 4659 *result = NULL; 4660 return 0; 4661 } else 4662 return -1; 4663 } 4664 else if (x == Py_None) { 4665 *result = x; 4666 return 0; 4667 } 4668 else if (PyInt_Check(x)) { 4669 long value = PyInt_AS_LONG(x); 4670 long max = PyUnicode_GetMax(); 4671 if (value < 0 || value > max) { 4672 PyErr_Format(PyExc_TypeError, 4673 "character mapping must be in range(0x%lx)", max+1); 4674 Py_DECREF(x); 4675 return -1; 4676 } 4677 *result = x; 4678 return 0; 4679 } 4680 else if (PyUnicode_Check(x)) { 4681 *result = x; 4682 return 0; 4683 } 4684 else { 4685 /* wrong return value */ 4686 PyErr_SetString(PyExc_TypeError, 4687 "character mapping must return integer, None or unicode"); 4688 Py_DECREF(x); 4689 return -1; 4690 } 4691} 4692/* ensure that *outobj is at least requiredsize characters long, 4693if not reallocate and adjust various state variables. 4694Return 0 on success, -1 on error */ 4695static 4696int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp, 4697 Py_ssize_t requiredsize) 4698{ 4699 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj); 4700 if (requiredsize > oldsize) { 4701 /* remember old output position */ 4702 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj); 4703 /* exponentially overallocate to minimize reallocations */ 4704 if (requiredsize < 2 * oldsize) 4705 requiredsize = 2 * oldsize; 4706 if (_PyUnicode_Resize(outobj, requiredsize) < 0) 4707 return -1; 4708 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos; 4709 } 4710 return 0; 4711} 4712/* lookup the character, put the result in the output string and adjust 4713 various state variables. Return a new reference to the object that 4714 was put in the output buffer in *result, or Py_None, if the mapping was 4715 undefined (in which case no character was written). 4716 The called must decref result. 4717 Return 0 on success, -1 on error. */ 4718static 4719int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp, 4720 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp, 4721 PyObject **res) 4722{ 4723 if (charmaptranslate_lookup(*curinp, mapping, res)) 4724 return -1; 4725 if (*res==NULL) { 4726 /* not found => default to 1:1 mapping */ 4727 *(*outp)++ = *curinp; 4728 } 4729 else if (*res==Py_None) 4730 ; 4731 else if (PyInt_Check(*res)) { 4732 /* no overflow check, because we know that the space is enough */ 4733 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res); 4734 } 4735 else if (PyUnicode_Check(*res)) { 4736 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res); 4737 if (repsize==1) { 4738 /* no overflow check, because we know that the space is enough */ 4739 *(*outp)++ = *PyUnicode_AS_UNICODE(*res); 4740 } 4741 else if (repsize!=0) { 4742 /* more than one character */ 4743 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) + 4744 (insize - (curinp-startinp)) + 4745 repsize - 1; 4746 if (charmaptranslate_makespace(outobj, outp, requiredsize)) 4747 return -1; 4748 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize); 4749 *outp += repsize; 4750 } 4751 } 4752 else 4753 return -1; 4754 return 0; 4755} 4756 4757PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p, 4758 Py_ssize_t size, 4759 PyObject *mapping, 4760 const char *errors) 4761{ 4762 /* output object */ 4763 PyObject *res = NULL; 4764 /* pointers to the beginning and end+1 of input */ 4765 const Py_UNICODE *startp = p; 4766 const Py_UNICODE *endp = p + size; 4767 /* pointer into the output */ 4768 Py_UNICODE *str; 4769 /* current output position */ 4770 Py_ssize_t respos = 0; 4771 char *reason = "character maps to <undefined>"; 4772 PyObject *errorHandler = NULL; 4773 PyObject *exc = NULL; 4774 /* the following variable is used for caching string comparisons 4775 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 4776 * 3=ignore, 4=xmlcharrefreplace */ 4777 int known_errorHandler = -1; 4778 4779 if (mapping == NULL) { 4780 PyErr_BadArgument(); 4781 return NULL; 4782 } 4783 4784 /* allocate enough for a simple 1:1 translation without 4785 replacements, if we need more, we'll resize */ 4786 res = PyUnicode_FromUnicode(NULL, size); 4787 if (res == NULL) 4788 goto onError; 4789 if (size == 0) 4790 return res; 4791 str = PyUnicode_AS_UNICODE(res); 4792 4793 while (p<endp) { 4794 /* try to encode it */ 4795 PyObject *x = NULL; 4796 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) { 4797 Py_XDECREF(x); 4798 goto onError; 4799 } 4800 Py_XDECREF(x); 4801 if (x!=Py_None) /* it worked => adjust input pointer */ 4802 ++p; 4803 else { /* untranslatable character */ 4804 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */ 4805 Py_ssize_t repsize; 4806 Py_ssize_t newpos; 4807 Py_UNICODE *uni2; 4808 /* startpos for collecting untranslatable chars */ 4809 const Py_UNICODE *collstart = p; 4810 const Py_UNICODE *collend = p+1; 4811 const Py_UNICODE *coll; 4812 4813 /* find all untranslatable characters */ 4814 while (collend < endp) { 4815 if (charmaptranslate_lookup(*collend, mapping, &x)) 4816 goto onError; 4817 Py_XDECREF(x); 4818 if (x!=Py_None) 4819 break; 4820 ++collend; 4821 } 4822 /* cache callback name lookup 4823 * (if not done yet, i.e. it's the first error) */ 4824 if (known_errorHandler==-1) { 4825 if ((errors==NULL) || (!strcmp(errors, "strict"))) 4826 known_errorHandler = 1; 4827 else if (!strcmp(errors, "replace")) 4828 known_errorHandler = 2; 4829 else if (!strcmp(errors, "ignore")) 4830 known_errorHandler = 3; 4831 else if (!strcmp(errors, "xmlcharrefreplace")) 4832 known_errorHandler = 4; 4833 else 4834 known_errorHandler = 0; 4835 } 4836 switch (known_errorHandler) { 4837 case 1: /* strict */ 4838 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason); 4839 goto onError; 4840 case 2: /* replace */ 4841 /* No need to check for space, this is a 1:1 replacement */ 4842 for (coll = collstart; coll<collend; ++coll) 4843 *str++ = '?'; 4844 /* fall through */ 4845 case 3: /* ignore */ 4846 p = collend; 4847 break; 4848 case 4: /* xmlcharrefreplace */ 4849 /* generate replacement (temporarily (mis)uses p) */ 4850 for (p = collstart; p < collend; ++p) { 4851 char buffer[2+29+1+1]; 4852 char *cp; 4853 sprintf(buffer, "&#%d;", (int)*p); 4854 if (charmaptranslate_makespace(&res, &str, 4855 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend))) 4856 goto onError; 4857 for (cp = buffer; *cp; ++cp) 4858 *str++ = *cp; 4859 } 4860 p = collend; 4861 break; 4862 default: 4863 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler, 4864 reason, startp, size, &exc, 4865 collstart-startp, collend-startp, &newpos); 4866 if (repunicode == NULL) 4867 goto onError; 4868 /* generate replacement */ 4869 repsize = PyUnicode_GET_SIZE(repunicode); 4870 if (charmaptranslate_makespace(&res, &str, 4871 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) { 4872 Py_DECREF(repunicode); 4873 goto onError; 4874 } 4875 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) 4876 *str++ = *uni2; 4877 p = startp + newpos; 4878 Py_DECREF(repunicode); 4879 } 4880 } 4881 } 4882 /* Resize if we allocated to much */ 4883 respos = str-PyUnicode_AS_UNICODE(res); 4884 if (respos<PyUnicode_GET_SIZE(res)) { 4885 if (_PyUnicode_Resize(&res, respos) < 0) 4886 goto onError; 4887 } 4888 Py_XDECREF(exc); 4889 Py_XDECREF(errorHandler); 4890 return res; 4891 4892 onError: 4893 Py_XDECREF(res); 4894 Py_XDECREF(exc); 4895 Py_XDECREF(errorHandler); 4896 return NULL; 4897} 4898 4899PyObject *PyUnicode_Translate(PyObject *str, 4900 PyObject *mapping, 4901 const char *errors) 4902{ 4903 PyObject *result; 4904 4905 str = PyUnicode_FromObject(str); 4906 if (str == NULL) 4907 goto onError; 4908 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str), 4909 PyUnicode_GET_SIZE(str), 4910 mapping, 4911 errors); 4912 Py_DECREF(str); 4913 return result; 4914 4915 onError: 4916 Py_XDECREF(str); 4917 return NULL; 4918} 4919 4920/* --- Decimal Encoder ---------------------------------------------------- */ 4921 4922int PyUnicode_EncodeDecimal(Py_UNICODE *s, 4923 Py_ssize_t length, 4924 char *output, 4925 const char *errors) 4926{ 4927 Py_UNICODE *p, *end; 4928 PyObject *errorHandler = NULL; 4929 PyObject *exc = NULL; 4930 const char *encoding = "decimal"; 4931 const char *reason = "invalid decimal Unicode string"; 4932 /* the following variable is used for caching string comparisons 4933 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */ 4934 int known_errorHandler = -1; 4935 4936 if (output == NULL) { 4937 PyErr_BadArgument(); 4938 return -1; 4939 } 4940 4941 p = s; 4942 end = s + length; 4943 while (p < end) { 4944 register Py_UNICODE ch = *p; 4945 int decimal; 4946 PyObject *repunicode; 4947 Py_ssize_t repsize; 4948 Py_ssize_t newpos; 4949 Py_UNICODE *uni2; 4950 Py_UNICODE *collstart; 4951 Py_UNICODE *collend; 4952 4953 if (Py_UNICODE_ISSPACE(ch)) { 4954 *output++ = ' '; 4955 ++p; 4956 continue; 4957 } 4958 decimal = Py_UNICODE_TODECIMAL(ch); 4959 if (decimal >= 0) { 4960 *output++ = '0' + decimal; 4961 ++p; 4962 continue; 4963 } 4964 if (0 < ch && ch < 256) { 4965 *output++ = (char)ch; 4966 ++p; 4967 continue; 4968 } 4969 /* All other characters are considered unencodable */ 4970 collstart = p; 4971 collend = p+1; 4972 while (collend < end) { 4973 if ((0 < *collend && *collend < 256) || 4974 !Py_UNICODE_ISSPACE(*collend) || 4975 Py_UNICODE_TODECIMAL(*collend)) 4976 break; 4977 } 4978 /* cache callback name lookup 4979 * (if not done yet, i.e. it's the first error) */ 4980 if (known_errorHandler==-1) { 4981 if ((errors==NULL) || (!strcmp(errors, "strict"))) 4982 known_errorHandler = 1; 4983 else if (!strcmp(errors, "replace")) 4984 known_errorHandler = 2; 4985 else if (!strcmp(errors, "ignore")) 4986 known_errorHandler = 3; 4987 else if (!strcmp(errors, "xmlcharrefreplace")) 4988 known_errorHandler = 4; 4989 else 4990 known_errorHandler = 0; 4991 } 4992 switch (known_errorHandler) { 4993 case 1: /* strict */ 4994 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason); 4995 goto onError; 4996 case 2: /* replace */ 4997 for (p = collstart; p < collend; ++p) 4998 *output++ = '?'; 4999 /* fall through */ 5000 case 3: /* ignore */ 5001 p = collend; 5002 break; 5003 case 4: /* xmlcharrefreplace */ 5004 /* generate replacement (temporarily (mis)uses p) */ 5005 for (p = collstart; p < collend; ++p) 5006 output += sprintf(output, "&#%d;", (int)*p); 5007 p = collend; 5008 break; 5009 default: 5010 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler, 5011 encoding, reason, s, length, &exc, 5012 collstart-s, collend-s, &newpos); 5013 if (repunicode == NULL) 5014 goto onError; 5015 /* generate replacement */ 5016 repsize = PyUnicode_GET_SIZE(repunicode); 5017 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) { 5018 Py_UNICODE ch = *uni2; 5019 if (Py_UNICODE_ISSPACE(ch)) 5020 *output++ = ' '; 5021 else { 5022 decimal = Py_UNICODE_TODECIMAL(ch); 5023 if (decimal >= 0) 5024 *output++ = '0' + decimal; 5025 else if (0 < ch && ch < 256) 5026 *output++ = (char)ch; 5027 else { 5028 Py_DECREF(repunicode); 5029 raise_encode_exception(&exc, encoding, 5030 s, length, collstart-s, collend-s, reason); 5031 goto onError; 5032 } 5033 } 5034 } 5035 p = s + newpos; 5036 Py_DECREF(repunicode); 5037 } 5038 } 5039 /* 0-terminate the output string */ 5040 *output++ = '\0'; 5041 Py_XDECREF(exc); 5042 Py_XDECREF(errorHandler); 5043 return 0; 5044 5045 onError: 5046 Py_XDECREF(exc); 5047 Py_XDECREF(errorHandler); 5048 return -1; 5049} 5050 5051/* --- Helpers ------------------------------------------------------------ */ 5052 5053#include "stringlib/unicodedefs.h" 5054 5055#include "stringlib/fastsearch.h" 5056 5057#include "stringlib/count.h" 5058#include "stringlib/find.h" 5059#include "stringlib/partition.h" 5060 5061/* helper macro to fixup start/end slice values */ 5062#define FIX_START_END(obj) \ 5063 if (start < 0) \ 5064 start += (obj)->length; \ 5065 if (start < 0) \ 5066 start = 0; \ 5067 if (end > (obj)->length) \ 5068 end = (obj)->length; \ 5069 if (end < 0) \ 5070 end += (obj)->length; \ 5071 if (end < 0) \ 5072 end = 0; 5073 5074Py_ssize_t PyUnicode_Count(PyObject *str, 5075 PyObject *substr, 5076 Py_ssize_t start, 5077 Py_ssize_t end) 5078{ 5079 Py_ssize_t result; 5080 PyUnicodeObject* str_obj; 5081 PyUnicodeObject* sub_obj; 5082 5083 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str); 5084 if (!str_obj) 5085 return -1; 5086 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr); 5087 if (!sub_obj) { 5088 Py_DECREF(str_obj); 5089 return -1; 5090 } 5091 5092 FIX_START_END(str_obj); 5093 5094 result = stringlib_count( 5095 str_obj->str + start, end - start, sub_obj->str, sub_obj->length 5096 ); 5097 5098 Py_DECREF(sub_obj); 5099 Py_DECREF(str_obj); 5100 5101 return result; 5102} 5103 5104Py_ssize_t PyUnicode_Find(PyObject *str, 5105 PyObject *sub, 5106 Py_ssize_t start, 5107 Py_ssize_t end, 5108 int direction) 5109{ 5110 Py_ssize_t result; 5111 5112 str = PyUnicode_FromObject(str); 5113 if (!str) 5114 return -2; 5115 sub = PyUnicode_FromObject(sub); 5116 if (!sub) { 5117 Py_DECREF(str); 5118 return -2; 5119 } 5120 5121 if (direction > 0) 5122 result = stringlib_find_slice( 5123 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str), 5124 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub), 5125 start, end 5126 ); 5127 else 5128 result = stringlib_rfind_slice( 5129 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str), 5130 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub), 5131 start, end 5132 ); 5133 5134 Py_DECREF(str); 5135 Py_DECREF(sub); 5136 5137 return result; 5138} 5139 5140static 5141int tailmatch(PyUnicodeObject *self, 5142 PyUnicodeObject *substring, 5143 Py_ssize_t start, 5144 Py_ssize_t end, 5145 int direction) 5146{ 5147 if (substring->length == 0) 5148 return 1; 5149 5150 FIX_START_END(self); 5151 5152 end -= substring->length; 5153 if (end < start) 5154 return 0; 5155 5156 if (direction > 0) { 5157 if (Py_UNICODE_MATCH(self, end, substring)) 5158 return 1; 5159 } else { 5160 if (Py_UNICODE_MATCH(self, start, substring)) 5161 return 1; 5162 } 5163 5164 return 0; 5165} 5166 5167Py_ssize_t PyUnicode_Tailmatch(PyObject *str, 5168 PyObject *substr, 5169 Py_ssize_t start, 5170 Py_ssize_t end, 5171 int direction) 5172{ 5173 Py_ssize_t result; 5174 5175 str = PyUnicode_FromObject(str); 5176 if (str == NULL) 5177 return -1; 5178 substr = PyUnicode_FromObject(substr); 5179 if (substr == NULL) { 5180 Py_DECREF(str); 5181 return -1; 5182 } 5183 5184 result = tailmatch((PyUnicodeObject *)str, 5185 (PyUnicodeObject *)substr, 5186 start, end, direction); 5187 Py_DECREF(str); 5188 Py_DECREF(substr); 5189 return result; 5190} 5191 5192/* Apply fixfct filter to the Unicode object self and return a 5193 reference to the modified object */ 5194 5195static 5196PyObject *fixup(PyUnicodeObject *self, 5197 int (*fixfct)(PyUnicodeObject *s)) 5198{ 5199 5200 PyUnicodeObject *u; 5201 5202 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length); 5203 if (u == NULL) 5204 return NULL; 5205 5206 Py_UNICODE_COPY(u->str, self->str, self->length); 5207 5208 if (!fixfct(u) && PyUnicode_CheckExact(self)) { 5209 /* fixfct should return TRUE if it modified the buffer. If 5210 FALSE, return a reference to the original buffer instead 5211 (to save space, not time) */ 5212 Py_INCREF(self); 5213 Py_DECREF(u); 5214 return (PyObject*) self; 5215 } 5216 return (PyObject*) u; 5217} 5218 5219static 5220int fixupper(PyUnicodeObject *self) 5221{ 5222 Py_ssize_t len = self->length; 5223 Py_UNICODE *s = self->str; 5224 int status = 0; 5225 5226 while (len-- > 0) { 5227 register Py_UNICODE ch; 5228 5229 ch = Py_UNICODE_TOUPPER(*s); 5230 if (ch != *s) { 5231 status = 1; 5232 *s = ch; 5233 } 5234 s++; 5235 } 5236 5237 return status; 5238} 5239 5240static 5241int fixlower(PyUnicodeObject *self) 5242{ 5243 Py_ssize_t len = self->length; 5244 Py_UNICODE *s = self->str; 5245 int status = 0; 5246 5247 while (len-- > 0) { 5248 register Py_UNICODE ch; 5249 5250 ch = Py_UNICODE_TOLOWER(*s); 5251 if (ch != *s) { 5252 status = 1; 5253 *s = ch; 5254 } 5255 s++; 5256 } 5257 5258 return status; 5259} 5260 5261static 5262int fixswapcase(PyUnicodeObject *self) 5263{ 5264 Py_ssize_t len = self->length; 5265 Py_UNICODE *s = self->str; 5266 int status = 0; 5267 5268 while (len-- > 0) { 5269 if (Py_UNICODE_ISUPPER(*s)) { 5270 *s = Py_UNICODE_TOLOWER(*s); 5271 status = 1; 5272 } else if (Py_UNICODE_ISLOWER(*s)) { 5273 *s = Py_UNICODE_TOUPPER(*s); 5274 status = 1; 5275 } 5276 s++; 5277 } 5278 5279 return status; 5280} 5281 5282static 5283int fixcapitalize(PyUnicodeObject *self) 5284{ 5285 Py_ssize_t len = self->length; 5286 Py_UNICODE *s = self->str; 5287 int status = 0; 5288 5289 if (len == 0) 5290 return 0; 5291 if (Py_UNICODE_ISLOWER(*s)) { 5292 *s = Py_UNICODE_TOUPPER(*s); 5293 status = 1; 5294 } 5295 s++; 5296 while (--len > 0) { 5297 if (Py_UNICODE_ISUPPER(*s)) { 5298 *s = Py_UNICODE_TOLOWER(*s); 5299 status = 1; 5300 } 5301 s++; 5302 } 5303 return status; 5304} 5305 5306static 5307int fixtitle(PyUnicodeObject *self) 5308{ 5309 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 5310 register Py_UNICODE *e; 5311 int previous_is_cased; 5312 5313 /* Shortcut for single character strings */ 5314 if (PyUnicode_GET_SIZE(self) == 1) { 5315 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p); 5316 if (*p != ch) { 5317 *p = ch; 5318 return 1; 5319 } 5320 else 5321 return 0; 5322 } 5323 5324 e = p + PyUnicode_GET_SIZE(self); 5325 previous_is_cased = 0; 5326 for (; p < e; p++) { 5327 register const Py_UNICODE ch = *p; 5328 5329 if (previous_is_cased) 5330 *p = Py_UNICODE_TOLOWER(ch); 5331 else 5332 *p = Py_UNICODE_TOTITLE(ch); 5333 5334 if (Py_UNICODE_ISLOWER(ch) || 5335 Py_UNICODE_ISUPPER(ch) || 5336 Py_UNICODE_ISTITLE(ch)) 5337 previous_is_cased = 1; 5338 else 5339 previous_is_cased = 0; 5340 } 5341 return 1; 5342} 5343 5344PyObject * 5345PyUnicode_Join(PyObject *separator, PyObject *seq) 5346{ 5347 PyObject *internal_separator = NULL; 5348 const Py_UNICODE blank = ' '; 5349 const Py_UNICODE *sep = ␣ 5350 Py_ssize_t seplen = 1; 5351 PyUnicodeObject *res = NULL; /* the result */ 5352 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */ 5353 Py_ssize_t res_used; /* # used bytes */ 5354 Py_UNICODE *res_p; /* pointer to free byte in res's string area */ 5355 PyObject *fseq; /* PySequence_Fast(seq) */ 5356 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */ 5357 PyObject *item; 5358 Py_ssize_t i; 5359 5360 fseq = PySequence_Fast(seq, ""); 5361 if (fseq == NULL) { 5362 return NULL; 5363 } 5364 5365 /* Grrrr. A codec may be invoked to convert str objects to 5366 * Unicode, and so it's possible to call back into Python code 5367 * during PyUnicode_FromObject(), and so it's possible for a sick 5368 * codec to change the size of fseq (if seq is a list). Therefore 5369 * we have to keep refetching the size -- can't assume seqlen 5370 * is invariant. 5371 */ 5372 seqlen = PySequence_Fast_GET_SIZE(fseq); 5373 /* If empty sequence, return u"". */ 5374 if (seqlen == 0) { 5375 res = _PyUnicode_New(0); /* empty sequence; return u"" */ 5376 goto Done; 5377 } 5378 /* If singleton sequence with an exact Unicode, return that. */ 5379 if (seqlen == 1) { 5380 item = PySequence_Fast_GET_ITEM(fseq, 0); 5381 if (PyUnicode_CheckExact(item)) { 5382 Py_INCREF(item); 5383 res = (PyUnicodeObject *)item; 5384 goto Done; 5385 } 5386 } 5387 5388 /* At least two items to join, or one that isn't exact Unicode. */ 5389 if (seqlen > 1) { 5390 /* Set up sep and seplen -- they're needed. */ 5391 if (separator == NULL) { 5392 sep = ␣ 5393 seplen = 1; 5394 } 5395 else { 5396 internal_separator = PyUnicode_FromObject(separator); 5397 if (internal_separator == NULL) 5398 goto onError; 5399 sep = PyUnicode_AS_UNICODE(internal_separator); 5400 seplen = PyUnicode_GET_SIZE(internal_separator); 5401 /* In case PyUnicode_FromObject() mutated seq. */ 5402 seqlen = PySequence_Fast_GET_SIZE(fseq); 5403 } 5404 } 5405 5406 /* Get space. */ 5407 res = _PyUnicode_New(res_alloc); 5408 if (res == NULL) 5409 goto onError; 5410 res_p = PyUnicode_AS_UNICODE(res); 5411 res_used = 0; 5412 5413 for (i = 0; i < seqlen; ++i) { 5414 Py_ssize_t itemlen; 5415 Py_ssize_t new_res_used; 5416 5417 item = PySequence_Fast_GET_ITEM(fseq, i); 5418 /* Convert item to Unicode. */ 5419 if (!PyString_Check(item) && !PyUnicode_Check(item)) 5420 { 5421 if (PyBytes_Check(item)) 5422 { 5423 PyErr_Format(PyExc_TypeError, 5424 "sequence item %d: join() will not operate on " 5425 "bytes objects", i); 5426 goto onError; 5427 } 5428 item = PyObject_Unicode(item); 5429 } 5430 else 5431 item = PyUnicode_FromObject(item); 5432 5433 if (item == NULL) 5434 goto onError; 5435 /* We own a reference to item from here on. */ 5436 5437 /* In case PyUnicode_FromObject() mutated seq. */ 5438 seqlen = PySequence_Fast_GET_SIZE(fseq); 5439 5440 /* Make sure we have enough space for the separator and the item. */ 5441 itemlen = PyUnicode_GET_SIZE(item); 5442 new_res_used = res_used + itemlen; 5443 if (new_res_used < 0) 5444 goto Overflow; 5445 if (i < seqlen - 1) { 5446 new_res_used += seplen; 5447 if (new_res_used < 0) 5448 goto Overflow; 5449 } 5450 if (new_res_used > res_alloc) { 5451 /* double allocated size until it's big enough */ 5452 do { 5453 res_alloc += res_alloc; 5454 if (res_alloc <= 0) 5455 goto Overflow; 5456 } while (new_res_used > res_alloc); 5457 if (_PyUnicode_Resize(&res, res_alloc) < 0) { 5458 Py_DECREF(item); 5459 goto onError; 5460 } 5461 res_p = PyUnicode_AS_UNICODE(res) + res_used; 5462 } 5463 5464 /* Copy item, and maybe the separator. */ 5465 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen); 5466 res_p += itemlen; 5467 if (i < seqlen - 1) { 5468 Py_UNICODE_COPY(res_p, sep, seplen); 5469 res_p += seplen; 5470 } 5471 Py_DECREF(item); 5472 res_used = new_res_used; 5473 } 5474 5475 /* Shrink res to match the used area; this probably can't fail, 5476 * but it's cheap to check. 5477 */ 5478 if (_PyUnicode_Resize(&res, res_used) < 0) 5479 goto onError; 5480 5481 Done: 5482 Py_XDECREF(internal_separator); 5483 Py_DECREF(fseq); 5484 return (PyObject *)res; 5485 5486 Overflow: 5487 PyErr_SetString(PyExc_OverflowError, 5488 "join() result is too long for a Python string"); 5489 Py_DECREF(item); 5490 /* fall through */ 5491 5492 onError: 5493 Py_XDECREF(internal_separator); 5494 Py_DECREF(fseq); 5495 Py_XDECREF(res); 5496 return NULL; 5497} 5498 5499static 5500PyUnicodeObject *pad(PyUnicodeObject *self, 5501 Py_ssize_t left, 5502 Py_ssize_t right, 5503 Py_UNICODE fill) 5504{ 5505 PyUnicodeObject *u; 5506 5507 if (left < 0) 5508 left = 0; 5509 if (right < 0) 5510 right = 0; 5511 5512 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) { 5513 Py_INCREF(self); 5514 return self; 5515 } 5516 5517 u = _PyUnicode_New(left + self->length + right); 5518 if (u) { 5519 if (left) 5520 Py_UNICODE_FILL(u->str, fill, left); 5521 Py_UNICODE_COPY(u->str + left, self->str, self->length); 5522 if (right) 5523 Py_UNICODE_FILL(u->str + left + self->length, fill, right); 5524 } 5525 5526 return u; 5527} 5528 5529#define SPLIT_APPEND(data, left, right) \ 5530 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \ 5531 if (!str) \ 5532 goto onError; \ 5533 if (PyList_Append(list, str)) { \ 5534 Py_DECREF(str); \ 5535 goto onError; \ 5536 } \ 5537 else \ 5538 Py_DECREF(str); 5539 5540static 5541PyObject *split_whitespace(PyUnicodeObject *self, 5542 PyObject *list, 5543 Py_ssize_t maxcount) 5544{ 5545 register Py_ssize_t i; 5546 register Py_ssize_t j; 5547 Py_ssize_t len = self->length; 5548 PyObject *str; 5549 5550 for (i = j = 0; i < len; ) { 5551 /* find a token */ 5552 while (i < len && Py_UNICODE_ISSPACE(self->str[i])) 5553 i++; 5554 j = i; 5555 while (i < len && !Py_UNICODE_ISSPACE(self->str[i])) 5556 i++; 5557 if (j < i) { 5558 if (maxcount-- <= 0) 5559 break; 5560 SPLIT_APPEND(self->str, j, i); 5561 while (i < len && Py_UNICODE_ISSPACE(self->str[i])) 5562 i++; 5563 j = i; 5564 } 5565 } 5566 if (j < len) { 5567 SPLIT_APPEND(self->str, j, len); 5568 } 5569 return list; 5570 5571 onError: 5572 Py_DECREF(list); 5573 return NULL; 5574} 5575 5576PyObject *PyUnicode_Splitlines(PyObject *string, 5577 int keepends) 5578{ 5579 register Py_ssize_t i; 5580 register Py_ssize_t j; 5581 Py_ssize_t len; 5582 PyObject *list; 5583 PyObject *str; 5584 Py_UNICODE *data; 5585 5586 string = PyUnicode_FromObject(string); 5587 if (string == NULL) 5588 return NULL; 5589 data = PyUnicode_AS_UNICODE(string); 5590 len = PyUnicode_GET_SIZE(string); 5591 5592 list = PyList_New(0); 5593 if (!list) 5594 goto onError; 5595 5596 for (i = j = 0; i < len; ) { 5597 Py_ssize_t eol; 5598 5599 /* Find a line and append it */ 5600 while (i < len && !BLOOM_LINEBREAK(data[i])) 5601 i++; 5602 5603 /* Skip the line break reading CRLF as one line break */ 5604 eol = i; 5605 if (i < len) { 5606 if (data[i] == '\r' && i + 1 < len && 5607 data[i+1] == '\n') 5608 i += 2; 5609 else 5610 i++; 5611 if (keepends) 5612 eol = i; 5613 } 5614 SPLIT_APPEND(data, j, eol); 5615 j = i; 5616 } 5617 if (j < len) { 5618 SPLIT_APPEND(data, j, len); 5619 } 5620 5621 Py_DECREF(string); 5622 return list; 5623 5624 onError: 5625 Py_XDECREF(list); 5626 Py_DECREF(string); 5627 return NULL; 5628} 5629 5630static 5631PyObject *split_char(PyUnicodeObject *self, 5632 PyObject *list, 5633 Py_UNICODE ch, 5634 Py_ssize_t maxcount) 5635{ 5636 register Py_ssize_t i; 5637 register Py_ssize_t j; 5638 Py_ssize_t len = self->length; 5639 PyObject *str; 5640 5641 for (i = j = 0; i < len; ) { 5642 if (self->str[i] == ch) { 5643 if (maxcount-- <= 0) 5644 break; 5645 SPLIT_APPEND(self->str, j, i); 5646 i = j = i + 1; 5647 } else 5648 i++; 5649 } 5650 if (j <= len) { 5651 SPLIT_APPEND(self->str, j, len); 5652 } 5653 return list; 5654 5655 onError: 5656 Py_DECREF(list); 5657 return NULL; 5658} 5659 5660static 5661PyObject *split_substring(PyUnicodeObject *self, 5662 PyObject *list, 5663 PyUnicodeObject *substring, 5664 Py_ssize_t maxcount) 5665{ 5666 register Py_ssize_t i; 5667 register Py_ssize_t j; 5668 Py_ssize_t len = self->length; 5669 Py_ssize_t sublen = substring->length; 5670 PyObject *str; 5671 5672 for (i = j = 0; i <= len - sublen; ) { 5673 if (Py_UNICODE_MATCH(self, i, substring)) { 5674 if (maxcount-- <= 0) 5675 break; 5676 SPLIT_APPEND(self->str, j, i); 5677 i = j = i + sublen; 5678 } else 5679 i++; 5680 } 5681 if (j <= len) { 5682 SPLIT_APPEND(self->str, j, len); 5683 } 5684 return list; 5685 5686 onError: 5687 Py_DECREF(list); 5688 return NULL; 5689} 5690 5691static 5692PyObject *rsplit_whitespace(PyUnicodeObject *self, 5693 PyObject *list, 5694 Py_ssize_t maxcount) 5695{ 5696 register Py_ssize_t i; 5697 register Py_ssize_t j; 5698 Py_ssize_t len = self->length; 5699 PyObject *str; 5700 5701 for (i = j = len - 1; i >= 0; ) { 5702 /* find a token */ 5703 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i])) 5704 i--; 5705 j = i; 5706 while (i >= 0 && !Py_UNICODE_ISSPACE(self->str[i])) 5707 i--; 5708 if (j > i) { 5709 if (maxcount-- <= 0) 5710 break; 5711 SPLIT_APPEND(self->str, i + 1, j + 1); 5712 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i])) 5713 i--; 5714 j = i; 5715 } 5716 } 5717 if (j >= 0) { 5718 SPLIT_APPEND(self->str, 0, j + 1); 5719 } 5720 if (PyList_Reverse(list) < 0) 5721 goto onError; 5722 return list; 5723 5724 onError: 5725 Py_DECREF(list); 5726 return NULL; 5727} 5728 5729static 5730PyObject *rsplit_char(PyUnicodeObject *self, 5731 PyObject *list, 5732 Py_UNICODE ch, 5733 Py_ssize_t maxcount) 5734{ 5735 register Py_ssize_t i; 5736 register Py_ssize_t j; 5737 Py_ssize_t len = self->length; 5738 PyObject *str; 5739 5740 for (i = j = len - 1; i >= 0; ) { 5741 if (self->str[i] == ch) { 5742 if (maxcount-- <= 0) 5743 break; 5744 SPLIT_APPEND(self->str, i + 1, j + 1); 5745 j = i = i - 1; 5746 } else 5747 i--; 5748 } 5749 if (j >= -1) { 5750 SPLIT_APPEND(self->str, 0, j + 1); 5751 } 5752 if (PyList_Reverse(list) < 0) 5753 goto onError; 5754 return list; 5755 5756 onError: 5757 Py_DECREF(list); 5758 return NULL; 5759} 5760 5761static 5762PyObject *rsplit_substring(PyUnicodeObject *self, 5763 PyObject *list, 5764 PyUnicodeObject *substring, 5765 Py_ssize_t maxcount) 5766{ 5767 register Py_ssize_t i; 5768 register Py_ssize_t j; 5769 Py_ssize_t len = self->length; 5770 Py_ssize_t sublen = substring->length; 5771 PyObject *str; 5772 5773 for (i = len - sublen, j = len; i >= 0; ) { 5774 if (Py_UNICODE_MATCH(self, i, substring)) { 5775 if (maxcount-- <= 0) 5776 break; 5777 SPLIT_APPEND(self->str, i + sublen, j); 5778 j = i; 5779 i -= sublen; 5780 } else 5781 i--; 5782 } 5783 if (j >= 0) { 5784 SPLIT_APPEND(self->str, 0, j); 5785 } 5786 if (PyList_Reverse(list) < 0) 5787 goto onError; 5788 return list; 5789 5790 onError: 5791 Py_DECREF(list); 5792 return NULL; 5793} 5794 5795#undef SPLIT_APPEND 5796 5797static 5798PyObject *split(PyUnicodeObject *self, 5799 PyUnicodeObject *substring, 5800 Py_ssize_t maxcount) 5801{ 5802 PyObject *list; 5803 5804 if (maxcount < 0) 5805 maxcount = PY_SSIZE_T_MAX; 5806 5807 list = PyList_New(0); 5808 if (!list) 5809 return NULL; 5810 5811 if (substring == NULL) 5812 return split_whitespace(self,list,maxcount); 5813 5814 else if (substring->length == 1) 5815 return split_char(self,list,substring->str[0],maxcount); 5816 5817 else if (substring->length == 0) { 5818 Py_DECREF(list); 5819 PyErr_SetString(PyExc_ValueError, "empty separator"); 5820 return NULL; 5821 } 5822 else 5823 return split_substring(self,list,substring,maxcount); 5824} 5825 5826static 5827PyObject *rsplit(PyUnicodeObject *self, 5828 PyUnicodeObject *substring, 5829 Py_ssize_t maxcount) 5830{ 5831 PyObject *list; 5832 5833 if (maxcount < 0) 5834 maxcount = PY_SSIZE_T_MAX; 5835 5836 list = PyList_New(0); 5837 if (!list) 5838 return NULL; 5839 5840 if (substring == NULL) 5841 return rsplit_whitespace(self,list,maxcount); 5842 5843 else if (substring->length == 1) 5844 return rsplit_char(self,list,substring->str[0],maxcount); 5845 5846 else if (substring->length == 0) { 5847 Py_DECREF(list); 5848 PyErr_SetString(PyExc_ValueError, "empty separator"); 5849 return NULL; 5850 } 5851 else 5852 return rsplit_substring(self,list,substring,maxcount); 5853} 5854 5855static 5856PyObject *replace(PyUnicodeObject *self, 5857 PyUnicodeObject *str1, 5858 PyUnicodeObject *str2, 5859 Py_ssize_t maxcount) 5860{ 5861 PyUnicodeObject *u; 5862 5863 if (maxcount < 0) 5864 maxcount = PY_SSIZE_T_MAX; 5865 5866 if (str1->length == str2->length) { 5867 /* same length */ 5868 Py_ssize_t i; 5869 if (str1->length == 1) { 5870 /* replace characters */ 5871 Py_UNICODE u1, u2; 5872 if (!findchar(self->str, self->length, str1->str[0])) 5873 goto nothing; 5874 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length); 5875 if (!u) 5876 return NULL; 5877 Py_UNICODE_COPY(u->str, self->str, self->length); 5878 u1 = str1->str[0]; 5879 u2 = str2->str[0]; 5880 for (i = 0; i < u->length; i++) 5881 if (u->str[i] == u1) { 5882 if (--maxcount < 0) 5883 break; 5884 u->str[i] = u2; 5885 } 5886 } else { 5887 i = fastsearch( 5888 self->str, self->length, str1->str, str1->length, FAST_SEARCH 5889 ); 5890 if (i < 0) 5891 goto nothing; 5892 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length); 5893 if (!u) 5894 return NULL; 5895 Py_UNICODE_COPY(u->str, self->str, self->length); 5896 while (i <= self->length - str1->length) 5897 if (Py_UNICODE_MATCH(self, i, str1)) { 5898 if (--maxcount < 0) 5899 break; 5900 Py_UNICODE_COPY(u->str+i, str2->str, str2->length); 5901 i += str1->length; 5902 } else 5903 i++; 5904 } 5905 } else { 5906 5907 Py_ssize_t n, i, j, e; 5908 Py_ssize_t product, new_size, delta; 5909 Py_UNICODE *p; 5910 5911 /* replace strings */ 5912 n = stringlib_count(self->str, self->length, str1->str, str1->length); 5913 if (n > maxcount) 5914 n = maxcount; 5915 if (n == 0) 5916 goto nothing; 5917 /* new_size = self->length + n * (str2->length - str1->length)); */ 5918 delta = (str2->length - str1->length); 5919 if (delta == 0) { 5920 new_size = self->length; 5921 } else { 5922 product = n * (str2->length - str1->length); 5923 if ((product / (str2->length - str1->length)) != n) { 5924 PyErr_SetString(PyExc_OverflowError, 5925 "replace string is too long"); 5926 return NULL; 5927 } 5928 new_size = self->length + product; 5929 if (new_size < 0) { 5930 PyErr_SetString(PyExc_OverflowError, 5931 "replace string is too long"); 5932 return NULL; 5933 } 5934 } 5935 u = _PyUnicode_New(new_size); 5936 if (!u) 5937 return NULL; 5938 i = 0; 5939 p = u->str; 5940 e = self->length - str1->length; 5941 if (str1->length > 0) { 5942 while (n-- > 0) { 5943 /* look for next match */ 5944 j = i; 5945 while (j <= e) { 5946 if (Py_UNICODE_MATCH(self, j, str1)) 5947 break; 5948 j++; 5949 } 5950 if (j > i) { 5951 if (j > e) 5952 break; 5953 /* copy unchanged part [i:j] */ 5954 Py_UNICODE_COPY(p, self->str+i, j-i); 5955 p += j - i; 5956 } 5957 /* copy substitution string */ 5958 if (str2->length > 0) { 5959 Py_UNICODE_COPY(p, str2->str, str2->length); 5960 p += str2->length; 5961 } 5962 i = j + str1->length; 5963 } 5964 if (i < self->length) 5965 /* copy tail [i:] */ 5966 Py_UNICODE_COPY(p, self->str+i, self->length-i); 5967 } else { 5968 /* interleave */ 5969 while (n > 0) { 5970 Py_UNICODE_COPY(p, str2->str, str2->length); 5971 p += str2->length; 5972 if (--n <= 0) 5973 break; 5974 *p++ = self->str[i++]; 5975 } 5976 Py_UNICODE_COPY(p, self->str+i, self->length-i); 5977 } 5978 } 5979 return (PyObject *) u; 5980 5981nothing: 5982 /* nothing to replace; return original string (when possible) */ 5983 if (PyUnicode_CheckExact(self)) { 5984 Py_INCREF(self); 5985 return (PyObject *) self; 5986 } 5987 return PyUnicode_FromUnicode(self->str, self->length); 5988} 5989 5990/* --- Unicode Object Methods --------------------------------------------- */ 5991 5992PyDoc_STRVAR(title__doc__, 5993"S.title() -> unicode\n\ 5994\n\ 5995Return a titlecased version of S, i.e. words start with title case\n\ 5996characters, all remaining cased characters have lower case."); 5997 5998static PyObject* 5999unicode_title(PyUnicodeObject *self) 6000{ 6001 return fixup(self, fixtitle); 6002} 6003 6004PyDoc_STRVAR(capitalize__doc__, 6005"S.capitalize() -> unicode\n\ 6006\n\ 6007Return a capitalized version of S, i.e. make the first character\n\ 6008have upper case."); 6009 6010static PyObject* 6011unicode_capitalize(PyUnicodeObject *self) 6012{ 6013 return fixup(self, fixcapitalize); 6014} 6015 6016#if 0 6017PyDoc_STRVAR(capwords__doc__, 6018"S.capwords() -> unicode\n\ 6019\n\ 6020Apply .capitalize() to all words in S and return the result with\n\ 6021normalized whitespace (all whitespace strings are replaced by ' ')."); 6022 6023static PyObject* 6024unicode_capwords(PyUnicodeObject *self) 6025{ 6026 PyObject *list; 6027 PyObject *item; 6028 Py_ssize_t i; 6029 6030 /* Split into words */ 6031 list = split(self, NULL, -1); 6032 if (!list) 6033 return NULL; 6034 6035 /* Capitalize each word */ 6036 for (i = 0; i < PyList_GET_SIZE(list); i++) { 6037 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i), 6038 fixcapitalize); 6039 if (item == NULL) 6040 goto onError; 6041 Py_DECREF(PyList_GET_ITEM(list, i)); 6042 PyList_SET_ITEM(list, i, item); 6043 } 6044 6045 /* Join the words to form a new string */ 6046 item = PyUnicode_Join(NULL, list); 6047 6048onError: 6049 Py_DECREF(list); 6050 return (PyObject *)item; 6051} 6052#endif 6053 6054/* Argument converter. Coerces to a single unicode character */ 6055 6056static int 6057convert_uc(PyObject *obj, void *addr) 6058{ 6059 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr; 6060 PyObject *uniobj; 6061 Py_UNICODE *unistr; 6062 6063 uniobj = PyUnicode_FromObject(obj); 6064 if (uniobj == NULL) { 6065 PyErr_SetString(PyExc_TypeError, 6066 "The fill character cannot be converted to Unicode"); 6067 return 0; 6068 } 6069 if (PyUnicode_GET_SIZE(uniobj) != 1) { 6070 PyErr_SetString(PyExc_TypeError, 6071 "The fill character must be exactly one character long"); 6072 Py_DECREF(uniobj); 6073 return 0; 6074 } 6075 unistr = PyUnicode_AS_UNICODE(uniobj); 6076 *fillcharloc = unistr[0]; 6077 Py_DECREF(uniobj); 6078 return 1; 6079} 6080 6081PyDoc_STRVAR(center__doc__, 6082"S.center(width[, fillchar]) -> unicode\n\ 6083\n\ 6084Return S centered in a Unicode string of length width. Padding is\n\ 6085done using the specified fill character (default is a space)"); 6086 6087static PyObject * 6088unicode_center(PyUnicodeObject *self, PyObject *args) 6089{ 6090 Py_ssize_t marg, left; 6091 Py_ssize_t width; 6092 Py_UNICODE fillchar = ' '; 6093 6094 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar)) 6095 return NULL; 6096 6097 if (self->length >= width && PyUnicode_CheckExact(self)) { 6098 Py_INCREF(self); 6099 return (PyObject*) self; 6100 } 6101 6102 marg = width - self->length; 6103 left = marg / 2 + (marg & width & 1); 6104 6105 return (PyObject*) pad(self, left, marg - left, fillchar); 6106} 6107 6108#if 0 6109 6110/* This code should go into some future Unicode collation support 6111 module. The basic comparison should compare ordinals on a naive 6112 basis (this is what Java does and thus JPython too). */ 6113 6114/* speedy UTF-16 code point order comparison */ 6115/* gleaned from: */ 6116/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */ 6117 6118static short utf16Fixup[32] = 6119{ 6120 0, 0, 0, 0, 0, 0, 0, 0, 6121 0, 0, 0, 0, 0, 0, 0, 0, 6122 0, 0, 0, 0, 0, 0, 0, 0, 6123 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800 6124}; 6125 6126static int 6127unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2) 6128{ 6129 Py_ssize_t len1, len2; 6130 6131 Py_UNICODE *s1 = str1->str; 6132 Py_UNICODE *s2 = str2->str; 6133 6134 len1 = str1->length; 6135 len2 = str2->length; 6136 6137 while (len1 > 0 && len2 > 0) { 6138 Py_UNICODE c1, c2; 6139 6140 c1 = *s1++; 6141 c2 = *s2++; 6142 6143 if (c1 > (1<<11) * 26) 6144 c1 += utf16Fixup[c1>>11]; 6145 if (c2 > (1<<11) * 26) 6146 c2 += utf16Fixup[c2>>11]; 6147 /* now c1 and c2 are in UTF-32-compatible order */ 6148 6149 if (c1 != c2) 6150 return (c1 < c2) ? -1 : 1; 6151 6152 len1--; len2--; 6153 } 6154 6155 return (len1 < len2) ? -1 : (len1 != len2); 6156} 6157 6158#else 6159 6160static int 6161unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2) 6162{ 6163 register Py_ssize_t len1, len2; 6164 6165 Py_UNICODE *s1 = str1->str; 6166 Py_UNICODE *s2 = str2->str; 6167 6168 len1 = str1->length; 6169 len2 = str2->length; 6170 6171 while (len1 > 0 && len2 > 0) { 6172 Py_UNICODE c1, c2; 6173 6174 c1 = *s1++; 6175 c2 = *s2++; 6176 6177 if (c1 != c2) 6178 return (c1 < c2) ? -1 : 1; 6179 6180 len1--; len2--; 6181 } 6182 6183 return (len1 < len2) ? -1 : (len1 != len2); 6184} 6185 6186#endif 6187 6188int PyUnicode_Compare(PyObject *left, 6189 PyObject *right) 6190{ 6191 if (PyUnicode_Check(left) && PyUnicode_Check(right)) 6192 return unicode_compare((PyUnicodeObject *)left, 6193 (PyUnicodeObject *)right); 6194 if ((PyString_Check(left) && PyUnicode_Check(right)) || 6195 (PyUnicode_Check(left) && PyString_Check(right))) { 6196 if (PyUnicode_Check(left)) 6197 left = _PyUnicode_AsDefaultEncodedString(left, NULL); 6198 if (PyUnicode_Check(right)) 6199 right = _PyUnicode_AsDefaultEncodedString(right, NULL); 6200 assert(PyString_Check(left)); 6201 assert(PyString_Check(right)); 6202 return PyObject_Compare(left, right); 6203 } 6204 PyErr_Format(PyExc_TypeError, 6205 "Can't compare %.100s and %.100s", 6206 left->ob_type->tp_name, 6207 right->ob_type->tp_name); 6208 return -1; 6209} 6210 6211int 6212PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str) 6213{ 6214 int i; 6215 Py_UNICODE *id; 6216 assert(PyUnicode_Check(uni)); 6217 id = PyUnicode_AS_UNICODE(uni); 6218 /* Compare Unicode string and source character set string */ 6219 for (i = 0; id[i] && str[i]; i++) 6220 if (id[i] != str[i]) 6221 return ((int)id[i] < (int)str[i]) ? -1 : 1; 6222 if (id[i]) 6223 return 1; /* uni is longer */ 6224 if (str[i]) 6225 return -1; /* str is longer */ 6226 return 0; 6227} 6228 6229PyObject *PyUnicode_RichCompare(PyObject *left, 6230 PyObject *right, 6231 int op) 6232{ 6233 int result; 6234 6235 result = PyUnicode_Compare(left, right); 6236 if (result == -1 && PyErr_Occurred()) 6237 goto onError; 6238 6239 /* Convert the return value to a Boolean */ 6240 switch (op) { 6241 case Py_EQ: 6242 result = (result == 0); 6243 break; 6244 case Py_NE: 6245 result = (result != 0); 6246 break; 6247 case Py_LE: 6248 result = (result <= 0); 6249 break; 6250 case Py_GE: 6251 result = (result >= 0); 6252 break; 6253 case Py_LT: 6254 result = (result == -1); 6255 break; 6256 case Py_GT: 6257 result = (result == 1); 6258 break; 6259 } 6260 return PyBool_FromLong(result); 6261 6262 onError: 6263 6264 /* Standard case 6265 6266 Type errors mean that PyUnicode_FromObject() could not convert 6267 one of the arguments (usually the right hand side) to Unicode, 6268 ie. we can't handle the comparison request. However, it is 6269 possible that the other object knows a comparison method, which 6270 is why we return Py_NotImplemented to give the other object a 6271 chance. 6272 6273 */ 6274 if (PyErr_ExceptionMatches(PyExc_TypeError)) { 6275 PyErr_Clear(); 6276 Py_INCREF(Py_NotImplemented); 6277 return Py_NotImplemented; 6278 } 6279 if (op != Py_EQ && op != Py_NE) 6280 return NULL; 6281 6282 /* Equality comparison. 6283 6284 This is a special case: we silence any PyExc_UnicodeDecodeError 6285 and instead turn it into a PyErr_UnicodeWarning. 6286 6287 */ 6288 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) 6289 return NULL; 6290 PyErr_Clear(); 6291 if (PyErr_WarnEx(PyExc_UnicodeWarning, 6292 (op == Py_EQ) ? 6293 "Unicode equal comparison " 6294 "failed to convert both arguments to Unicode - " 6295 "interpreting them as being unequal" 6296 : 6297 "Unicode unequal comparison " 6298 "failed to convert both arguments to Unicode - " 6299 "interpreting them as being unequal", 6300 1) < 0) 6301 return NULL; 6302 result = (op == Py_NE); 6303 return PyBool_FromLong(result); 6304} 6305 6306int PyUnicode_Contains(PyObject *container, 6307 PyObject *element) 6308{ 6309 PyObject *str, *sub; 6310 int result; 6311 6312 /* Coerce the two arguments */ 6313 sub = PyUnicode_FromObject(element); 6314 if (!sub) { 6315 PyErr_Format(PyExc_TypeError, 6316 "'in <string>' requires string as left operand, not %s", 6317 element->ob_type->tp_name); 6318 return -1; 6319 } 6320 6321 str = PyUnicode_FromObject(container); 6322 if (!str) { 6323 Py_DECREF(sub); 6324 return -1; 6325 } 6326 6327 result = stringlib_contains_obj(str, sub); 6328 6329 Py_DECREF(str); 6330 Py_DECREF(sub); 6331 6332 return result; 6333} 6334 6335/* Concat to string or Unicode object giving a new Unicode object. */ 6336 6337PyObject *PyUnicode_Concat(PyObject *left, 6338 PyObject *right) 6339{ 6340 PyUnicodeObject *u = NULL, *v = NULL, *w; 6341 6342 if (PyBytes_Check(left) || PyBytes_Check(right)) 6343 return PyBytes_Concat(left, right); 6344 6345 /* Coerce the two arguments */ 6346 u = (PyUnicodeObject *)PyUnicode_FromObject(left); 6347 if (u == NULL) 6348 goto onError; 6349 v = (PyUnicodeObject *)PyUnicode_FromObject(right); 6350 if (v == NULL) 6351 goto onError; 6352 6353 /* Shortcuts */ 6354 if (v == unicode_empty) { 6355 Py_DECREF(v); 6356 return (PyObject *)u; 6357 } 6358 if (u == unicode_empty) { 6359 Py_DECREF(u); 6360 return (PyObject *)v; 6361 } 6362 6363 /* Concat the two Unicode strings */ 6364 w = _PyUnicode_New(u->length + v->length); 6365 if (w == NULL) 6366 goto onError; 6367 Py_UNICODE_COPY(w->str, u->str, u->length); 6368 Py_UNICODE_COPY(w->str + u->length, v->str, v->length); 6369 6370 Py_DECREF(u); 6371 Py_DECREF(v); 6372 return (PyObject *)w; 6373 6374onError: 6375 Py_XDECREF(u); 6376 Py_XDECREF(v); 6377 return NULL; 6378} 6379 6380void 6381PyUnicode_Append(PyObject **pleft, PyObject *right) 6382{ 6383 PyObject *new; 6384 if (*pleft == NULL) 6385 return; 6386 if (right == NULL || !PyUnicode_Check(*pleft)) { 6387 Py_DECREF(*pleft); 6388 *pleft = NULL; 6389 return; 6390 } 6391 new = PyUnicode_Concat(*pleft, right); 6392 Py_DECREF(*pleft); 6393 *pleft = new; 6394} 6395 6396void 6397PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right) 6398{ 6399 PyUnicode_Append(pleft, right); 6400 Py_XDECREF(right); 6401} 6402 6403PyDoc_STRVAR(count__doc__, 6404"S.count(sub[, start[, end]]) -> int\n\ 6405\n\ 6406Return the number of non-overlapping occurrences of substring sub in\n\ 6407Unicode string S[start:end]. Optional arguments start and end are\n\ 6408interpreted as in slice notation."); 6409 6410static PyObject * 6411unicode_count(PyUnicodeObject *self, PyObject *args) 6412{ 6413 PyUnicodeObject *substring; 6414 Py_ssize_t start = 0; 6415 Py_ssize_t end = PY_SSIZE_T_MAX; 6416 PyObject *result; 6417 6418 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring, 6419 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 6420 return NULL; 6421 6422 substring = (PyUnicodeObject *)PyUnicode_FromObject( 6423 (PyObject *)substring); 6424 if (substring == NULL) 6425 return NULL; 6426 6427 FIX_START_END(self); 6428 6429 result = PyInt_FromSsize_t( 6430 stringlib_count(self->str + start, end - start, 6431 substring->str, substring->length) 6432 ); 6433 6434 Py_DECREF(substring); 6435 6436 return result; 6437} 6438 6439PyDoc_STRVAR(encode__doc__, 6440"S.encode([encoding[,errors]]) -> string or unicode\n\ 6441\n\ 6442Encodes S using the codec registered for encoding. encoding defaults\n\ 6443to the default encoding. errors may be given to set a different error\n\ 6444handling scheme. Default is 'strict' meaning that encoding errors raise\n\ 6445a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\ 6446'xmlcharrefreplace' as well as any other name registered with\n\ 6447codecs.register_error that can handle UnicodeEncodeErrors."); 6448 6449static PyObject * 6450unicode_encode(PyUnicodeObject *self, PyObject *args) 6451{ 6452 char *encoding = NULL; 6453 char *errors = NULL; 6454 PyObject *v; 6455 6456 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors)) 6457 return NULL; 6458 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors); 6459 if (v == NULL) 6460 goto onError; 6461 if (!PyBytes_Check(v)) { 6462 PyErr_Format(PyExc_TypeError, 6463 "encoder did not return a bytes object " 6464 "(type=%.400s)", 6465 Py_Type(v)->tp_name); 6466 Py_DECREF(v); 6467 return NULL; 6468 } 6469 return v; 6470 6471 onError: 6472 return NULL; 6473} 6474 6475PyDoc_STRVAR(expandtabs__doc__, 6476"S.expandtabs([tabsize]) -> unicode\n\ 6477\n\ 6478Return a copy of S where all tab characters are expanded using spaces.\n\ 6479If tabsize is not given, a tab size of 8 characters is assumed."); 6480 6481static PyObject* 6482unicode_expandtabs(PyUnicodeObject *self, PyObject *args) 6483{ 6484 Py_UNICODE *e; 6485 Py_UNICODE *p; 6486 Py_UNICODE *q; 6487 Py_ssize_t i, j, old_j; 6488 PyUnicodeObject *u; 6489 int tabsize = 8; 6490 6491 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize)) 6492 return NULL; 6493 6494 /* First pass: determine size of output string */ 6495 i = j = old_j = 0; 6496 e = self->str + self->length; 6497 for (p = self->str; p < e; p++) 6498 if (*p == '\t') { 6499 if (tabsize > 0) { 6500 j += tabsize - (j % tabsize); 6501 if (old_j > j) { 6502 PyErr_SetString(PyExc_OverflowError, 6503 "new string is too long"); 6504 return NULL; 6505 } 6506 old_j = j; 6507 } 6508 } 6509 else { 6510 j++; 6511 if (*p == '\n' || *p == '\r') { 6512 i += j; 6513 old_j = j = 0; 6514 if (i < 0) { 6515 PyErr_SetString(PyExc_OverflowError, 6516 "new string is too long"); 6517 return NULL; 6518 } 6519 } 6520 } 6521 6522 if ((i + j) < 0) { 6523 PyErr_SetString(PyExc_OverflowError, "new string is too long"); 6524 return NULL; 6525 } 6526 6527 /* Second pass: create output string and fill it */ 6528 u = _PyUnicode_New(i + j); 6529 if (!u) 6530 return NULL; 6531 6532 j = 0; 6533 q = u->str; 6534 6535 for (p = self->str; p < e; p++) 6536 if (*p == '\t') { 6537 if (tabsize > 0) { 6538 i = tabsize - (j % tabsize); 6539 j += i; 6540 while (i--) 6541 *q++ = ' '; 6542 } 6543 } 6544 else { 6545 j++; 6546 *q++ = *p; 6547 if (*p == '\n' || *p == '\r') 6548 j = 0; 6549 } 6550 6551 return (PyObject*) u; 6552} 6553 6554PyDoc_STRVAR(find__doc__, 6555"S.find(sub [,start [,end]]) -> int\n\ 6556\n\ 6557Return the lowest index in S where substring sub is found,\n\ 6558such that sub is contained within s[start:end]. Optional\n\ 6559arguments start and end are interpreted as in slice notation.\n\ 6560\n\ 6561Return -1 on failure."); 6562 6563static PyObject * 6564unicode_find(PyUnicodeObject *self, PyObject *args) 6565{ 6566 PyObject *substring; 6567 Py_ssize_t start = 0; 6568 Py_ssize_t end = PY_SSIZE_T_MAX; 6569 Py_ssize_t result; 6570 6571 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring, 6572 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 6573 return NULL; 6574 substring = PyUnicode_FromObject(substring); 6575 if (!substring) 6576 return NULL; 6577 6578 result = stringlib_find_slice( 6579 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self), 6580 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring), 6581 start, end 6582 ); 6583 6584 Py_DECREF(substring); 6585 6586 return PyInt_FromSsize_t(result); 6587} 6588 6589static PyObject * 6590unicode_getitem(PyUnicodeObject *self, Py_ssize_t index) 6591{ 6592 if (index < 0 || index >= self->length) { 6593 PyErr_SetString(PyExc_IndexError, "string index out of range"); 6594 return NULL; 6595 } 6596 6597 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1); 6598} 6599 6600/* Believe it or not, this produces the same value for ASCII strings 6601 as string_hash(). */ 6602static long 6603unicode_hash(PyUnicodeObject *self) 6604{ 6605 Py_ssize_t len; 6606 Py_UNICODE *p; 6607 long x; 6608 6609 if (self->hash != -1) 6610 return self->hash; 6611 len = Py_Size(self); 6612 p = self->str; 6613 x = *p << 7; 6614 while (--len >= 0) 6615 x = (1000003*x) ^ *p++; 6616 x ^= Py_Size(self); 6617 if (x == -1) 6618 x = -2; 6619 self->hash = x; 6620 return x; 6621} 6622 6623PyDoc_STRVAR(index__doc__, 6624"S.index(sub [,start [,end]]) -> int\n\ 6625\n\ 6626Like S.find() but raise ValueError when the substring is not found."); 6627 6628static PyObject * 6629unicode_index(PyUnicodeObject *self, PyObject *args) 6630{ 6631 Py_ssize_t result; 6632 PyObject *substring; 6633 Py_ssize_t start = 0; 6634 Py_ssize_t end = PY_SSIZE_T_MAX; 6635 6636 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring, 6637 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 6638 return NULL; 6639 substring = PyUnicode_FromObject(substring); 6640 if (!substring) 6641 return NULL; 6642 6643 result = stringlib_find_slice( 6644 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self), 6645 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring), 6646 start, end 6647 ); 6648 6649 Py_DECREF(substring); 6650 6651 if (result < 0) { 6652 PyErr_SetString(PyExc_ValueError, "substring not found"); 6653 return NULL; 6654 } 6655 6656 return PyInt_FromSsize_t(result); 6657} 6658 6659PyDoc_STRVAR(islower__doc__, 6660"S.islower() -> bool\n\ 6661\n\ 6662Return True if all cased characters in S are lowercase and there is\n\ 6663at least one cased character in S, False otherwise."); 6664 6665static PyObject* 6666unicode_islower(PyUnicodeObject *self) 6667{ 6668 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 6669 register const Py_UNICODE *e; 6670 int cased; 6671 6672 /* Shortcut for single character strings */ 6673 if (PyUnicode_GET_SIZE(self) == 1) 6674 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p)); 6675 6676 /* Special case for empty strings */ 6677 if (PyUnicode_GET_SIZE(self) == 0) 6678 return PyBool_FromLong(0); 6679 6680 e = p + PyUnicode_GET_SIZE(self); 6681 cased = 0; 6682 for (; p < e; p++) { 6683 register const Py_UNICODE ch = *p; 6684 6685 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) 6686 return PyBool_FromLong(0); 6687 else if (!cased && Py_UNICODE_ISLOWER(ch)) 6688 cased = 1; 6689 } 6690 return PyBool_FromLong(cased); 6691} 6692 6693PyDoc_STRVAR(isupper__doc__, 6694"S.isupper() -> bool\n\ 6695\n\ 6696Return True if all cased characters in S are uppercase and there is\n\ 6697at least one cased character in S, False otherwise."); 6698 6699static PyObject* 6700unicode_isupper(PyUnicodeObject *self) 6701{ 6702 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 6703 register const Py_UNICODE *e; 6704 int cased; 6705 6706 /* Shortcut for single character strings */ 6707 if (PyUnicode_GET_SIZE(self) == 1) 6708 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0); 6709 6710 /* Special case for empty strings */ 6711 if (PyUnicode_GET_SIZE(self) == 0) 6712 return PyBool_FromLong(0); 6713 6714 e = p + PyUnicode_GET_SIZE(self); 6715 cased = 0; 6716 for (; p < e; p++) { 6717 register const Py_UNICODE ch = *p; 6718 6719 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch)) 6720 return PyBool_FromLong(0); 6721 else if (!cased && Py_UNICODE_ISUPPER(ch)) 6722 cased = 1; 6723 } 6724 return PyBool_FromLong(cased); 6725} 6726 6727PyDoc_STRVAR(istitle__doc__, 6728"S.istitle() -> bool\n\ 6729\n\ 6730Return True if S is a titlecased string and there is at least one\n\ 6731character in S, i.e. upper- and titlecase characters may only\n\ 6732follow uncased characters and lowercase characters only cased ones.\n\ 6733Return False otherwise."); 6734 6735static PyObject* 6736unicode_istitle(PyUnicodeObject *self) 6737{ 6738 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 6739 register const Py_UNICODE *e; 6740 int cased, previous_is_cased; 6741 6742 /* Shortcut for single character strings */ 6743 if (PyUnicode_GET_SIZE(self) == 1) 6744 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) || 6745 (Py_UNICODE_ISUPPER(*p) != 0)); 6746 6747 /* Special case for empty strings */ 6748 if (PyUnicode_GET_SIZE(self) == 0) 6749 return PyBool_FromLong(0); 6750 6751 e = p + PyUnicode_GET_SIZE(self); 6752 cased = 0; 6753 previous_is_cased = 0; 6754 for (; p < e; p++) { 6755 register const Py_UNICODE ch = *p; 6756 6757 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) { 6758 if (previous_is_cased) 6759 return PyBool_FromLong(0); 6760 previous_is_cased = 1; 6761 cased = 1; 6762 } 6763 else if (Py_UNICODE_ISLOWER(ch)) { 6764 if (!previous_is_cased) 6765 return PyBool_FromLong(0); 6766 previous_is_cased = 1; 6767 cased = 1; 6768 } 6769 else 6770 previous_is_cased = 0; 6771 } 6772 return PyBool_FromLong(cased); 6773} 6774 6775PyDoc_STRVAR(isspace__doc__, 6776"S.isspace() -> bool\n\ 6777\n\ 6778Return True if all characters in S are whitespace\n\ 6779and there is at least one character in S, False otherwise."); 6780 6781static PyObject* 6782unicode_isspace(PyUnicodeObject *self) 6783{ 6784 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 6785 register const Py_UNICODE *e; 6786 6787 /* Shortcut for single character strings */ 6788 if (PyUnicode_GET_SIZE(self) == 1 && 6789 Py_UNICODE_ISSPACE(*p)) 6790 return PyBool_FromLong(1); 6791 6792 /* Special case for empty strings */ 6793 if (PyUnicode_GET_SIZE(self) == 0) 6794 return PyBool_FromLong(0); 6795 6796 e = p + PyUnicode_GET_SIZE(self); 6797 for (; p < e; p++) { 6798 if (!Py_UNICODE_ISSPACE(*p)) 6799 return PyBool_FromLong(0); 6800 } 6801 return PyBool_FromLong(1); 6802} 6803 6804PyDoc_STRVAR(isalpha__doc__, 6805"S.isalpha() -> bool\n\ 6806\n\ 6807Return True if all characters in S are alphabetic\n\ 6808and there is at least one character in S, False otherwise."); 6809 6810static PyObject* 6811unicode_isalpha(PyUnicodeObject *self) 6812{ 6813 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 6814 register const Py_UNICODE *e; 6815 6816 /* Shortcut for single character strings */ 6817 if (PyUnicode_GET_SIZE(self) == 1 && 6818 Py_UNICODE_ISALPHA(*p)) 6819 return PyBool_FromLong(1); 6820 6821 /* Special case for empty strings */ 6822 if (PyUnicode_GET_SIZE(self) == 0) 6823 return PyBool_FromLong(0); 6824 6825 e = p + PyUnicode_GET_SIZE(self); 6826 for (; p < e; p++) { 6827 if (!Py_UNICODE_ISALPHA(*p)) 6828 return PyBool_FromLong(0); 6829 } 6830 return PyBool_FromLong(1); 6831} 6832 6833PyDoc_STRVAR(isalnum__doc__, 6834"S.isalnum() -> bool\n\ 6835\n\ 6836Return True if all characters in S are alphanumeric\n\ 6837and there is at least one character in S, False otherwise."); 6838 6839static PyObject* 6840unicode_isalnum(PyUnicodeObject *self) 6841{ 6842 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 6843 register const Py_UNICODE *e; 6844 6845 /* Shortcut for single character strings */ 6846 if (PyUnicode_GET_SIZE(self) == 1 && 6847 Py_UNICODE_ISALNUM(*p)) 6848 return PyBool_FromLong(1); 6849 6850 /* Special case for empty strings */ 6851 if (PyUnicode_GET_SIZE(self) == 0) 6852 return PyBool_FromLong(0); 6853 6854 e = p + PyUnicode_GET_SIZE(self); 6855 for (; p < e; p++) { 6856 if (!Py_UNICODE_ISALNUM(*p)) 6857 return PyBool_FromLong(0); 6858 } 6859 return PyBool_FromLong(1); 6860} 6861 6862PyDoc_STRVAR(isdecimal__doc__, 6863"S.isdecimal() -> bool\n\ 6864\n\ 6865Return True if there are only decimal characters in S,\n\ 6866False otherwise."); 6867 6868static PyObject* 6869unicode_isdecimal(PyUnicodeObject *self) 6870{ 6871 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 6872 register const Py_UNICODE *e; 6873 6874 /* Shortcut for single character strings */ 6875 if (PyUnicode_GET_SIZE(self) == 1 && 6876 Py_UNICODE_ISDECIMAL(*p)) 6877 return PyBool_FromLong(1); 6878 6879 /* Special case for empty strings */ 6880 if (PyUnicode_GET_SIZE(self) == 0) 6881 return PyBool_FromLong(0); 6882 6883 e = p + PyUnicode_GET_SIZE(self); 6884 for (; p < e; p++) { 6885 if (!Py_UNICODE_ISDECIMAL(*p)) 6886 return PyBool_FromLong(0); 6887 } 6888 return PyBool_FromLong(1); 6889} 6890 6891PyDoc_STRVAR(isdigit__doc__, 6892"S.isdigit() -> bool\n\ 6893\n\ 6894Return True if all characters in S are digits\n\ 6895and there is at least one character in S, False otherwise."); 6896 6897static PyObject* 6898unicode_isdigit(PyUnicodeObject *self) 6899{ 6900 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 6901 register const Py_UNICODE *e; 6902 6903 /* Shortcut for single character strings */ 6904 if (PyUnicode_GET_SIZE(self) == 1 && 6905 Py_UNICODE_ISDIGIT(*p)) 6906 return PyBool_FromLong(1); 6907 6908 /* Special case for empty strings */ 6909 if (PyUnicode_GET_SIZE(self) == 0) 6910 return PyBool_FromLong(0); 6911 6912 e = p + PyUnicode_GET_SIZE(self); 6913 for (; p < e; p++) { 6914 if (!Py_UNICODE_ISDIGIT(*p)) 6915 return PyBool_FromLong(0); 6916 } 6917 return PyBool_FromLong(1); 6918} 6919 6920PyDoc_STRVAR(isnumeric__doc__, 6921"S.isnumeric() -> bool\n\ 6922\n\ 6923Return True if there are only numeric characters in S,\n\ 6924False otherwise."); 6925 6926static PyObject* 6927unicode_isnumeric(PyUnicodeObject *self) 6928{ 6929 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 6930 register const Py_UNICODE *e; 6931 6932 /* Shortcut for single character strings */ 6933 if (PyUnicode_GET_SIZE(self) == 1 && 6934 Py_UNICODE_ISNUMERIC(*p)) 6935 return PyBool_FromLong(1); 6936 6937 /* Special case for empty strings */ 6938 if (PyUnicode_GET_SIZE(self) == 0) 6939 return PyBool_FromLong(0); 6940 6941 e = p + PyUnicode_GET_SIZE(self); 6942 for (; p < e; p++) { 6943 if (!Py_UNICODE_ISNUMERIC(*p)) 6944 return PyBool_FromLong(0); 6945 } 6946 return PyBool_FromLong(1); 6947} 6948 6949int 6950PyUnicode_IsIdentifier(PyObject *self) 6951{ 6952 register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self); 6953 register const Py_UNICODE *e; 6954 6955 /* Special case for empty strings */ 6956 if (PyUnicode_GET_SIZE(self) == 0) 6957 return 0; 6958 6959 /* PEP 3131 says that the first character must be in 6960 XID_Start and subsequent characters in XID_Continue, 6961 and for the ASCII range, the 2.x rules apply (i.e 6962 start with letters and underscore, continue with 6963 letters, digits, underscore). However, given the current 6964 definition of XID_Start and XID_Continue, it is sufficient 6965 to check just for these, except that _ must be allowed 6966 as starting an identifier. */ 6967 if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */) 6968 return 0; 6969 6970 e = p + PyUnicode_GET_SIZE(self); 6971 for (p++; p < e; p++) { 6972 if (!_PyUnicode_IsXidContinue(*p)) 6973 return 0; 6974 } 6975 return 1; 6976} 6977 6978PyDoc_STRVAR(isidentifier__doc__, 6979"S.isidentifier() -> bool\n\ 6980\n\ 6981Return True if S is a valid identifier according\n\ 6982to the language definition."); 6983 6984static PyObject* 6985unicode_isidentifier(PyObject *self) 6986{ 6987 return PyBool_FromLong(PyUnicode_IsIdentifier(self)); 6988} 6989 6990PyDoc_STRVAR(join__doc__, 6991"S.join(sequence) -> unicode\n\ 6992\n\ 6993Return a string which is the concatenation of the strings in the\n\ 6994sequence. The separator between elements is S."); 6995 6996static PyObject* 6997unicode_join(PyObject *self, PyObject *data) 6998{ 6999 return PyUnicode_Join(self, data); 7000} 7001 7002static Py_ssize_t 7003unicode_length(PyUnicodeObject *self) 7004{ 7005 return self->length; 7006} 7007 7008PyDoc_STRVAR(ljust__doc__, 7009"S.ljust(width[, fillchar]) -> int\n\ 7010\n\ 7011Return S left justified in a Unicode string of length width. Padding is\n\ 7012done using the specified fill character (default is a space)."); 7013 7014static PyObject * 7015unicode_ljust(PyUnicodeObject *self, PyObject *args) 7016{ 7017 Py_ssize_t width; 7018 Py_UNICODE fillchar = ' '; 7019 7020 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar)) 7021 return NULL; 7022 7023 if (self->length >= width && PyUnicode_CheckExact(self)) { 7024 Py_INCREF(self); 7025 return (PyObject*) self; 7026 } 7027 7028 return (PyObject*) pad(self, 0, width - self->length, fillchar); 7029} 7030 7031PyDoc_STRVAR(lower__doc__, 7032"S.lower() -> unicode\n\ 7033\n\ 7034Return a copy of the string S converted to lowercase."); 7035 7036static PyObject* 7037unicode_lower(PyUnicodeObject *self) 7038{ 7039 return fixup(self, fixlower); 7040} 7041 7042#define LEFTSTRIP 0 7043#define RIGHTSTRIP 1 7044#define BOTHSTRIP 2 7045 7046/* Arrays indexed by above */ 7047static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"}; 7048 7049#define STRIPNAME(i) (stripformat[i]+3) 7050 7051/* externally visible for str.strip(unicode) */ 7052PyObject * 7053_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj) 7054{ 7055 Py_UNICODE *s = PyUnicode_AS_UNICODE(self); 7056 Py_ssize_t len = PyUnicode_GET_SIZE(self); 7057 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj); 7058 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj); 7059 Py_ssize_t i, j; 7060 7061 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen); 7062 7063 i = 0; 7064 if (striptype != RIGHTSTRIP) { 7065 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) { 7066 i++; 7067 } 7068 } 7069 7070 j = len; 7071 if (striptype != LEFTSTRIP) { 7072 do { 7073 j--; 7074 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen)); 7075 j++; 7076 } 7077 7078 if (i == 0 && j == len && PyUnicode_CheckExact(self)) { 7079 Py_INCREF(self); 7080 return (PyObject*)self; 7081 } 7082 else 7083 return PyUnicode_FromUnicode(s+i, j-i); 7084} 7085 7086 7087static PyObject * 7088do_strip(PyUnicodeObject *self, int striptype) 7089{ 7090 Py_UNICODE *s = PyUnicode_AS_UNICODE(self); 7091 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j; 7092 7093 i = 0; 7094 if (striptype != RIGHTSTRIP) { 7095 while (i < len && Py_UNICODE_ISSPACE(s[i])) { 7096 i++; 7097 } 7098 } 7099 7100 j = len; 7101 if (striptype != LEFTSTRIP) { 7102 do { 7103 j--; 7104 } while (j >= i && Py_UNICODE_ISSPACE(s[j])); 7105 j++; 7106 } 7107 7108 if (i == 0 && j == len && PyUnicode_CheckExact(self)) { 7109 Py_INCREF(self); 7110 return (PyObject*)self; 7111 } 7112 else 7113 return PyUnicode_FromUnicode(s+i, j-i); 7114} 7115 7116 7117static PyObject * 7118do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args) 7119{ 7120 PyObject *sep = NULL; 7121 7122 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep)) 7123 return NULL; 7124 7125 if (sep != NULL && sep != Py_None) { 7126 if (PyUnicode_Check(sep)) 7127 return _PyUnicode_XStrip(self, striptype, sep); 7128 else if (PyString_Check(sep)) { 7129 PyObject *res; 7130 sep = PyUnicode_FromObject(sep); 7131 if (sep==NULL) 7132 return NULL; 7133 res = _PyUnicode_XStrip(self, striptype, sep); 7134 Py_DECREF(sep); 7135 return res; 7136 } 7137 else { 7138 PyErr_Format(PyExc_TypeError, 7139 "%s arg must be None, unicode or str", 7140 STRIPNAME(striptype)); 7141 return NULL; 7142 } 7143 } 7144 7145 return do_strip(self, striptype); 7146} 7147 7148 7149PyDoc_STRVAR(strip__doc__, 7150"S.strip([chars]) -> unicode\n\ 7151\n\ 7152Return a copy of the string S with leading and trailing\n\ 7153whitespace removed.\n\ 7154If chars is given and not None, remove characters in chars instead.\n\ 7155If chars is a str, it will be converted to unicode before stripping"); 7156 7157static PyObject * 7158unicode_strip(PyUnicodeObject *self, PyObject *args) 7159{ 7160 if (PyTuple_GET_SIZE(args) == 0) 7161 return do_strip(self, BOTHSTRIP); /* Common case */ 7162 else 7163 return do_argstrip(self, BOTHSTRIP, args); 7164} 7165 7166 7167PyDoc_STRVAR(lstrip__doc__, 7168"S.lstrip([chars]) -> unicode\n\ 7169\n\ 7170Return a copy of the string S with leading whitespace removed.\n\ 7171If chars is given and not None, remove characters in chars instead.\n\ 7172If chars is a str, it will be converted to unicode before stripping"); 7173 7174static PyObject * 7175unicode_lstrip(PyUnicodeObject *self, PyObject *args) 7176{ 7177 if (PyTuple_GET_SIZE(args) == 0) 7178 return do_strip(self, LEFTSTRIP); /* Common case */ 7179 else 7180 return do_argstrip(self, LEFTSTRIP, args); 7181} 7182 7183 7184PyDoc_STRVAR(rstrip__doc__, 7185"S.rstrip([chars]) -> unicode\n\ 7186\n\ 7187Return a copy of the string S with trailing whitespace removed.\n\ 7188If chars is given and not None, remove characters in chars instead.\n\ 7189If chars is a str, it will be converted to unicode before stripping"); 7190 7191static PyObject * 7192unicode_rstrip(PyUnicodeObject *self, PyObject *args) 7193{ 7194 if (PyTuple_GET_SIZE(args) == 0) 7195 return do_strip(self, RIGHTSTRIP); /* Common case */ 7196 else 7197 return do_argstrip(self, RIGHTSTRIP, args); 7198} 7199 7200 7201static PyObject* 7202unicode_repeat(PyUnicodeObject *str, Py_ssize_t len) 7203{ 7204 PyUnicodeObject *u; 7205 Py_UNICODE *p; 7206 Py_ssize_t nchars; 7207 size_t nbytes; 7208 7209 if (len < 0) 7210 len = 0; 7211 7212 if (len == 1 && PyUnicode_CheckExact(str)) { 7213 /* no repeat, return original string */ 7214 Py_INCREF(str); 7215 return (PyObject*) str; 7216 } 7217 7218 /* ensure # of chars needed doesn't overflow int and # of bytes 7219 * needed doesn't overflow size_t 7220 */ 7221 nchars = len * str->length; 7222 if (len && nchars / len != str->length) { 7223 PyErr_SetString(PyExc_OverflowError, 7224 "repeated string is too long"); 7225 return NULL; 7226 } 7227 nbytes = (nchars + 1) * sizeof(Py_UNICODE); 7228 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) { 7229 PyErr_SetString(PyExc_OverflowError, 7230 "repeated string is too long"); 7231 return NULL; 7232 } 7233 u = _PyUnicode_New(nchars); 7234 if (!u) 7235 return NULL; 7236 7237 p = u->str; 7238 7239 if (str->length == 1 && len > 0) { 7240 Py_UNICODE_FILL(p, str->str[0], len); 7241 } else { 7242 Py_ssize_t done = 0; /* number of characters copied this far */ 7243 if (done < nchars) { 7244 Py_UNICODE_COPY(p, str->str, str->length); 7245 done = str->length; 7246 } 7247 while (done < nchars) { 7248 int n = (done <= nchars-done) ? done : nchars-done; 7249 Py_UNICODE_COPY(p+done, p, n); 7250 done += n; 7251 } 7252 } 7253 7254 return (PyObject*) u; 7255} 7256 7257PyObject *PyUnicode_Replace(PyObject *obj, 7258 PyObject *subobj, 7259 PyObject *replobj, 7260 Py_ssize_t maxcount) 7261{ 7262 PyObject *self; 7263 PyObject *str1; 7264 PyObject *str2; 7265 PyObject *result; 7266 7267 self = PyUnicode_FromObject(obj); 7268 if (self == NULL) 7269 return NULL; 7270 str1 = PyUnicode_FromObject(subobj); 7271 if (str1 == NULL) { 7272 Py_DECREF(self); 7273 return NULL; 7274 } 7275 str2 = PyUnicode_FromObject(replobj); 7276 if (str2 == NULL) { 7277 Py_DECREF(self); 7278 Py_DECREF(str1); 7279 return NULL; 7280 } 7281 result = replace((PyUnicodeObject *)self, 7282 (PyUnicodeObject *)str1, 7283 (PyUnicodeObject *)str2, 7284 maxcount); 7285 Py_DECREF(self); 7286 Py_DECREF(str1); 7287 Py_DECREF(str2); 7288 return result; 7289} 7290 7291PyDoc_STRVAR(replace__doc__, 7292"S.replace (old, new[, maxsplit]) -> unicode\n\ 7293\n\ 7294Return a copy of S with all occurrences of substring\n\ 7295old replaced by new. If the optional argument maxsplit is\n\ 7296given, only the first maxsplit occurrences are replaced."); 7297 7298static PyObject* 7299unicode_replace(PyUnicodeObject *self, PyObject *args) 7300{ 7301 PyUnicodeObject *str1; 7302 PyUnicodeObject *str2; 7303 Py_ssize_t maxcount = -1; 7304 PyObject *result; 7305 7306 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount)) 7307 return NULL; 7308 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1); 7309 if (str1 == NULL) 7310 return NULL; 7311 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2); 7312 if (str2 == NULL) { 7313 Py_DECREF(str1); 7314 return NULL; 7315 } 7316 7317 result = replace(self, str1, str2, maxcount); 7318 7319 Py_DECREF(str1); 7320 Py_DECREF(str2); 7321 return result; 7322} 7323 7324static 7325PyObject *unicode_repr(PyObject *unicode) 7326{ 7327 PyObject *repr; 7328 Py_UNICODE *p; 7329 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode); 7330 Py_ssize_t size = PyUnicode_GET_SIZE(unicode); 7331 7332 /* XXX(nnorwitz): rather than over-allocating, it would be 7333 better to choose a different scheme. Perhaps scan the 7334 first N-chars of the string and allocate based on that size. 7335 */ 7336 /* Initial allocation is based on the longest-possible unichr 7337 escape. 7338 7339 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source 7340 unichr, so in this case it's the longest unichr escape. In 7341 narrow (UTF-16) builds this is five chars per source unichr 7342 since there are two unichrs in the surrogate pair, so in narrow 7343 (UTF-16) builds it's not the longest unichr escape. 7344 7345 In wide or narrow builds '\uxxxx' is 6 chars per source unichr, 7346 so in the narrow (UTF-16) build case it's the longest unichr 7347 escape. 7348 */ 7349 7350 repr = PyUnicode_FromUnicode(NULL, 7351 2 /* quotes */ 7352#ifdef Py_UNICODE_WIDE 7353 + 10*size 7354#else 7355 + 6*size 7356#endif 7357 + 1); 7358 if (repr == NULL) 7359 return NULL; 7360 7361 p = PyUnicode_AS_UNICODE(repr); 7362 7363 /* Add quote */ 7364 *p++ = (findchar(s, size, '\'') && 7365 !findchar(s, size, '"')) ? '"' : '\''; 7366 while (size-- > 0) { 7367 Py_UNICODE ch = *s++; 7368 7369 /* Escape quotes and backslashes */ 7370 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) { 7371 *p++ = '\\'; 7372 *p++ = ch; 7373 continue; 7374 } 7375 7376#ifdef Py_UNICODE_WIDE 7377 /* Map 21-bit characters to '\U00xxxxxx' */ 7378 else if (ch >= 0x10000) { 7379 *p++ = '\\'; 7380 *p++ = 'U'; 7381 *p++ = hexdigits[(ch >> 28) & 0x0000000F]; 7382 *p++ = hexdigits[(ch >> 24) & 0x0000000F]; 7383 *p++ = hexdigits[(ch >> 20) & 0x0000000F]; 7384 *p++ = hexdigits[(ch >> 16) & 0x0000000F]; 7385 *p++ = hexdigits[(ch >> 12) & 0x0000000F]; 7386 *p++ = hexdigits[(ch >> 8) & 0x0000000F]; 7387 *p++ = hexdigits[(ch >> 4) & 0x0000000F]; 7388 *p++ = hexdigits[ch & 0x0000000F]; 7389 continue; 7390 } 7391#else 7392 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */ 7393 else if (ch >= 0xD800 && ch < 0xDC00) { 7394 Py_UNICODE ch2; 7395 Py_UCS4 ucs; 7396 7397 ch2 = *s++; 7398 size--; 7399 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) { 7400 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000; 7401 *p++ = '\\'; 7402 *p++ = 'U'; 7403 *p++ = hexdigits[(ucs >> 28) & 0x0000000F]; 7404 *p++ = hexdigits[(ucs >> 24) & 0x0000000F]; 7405 *p++ = hexdigits[(ucs >> 20) & 0x0000000F]; 7406 *p++ = hexdigits[(ucs >> 16) & 0x0000000F]; 7407 *p++ = hexdigits[(ucs >> 12) & 0x0000000F]; 7408 *p++ = hexdigits[(ucs >> 8) & 0x0000000F]; 7409 *p++ = hexdigits[(ucs >> 4) & 0x0000000F]; 7410 *p++ = hexdigits[ucs & 0x0000000F]; 7411 continue; 7412 } 7413 /* Fall through: isolated surrogates are copied as-is */ 7414 s--; 7415 size++; 7416 } 7417#endif 7418 7419 /* Map 16-bit characters to '\uxxxx' */ 7420 if (ch >= 256) { 7421 *p++ = '\\'; 7422 *p++ = 'u'; 7423 *p++ = hexdigits[(ch >> 12) & 0x000F]; 7424 *p++ = hexdigits[(ch >> 8) & 0x000F]; 7425 *p++ = hexdigits[(ch >> 4) & 0x000F]; 7426 *p++ = hexdigits[ch & 0x000F]; 7427 } 7428 7429 /* Map special whitespace to '\t', \n', '\r' */ 7430 else if (ch == '\t') { 7431 *p++ = '\\'; 7432 *p++ = 't'; 7433 } 7434 else if (ch == '\n') { 7435 *p++ = '\\'; 7436 *p++ = 'n'; 7437 } 7438 else if (ch == '\r') { 7439 *p++ = '\\'; 7440 *p++ = 'r'; 7441 } 7442 7443 /* Map non-printable US ASCII to '\xhh' */ 7444 else if (ch < ' ' || ch >= 0x7F) { 7445 *p++ = '\\'; 7446 *p++ = 'x'; 7447 *p++ = hexdigits[(ch >> 4) & 0x000F]; 7448 *p++ = hexdigits[ch & 0x000F]; 7449 } 7450 7451 /* Copy everything else as-is */ 7452 else 7453 *p++ = (char) ch; 7454 } 7455 /* Add quote */ 7456 *p++ = PyUnicode_AS_UNICODE(repr)[0]; 7457 7458 *p = '\0'; 7459 _PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr)); 7460 return repr; 7461} 7462 7463PyDoc_STRVAR(rfind__doc__, 7464"S.rfind(sub [,start [,end]]) -> int\n\ 7465\n\ 7466Return the highest index in S where substring sub is found,\n\ 7467such that sub is contained within s[start:end]. Optional\n\ 7468arguments start and end are interpreted as in slice notation.\n\ 7469\n\ 7470Return -1 on failure."); 7471 7472static PyObject * 7473unicode_rfind(PyUnicodeObject *self, PyObject *args) 7474{ 7475 PyObject *substring; 7476 Py_ssize_t start = 0; 7477 Py_ssize_t end = PY_SSIZE_T_MAX; 7478 Py_ssize_t result; 7479 7480 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring, 7481 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 7482 return NULL; 7483 substring = PyUnicode_FromObject(substring); 7484 if (!substring) 7485 return NULL; 7486 7487 result = stringlib_rfind_slice( 7488 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self), 7489 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring), 7490 start, end 7491 ); 7492 7493 Py_DECREF(substring); 7494 7495 return PyInt_FromSsize_t(result); 7496} 7497 7498PyDoc_STRVAR(rindex__doc__, 7499"S.rindex(sub [,start [,end]]) -> int\n\ 7500\n\ 7501Like S.rfind() but raise ValueError when the substring is not found."); 7502 7503static PyObject * 7504unicode_rindex(PyUnicodeObject *self, PyObject *args) 7505{ 7506 PyObject *substring; 7507 Py_ssize_t start = 0; 7508 Py_ssize_t end = PY_SSIZE_T_MAX; 7509 Py_ssize_t result; 7510 7511 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring, 7512 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 7513 return NULL; 7514 substring = PyUnicode_FromObject(substring); 7515 if (!substring) 7516 return NULL; 7517 7518 result = stringlib_rfind_slice( 7519 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self), 7520 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring), 7521 start, end 7522 ); 7523 7524 Py_DECREF(substring); 7525 7526 if (result < 0) { 7527 PyErr_SetString(PyExc_ValueError, "substring not found"); 7528 return NULL; 7529 } 7530 return PyInt_FromSsize_t(result); 7531} 7532 7533PyDoc_STRVAR(rjust__doc__, 7534"S.rjust(width[, fillchar]) -> unicode\n\ 7535\n\ 7536Return S right justified in a Unicode string of length width. Padding is\n\ 7537done using the specified fill character (default is a space)."); 7538 7539static PyObject * 7540unicode_rjust(PyUnicodeObject *self, PyObject *args) 7541{ 7542 Py_ssize_t width; 7543 Py_UNICODE fillchar = ' '; 7544 7545 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar)) 7546 return NULL; 7547 7548 if (self->length >= width && PyUnicode_CheckExact(self)) { 7549 Py_INCREF(self); 7550 return (PyObject*) self; 7551 } 7552 7553 return (PyObject*) pad(self, width - self->length, 0, fillchar); 7554} 7555 7556PyObject *PyUnicode_Split(PyObject *s, 7557 PyObject *sep, 7558 Py_ssize_t maxsplit) 7559{ 7560 PyObject *result; 7561 7562 s = PyUnicode_FromObject(s); 7563 if (s == NULL) 7564 return NULL; 7565 if (sep != NULL) { 7566 sep = PyUnicode_FromObject(sep); 7567 if (sep == NULL) { 7568 Py_DECREF(s); 7569 return NULL; 7570 } 7571 } 7572 7573 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit); 7574 7575 Py_DECREF(s); 7576 Py_XDECREF(sep); 7577 return result; 7578} 7579 7580PyDoc_STRVAR(split__doc__, 7581"S.split([sep [,maxsplit]]) -> list of strings\n\ 7582\n\ 7583Return a list of the words in S, using sep as the\n\ 7584delimiter string. If maxsplit is given, at most maxsplit\n\ 7585splits are done. If sep is not specified or is None,\n\ 7586any whitespace string is a separator."); 7587 7588static PyObject* 7589unicode_split(PyUnicodeObject *self, PyObject *args) 7590{ 7591 PyObject *substring = Py_None; 7592 Py_ssize_t maxcount = -1; 7593 7594 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount)) 7595 return NULL; 7596 7597 if (substring == Py_None) 7598 return split(self, NULL, maxcount); 7599 else if (PyUnicode_Check(substring)) 7600 return split(self, (PyUnicodeObject *)substring, maxcount); 7601 else 7602 return PyUnicode_Split((PyObject *)self, substring, maxcount); 7603} 7604 7605PyObject * 7606PyUnicode_Partition(PyObject *str_in, PyObject *sep_in) 7607{ 7608 PyObject* str_obj; 7609 PyObject* sep_obj; 7610 PyObject* out; 7611 7612 str_obj = PyUnicode_FromObject(str_in); 7613 if (!str_obj) 7614 return NULL; 7615 sep_obj = PyUnicode_FromObject(sep_in); 7616 if (!sep_obj) { 7617 Py_DECREF(str_obj); 7618 return NULL; 7619 } 7620 7621 out = stringlib_partition( 7622 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj), 7623 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj) 7624 ); 7625 7626 Py_DECREF(sep_obj); 7627 Py_DECREF(str_obj); 7628 7629 return out; 7630} 7631 7632 7633PyObject * 7634PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in) 7635{ 7636 PyObject* str_obj; 7637 PyObject* sep_obj; 7638 PyObject* out; 7639 7640 str_obj = PyUnicode_FromObject(str_in); 7641 if (!str_obj) 7642 return NULL; 7643 sep_obj = PyUnicode_FromObject(sep_in); 7644 if (!sep_obj) { 7645 Py_DECREF(str_obj); 7646 return NULL; 7647 } 7648 7649 out = stringlib_rpartition( 7650 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj), 7651 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj) 7652 ); 7653 7654 Py_DECREF(sep_obj); 7655 Py_DECREF(str_obj); 7656 7657 return out; 7658} 7659 7660PyDoc_STRVAR(partition__doc__, 7661"S.partition(sep) -> (head, sep, tail)\n\ 7662\n\ 7663Searches for the separator sep in S, and returns the part before it,\n\ 7664the separator itself, and the part after it. If the separator is not\n\ 7665found, returns S and two empty strings."); 7666 7667static PyObject* 7668unicode_partition(PyUnicodeObject *self, PyObject *separator) 7669{ 7670 return PyUnicode_Partition((PyObject *)self, separator); 7671} 7672 7673PyDoc_STRVAR(rpartition__doc__, 7674"S.rpartition(sep) -> (tail, sep, head)\n\ 7675\n\ 7676Searches for the separator sep in S, starting at the end of S, and returns\n\ 7677the part before it, the separator itself, and the part after it. If the\n\ 7678separator is not found, returns two empty strings and S."); 7679 7680static PyObject* 7681unicode_rpartition(PyUnicodeObject *self, PyObject *separator) 7682{ 7683 return PyUnicode_RPartition((PyObject *)self, separator); 7684} 7685 7686PyObject *PyUnicode_RSplit(PyObject *s, 7687 PyObject *sep, 7688 Py_ssize_t maxsplit) 7689{ 7690 PyObject *result; 7691 7692 s = PyUnicode_FromObject(s); 7693 if (s == NULL) 7694 return NULL; 7695 if (sep != NULL) { 7696 sep = PyUnicode_FromObject(sep); 7697 if (sep == NULL) { 7698 Py_DECREF(s); 7699 return NULL; 7700 } 7701 } 7702 7703 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit); 7704 7705 Py_DECREF(s); 7706 Py_XDECREF(sep); 7707 return result; 7708} 7709 7710PyDoc_STRVAR(rsplit__doc__, 7711"S.rsplit([sep [,maxsplit]]) -> list of strings\n\ 7712\n\ 7713Return a list of the words in S, using sep as the\n\ 7714delimiter string, starting at the end of the string and\n\ 7715working to the front. If maxsplit is given, at most maxsplit\n\ 7716splits are done. If sep is not specified, any whitespace string\n\ 7717is a separator."); 7718 7719static PyObject* 7720unicode_rsplit(PyUnicodeObject *self, PyObject *args) 7721{ 7722 PyObject *substring = Py_None; 7723 Py_ssize_t maxcount = -1; 7724 7725 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount)) 7726 return NULL; 7727 7728 if (substring == Py_None) 7729 return rsplit(self, NULL, maxcount); 7730 else if (PyUnicode_Check(substring)) 7731 return rsplit(self, (PyUnicodeObject *)substring, maxcount); 7732 else 7733 return PyUnicode_RSplit((PyObject *)self, substring, maxcount); 7734} 7735 7736PyDoc_STRVAR(splitlines__doc__, 7737"S.splitlines([keepends]]) -> list of strings\n\ 7738\n\ 7739Return a list of the lines in S, breaking at line boundaries.\n\ 7740Line breaks are not included in the resulting list unless keepends\n\ 7741is given and true."); 7742 7743static PyObject* 7744unicode_splitlines(PyUnicodeObject *self, PyObject *args) 7745{ 7746 int keepends = 0; 7747 7748 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends)) 7749 return NULL; 7750 7751 return PyUnicode_Splitlines((PyObject *)self, keepends); 7752} 7753 7754static 7755PyObject *unicode_str(PyObject *self) 7756{ 7757 if (PyUnicode_CheckExact(self)) { 7758 Py_INCREF(self); 7759 return self; 7760 } else 7761 /* Subtype -- return genuine unicode string with the same value. */ 7762 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self), 7763 PyUnicode_GET_SIZE(self)); 7764} 7765 7766PyDoc_STRVAR(swapcase__doc__, 7767"S.swapcase() -> unicode\n\ 7768\n\ 7769Return a copy of S with uppercase characters converted to lowercase\n\ 7770and vice versa."); 7771 7772static PyObject* 7773unicode_swapcase(PyUnicodeObject *self) 7774{ 7775 return fixup(self, fixswapcase); 7776} 7777 7778PyDoc_STRVAR(translate__doc__, 7779"S.translate(table) -> unicode\n\ 7780\n\ 7781Return a copy of the string S, where all characters have been mapped\n\ 7782through the given translation table, which must be a mapping of\n\ 7783Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\ 7784Unmapped characters are left untouched. Characters mapped to None\n\ 7785are deleted."); 7786 7787static PyObject* 7788unicode_translate(PyUnicodeObject *self, PyObject *table) 7789{ 7790 return PyUnicode_TranslateCharmap(self->str, 7791 self->length, 7792 table, 7793 "ignore"); 7794} 7795 7796PyDoc_STRVAR(upper__doc__, 7797"S.upper() -> unicode\n\ 7798\n\ 7799Return a copy of S converted to uppercase."); 7800 7801static PyObject* 7802unicode_upper(PyUnicodeObject *self) 7803{ 7804 return fixup(self, fixupper); 7805} 7806 7807PyDoc_STRVAR(zfill__doc__, 7808"S.zfill(width) -> unicode\n\ 7809\n\ 7810Pad a numeric string x with zeros on the left, to fill a field\n\ 7811of the specified width. The string x is never truncated."); 7812 7813static PyObject * 7814unicode_zfill(PyUnicodeObject *self, PyObject *args) 7815{ 7816 Py_ssize_t fill; 7817 PyUnicodeObject *u; 7818 7819 Py_ssize_t width; 7820 if (!PyArg_ParseTuple(args, "n:zfill", &width)) 7821 return NULL; 7822 7823 if (self->length >= width) { 7824 if (PyUnicode_CheckExact(self)) { 7825 Py_INCREF(self); 7826 return (PyObject*) self; 7827 } 7828 else 7829 return PyUnicode_FromUnicode( 7830 PyUnicode_AS_UNICODE(self), 7831 PyUnicode_GET_SIZE(self) 7832 ); 7833 } 7834 7835 fill = width - self->length; 7836 7837 u = pad(self, fill, 0, '0'); 7838 7839 if (u == NULL) 7840 return NULL; 7841 7842 if (u->str[fill] == '+' || u->str[fill] == '-') { 7843 /* move sign to beginning of string */ 7844 u->str[0] = u->str[fill]; 7845 u->str[fill] = '0'; 7846 } 7847 7848 return (PyObject*) u; 7849} 7850 7851#if 0 7852static PyObject* 7853unicode_freelistsize(PyUnicodeObject *self) 7854{ 7855 return PyInt_FromLong(unicode_freelist_size); 7856} 7857#endif 7858 7859PyDoc_STRVAR(startswith__doc__, 7860"S.startswith(prefix[, start[, end]]) -> bool\n\ 7861\n\ 7862Return True if S starts with the specified prefix, False otherwise.\n\ 7863With optional start, test S beginning at that position.\n\ 7864With optional end, stop comparing S at that position.\n\ 7865prefix can also be a tuple of strings to try."); 7866 7867static PyObject * 7868unicode_startswith(PyUnicodeObject *self, 7869 PyObject *args) 7870{ 7871 PyObject *subobj; 7872 PyUnicodeObject *substring; 7873 Py_ssize_t start = 0; 7874 Py_ssize_t end = PY_SSIZE_T_MAX; 7875 int result; 7876 7877 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj, 7878 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 7879 return NULL; 7880 if (PyTuple_Check(subobj)) { 7881 Py_ssize_t i; 7882 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) { 7883 substring = (PyUnicodeObject *)PyUnicode_FromObject( 7884 PyTuple_GET_ITEM(subobj, i)); 7885 if (substring == NULL) 7886 return NULL; 7887 result = tailmatch(self, substring, start, end, -1); 7888 Py_DECREF(substring); 7889 if (result) { 7890 Py_RETURN_TRUE; 7891 } 7892 } 7893 /* nothing matched */ 7894 Py_RETURN_FALSE; 7895 } 7896 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj); 7897 if (substring == NULL) 7898 return NULL; 7899 result = tailmatch(self, substring, start, end, -1); 7900 Py_DECREF(substring); 7901 return PyBool_FromLong(result); 7902} 7903 7904 7905PyDoc_STRVAR(endswith__doc__, 7906"S.endswith(suffix[, start[, end]]) -> bool\n\ 7907\n\ 7908Return True if S ends with the specified suffix, False otherwise.\n\ 7909With optional start, test S beginning at that position.\n\ 7910With optional end, stop comparing S at that position.\n\ 7911suffix can also be a tuple of strings to try."); 7912 7913static PyObject * 7914unicode_endswith(PyUnicodeObject *self, 7915 PyObject *args) 7916{ 7917 PyObject *subobj; 7918 PyUnicodeObject *substring; 7919 Py_ssize_t start = 0; 7920 Py_ssize_t end = PY_SSIZE_T_MAX; 7921 int result; 7922 7923 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj, 7924 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 7925 return NULL; 7926 if (PyTuple_Check(subobj)) { 7927 Py_ssize_t i; 7928 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) { 7929 substring = (PyUnicodeObject *)PyUnicode_FromObject( 7930 PyTuple_GET_ITEM(subobj, i)); 7931 if (substring == NULL) 7932 return NULL; 7933 result = tailmatch(self, substring, start, end, +1); 7934 Py_DECREF(substring); 7935 if (result) { 7936 Py_RETURN_TRUE; 7937 } 7938 } 7939 Py_RETURN_FALSE; 7940 } 7941 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj); 7942 if (substring == NULL) 7943 return NULL; 7944 7945 result = tailmatch(self, substring, start, end, +1); 7946 Py_DECREF(substring); 7947 return PyBool_FromLong(result); 7948} 7949 7950#include "stringlib/string_format.h" 7951 7952PyDoc_STRVAR(format__doc__, 7953"S.format(*args, **kwargs) -> unicode\n\ 7954\n\ 7955"); 7956 7957PyDoc_STRVAR(p_format__doc__, 7958"S.__format__(format_spec) -> unicode\n\ 7959\n\ 7960"); 7961 7962static PyObject * 7963unicode_getnewargs(PyUnicodeObject *v) 7964{ 7965 return Py_BuildValue("(u#)", v->str, v->length); 7966} 7967 7968 7969static PyMethodDef unicode_methods[] = { 7970 7971 /* Order is according to common usage: often used methods should 7972 appear first, since lookup is done sequentially. */ 7973 7974 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__}, 7975 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__}, 7976 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__}, 7977 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__}, 7978 {"join", (PyCFunction) unicode_join, METH_O, join__doc__}, 7979 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__}, 7980 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__}, 7981 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__}, 7982 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__}, 7983 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__}, 7984 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__}, 7985 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__}, 7986 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__}, 7987 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__}, 7988 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__}, 7989 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__}, 7990 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__}, 7991 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__}, 7992 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__}, 7993 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__}, 7994 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__}, 7995 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__}, 7996 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__}, 7997 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__}, 7998 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__}, 7999 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__}, 8000 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__}, 8001 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__}, 8002 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__}, 8003 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__}, 8004 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__}, 8005 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__}, 8006 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__}, 8007 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__}, 8008 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__}, 8009 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__}, 8010 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__}, 8011 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__}, 8012 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__}, 8013 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__}, 8014 {"__format__", (PyCFunction) unicode_unicode__format__, METH_VARARGS, p_format__doc__}, 8015 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS}, 8016 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS}, 8017#if 0 8018 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__}, 8019#endif 8020 8021#if 0 8022 /* This one is just used for debugging the implementation. */ 8023 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS}, 8024#endif 8025 8026 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS}, 8027 {NULL, NULL} 8028}; 8029 8030static PyObject * 8031unicode_mod(PyObject *v, PyObject *w) 8032{ 8033 if (!PyUnicode_Check(v)) { 8034 Py_INCREF(Py_NotImplemented); 8035 return Py_NotImplemented; 8036 } 8037 return PyUnicode_Format(v, w); 8038} 8039 8040static PyNumberMethods unicode_as_number = { 8041 0, /*nb_add*/ 8042 0, /*nb_subtract*/ 8043 0, /*nb_multiply*/ 8044 unicode_mod, /*nb_remainder*/ 8045}; 8046 8047static PySequenceMethods unicode_as_sequence = { 8048 (lenfunc) unicode_length, /* sq_length */ 8049 PyUnicode_Concat, /* sq_concat */ 8050 (ssizeargfunc) unicode_repeat, /* sq_repeat */ 8051 (ssizeargfunc) unicode_getitem, /* sq_item */ 8052 0, /* sq_slice */ 8053 0, /* sq_ass_item */ 8054 0, /* sq_ass_slice */ 8055 PyUnicode_Contains, /* sq_contains */ 8056}; 8057 8058static PyObject* 8059unicode_subscript(PyUnicodeObject* self, PyObject* item) 8060{ 8061 if (PyIndex_Check(item)) { 8062 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError); 8063 if (i == -1 && PyErr_Occurred()) 8064 return NULL; 8065 if (i < 0) 8066 i += PyUnicode_GET_SIZE(self); 8067 return unicode_getitem(self, i); 8068 } else if (PySlice_Check(item)) { 8069 Py_ssize_t start, stop, step, slicelength, cur, i; 8070 Py_UNICODE* source_buf; 8071 Py_UNICODE* result_buf; 8072 PyObject* result; 8073 8074 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self), 8075 &start, &stop, &step, &slicelength) < 0) { 8076 return NULL; 8077 } 8078 8079 if (slicelength <= 0) { 8080 return PyUnicode_FromUnicode(NULL, 0); 8081 } else if (start == 0 && step == 1 && slicelength == self->length && 8082 PyUnicode_CheckExact(self)) { 8083 Py_INCREF(self); 8084 return (PyObject *)self; 8085 } else if (step == 1) { 8086 return PyUnicode_FromUnicode(self->str + start, slicelength); 8087 } else { 8088 source_buf = PyUnicode_AS_UNICODE((PyObject*)self); 8089 result_buf = (Py_UNICODE *)PyMem_MALLOC(slicelength* 8090 sizeof(Py_UNICODE)); 8091 8092 if (result_buf == NULL) 8093 return PyErr_NoMemory(); 8094 8095 for (cur = start, i = 0; i < slicelength; cur += step, i++) { 8096 result_buf[i] = source_buf[cur]; 8097 } 8098 8099 result = PyUnicode_FromUnicode(result_buf, slicelength); 8100 PyMem_FREE(result_buf); 8101 return result; 8102 } 8103 } else { 8104 PyErr_SetString(PyExc_TypeError, "string indices must be integers"); 8105 return NULL; 8106 } 8107} 8108 8109static PyMappingMethods unicode_as_mapping = { 8110 (lenfunc)unicode_length, /* mp_length */ 8111 (binaryfunc)unicode_subscript, /* mp_subscript */ 8112 (objobjargproc)0, /* mp_ass_subscript */ 8113}; 8114 8115 8116/* Helpers for PyUnicode_Format() */ 8117 8118static PyObject * 8119getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx) 8120{ 8121 Py_ssize_t argidx = *p_argidx; 8122 if (argidx < arglen) { 8123 (*p_argidx)++; 8124 if (arglen < 0) 8125 return args; 8126 else 8127 return PyTuple_GetItem(args, argidx); 8128 } 8129 PyErr_SetString(PyExc_TypeError, 8130 "not enough arguments for format string"); 8131 return NULL; 8132} 8133 8134#define F_LJUST (1<<0) 8135#define F_SIGN (1<<1) 8136#define F_BLANK (1<<2) 8137#define F_ALT (1<<3) 8138#define F_ZERO (1<<4) 8139 8140static Py_ssize_t 8141strtounicode(Py_UNICODE *buffer, const char *charbuffer) 8142{ 8143 register Py_ssize_t i; 8144 Py_ssize_t len = strlen(charbuffer); 8145 for (i = len - 1; i >= 0; i--) 8146 buffer[i] = (Py_UNICODE) charbuffer[i]; 8147 8148 return len; 8149} 8150 8151static int 8152doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x) 8153{ 8154 Py_ssize_t result; 8155 8156 PyOS_ascii_formatd((char *)buffer, len, format, x); 8157 result = strtounicode(buffer, (char *)buffer); 8158 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int); 8159} 8160 8161static int 8162longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x) 8163{ 8164 Py_ssize_t result; 8165 8166 PyOS_snprintf((char *)buffer, len, format, x); 8167 result = strtounicode(buffer, (char *)buffer); 8168 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int); 8169} 8170 8171/* XXX To save some code duplication, formatfloat/long/int could have been 8172 shared with stringobject.c, converting from 8-bit to Unicode after the 8173 formatting is done. */ 8174 8175static int 8176formatfloat(Py_UNICODE *buf, 8177 size_t buflen, 8178 int flags, 8179 int prec, 8180 int type, 8181 PyObject *v) 8182{ 8183 /* fmt = '%#.' + `prec` + `type` 8184 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/ 8185 char fmt[20]; 8186 double x; 8187 8188 x = PyFloat_AsDouble(v); 8189 if (x == -1.0 && PyErr_Occurred()) 8190 return -1; 8191 if (prec < 0) 8192 prec = 6; 8193 if (type == 'f' && (fabs(x) / 1e25) >= 1e25) 8194 type = 'g'; 8195 /* Worst case length calc to ensure no buffer overrun: 8196 8197 'g' formats: 8198 fmt = %#.<prec>g 8199 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp 8200 for any double rep.) 8201 len = 1 + prec + 1 + 2 + 5 = 9 + prec 8202 8203 'f' formats: 8204 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50) 8205 len = 1 + 50 + 1 + prec = 52 + prec 8206 8207 If prec=0 the effective precision is 1 (the leading digit is 8208 always given), therefore increase the length by one. 8209 8210 */ 8211 if (((type == 'g' || type == 'G') && 8212 buflen <= (size_t)10 + (size_t)prec) || 8213 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) { 8214 PyErr_SetString(PyExc_OverflowError, 8215 "formatted float is too long (precision too large?)"); 8216 return -1; 8217 } 8218 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c", 8219 (flags&F_ALT) ? "#" : "", 8220 prec, type); 8221 return doubletounicode(buf, buflen, fmt, x); 8222} 8223 8224static PyObject* 8225formatlong(PyObject *val, int flags, int prec, int type) 8226{ 8227 char *buf; 8228 int len; 8229 PyObject *str; /* temporary string object. */ 8230 PyObject *result; 8231 8232 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len); 8233 if (!str) 8234 return NULL; 8235 result = PyUnicode_FromStringAndSize(buf, len); 8236 Py_DECREF(str); 8237 return result; 8238} 8239 8240static int 8241formatint(Py_UNICODE *buf, 8242 size_t buflen, 8243 int flags, 8244 int prec, 8245 int type, 8246 PyObject *v) 8247{ 8248 /* fmt = '%#.' + `prec` + 'l' + `type` 8249 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine) 8250 * + 1 + 1 8251 * = 24 8252 */ 8253 char fmt[64]; /* plenty big enough! */ 8254 char *sign; 8255 long x; 8256 8257 x = PyInt_AsLong(v); 8258 if (x == -1 && PyErr_Occurred()) 8259 return -1; 8260 if (x < 0 && type == 'u') { 8261 type = 'd'; 8262 } 8263 if (x < 0 && (type == 'x' || type == 'X' || type == 'o')) 8264 sign = "-"; 8265 else 8266 sign = ""; 8267 if (prec < 0) 8268 prec = 1; 8269 8270 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal)) 8271 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11 8272 */ 8273 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) { 8274 PyErr_SetString(PyExc_OverflowError, 8275 "formatted integer is too long (precision too large?)"); 8276 return -1; 8277 } 8278 8279 if ((flags & F_ALT) && 8280 (type == 'x' || type == 'X' || type == 'o')) { 8281 /* When converting under %#o, %#x or %#X, there are a number 8282 * of issues that cause pain: 8283 * - for %#o, we want a different base marker than C 8284 * - when 0 is being converted, the C standard leaves off 8285 * the '0x' or '0X', which is inconsistent with other 8286 * %#x/%#X conversions and inconsistent with Python's 8287 * hex() function 8288 * - there are platforms that violate the standard and 8289 * convert 0 with the '0x' or '0X' 8290 * (Metrowerks, Compaq Tru64) 8291 * - there are platforms that give '0x' when converting 8292 * under %#X, but convert 0 in accordance with the 8293 * standard (OS/2 EMX) 8294 * 8295 * We can achieve the desired consistency by inserting our 8296 * own '0x' or '0X' prefix, and substituting %x/%X in place 8297 * of %#x/%#X. 8298 * 8299 * Note that this is the same approach as used in 8300 * formatint() in stringobject.c 8301 */ 8302 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c", 8303 sign, type, prec, type); 8304 } 8305 else { 8306 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c", 8307 sign, (flags&F_ALT) ? "#" : "", 8308 prec, type); 8309 } 8310 if (sign[0]) 8311 return longtounicode(buf, buflen, fmt, -x); 8312 else 8313 return longtounicode(buf, buflen, fmt, x); 8314} 8315 8316static int 8317formatchar(Py_UNICODE *buf, 8318 size_t buflen, 8319 PyObject *v) 8320{ 8321 /* presume that the buffer is at least 2 characters long */ 8322 if (PyUnicode_Check(v)) { 8323 if (PyUnicode_GET_SIZE(v) != 1) 8324 goto onError; 8325 buf[0] = PyUnicode_AS_UNICODE(v)[0]; 8326 } 8327 8328 else if (PyString_Check(v)) { 8329 if (PyString_GET_SIZE(v) != 1) 8330 goto onError; 8331 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0]; 8332 } 8333 8334 else { 8335 /* Integer input truncated to a character */ 8336 long x; 8337 x = PyInt_AsLong(v); 8338 if (x == -1 && PyErr_Occurred()) 8339 goto onError; 8340#ifdef Py_UNICODE_WIDE 8341 if (x < 0 || x > 0x10ffff) { 8342 PyErr_SetString(PyExc_OverflowError, 8343 "%c arg not in range(0x110000) " 8344 "(wide Python build)"); 8345 return -1; 8346 } 8347#else 8348 if (x < 0 || x > 0xffff) { 8349 PyErr_SetString(PyExc_OverflowError, 8350 "%c arg not in range(0x10000) " 8351 "(narrow Python build)"); 8352 return -1; 8353 } 8354#endif 8355 buf[0] = (Py_UNICODE) x; 8356 } 8357 buf[1] = '\0'; 8358 return 1; 8359 8360 onError: 8361 PyErr_SetString(PyExc_TypeError, 8362 "%c requires int or char"); 8363 return -1; 8364} 8365 8366/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...) 8367 8368 FORMATBUFLEN is the length of the buffer in which the floats, ints, & 8369 chars are formatted. XXX This is a magic number. Each formatting 8370 routine does bounds checking to ensure no overflow, but a better 8371 solution may be to malloc a buffer of appropriate size for each 8372 format. For now, the current solution is sufficient. 8373*/ 8374#define FORMATBUFLEN (size_t)120 8375 8376PyObject *PyUnicode_Format(PyObject *format, 8377 PyObject *args) 8378{ 8379 Py_UNICODE *fmt, *res; 8380 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx; 8381 int args_owned = 0; 8382 PyUnicodeObject *result = NULL; 8383 PyObject *dict = NULL; 8384 PyObject *uformat; 8385 8386 if (format == NULL || args == NULL) { 8387 PyErr_BadInternalCall(); 8388 return NULL; 8389 } 8390 uformat = PyUnicode_FromObject(format); 8391 if (uformat == NULL) 8392 return NULL; 8393 fmt = PyUnicode_AS_UNICODE(uformat); 8394 fmtcnt = PyUnicode_GET_SIZE(uformat); 8395 8396 reslen = rescnt = fmtcnt + 100; 8397 result = _PyUnicode_New(reslen); 8398 if (result == NULL) 8399 goto onError; 8400 res = PyUnicode_AS_UNICODE(result); 8401 8402 if (PyTuple_Check(args)) { 8403 arglen = PyTuple_Size(args); 8404 argidx = 0; 8405 } 8406 else { 8407 arglen = -1; 8408 argidx = -2; 8409 } 8410 if (Py_Type(args)->tp_as_mapping && !PyTuple_Check(args) && 8411 !PyObject_TypeCheck(args, &PyBaseString_Type)) 8412 dict = args; 8413 8414 while (--fmtcnt >= 0) { 8415 if (*fmt != '%') { 8416 if (--rescnt < 0) { 8417 rescnt = fmtcnt + 100; 8418 reslen += rescnt; 8419 if (_PyUnicode_Resize(&result, reslen) < 0) 8420 goto onError; 8421 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt; 8422 --rescnt; 8423 } 8424 *res++ = *fmt++; 8425 } 8426 else { 8427 /* Got a format specifier */ 8428 int flags = 0; 8429 Py_ssize_t width = -1; 8430 int prec = -1; 8431 Py_UNICODE c = '\0'; 8432 Py_UNICODE fill; 8433 PyObject *v = NULL; 8434 PyObject *temp = NULL; 8435 Py_UNICODE *pbuf; 8436 Py_UNICODE sign; 8437 Py_ssize_t len; 8438 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */ 8439 8440 fmt++; 8441 if (*fmt == '(') { 8442 Py_UNICODE *keystart; 8443 Py_ssize_t keylen; 8444 PyObject *key; 8445 int pcount = 1; 8446 8447 if (dict == NULL) { 8448 PyErr_SetString(PyExc_TypeError, 8449 "format requires a mapping"); 8450 goto onError; 8451 } 8452 ++fmt; 8453 --fmtcnt; 8454 keystart = fmt; 8455 /* Skip over balanced parentheses */ 8456 while (pcount > 0 && --fmtcnt >= 0) { 8457 if (*fmt == ')') 8458 --pcount; 8459 else if (*fmt == '(') 8460 ++pcount; 8461 fmt++; 8462 } 8463 keylen = fmt - keystart - 1; 8464 if (fmtcnt < 0 || pcount > 0) { 8465 PyErr_SetString(PyExc_ValueError, 8466 "incomplete format key"); 8467 goto onError; 8468 } 8469#if 0 8470 /* keys are converted to strings using UTF-8 and 8471 then looked up since Python uses strings to hold 8472 variables names etc. in its namespaces and we 8473 wouldn't want to break common idioms. */ 8474 key = PyUnicode_EncodeUTF8(keystart, 8475 keylen, 8476 NULL); 8477#else 8478 key = PyUnicode_FromUnicode(keystart, keylen); 8479#endif 8480 if (key == NULL) 8481 goto onError; 8482 if (args_owned) { 8483 Py_DECREF(args); 8484 args_owned = 0; 8485 } 8486 args = PyObject_GetItem(dict, key); 8487 Py_DECREF(key); 8488 if (args == NULL) { 8489 goto onError; 8490 } 8491 args_owned = 1; 8492 arglen = -1; 8493 argidx = -2; 8494 } 8495 while (--fmtcnt >= 0) { 8496 switch (c = *fmt++) { 8497 case '-': flags |= F_LJUST; continue; 8498 case '+': flags |= F_SIGN; continue; 8499 case ' ': flags |= F_BLANK; continue; 8500 case '#': flags |= F_ALT; continue; 8501 case '0': flags |= F_ZERO; continue; 8502 } 8503 break; 8504 } 8505 if (c == '*') { 8506 v = getnextarg(args, arglen, &argidx); 8507 if (v == NULL) 8508 goto onError; 8509 if (!PyInt_Check(v)) { 8510 PyErr_SetString(PyExc_TypeError, 8511 "* wants int"); 8512 goto onError; 8513 } 8514 width = PyInt_AsLong(v); 8515 if (width == -1 && PyErr_Occurred()) 8516 goto onError; 8517 if (width < 0) { 8518 flags |= F_LJUST; 8519 width = -width; 8520 } 8521 if (--fmtcnt >= 0) 8522 c = *fmt++; 8523 } 8524 else if (c >= '0' && c <= '9') { 8525 width = c - '0'; 8526 while (--fmtcnt >= 0) { 8527 c = *fmt++; 8528 if (c < '0' || c > '9') 8529 break; 8530 if ((width*10) / 10 != width) { 8531 PyErr_SetString(PyExc_ValueError, 8532 "width too big"); 8533 goto onError; 8534 } 8535 width = width*10 + (c - '0'); 8536 } 8537 } 8538 if (c == '.') { 8539 prec = 0; 8540 if (--fmtcnt >= 0) 8541 c = *fmt++; 8542 if (c == '*') { 8543 v = getnextarg(args, arglen, &argidx); 8544 if (v == NULL) 8545 goto onError; 8546 if (!PyInt_Check(v)) { 8547 PyErr_SetString(PyExc_TypeError, 8548 "* wants int"); 8549 goto onError; 8550 } 8551 prec = PyInt_AsLong(v); 8552 if (prec == -1 && PyErr_Occurred()) 8553 goto onError; 8554 if (prec < 0) 8555 prec = 0; 8556 if (--fmtcnt >= 0) 8557 c = *fmt++; 8558 } 8559 else if (c >= '0' && c <= '9') { 8560 prec = c - '0'; 8561 while (--fmtcnt >= 0) { 8562 c = Py_CHARMASK(*fmt++); 8563 if (c < '0' || c > '9') 8564 break; 8565 if ((prec*10) / 10 != prec) { 8566 PyErr_SetString(PyExc_ValueError, 8567 "prec too big"); 8568 goto onError; 8569 } 8570 prec = prec*10 + (c - '0'); 8571 } 8572 } 8573 } /* prec */ 8574 if (fmtcnt >= 0) { 8575 if (c == 'h' || c == 'l' || c == 'L') { 8576 if (--fmtcnt >= 0) 8577 c = *fmt++; 8578 } 8579 } 8580 if (fmtcnt < 0) { 8581 PyErr_SetString(PyExc_ValueError, 8582 "incomplete format"); 8583 goto onError; 8584 } 8585 if (c != '%') { 8586 v = getnextarg(args, arglen, &argidx); 8587 if (v == NULL) 8588 goto onError; 8589 } 8590 sign = 0; 8591 fill = ' '; 8592 switch (c) { 8593 8594 case '%': 8595 pbuf = formatbuf; 8596 /* presume that buffer length is at least 1 */ 8597 pbuf[0] = '%'; 8598 len = 1; 8599 break; 8600 8601 case 's': 8602 case 'r': 8603 if (PyUnicode_Check(v) && c == 's') { 8604 temp = v; 8605 Py_INCREF(temp); 8606 } 8607 else { 8608 PyObject *unicode; 8609 if (c == 's') 8610 temp = PyObject_Unicode(v); 8611 else 8612 temp = PyObject_Repr(v); 8613 if (temp == NULL) 8614 goto onError; 8615 if (PyUnicode_Check(temp)) 8616 /* nothing to do */; 8617 else if (PyString_Check(temp)) { 8618 /* convert to string to Unicode */ 8619 unicode = PyUnicode_Decode(PyString_AS_STRING(temp), 8620 PyString_GET_SIZE(temp), 8621 NULL, 8622 "strict"); 8623 Py_DECREF(temp); 8624 temp = unicode; 8625 if (temp == NULL) 8626 goto onError; 8627 } 8628 else { 8629 Py_DECREF(temp); 8630 PyErr_SetString(PyExc_TypeError, 8631 "%s argument has non-string str()"); 8632 goto onError; 8633 } 8634 } 8635 pbuf = PyUnicode_AS_UNICODE(temp); 8636 len = PyUnicode_GET_SIZE(temp); 8637 if (prec >= 0 && len > prec) 8638 len = prec; 8639 break; 8640 8641 case 'i': 8642 case 'd': 8643 case 'u': 8644 case 'o': 8645 case 'x': 8646 case 'X': 8647 if (c == 'i') 8648 c = 'd'; 8649 if (PyLong_Check(v)) { 8650 temp = formatlong(v, flags, prec, c); 8651 if (!temp) 8652 goto onError; 8653 pbuf = PyUnicode_AS_UNICODE(temp); 8654 len = PyUnicode_GET_SIZE(temp); 8655 sign = 1; 8656 } 8657 else { 8658 pbuf = formatbuf; 8659 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), 8660 flags, prec, c, v); 8661 if (len < 0) 8662 goto onError; 8663 sign = 1; 8664 } 8665 if (flags & F_ZERO) 8666 fill = '0'; 8667 break; 8668 8669 case 'e': 8670 case 'E': 8671 case 'f': 8672 case 'F': 8673 case 'g': 8674 case 'G': 8675 if (c == 'F') 8676 c = 'f'; 8677 pbuf = formatbuf; 8678 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), 8679 flags, prec, c, v); 8680 if (len < 0) 8681 goto onError; 8682 sign = 1; 8683 if (flags & F_ZERO) 8684 fill = '0'; 8685 break; 8686 8687 case 'c': 8688 pbuf = formatbuf; 8689 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v); 8690 if (len < 0) 8691 goto onError; 8692 break; 8693 8694 default: 8695 PyErr_Format(PyExc_ValueError, 8696 "unsupported format character '%c' (0x%x) " 8697 "at index %zd", 8698 (31<=c && c<=126) ? (char)c : '?', 8699 (int)c, 8700 (Py_ssize_t)(fmt - 1 - 8701 PyUnicode_AS_UNICODE(uformat))); 8702 goto onError; 8703 } 8704 if (sign) { 8705 if (*pbuf == '-' || *pbuf == '+') { 8706 sign = *pbuf++; 8707 len--; 8708 } 8709 else if (flags & F_SIGN) 8710 sign = '+'; 8711 else if (flags & F_BLANK) 8712 sign = ' '; 8713 else 8714 sign = 0; 8715 } 8716 if (width < len) 8717 width = len; 8718 if (rescnt - (sign != 0) < width) { 8719 reslen -= rescnt; 8720 rescnt = width + fmtcnt + 100; 8721 reslen += rescnt; 8722 if (reslen < 0) { 8723 Py_XDECREF(temp); 8724 PyErr_NoMemory(); 8725 goto onError; 8726 } 8727 if (_PyUnicode_Resize(&result, reslen) < 0) { 8728 Py_XDECREF(temp); 8729 goto onError; 8730 } 8731 res = PyUnicode_AS_UNICODE(result) 8732 + reslen - rescnt; 8733 } 8734 if (sign) { 8735 if (fill != ' ') 8736 *res++ = sign; 8737 rescnt--; 8738 if (width > len) 8739 width--; 8740 } 8741 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) { 8742 assert(pbuf[0] == '0'); 8743 assert(pbuf[1] == c); 8744 if (fill != ' ') { 8745 *res++ = *pbuf++; 8746 *res++ = *pbuf++; 8747 } 8748 rescnt -= 2; 8749 width -= 2; 8750 if (width < 0) 8751 width = 0; 8752 len -= 2; 8753 } 8754 if (width > len && !(flags & F_LJUST)) { 8755 do { 8756 --rescnt; 8757 *res++ = fill; 8758 } while (--width > len); 8759 } 8760 if (fill == ' ') { 8761 if (sign) 8762 *res++ = sign; 8763 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) { 8764 assert(pbuf[0] == '0'); 8765 assert(pbuf[1] == c); 8766 *res++ = *pbuf++; 8767 *res++ = *pbuf++; 8768 } 8769 } 8770 Py_UNICODE_COPY(res, pbuf, len); 8771 res += len; 8772 rescnt -= len; 8773 while (--width >= len) { 8774 --rescnt; 8775 *res++ = ' '; 8776 } 8777 if (dict && (argidx < arglen) && c != '%') { 8778 PyErr_SetString(PyExc_TypeError, 8779 "not all arguments converted during string formatting"); 8780 Py_XDECREF(temp); 8781 goto onError; 8782 } 8783 Py_XDECREF(temp); 8784 } /* '%' */ 8785 } /* until end */ 8786 if (argidx < arglen && !dict) { 8787 PyErr_SetString(PyExc_TypeError, 8788 "not all arguments converted during string formatting"); 8789 goto onError; 8790 } 8791 8792 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0) 8793 goto onError; 8794 if (args_owned) { 8795 Py_DECREF(args); 8796 } 8797 Py_DECREF(uformat); 8798 return (PyObject *)result; 8799 8800 onError: 8801 Py_XDECREF(result); 8802 Py_DECREF(uformat); 8803 if (args_owned) { 8804 Py_DECREF(args); 8805 } 8806 return NULL; 8807} 8808 8809static PyObject * 8810unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds); 8811 8812static PyObject * 8813unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds) 8814{ 8815 PyObject *x = NULL; 8816 static char *kwlist[] = {"object", "encoding", "errors", 0}; 8817 char *encoding = NULL; 8818 char *errors = NULL; 8819 8820 if (type != &PyUnicode_Type) 8821 return unicode_subtype_new(type, args, kwds); 8822 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode", 8823 kwlist, &x, &encoding, &errors)) 8824 return NULL; 8825 if (x == NULL) 8826 return (PyObject *)_PyUnicode_New(0); 8827 if (encoding == NULL && errors == NULL) 8828 return PyObject_Unicode(x); 8829 else 8830 return PyUnicode_FromEncodedObject(x, encoding, errors); 8831} 8832 8833static PyObject * 8834unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds) 8835{ 8836 PyUnicodeObject *tmp, *pnew; 8837 Py_ssize_t n; 8838 8839 assert(PyType_IsSubtype(type, &PyUnicode_Type)); 8840 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds); 8841 if (tmp == NULL) 8842 return NULL; 8843 assert(PyUnicode_Check(tmp)); 8844 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length); 8845 if (pnew == NULL) { 8846 Py_DECREF(tmp); 8847 return NULL; 8848 } 8849 pnew->str = PyMem_NEW(Py_UNICODE, n+1); 8850 if (pnew->str == NULL) { 8851 _Py_ForgetReference((PyObject *)pnew); 8852 PyObject_Del(pnew); 8853 Py_DECREF(tmp); 8854 return PyErr_NoMemory(); 8855 } 8856 Py_UNICODE_COPY(pnew->str, tmp->str, n+1); 8857 pnew->length = n; 8858 pnew->hash = tmp->hash; 8859 Py_DECREF(tmp); 8860 return (PyObject *)pnew; 8861} 8862 8863PyDoc_STRVAR(unicode_doc, 8864"str(string [, encoding[, errors]]) -> object\n\ 8865\n\ 8866Create a new string object from the given encoded string.\n\ 8867encoding defaults to the current default string encoding.\n\ 8868errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'."); 8869 8870static PyObject *unicode_iter(PyObject *seq); 8871 8872PyTypeObject PyUnicode_Type = { 8873 PyVarObject_HEAD_INIT(&PyType_Type, 0) 8874 "str", /* tp_name */ 8875 sizeof(PyUnicodeObject), /* tp_size */ 8876 0, /* tp_itemsize */ 8877 /* Slots */ 8878 (destructor)unicode_dealloc, /* tp_dealloc */ 8879 0, /* tp_print */ 8880 0, /* tp_getattr */ 8881 0, /* tp_setattr */ 8882 0, /* tp_compare */ 8883 unicode_repr, /* tp_repr */ 8884 &unicode_as_number, /* tp_as_number */ 8885 &unicode_as_sequence, /* tp_as_sequence */ 8886 &unicode_as_mapping, /* tp_as_mapping */ 8887 (hashfunc) unicode_hash, /* tp_hash*/ 8888 0, /* tp_call*/ 8889 (reprfunc) unicode_str, /* tp_str */ 8890 PyObject_GenericGetAttr, /* tp_getattro */ 8891 0, /* tp_setattro */ 8892 0, /* tp_as_buffer */ 8893 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | 8894 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */ 8895 unicode_doc, /* tp_doc */ 8896 0, /* tp_traverse */ 8897 0, /* tp_clear */ 8898 PyUnicode_RichCompare, /* tp_richcompare */ 8899 0, /* tp_weaklistoffset */ 8900 unicode_iter, /* tp_iter */ 8901 0, /* tp_iternext */ 8902 unicode_methods, /* tp_methods */ 8903 0, /* tp_members */ 8904 0, /* tp_getset */ 8905 &PyBaseString_Type, /* tp_base */ 8906 0, /* tp_dict */ 8907 0, /* tp_descr_get */ 8908 0, /* tp_descr_set */ 8909 0, /* tp_dictoffset */ 8910 0, /* tp_init */ 8911 0, /* tp_alloc */ 8912 unicode_new, /* tp_new */ 8913 PyObject_Del, /* tp_free */ 8914}; 8915 8916/* Initialize the Unicode implementation */ 8917 8918void _PyUnicode_Init(void) 8919{ 8920 int i; 8921 8922 /* XXX - move this array to unicodectype.c ? */ 8923 Py_UNICODE linebreak[] = { 8924 0x000A, /* LINE FEED */ 8925 0x000D, /* CARRIAGE RETURN */ 8926 0x001C, /* FILE SEPARATOR */ 8927 0x001D, /* GROUP SEPARATOR */ 8928 0x001E, /* RECORD SEPARATOR */ 8929 0x0085, /* NEXT LINE */ 8930 0x2028, /* LINE SEPARATOR */ 8931 0x2029, /* PARAGRAPH SEPARATOR */ 8932 }; 8933 8934 /* Init the implementation */ 8935 unicode_freelist = NULL; 8936 unicode_freelist_size = 0; 8937 unicode_empty = _PyUnicode_New(0); 8938 if (!unicode_empty) 8939 return; 8940 8941 for (i = 0; i < 256; i++) 8942 unicode_latin1[i] = NULL; 8943 if (PyType_Ready(&PyUnicode_Type) < 0) 8944 Py_FatalError("Can't initialize 'unicode'"); 8945 8946 /* initialize the linebreak bloom filter */ 8947 bloom_linebreak = make_bloom_mask( 8948 linebreak, sizeof(linebreak) / sizeof(linebreak[0]) 8949 ); 8950 8951 PyType_Ready(&EncodingMapType); 8952} 8953 8954/* Finalize the Unicode implementation */ 8955 8956void 8957_PyUnicode_Fini(void) 8958{ 8959 PyUnicodeObject *u; 8960 int i; 8961 8962 Py_XDECREF(unicode_empty); 8963 unicode_empty = NULL; 8964 8965 for (i = 0; i < 256; i++) { 8966 if (unicode_latin1[i]) { 8967 Py_DECREF(unicode_latin1[i]); 8968 unicode_latin1[i] = NULL; 8969 } 8970 } 8971 8972 for (u = unicode_freelist; u != NULL;) { 8973 PyUnicodeObject *v = u; 8974 u = *(PyUnicodeObject **)u; 8975 if (v->str) 8976 PyMem_DEL(v->str); 8977 Py_XDECREF(v->defenc); 8978 PyObject_Del(v); 8979 } 8980 unicode_freelist = NULL; 8981 unicode_freelist_size = 0; 8982} 8983 8984void 8985PyUnicode_InternInPlace(PyObject **p) 8986{ 8987 register PyUnicodeObject *s = (PyUnicodeObject *)(*p); 8988 PyObject *t; 8989 if (s == NULL || !PyUnicode_Check(s)) 8990 Py_FatalError( 8991 "PyUnicode_InternInPlace: unicode strings only please!"); 8992 /* If it's a subclass, we don't really know what putting 8993 it in the interned dict might do. */ 8994 if (!PyUnicode_CheckExact(s)) 8995 return; 8996 if (PyUnicode_CHECK_INTERNED(s)) 8997 return; 8998 if (interned == NULL) { 8999 interned = PyDict_New(); 9000 if (interned == NULL) { 9001 PyErr_Clear(); /* Don't leave an exception */ 9002 return; 9003 } 9004 } 9005 /* It might be that the GetItem call fails even 9006 though the key is present in the dictionary, 9007 namely when this happens during a stack overflow. */ 9008 Py_ALLOW_RECURSION 9009 t = PyDict_GetItem(interned, (PyObject *)s); 9010 Py_END_ALLOW_RECURSION 9011 9012 if (t) { 9013 Py_INCREF(t); 9014 Py_DECREF(*p); 9015 *p = t; 9016 return; 9017 } 9018 9019 PyThreadState_GET()->recursion_critical = 1; 9020 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) { 9021 PyErr_Clear(); 9022 PyThreadState_GET()->recursion_critical = 0; 9023 return; 9024 } 9025 PyThreadState_GET()->recursion_critical = 0; 9026 /* The two references in interned are not counted by refcnt. 9027 The deallocator will take care of this */ 9028 Py_Refcnt(s) -= 2; 9029 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL; 9030} 9031 9032void 9033PyUnicode_InternImmortal(PyObject **p) 9034{ 9035 PyUnicode_InternInPlace(p); 9036 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) { 9037 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL; 9038 Py_INCREF(*p); 9039 } 9040} 9041 9042PyObject * 9043PyUnicode_InternFromString(const char *cp) 9044{ 9045 PyObject *s = PyUnicode_FromString(cp); 9046 if (s == NULL) 9047 return NULL; 9048 PyUnicode_InternInPlace(&s); 9049 return s; 9050} 9051 9052void _Py_ReleaseInternedUnicodeStrings(void) 9053{ 9054 PyObject *keys; 9055 PyUnicodeObject *s; 9056 Py_ssize_t i, n; 9057 Py_ssize_t immortal_size = 0, mortal_size = 0; 9058 9059 if (interned == NULL || !PyDict_Check(interned)) 9060 return; 9061 keys = PyDict_Keys(interned); 9062 if (keys == NULL || !PyList_Check(keys)) { 9063 PyErr_Clear(); 9064 return; 9065 } 9066 9067 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak 9068 detector, interned unicode strings are not forcibly deallocated; 9069 rather, we give them their stolen references back, and then clear 9070 and DECREF the interned dict. */ 9071 9072 n = PyList_GET_SIZE(keys); 9073 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n", 9074 n); 9075 for (i = 0; i < n; i++) { 9076 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i); 9077 switch (s->state) { 9078 case SSTATE_NOT_INTERNED: 9079 /* XXX Shouldn't happen */ 9080 break; 9081 case SSTATE_INTERNED_IMMORTAL: 9082 Py_Refcnt(s) += 1; 9083 immortal_size += s->length; 9084 break; 9085 case SSTATE_INTERNED_MORTAL: 9086 Py_Refcnt(s) += 2; 9087 mortal_size += s->length; 9088 break; 9089 default: 9090 Py_FatalError("Inconsistent interned string state."); 9091 } 9092 s->state = SSTATE_NOT_INTERNED; 9093 } 9094 fprintf(stderr, "total size of all interned strings: " 9095 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d " 9096 "mortal/immortal\n", mortal_size, immortal_size); 9097 Py_DECREF(keys); 9098 PyDict_Clear(interned); 9099 Py_DECREF(interned); 9100 interned = NULL; 9101} 9102 9103 9104/********************* Unicode Iterator **************************/ 9105 9106typedef struct { 9107 PyObject_HEAD 9108 Py_ssize_t it_index; 9109 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */ 9110} unicodeiterobject; 9111 9112static void 9113unicodeiter_dealloc(unicodeiterobject *it) 9114{ 9115 _PyObject_GC_UNTRACK(it); 9116 Py_XDECREF(it->it_seq); 9117 PyObject_GC_Del(it); 9118} 9119 9120static int 9121unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg) 9122{ 9123 Py_VISIT(it->it_seq); 9124 return 0; 9125} 9126 9127static PyObject * 9128unicodeiter_next(unicodeiterobject *it) 9129{ 9130 PyUnicodeObject *seq; 9131 PyObject *item; 9132 9133 assert(it != NULL); 9134 seq = it->it_seq; 9135 if (seq == NULL) 9136 return NULL; 9137 assert(PyUnicode_Check(seq)); 9138 9139 if (it->it_index < PyUnicode_GET_SIZE(seq)) { 9140 item = PyUnicode_FromUnicode( 9141 PyUnicode_AS_UNICODE(seq)+it->it_index, 1); 9142 if (item != NULL) 9143 ++it->it_index; 9144 return item; 9145 } 9146 9147 Py_DECREF(seq); 9148 it->it_seq = NULL; 9149 return NULL; 9150} 9151 9152static PyObject * 9153unicodeiter_len(unicodeiterobject *it) 9154{ 9155 Py_ssize_t len = 0; 9156 if (it->it_seq) 9157 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index; 9158 return PyInt_FromSsize_t(len); 9159} 9160 9161PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it))."); 9162 9163static PyMethodDef unicodeiter_methods[] = { 9164 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS, 9165 length_hint_doc}, 9166 {NULL, NULL} /* sentinel */ 9167}; 9168 9169PyTypeObject PyUnicodeIter_Type = { 9170 PyVarObject_HEAD_INIT(&PyType_Type, 0) 9171 "unicodeiterator", /* tp_name */ 9172 sizeof(unicodeiterobject), /* tp_basicsize */ 9173 0, /* tp_itemsize */ 9174 /* methods */ 9175 (destructor)unicodeiter_dealloc, /* tp_dealloc */ 9176 0, /* tp_print */ 9177 0, /* tp_getattr */ 9178 0, /* tp_setattr */ 9179 0, /* tp_compare */ 9180 0, /* tp_repr */ 9181 0, /* tp_as_number */ 9182 0, /* tp_as_sequence */ 9183 0, /* tp_as_mapping */ 9184 0, /* tp_hash */ 9185 0, /* tp_call */ 9186 0, /* tp_str */ 9187 PyObject_GenericGetAttr, /* tp_getattro */ 9188 0, /* tp_setattro */ 9189 0, /* tp_as_buffer */ 9190 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */ 9191 0, /* tp_doc */ 9192 (traverseproc)unicodeiter_traverse, /* tp_traverse */ 9193 0, /* tp_clear */ 9194 0, /* tp_richcompare */ 9195 0, /* tp_weaklistoffset */ 9196 PyObject_SelfIter, /* tp_iter */ 9197 (iternextfunc)unicodeiter_next, /* tp_iternext */ 9198 unicodeiter_methods, /* tp_methods */ 9199 0, 9200}; 9201 9202static PyObject * 9203unicode_iter(PyObject *seq) 9204{ 9205 unicodeiterobject *it; 9206 9207 if (!PyUnicode_Check(seq)) { 9208 PyErr_BadInternalCall(); 9209 return NULL; 9210 } 9211 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type); 9212 if (it == NULL) 9213 return NULL; 9214 it->it_index = 0; 9215 Py_INCREF(seq); 9216 it->it_seq = (PyUnicodeObject *)seq; 9217 _PyObject_GC_TRACK(it); 9218 return (PyObject *)it; 9219} 9220 9221size_t 9222Py_UNICODE_strlen(const Py_UNICODE *u) 9223{ 9224 int res = 0; 9225 while(*u++) 9226 res++; 9227 return res; 9228} 9229 9230Py_UNICODE* 9231Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2) 9232{ 9233 Py_UNICODE *u = s1; 9234 while ((*u++ = *s2++)); 9235 return s1; 9236} 9237 9238Py_UNICODE* 9239Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n) 9240{ 9241 Py_UNICODE *u = s1; 9242 while ((*u++ = *s2++)) 9243 if (n-- == 0) 9244 break; 9245 return s1; 9246} 9247 9248int 9249Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2) 9250{ 9251 while (*s1 && *s2 && *s1 == *s2) 9252 s1++, s2++; 9253 if (*s1 && *s2) 9254 return (*s1 < *s2) ? -1 : +1; 9255 if (*s1) 9256 return 1; 9257 if (*s2) 9258 return -1; 9259 return 0; 9260} 9261 9262Py_UNICODE* 9263Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c) 9264{ 9265 const Py_UNICODE *p; 9266 for (p = s; *p; p++) 9267 if (*p == c) 9268 return (Py_UNICODE*)p; 9269 return NULL; 9270} 9271 9272 9273#ifdef __cplusplus 9274} 9275#endif 9276 9277 9278/* 9279Local variables: 9280c-basic-offset: 4 9281indent-tabs-mode: nil 9282End: 9283*/ 9284