unicodeobject.c revision e94c679df0b632bc929936ca54f0de006e1a6dc2
1/* 2 3Unicode implementation based on original code by Fredrik Lundh, 4modified by Marc-Andre Lemburg <mal@lemburg.com> according to the 5Unicode Integration Proposal (see file Misc/unicode.txt). 6 7Major speed upgrades to the method implementations at the Reykjavik 8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke. 9 10Copyright (c) Corporation for National Research Initiatives. 11 12-------------------------------------------------------------------- 13The original string type implementation is: 14 15 Copyright (c) 1999 by Secret Labs AB 16 Copyright (c) 1999 by Fredrik Lundh 17 18By obtaining, using, and/or copying this software and/or its 19associated documentation, you agree that you have read, understood, 20and will comply with the following terms and conditions: 21 22Permission to use, copy, modify, and distribute this software and its 23associated documentation for any purpose and without fee is hereby 24granted, provided that the above copyright notice appears in all 25copies, and that both that copyright notice and this permission notice 26appear in supporting documentation, and that the name of Secret Labs 27AB or the author not be used in advertising or publicity pertaining to 28distribution of the software without specific, written prior 29permission. 30 31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO 32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND 33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR 34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT 37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 38-------------------------------------------------------------------- 39 40*/ 41 42#define PY_SSIZE_T_CLEAN 43#include "Python.h" 44#include "bytes_methods.h" 45 46#include "unicodeobject.h" 47#include "ucnhash.h" 48 49#ifdef MS_WINDOWS 50#include <windows.h> 51#endif 52 53/* Limit for the Unicode object free list */ 54 55#define PyUnicode_MAXFREELIST 1024 56 57/* Limit for the Unicode object free list stay alive optimization. 58 59 The implementation will keep allocated Unicode memory intact for 60 all objects on the free list having a size less than this 61 limit. This reduces malloc() overhead for small Unicode objects. 62 63 At worst this will result in PyUnicode_MAXFREELIST * 64 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT + 65 malloc()-overhead) bytes of unused garbage. 66 67 Setting the limit to 0 effectively turns the feature off. 68 69 Note: This is an experimental feature ! If you get core dumps when 70 using Unicode objects, turn this feature off. 71 72*/ 73 74#define KEEPALIVE_SIZE_LIMIT 9 75 76/* Endianness switches; defaults to little endian */ 77 78#ifdef WORDS_BIGENDIAN 79# define BYTEORDER_IS_BIG_ENDIAN 80#else 81# define BYTEORDER_IS_LITTLE_ENDIAN 82#endif 83 84/* --- Globals ------------------------------------------------------------ 85 86 The globals are initialized by the _PyUnicode_Init() API and should 87 not be used before calling that API. 88 89*/ 90 91 92#ifdef __cplusplus 93extern "C" { 94#endif 95 96/* This dictionary holds all interned unicode strings. Note that references 97 to strings in this dictionary are *not* counted in the string's ob_refcnt. 98 When the interned string reaches a refcnt of 0 the string deallocation 99 function will delete the reference from this dictionary. 100 101 Another way to look at this is that to say that the actual reference 102 count of a string is: s->ob_refcnt + (s->state ? 2 : 0) 103*/ 104static PyObject *interned; 105 106/* Free list for Unicode objects */ 107static PyUnicodeObject *free_list; 108static int numfree; 109 110/* The empty Unicode object is shared to improve performance. */ 111static PyUnicodeObject *unicode_empty; 112 113/* Single character Unicode strings in the Latin-1 range are being 114 shared as well. */ 115static PyUnicodeObject *unicode_latin1[256]; 116 117/* Default encoding to use and assume when NULL is passed as encoding 118 parameter; it is fixed to "utf-8". Always use the 119 PyUnicode_GetDefaultEncoding() API to access this global. 120 121 Don't forget to alter Py_FileSystemDefaultEncoding if you change the 122 hard coded default! 123*/ 124static const char unicode_default_encoding[] = "utf-8"; 125 126/* Fast detection of the most frequent whitespace characters */ 127const unsigned char _Py_ascii_whitespace[] = { 128 0, 0, 0, 0, 0, 0, 0, 0, 129/* case 0x0009: * HORIZONTAL TABULATION */ 130/* case 0x000A: * LINE FEED */ 131/* case 0x000B: * VERTICAL TABULATION */ 132/* case 0x000C: * FORM FEED */ 133/* case 0x000D: * CARRIAGE RETURN */ 134 0, 1, 1, 1, 1, 1, 0, 0, 135 0, 0, 0, 0, 0, 0, 0, 0, 136/* case 0x001C: * FILE SEPARATOR */ 137/* case 0x001D: * GROUP SEPARATOR */ 138/* case 0x001E: * RECORD SEPARATOR */ 139/* case 0x001F: * UNIT SEPARATOR */ 140 0, 0, 0, 0, 1, 1, 1, 1, 141/* case 0x0020: * SPACE */ 142 1, 0, 0, 0, 0, 0, 0, 0, 143 0, 0, 0, 0, 0, 0, 0, 0, 144 0, 0, 0, 0, 0, 0, 0, 0, 145 0, 0, 0, 0, 0, 0, 0, 0, 146 147 0, 0, 0, 0, 0, 0, 0, 0, 148 0, 0, 0, 0, 0, 0, 0, 0, 149 0, 0, 0, 0, 0, 0, 0, 0, 150 0, 0, 0, 0, 0, 0, 0, 0, 151 0, 0, 0, 0, 0, 0, 0, 0, 152 0, 0, 0, 0, 0, 0, 0, 0, 153 0, 0, 0, 0, 0, 0, 0, 0, 154 0, 0, 0, 0, 0, 0, 0, 0 155}; 156 157/* Same for linebreaks */ 158static unsigned char ascii_linebreak[] = { 159 0, 0, 0, 0, 0, 0, 0, 0, 160/* 0x000A, * LINE FEED */ 161/* 0x000D, * CARRIAGE RETURN */ 162 0, 0, 1, 0, 0, 1, 0, 0, 163 0, 0, 0, 0, 0, 0, 0, 0, 164/* 0x001C, * FILE SEPARATOR */ 165/* 0x001D, * GROUP SEPARATOR */ 166/* 0x001E, * RECORD SEPARATOR */ 167 0, 0, 0, 0, 1, 1, 1, 0, 168 0, 0, 0, 0, 0, 0, 0, 0, 169 0, 0, 0, 0, 0, 0, 0, 0, 170 0, 0, 0, 0, 0, 0, 0, 0, 171 0, 0, 0, 0, 0, 0, 0, 0, 172 173 0, 0, 0, 0, 0, 0, 0, 0, 174 0, 0, 0, 0, 0, 0, 0, 0, 175 0, 0, 0, 0, 0, 0, 0, 0, 176 0, 0, 0, 0, 0, 0, 0, 0, 177 0, 0, 0, 0, 0, 0, 0, 0, 178 0, 0, 0, 0, 0, 0, 0, 0, 179 0, 0, 0, 0, 0, 0, 0, 0, 180 0, 0, 0, 0, 0, 0, 0, 0 181}; 182 183 184Py_UNICODE 185PyUnicode_GetMax(void) 186{ 187#ifdef Py_UNICODE_WIDE 188 return 0x10FFFF; 189#else 190 /* This is actually an illegal character, so it should 191 not be passed to unichr. */ 192 return 0xFFFF; 193#endif 194} 195 196/* --- Bloom Filters ----------------------------------------------------- */ 197 198/* stuff to implement simple "bloom filters" for Unicode characters. 199 to keep things simple, we use a single bitmask, using the least 5 200 bits from each unicode characters as the bit index. */ 201 202/* the linebreak mask is set up by Unicode_Init below */ 203 204#define BLOOM_MASK unsigned long 205 206static BLOOM_MASK bloom_linebreak; 207 208#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F)))) 209 210#define BLOOM_LINEBREAK(ch) \ 211 ((ch) < 128U ? ascii_linebreak[(ch)] : \ 212 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch))) 213 214Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len) 215{ 216 /* calculate simple bloom-style bitmask for a given unicode string */ 217 218 long mask; 219 Py_ssize_t i; 220 221 mask = 0; 222 for (i = 0; i < len; i++) 223 mask |= (1 << (ptr[i] & 0x1F)); 224 225 return mask; 226} 227 228Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen) 229{ 230 Py_ssize_t i; 231 232 for (i = 0; i < setlen; i++) 233 if (set[i] == chr) 234 return 1; 235 236 return 0; 237} 238 239#define BLOOM_MEMBER(mask, chr, set, setlen) \ 240 BLOOM(mask, chr) && unicode_member(chr, set, setlen) 241 242/* --- Unicode Object ----------------------------------------------------- */ 243 244static 245int unicode_resize(register PyUnicodeObject *unicode, 246 Py_ssize_t length) 247{ 248 void *oldstr; 249 250 /* Shortcut if there's nothing much to do. */ 251 if (unicode->length == length) 252 goto reset; 253 254 /* Resizing shared object (unicode_empty or single character 255 objects) in-place is not allowed. Use PyUnicode_Resize() 256 instead ! */ 257 258 if (unicode == unicode_empty || 259 (unicode->length == 1 && 260 unicode->str[0] < 256U && 261 unicode_latin1[unicode->str[0]] == unicode)) { 262 PyErr_SetString(PyExc_SystemError, 263 "can't resize shared str objects"); 264 return -1; 265 } 266 267 /* We allocate one more byte to make sure the string is Ux0000 terminated. 268 The overallocation is also used by fastsearch, which assumes that it's 269 safe to look at str[length] (without making any assumptions about what 270 it contains). */ 271 272 oldstr = unicode->str; 273 unicode->str = PyObject_REALLOC(unicode->str, 274 sizeof(Py_UNICODE) * (length + 1)); 275 if (!unicode->str) { 276 unicode->str = (Py_UNICODE *)oldstr; 277 PyErr_NoMemory(); 278 return -1; 279 } 280 unicode->str[length] = 0; 281 unicode->length = length; 282 283 reset: 284 /* Reset the object caches */ 285 if (unicode->defenc) { 286 Py_DECREF(unicode->defenc); 287 unicode->defenc = NULL; 288 } 289 unicode->hash = -1; 290 291 return 0; 292} 293 294/* We allocate one more byte to make sure the string is 295 Ux0000 terminated; some code (e.g. new_identifier) 296 relies on that. 297 298 XXX This allocator could further be enhanced by assuring that the 299 free list never reduces its size below 1. 300 301*/ 302 303static 304PyUnicodeObject *_PyUnicode_New(Py_ssize_t length) 305{ 306 register PyUnicodeObject *unicode; 307 308 /* Optimization for empty strings */ 309 if (length == 0 && unicode_empty != NULL) { 310 Py_INCREF(unicode_empty); 311 return unicode_empty; 312 } 313 314 /* Ensure we won't overflow the size. */ 315 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) { 316 return (PyUnicodeObject *)PyErr_NoMemory(); 317 } 318 319 /* Unicode freelist & memory allocation */ 320 if (free_list) { 321 unicode = free_list; 322 free_list = *(PyUnicodeObject **)unicode; 323 numfree--; 324 if (unicode->str) { 325 /* Keep-Alive optimization: we only upsize the buffer, 326 never downsize it. */ 327 if ((unicode->length < length) && 328 unicode_resize(unicode, length) < 0) { 329 PyObject_DEL(unicode->str); 330 unicode->str = NULL; 331 } 332 } 333 else { 334 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1); 335 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size); 336 } 337 PyObject_INIT(unicode, &PyUnicode_Type); 338 } 339 else { 340 size_t new_size; 341 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type); 342 if (unicode == NULL) 343 return NULL; 344 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1); 345 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size); 346 } 347 348 if (!unicode->str) { 349 PyErr_NoMemory(); 350 goto onError; 351 } 352 /* Initialize the first element to guard against cases where 353 * the caller fails before initializing str -- unicode_resize() 354 * reads str[0], and the Keep-Alive optimization can keep memory 355 * allocated for str alive across a call to unicode_dealloc(unicode). 356 * We don't want unicode_resize to read uninitialized memory in 357 * that case. 358 */ 359 unicode->str[0] = 0; 360 unicode->str[length] = 0; 361 unicode->length = length; 362 unicode->hash = -1; 363 unicode->state = 0; 364 unicode->defenc = NULL; 365 return unicode; 366 367 onError: 368 /* XXX UNREF/NEWREF interface should be more symmetrical */ 369 _Py_DEC_REFTOTAL; 370 _Py_ForgetReference((PyObject *)unicode); 371 PyObject_Del(unicode); 372 return NULL; 373} 374 375static 376void unicode_dealloc(register PyUnicodeObject *unicode) 377{ 378 switch (PyUnicode_CHECK_INTERNED(unicode)) { 379 case SSTATE_NOT_INTERNED: 380 break; 381 382 case SSTATE_INTERNED_MORTAL: 383 /* revive dead object temporarily for DelItem */ 384 Py_REFCNT(unicode) = 3; 385 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0) 386 Py_FatalError( 387 "deletion of interned string failed"); 388 break; 389 390 case SSTATE_INTERNED_IMMORTAL: 391 Py_FatalError("Immortal interned string died."); 392 393 default: 394 Py_FatalError("Inconsistent interned string state."); 395 } 396 397 if (PyUnicode_CheckExact(unicode) && 398 numfree < PyUnicode_MAXFREELIST) { 399 /* Keep-Alive optimization */ 400 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) { 401 PyObject_DEL(unicode->str); 402 unicode->str = NULL; 403 unicode->length = 0; 404 } 405 if (unicode->defenc) { 406 Py_DECREF(unicode->defenc); 407 unicode->defenc = NULL; 408 } 409 /* Add to free list */ 410 *(PyUnicodeObject **)unicode = free_list; 411 free_list = unicode; 412 numfree++; 413 } 414 else { 415 PyObject_DEL(unicode->str); 416 Py_XDECREF(unicode->defenc); 417 Py_TYPE(unicode)->tp_free((PyObject *)unicode); 418 } 419} 420 421static 422int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length) 423{ 424 register PyUnicodeObject *v; 425 426 /* Argument checks */ 427 if (unicode == NULL) { 428 PyErr_BadInternalCall(); 429 return -1; 430 } 431 v = *unicode; 432 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) { 433 PyErr_BadInternalCall(); 434 return -1; 435 } 436 437 /* Resizing unicode_empty and single character objects is not 438 possible since these are being shared. We simply return a fresh 439 copy with the same Unicode content. */ 440 if (v->length != length && 441 (v == unicode_empty || v->length == 1)) { 442 PyUnicodeObject *w = _PyUnicode_New(length); 443 if (w == NULL) 444 return -1; 445 Py_UNICODE_COPY(w->str, v->str, 446 length < v->length ? length : v->length); 447 Py_DECREF(*unicode); 448 *unicode = w; 449 return 0; 450 } 451 452 /* Note that we don't have to modify *unicode for unshared Unicode 453 objects, since we can modify them in-place. */ 454 return unicode_resize(v, length); 455} 456 457int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length) 458{ 459 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length); 460} 461 462PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u, 463 Py_ssize_t size) 464{ 465 PyUnicodeObject *unicode; 466 467 /* If the Unicode data is known at construction time, we can apply 468 some optimizations which share commonly used objects. */ 469 if (u != NULL) { 470 471 /* Optimization for empty strings */ 472 if (size == 0 && unicode_empty != NULL) { 473 Py_INCREF(unicode_empty); 474 return (PyObject *)unicode_empty; 475 } 476 477 /* Single character Unicode objects in the Latin-1 range are 478 shared when using this constructor */ 479 if (size == 1 && *u < 256) { 480 unicode = unicode_latin1[*u]; 481 if (!unicode) { 482 unicode = _PyUnicode_New(1); 483 if (!unicode) 484 return NULL; 485 unicode->str[0] = *u; 486 unicode_latin1[*u] = unicode; 487 } 488 Py_INCREF(unicode); 489 return (PyObject *)unicode; 490 } 491 } 492 493 unicode = _PyUnicode_New(size); 494 if (!unicode) 495 return NULL; 496 497 /* Copy the Unicode data into the new object */ 498 if (u != NULL) 499 Py_UNICODE_COPY(unicode->str, u, size); 500 501 return (PyObject *)unicode; 502} 503 504PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size) 505{ 506 PyUnicodeObject *unicode; 507 508 if (size < 0) { 509 PyErr_SetString(PyExc_SystemError, 510 "Negative size passed to PyUnicode_FromStringAndSize"); 511 return NULL; 512 } 513 514 /* If the Unicode data is known at construction time, we can apply 515 some optimizations which share commonly used objects. 516 Also, this means the input must be UTF-8, so fall back to the 517 UTF-8 decoder at the end. */ 518 if (u != NULL) { 519 520 /* Optimization for empty strings */ 521 if (size == 0 && unicode_empty != NULL) { 522 Py_INCREF(unicode_empty); 523 return (PyObject *)unicode_empty; 524 } 525 526 /* Single characters are shared when using this constructor. 527 Restrict to ASCII, since the input must be UTF-8. */ 528 if (size == 1 && Py_CHARMASK(*u) < 128) { 529 unicode = unicode_latin1[Py_CHARMASK(*u)]; 530 if (!unicode) { 531 unicode = _PyUnicode_New(1); 532 if (!unicode) 533 return NULL; 534 unicode->str[0] = Py_CHARMASK(*u); 535 unicode_latin1[Py_CHARMASK(*u)] = unicode; 536 } 537 Py_INCREF(unicode); 538 return (PyObject *)unicode; 539 } 540 541 return PyUnicode_DecodeUTF8(u, size, NULL); 542 } 543 544 unicode = _PyUnicode_New(size); 545 if (!unicode) 546 return NULL; 547 548 return (PyObject *)unicode; 549} 550 551PyObject *PyUnicode_FromString(const char *u) 552{ 553 size_t size = strlen(u); 554 if (size > PY_SSIZE_T_MAX) { 555 PyErr_SetString(PyExc_OverflowError, "input too long"); 556 return NULL; 557 } 558 559 return PyUnicode_FromStringAndSize(u, size); 560} 561 562#ifdef HAVE_WCHAR_H 563 564PyObject *PyUnicode_FromWideChar(register const wchar_t *w, 565 Py_ssize_t size) 566{ 567 PyUnicodeObject *unicode; 568 569 if (w == NULL) { 570 if (size == 0) 571 return PyUnicode_FromStringAndSize(NULL, 0); 572 PyErr_BadInternalCall(); 573 return NULL; 574 } 575 576 if (size == -1) { 577 size = wcslen(w); 578 } 579 580 unicode = _PyUnicode_New(size); 581 if (!unicode) 582 return NULL; 583 584 /* Copy the wchar_t data into the new object */ 585#ifdef HAVE_USABLE_WCHAR_T 586 memcpy(unicode->str, w, size * sizeof(wchar_t)); 587#else 588 { 589 register Py_UNICODE *u; 590 register Py_ssize_t i; 591 u = PyUnicode_AS_UNICODE(unicode); 592 for (i = size; i > 0; i--) 593 *u++ = *w++; 594 } 595#endif 596 597 return (PyObject *)unicode; 598} 599 600static void 601makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c) 602{ 603 *fmt++ = '%'; 604 if (width) { 605 if (zeropad) 606 *fmt++ = '0'; 607 fmt += sprintf(fmt, "%d", width); 608 } 609 if (precision) 610 fmt += sprintf(fmt, ".%d", precision); 611 if (longflag) 612 *fmt++ = 'l'; 613 else if (size_tflag) { 614 char *f = PY_FORMAT_SIZE_T; 615 while (*f) 616 *fmt++ = *f++; 617 } 618 *fmt++ = c; 619 *fmt = '\0'; 620} 621 622#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;} 623 624PyObject * 625PyUnicode_FromFormatV(const char *format, va_list vargs) 626{ 627 va_list count; 628 Py_ssize_t callcount = 0; 629 PyObject **callresults = NULL; 630 PyObject **callresult = NULL; 631 Py_ssize_t n = 0; 632 int width = 0; 633 int precision = 0; 634 int zeropad; 635 const char* f; 636 Py_UNICODE *s; 637 PyObject *string; 638 /* used by sprintf */ 639 char buffer[21]; 640 /* use abuffer instead of buffer, if we need more space 641 * (which can happen if there's a format specifier with width). */ 642 char *abuffer = NULL; 643 char *realbuffer; 644 Py_ssize_t abuffersize = 0; 645 char fmt[60]; /* should be enough for %0width.precisionld */ 646 const char *copy; 647 648#ifdef VA_LIST_IS_ARRAY 649 Py_MEMCPY(count, vargs, sizeof(va_list)); 650#else 651#ifdef __va_copy 652 __va_copy(count, vargs); 653#else 654 count = vargs; 655#endif 656#endif 657 /* step 1: count the number of %S/%R/%A format specifications 658 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII() for 659 * these objects once during step 3 and put the result in 660 an array) */ 661 for (f = format; *f; f++) { 662 if (*f == '%' && (*(f+1)=='S' || *(f+1)=='R' || *(f+1)=='A')) 663 ++callcount; 664 } 665 /* step 2: allocate memory for the results of 666 * PyObject_Str()/PyObject_Repr() calls */ 667 if (callcount) { 668 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount); 669 if (!callresults) { 670 PyErr_NoMemory(); 671 return NULL; 672 } 673 callresult = callresults; 674 } 675 /* step 3: figure out how large a buffer we need */ 676 for (f = format; *f; f++) { 677 if (*f == '%') { 678 const char* p = f; 679 width = 0; 680 while (ISDIGIT((unsigned)*f)) 681 width = (width*10) + *f++ - '0'; 682 while (*++f && *f != '%' && !ISALPHA((unsigned)*f)) 683 ; 684 685 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since 686 * they don't affect the amount of space we reserve. 687 */ 688 if ((*f == 'l' || *f == 'z') && 689 (f[1] == 'd' || f[1] == 'u')) 690 ++f; 691 692 switch (*f) { 693 case 'c': 694 (void)va_arg(count, int); 695 /* fall through... */ 696 case '%': 697 n++; 698 break; 699 case 'd': case 'u': case 'i': case 'x': 700 (void) va_arg(count, int); 701 /* 20 bytes is enough to hold a 64-bit 702 integer. Decimal takes the most space. 703 This isn't enough for octal. 704 If a width is specified we need more 705 (which we allocate later). */ 706 if (width < 20) 707 width = 20; 708 n += width; 709 if (abuffersize < width) 710 abuffersize = width; 711 break; 712 case 's': 713 { 714 /* UTF-8 */ 715 unsigned char*s; 716 s = va_arg(count, unsigned char*); 717 while (*s) { 718 if (*s < 128) { 719 n++; s++; 720 } else if (*s < 0xc0) { 721 /* invalid UTF-8 */ 722 n++; s++; 723 } else if (*s < 0xc0) { 724 n++; 725 s++; if(!*s)break; 726 s++; 727 } else if (*s < 0xe0) { 728 n++; 729 s++; if(!*s)break; 730 s++; if(!*s)break; 731 s++; 732 } else { 733#ifdef Py_UNICODE_WIDE 734 n++; 735#else 736 n+=2; 737#endif 738 s++; if(!*s)break; 739 s++; if(!*s)break; 740 s++; if(!*s)break; 741 s++; 742 } 743 } 744 break; 745 } 746 case 'U': 747 { 748 PyObject *obj = va_arg(count, PyObject *); 749 assert(obj && PyUnicode_Check(obj)); 750 n += PyUnicode_GET_SIZE(obj); 751 break; 752 } 753 case 'V': 754 { 755 PyObject *obj = va_arg(count, PyObject *); 756 const char *str = va_arg(count, const char *); 757 assert(obj || str); 758 assert(!obj || PyUnicode_Check(obj)); 759 if (obj) 760 n += PyUnicode_GET_SIZE(obj); 761 else 762 n += strlen(str); 763 break; 764 } 765 case 'S': 766 { 767 PyObject *obj = va_arg(count, PyObject *); 768 PyObject *str; 769 assert(obj); 770 str = PyObject_Str(obj); 771 if (!str) 772 goto fail; 773 n += PyUnicode_GET_SIZE(str); 774 /* Remember the str and switch to the next slot */ 775 *callresult++ = str; 776 break; 777 } 778 case 'R': 779 { 780 PyObject *obj = va_arg(count, PyObject *); 781 PyObject *repr; 782 assert(obj); 783 repr = PyObject_Repr(obj); 784 if (!repr) 785 goto fail; 786 n += PyUnicode_GET_SIZE(repr); 787 /* Remember the repr and switch to the next slot */ 788 *callresult++ = repr; 789 break; 790 } 791 case 'A': 792 { 793 PyObject *obj = va_arg(count, PyObject *); 794 PyObject *ascii; 795 assert(obj); 796 ascii = PyObject_ASCII(obj); 797 if (!ascii) 798 goto fail; 799 n += PyUnicode_GET_SIZE(ascii); 800 /* Remember the repr and switch to the next slot */ 801 *callresult++ = ascii; 802 break; 803 } 804 case 'p': 805 (void) va_arg(count, int); 806 /* maximum 64-bit pointer representation: 807 * 0xffffffffffffffff 808 * so 19 characters is enough. 809 * XXX I count 18 -- what's the extra for? 810 */ 811 n += 19; 812 break; 813 default: 814 /* if we stumble upon an unknown 815 formatting code, copy the rest of 816 the format string to the output 817 string. (we cannot just skip the 818 code, since there's no way to know 819 what's in the argument list) */ 820 n += strlen(p); 821 goto expand; 822 } 823 } else 824 n++; 825 } 826 expand: 827 if (abuffersize > 20) { 828 abuffer = PyObject_Malloc(abuffersize); 829 if (!abuffer) { 830 PyErr_NoMemory(); 831 goto fail; 832 } 833 realbuffer = abuffer; 834 } 835 else 836 realbuffer = buffer; 837 /* step 4: fill the buffer */ 838 /* Since we've analyzed how much space we need for the worst case, 839 we don't have to resize the string. 840 There can be no errors beyond this point. */ 841 string = PyUnicode_FromUnicode(NULL, n); 842 if (!string) 843 goto fail; 844 845 s = PyUnicode_AS_UNICODE(string); 846 callresult = callresults; 847 848 for (f = format; *f; f++) { 849 if (*f == '%') { 850 const char* p = f++; 851 int longflag = 0; 852 int size_tflag = 0; 853 zeropad = (*f == '0'); 854 /* parse the width.precision part */ 855 width = 0; 856 while (ISDIGIT((unsigned)*f)) 857 width = (width*10) + *f++ - '0'; 858 precision = 0; 859 if (*f == '.') { 860 f++; 861 while (ISDIGIT((unsigned)*f)) 862 precision = (precision*10) + *f++ - '0'; 863 } 864 /* handle the long flag, but only for %ld and %lu. 865 others can be added when necessary. */ 866 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) { 867 longflag = 1; 868 ++f; 869 } 870 /* handle the size_t flag. */ 871 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) { 872 size_tflag = 1; 873 ++f; 874 } 875 876 switch (*f) { 877 case 'c': 878 *s++ = va_arg(vargs, int); 879 break; 880 case 'd': 881 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd'); 882 if (longflag) 883 sprintf(realbuffer, fmt, va_arg(vargs, long)); 884 else if (size_tflag) 885 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t)); 886 else 887 sprintf(realbuffer, fmt, va_arg(vargs, int)); 888 appendstring(realbuffer); 889 break; 890 case 'u': 891 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u'); 892 if (longflag) 893 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long)); 894 else if (size_tflag) 895 sprintf(realbuffer, fmt, va_arg(vargs, size_t)); 896 else 897 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int)); 898 appendstring(realbuffer); 899 break; 900 case 'i': 901 makefmt(fmt, 0, 0, zeropad, width, precision, 'i'); 902 sprintf(realbuffer, fmt, va_arg(vargs, int)); 903 appendstring(realbuffer); 904 break; 905 case 'x': 906 makefmt(fmt, 0, 0, zeropad, width, precision, 'x'); 907 sprintf(realbuffer, fmt, va_arg(vargs, int)); 908 appendstring(realbuffer); 909 break; 910 case 's': 911 { 912 /* Parameter must be UTF-8 encoded. 913 In case of encoding errors, use 914 the replacement character. */ 915 PyObject *u; 916 p = va_arg(vargs, char*); 917 u = PyUnicode_DecodeUTF8(p, strlen(p), 918 "replace"); 919 if (!u) 920 goto fail; 921 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(u), 922 PyUnicode_GET_SIZE(u)); 923 s += PyUnicode_GET_SIZE(u); 924 Py_DECREF(u); 925 break; 926 } 927 case 'U': 928 { 929 PyObject *obj = va_arg(vargs, PyObject *); 930 Py_ssize_t size = PyUnicode_GET_SIZE(obj); 931 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size); 932 s += size; 933 break; 934 } 935 case 'V': 936 { 937 PyObject *obj = va_arg(vargs, PyObject *); 938 const char *str = va_arg(vargs, const char *); 939 if (obj) { 940 Py_ssize_t size = PyUnicode_GET_SIZE(obj); 941 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size); 942 s += size; 943 } else { 944 appendstring(str); 945 } 946 break; 947 } 948 case 'S': 949 case 'R': 950 { 951 Py_UNICODE *ucopy; 952 Py_ssize_t usize; 953 Py_ssize_t upos; 954 /* unused, since we already have the result */ 955 (void) va_arg(vargs, PyObject *); 956 ucopy = PyUnicode_AS_UNICODE(*callresult); 957 usize = PyUnicode_GET_SIZE(*callresult); 958 for (upos = 0; upos<usize;) 959 *s++ = ucopy[upos++]; 960 /* We're done with the unicode()/repr() => forget it */ 961 Py_DECREF(*callresult); 962 /* switch to next unicode()/repr() result */ 963 ++callresult; 964 break; 965 } 966 case 'p': 967 sprintf(buffer, "%p", va_arg(vargs, void*)); 968 /* %p is ill-defined: ensure leading 0x. */ 969 if (buffer[1] == 'X') 970 buffer[1] = 'x'; 971 else if (buffer[1] != 'x') { 972 memmove(buffer+2, buffer, strlen(buffer)+1); 973 buffer[0] = '0'; 974 buffer[1] = 'x'; 975 } 976 appendstring(buffer); 977 break; 978 case '%': 979 *s++ = '%'; 980 break; 981 default: 982 appendstring(p); 983 goto end; 984 } 985 } else 986 *s++ = *f; 987 } 988 989 end: 990 if (callresults) 991 PyObject_Free(callresults); 992 if (abuffer) 993 PyObject_Free(abuffer); 994 PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string)); 995 return string; 996 fail: 997 if (callresults) { 998 PyObject **callresult2 = callresults; 999 while (callresult2 < callresult) { 1000 Py_DECREF(*callresult2); 1001 ++callresult2; 1002 } 1003 PyObject_Free(callresults); 1004 } 1005 if (abuffer) 1006 PyObject_Free(abuffer); 1007 return NULL; 1008} 1009 1010#undef appendstring 1011 1012PyObject * 1013PyUnicode_FromFormat(const char *format, ...) 1014{ 1015 PyObject* ret; 1016 va_list vargs; 1017 1018#ifdef HAVE_STDARG_PROTOTYPES 1019 va_start(vargs, format); 1020#else 1021 va_start(vargs); 1022#endif 1023 ret = PyUnicode_FromFormatV(format, vargs); 1024 va_end(vargs); 1025 return ret; 1026} 1027 1028Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode, 1029 wchar_t *w, 1030 Py_ssize_t size) 1031{ 1032 if (unicode == NULL) { 1033 PyErr_BadInternalCall(); 1034 return -1; 1035 } 1036 1037 /* If possible, try to copy the 0-termination as well */ 1038 if (size > PyUnicode_GET_SIZE(unicode)) 1039 size = PyUnicode_GET_SIZE(unicode) + 1; 1040 1041#ifdef HAVE_USABLE_WCHAR_T 1042 memcpy(w, unicode->str, size * sizeof(wchar_t)); 1043#else 1044 { 1045 register Py_UNICODE *u; 1046 register Py_ssize_t i; 1047 u = PyUnicode_AS_UNICODE(unicode); 1048 for (i = size; i > 0; i--) 1049 *w++ = *u++; 1050 } 1051#endif 1052 1053 if (size > PyUnicode_GET_SIZE(unicode)) 1054 return PyUnicode_GET_SIZE(unicode); 1055 else 1056 return size; 1057} 1058 1059#endif 1060 1061PyObject *PyUnicode_FromOrdinal(int ordinal) 1062{ 1063 Py_UNICODE s[2]; 1064 1065 if (ordinal < 0 || ordinal > 0x10ffff) { 1066 PyErr_SetString(PyExc_ValueError, 1067 "chr() arg not in range(0x110000)"); 1068 return NULL; 1069 } 1070 1071#ifndef Py_UNICODE_WIDE 1072 if (ordinal > 0xffff) { 1073 ordinal -= 0x10000; 1074 s[0] = 0xD800 | (ordinal >> 10); 1075 s[1] = 0xDC00 | (ordinal & 0x3FF); 1076 return PyUnicode_FromUnicode(s, 2); 1077 } 1078#endif 1079 1080 s[0] = (Py_UNICODE)ordinal; 1081 return PyUnicode_FromUnicode(s, 1); 1082} 1083 1084PyObject *PyUnicode_FromObject(register PyObject *obj) 1085{ 1086 /* XXX Perhaps we should make this API an alias of 1087 PyObject_Str() instead ?! */ 1088 if (PyUnicode_CheckExact(obj)) { 1089 Py_INCREF(obj); 1090 return obj; 1091 } 1092 if (PyUnicode_Check(obj)) { 1093 /* For a Unicode subtype that's not a Unicode object, 1094 return a true Unicode object with the same data. */ 1095 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj), 1096 PyUnicode_GET_SIZE(obj)); 1097 } 1098 PyErr_Format(PyExc_TypeError, 1099 "Can't convert '%.100s' object to str implicitly", 1100 Py_TYPE(obj)->tp_name); 1101 return NULL; 1102} 1103 1104PyObject *PyUnicode_FromEncodedObject(register PyObject *obj, 1105 const char *encoding, 1106 const char *errors) 1107{ 1108 const char *s = NULL; 1109 Py_ssize_t len; 1110 PyObject *v; 1111 1112 if (obj == NULL) { 1113 PyErr_BadInternalCall(); 1114 return NULL; 1115 } 1116 1117 if (PyUnicode_Check(obj)) { 1118 PyErr_SetString(PyExc_TypeError, 1119 "decoding str is not supported"); 1120 return NULL; 1121 } 1122 1123 /* Coerce object */ 1124 if (PyBytes_Check(obj)) { 1125 s = PyBytes_AS_STRING(obj); 1126 len = PyBytes_GET_SIZE(obj); 1127 } 1128 else if (PyByteArray_Check(obj)) { 1129 s = PyByteArray_AS_STRING(obj); 1130 len = PyByteArray_GET_SIZE(obj); 1131 } 1132 else if (PyObject_AsCharBuffer(obj, &s, &len)) { 1133 /* Overwrite the error message with something more useful in 1134 case of a TypeError. */ 1135 if (PyErr_ExceptionMatches(PyExc_TypeError)) 1136 PyErr_Format(PyExc_TypeError, 1137 "coercing to str: need string or buffer, " 1138 "%.80s found", 1139 Py_TYPE(obj)->tp_name); 1140 goto onError; 1141 } 1142 1143 /* Convert to Unicode */ 1144 if (len == 0) { 1145 Py_INCREF(unicode_empty); 1146 v = (PyObject *)unicode_empty; 1147 } 1148 else 1149 v = PyUnicode_Decode(s, len, encoding, errors); 1150 1151 return v; 1152 1153 onError: 1154 return NULL; 1155} 1156 1157PyObject *PyUnicode_Decode(const char *s, 1158 Py_ssize_t size, 1159 const char *encoding, 1160 const char *errors) 1161{ 1162 PyObject *buffer = NULL, *unicode; 1163 Py_buffer info; 1164 char lower[20]; /* Enough for any encoding name we recognize */ 1165 char *l; 1166 const char *e; 1167 1168 if (encoding == NULL) 1169 encoding = PyUnicode_GetDefaultEncoding(); 1170 1171 /* Convert encoding to lower case and replace '_' with '-' in order to 1172 catch e.g. UTF_8 */ 1173 e = encoding; 1174 l = lower; 1175 while (*e && l < &lower[(sizeof lower) - 2]) { 1176 if (ISUPPER(*e)) { 1177 *l++ = TOLOWER(*e++); 1178 } 1179 else if (*e == '_') { 1180 *l++ = '-'; 1181 e++; 1182 } 1183 else { 1184 *l++ = *e++; 1185 } 1186 } 1187 *l = '\0'; 1188 1189 /* Shortcuts for common default encodings */ 1190 if (strcmp(lower, "utf-8") == 0) 1191 return PyUnicode_DecodeUTF8(s, size, errors); 1192 else if ((strcmp(lower, "latin-1") == 0) || 1193 (strcmp(lower, "iso-8859-1") == 0)) 1194 return PyUnicode_DecodeLatin1(s, size, errors); 1195#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T) 1196 else if (strcmp(lower, "mbcs") == 0) 1197 return PyUnicode_DecodeMBCS(s, size, errors); 1198#endif 1199 else if (strcmp(lower, "ascii") == 0) 1200 return PyUnicode_DecodeASCII(s, size, errors); 1201 else if (strcmp(lower, "utf-16") == 0) 1202 return PyUnicode_DecodeUTF16(s, size, errors, 0); 1203 else if (strcmp(lower, "utf-32") == 0) 1204 return PyUnicode_DecodeUTF32(s, size, errors, 0); 1205 1206 /* Decode via the codec registry */ 1207 buffer = NULL; 1208 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0) 1209 goto onError; 1210 buffer = PyMemoryView_FromBuffer(&info); 1211 if (buffer == NULL) 1212 goto onError; 1213 unicode = PyCodec_Decode(buffer, encoding, errors); 1214 if (unicode == NULL) 1215 goto onError; 1216 if (!PyUnicode_Check(unicode)) { 1217 PyErr_Format(PyExc_TypeError, 1218 "decoder did not return a str object (type=%.400s)", 1219 Py_TYPE(unicode)->tp_name); 1220 Py_DECREF(unicode); 1221 goto onError; 1222 } 1223 Py_DECREF(buffer); 1224 return unicode; 1225 1226 onError: 1227 Py_XDECREF(buffer); 1228 return NULL; 1229} 1230 1231PyObject *PyUnicode_AsDecodedObject(PyObject *unicode, 1232 const char *encoding, 1233 const char *errors) 1234{ 1235 PyObject *v; 1236 1237 if (!PyUnicode_Check(unicode)) { 1238 PyErr_BadArgument(); 1239 goto onError; 1240 } 1241 1242 if (encoding == NULL) 1243 encoding = PyUnicode_GetDefaultEncoding(); 1244 1245 /* Decode via the codec registry */ 1246 v = PyCodec_Decode(unicode, encoding, errors); 1247 if (v == NULL) 1248 goto onError; 1249 return v; 1250 1251 onError: 1252 return NULL; 1253} 1254 1255PyObject *PyUnicode_AsDecodedUnicode(PyObject *unicode, 1256 const char *encoding, 1257 const char *errors) 1258{ 1259 PyObject *v; 1260 1261 if (!PyUnicode_Check(unicode)) { 1262 PyErr_BadArgument(); 1263 goto onError; 1264 } 1265 1266 if (encoding == NULL) 1267 encoding = PyUnicode_GetDefaultEncoding(); 1268 1269 /* Decode via the codec registry */ 1270 v = PyCodec_Decode(unicode, encoding, errors); 1271 if (v == NULL) 1272 goto onError; 1273 if (!PyUnicode_Check(v)) { 1274 PyErr_Format(PyExc_TypeError, 1275 "decoder did not return a str object (type=%.400s)", 1276 Py_TYPE(v)->tp_name); 1277 Py_DECREF(v); 1278 goto onError; 1279 } 1280 return v; 1281 1282 onError: 1283 return NULL; 1284} 1285 1286PyObject *PyUnicode_Encode(const Py_UNICODE *s, 1287 Py_ssize_t size, 1288 const char *encoding, 1289 const char *errors) 1290{ 1291 PyObject *v, *unicode; 1292 1293 unicode = PyUnicode_FromUnicode(s, size); 1294 if (unicode == NULL) 1295 return NULL; 1296 v = PyUnicode_AsEncodedString(unicode, encoding, errors); 1297 Py_DECREF(unicode); 1298 return v; 1299} 1300 1301PyObject *PyUnicode_AsEncodedObject(PyObject *unicode, 1302 const char *encoding, 1303 const char *errors) 1304{ 1305 PyObject *v; 1306 1307 if (!PyUnicode_Check(unicode)) { 1308 PyErr_BadArgument(); 1309 goto onError; 1310 } 1311 1312 if (encoding == NULL) 1313 encoding = PyUnicode_GetDefaultEncoding(); 1314 1315 /* Encode via the codec registry */ 1316 v = PyCodec_Encode(unicode, encoding, errors); 1317 if (v == NULL) 1318 goto onError; 1319 return v; 1320 1321 onError: 1322 return NULL; 1323} 1324 1325PyObject *PyUnicode_AsEncodedString(PyObject *unicode, 1326 const char *encoding, 1327 const char *errors) 1328{ 1329 PyObject *v; 1330 1331 if (!PyUnicode_Check(unicode)) { 1332 PyErr_BadArgument(); 1333 return NULL; 1334 } 1335 1336 if (encoding == NULL) 1337 encoding = PyUnicode_GetDefaultEncoding(); 1338 1339 /* Shortcuts for common default encodings */ 1340 if (errors == NULL) { 1341 if (strcmp(encoding, "utf-8") == 0) 1342 return PyUnicode_AsUTF8String(unicode); 1343 else if (strcmp(encoding, "latin-1") == 0) 1344 return PyUnicode_AsLatin1String(unicode); 1345#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T) 1346 else if (strcmp(encoding, "mbcs") == 0) 1347 return PyUnicode_AsMBCSString(unicode); 1348#endif 1349 else if (strcmp(encoding, "ascii") == 0) 1350 return PyUnicode_AsASCIIString(unicode); 1351 /* During bootstrap, we may need to find the encodings 1352 package, to load the file system encoding, and require the 1353 file system encoding in order to load the encodings 1354 package. 1355 1356 Break out of this dependency by assuming that the path to 1357 the encodings module is ASCII-only. XXX could try wcstombs 1358 instead, if the file system encoding is the locale's 1359 encoding. */ 1360 else if (Py_FileSystemDefaultEncoding && 1361 strcmp(encoding, Py_FileSystemDefaultEncoding) == 0 && 1362 !PyThreadState_GET()->interp->codecs_initialized) 1363 return PyUnicode_AsASCIIString(unicode); 1364 } 1365 1366 /* Encode via the codec registry */ 1367 v = PyCodec_Encode(unicode, encoding, errors); 1368 if (v == NULL) 1369 return NULL; 1370 1371 /* The normal path */ 1372 if (PyBytes_Check(v)) 1373 return v; 1374 1375 /* If the codec returns a buffer, raise a warning and convert to bytes */ 1376 if (PyByteArray_Check(v)) { 1377 char msg[100]; 1378 PyObject *b; 1379 PyOS_snprintf(msg, sizeof(msg), 1380 "encoder %s returned buffer instead of bytes", 1381 encoding); 1382 if (PyErr_WarnEx(PyExc_RuntimeWarning, msg, 1) < 0) { 1383 Py_DECREF(v); 1384 return NULL; 1385 } 1386 1387 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v)); 1388 Py_DECREF(v); 1389 return b; 1390 } 1391 1392 PyErr_Format(PyExc_TypeError, 1393 "encoder did not return a bytes object (type=%.400s)", 1394 Py_TYPE(v)->tp_name); 1395 Py_DECREF(v); 1396 return NULL; 1397} 1398 1399PyObject *PyUnicode_AsEncodedUnicode(PyObject *unicode, 1400 const char *encoding, 1401 const char *errors) 1402{ 1403 PyObject *v; 1404 1405 if (!PyUnicode_Check(unicode)) { 1406 PyErr_BadArgument(); 1407 goto onError; 1408 } 1409 1410 if (encoding == NULL) 1411 encoding = PyUnicode_GetDefaultEncoding(); 1412 1413 /* Encode via the codec registry */ 1414 v = PyCodec_Encode(unicode, encoding, errors); 1415 if (v == NULL) 1416 goto onError; 1417 if (!PyUnicode_Check(v)) { 1418 PyErr_Format(PyExc_TypeError, 1419 "encoder did not return an str object (type=%.400s)", 1420 Py_TYPE(v)->tp_name); 1421 Py_DECREF(v); 1422 goto onError; 1423 } 1424 return v; 1425 1426 onError: 1427 return NULL; 1428} 1429 1430PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode, 1431 const char *errors) 1432{ 1433 PyObject *v = ((PyUnicodeObject *)unicode)->defenc; 1434 if (v) 1435 return v; 1436 if (errors != NULL) 1437 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString"); 1438 v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode), 1439 PyUnicode_GET_SIZE(unicode), 1440 NULL); 1441 if (!v) 1442 return NULL; 1443 ((PyUnicodeObject *)unicode)->defenc = v; 1444 return v; 1445} 1446 1447PyObject* 1448PyUnicode_DecodeFSDefault(const char *s) { 1449 Py_ssize_t size = (Py_ssize_t)strlen(s); 1450 return PyUnicode_DecodeFSDefaultAndSize(s, size); 1451} 1452 1453PyObject* 1454PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size) 1455{ 1456 /* During the early bootstrapping process, Py_FileSystemDefaultEncoding 1457 can be undefined. If it is case, decode using UTF-8. The following assumes 1458 that Py_FileSystemDefaultEncoding is set to a built-in encoding during the 1459 bootstrapping process where the codecs aren't ready yet. 1460 */ 1461 if (Py_FileSystemDefaultEncoding) { 1462#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T) 1463 if (strcmp(Py_FileSystemDefaultEncoding, "mbcs") == 0) { 1464 return PyUnicode_DecodeMBCS(s, size, "replace"); 1465 } 1466#elif defined(__APPLE__) 1467 if (strcmp(Py_FileSystemDefaultEncoding, "utf-8") == 0) { 1468 return PyUnicode_DecodeUTF8(s, size, "replace"); 1469 } 1470#endif 1471 return PyUnicode_Decode(s, size, 1472 Py_FileSystemDefaultEncoding, 1473 "replace"); 1474 } 1475 else { 1476 return PyUnicode_DecodeUTF8(s, size, "replace"); 1477 } 1478} 1479 1480char* 1481_PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize) 1482{ 1483 PyObject *bytes; 1484 if (!PyUnicode_Check(unicode)) { 1485 PyErr_BadArgument(); 1486 return NULL; 1487 } 1488 bytes = _PyUnicode_AsDefaultEncodedString(unicode, NULL); 1489 if (bytes == NULL) 1490 return NULL; 1491 if (psize != NULL) 1492 *psize = PyBytes_GET_SIZE(bytes); 1493 return PyBytes_AS_STRING(bytes); 1494} 1495 1496char* 1497_PyUnicode_AsString(PyObject *unicode) 1498{ 1499 return _PyUnicode_AsStringAndSize(unicode, NULL); 1500} 1501 1502Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode) 1503{ 1504 if (!PyUnicode_Check(unicode)) { 1505 PyErr_BadArgument(); 1506 goto onError; 1507 } 1508 return PyUnicode_AS_UNICODE(unicode); 1509 1510 onError: 1511 return NULL; 1512} 1513 1514Py_ssize_t PyUnicode_GetSize(PyObject *unicode) 1515{ 1516 if (!PyUnicode_Check(unicode)) { 1517 PyErr_BadArgument(); 1518 goto onError; 1519 } 1520 return PyUnicode_GET_SIZE(unicode); 1521 1522 onError: 1523 return -1; 1524} 1525 1526const char *PyUnicode_GetDefaultEncoding(void) 1527{ 1528 return unicode_default_encoding; 1529} 1530 1531int PyUnicode_SetDefaultEncoding(const char *encoding) 1532{ 1533 if (strcmp(encoding, unicode_default_encoding) != 0) { 1534 PyErr_Format(PyExc_ValueError, 1535 "Can only set default encoding to %s", 1536 unicode_default_encoding); 1537 return -1; 1538 } 1539 return 0; 1540} 1541 1542/* error handling callback helper: 1543 build arguments, call the callback and check the arguments, 1544 if no exception occurred, copy the replacement to the output 1545 and adjust various state variables. 1546 return 0 on success, -1 on error 1547*/ 1548 1549static 1550int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler, 1551 const char *encoding, const char *reason, 1552 const char **input, const char **inend, Py_ssize_t *startinpos, 1553 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr, 1554 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr) 1555{ 1556 static char *argparse = "O!n;decoding error handler must return (str, int) tuple"; 1557 1558 PyObject *restuple = NULL; 1559 PyObject *repunicode = NULL; 1560 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output); 1561 Py_ssize_t insize; 1562 Py_ssize_t requiredsize; 1563 Py_ssize_t newpos; 1564 Py_UNICODE *repptr; 1565 PyObject *inputobj = NULL; 1566 Py_ssize_t repsize; 1567 int res = -1; 1568 1569 if (*errorHandler == NULL) { 1570 *errorHandler = PyCodec_LookupError(errors); 1571 if (*errorHandler == NULL) 1572 goto onError; 1573 } 1574 1575 if (*exceptionObject == NULL) { 1576 *exceptionObject = PyUnicodeDecodeError_Create( 1577 encoding, *input, *inend-*input, *startinpos, *endinpos, reason); 1578 if (*exceptionObject == NULL) 1579 goto onError; 1580 } 1581 else { 1582 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos)) 1583 goto onError; 1584 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos)) 1585 goto onError; 1586 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason)) 1587 goto onError; 1588 } 1589 1590 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL); 1591 if (restuple == NULL) 1592 goto onError; 1593 if (!PyTuple_Check(restuple)) { 1594 PyErr_Format(PyExc_TypeError, &argparse[4]); 1595 goto onError; 1596 } 1597 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos)) 1598 goto onError; 1599 1600 /* Copy back the bytes variables, which might have been modified by the 1601 callback */ 1602 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject); 1603 if (!inputobj) 1604 goto onError; 1605 if (!PyBytes_Check(inputobj)) { 1606 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes"); 1607 } 1608 *input = PyBytes_AS_STRING(inputobj); 1609 insize = PyBytes_GET_SIZE(inputobj); 1610 *inend = *input + insize; 1611 /* we can DECREF safely, as the exception has another reference, 1612 so the object won't go away. */ 1613 Py_DECREF(inputobj); 1614 1615 if (newpos<0) 1616 newpos = insize+newpos; 1617 if (newpos<0 || newpos>insize) { 1618 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos); 1619 goto onError; 1620 } 1621 1622 /* need more space? (at least enough for what we 1623 have+the replacement+the rest of the string (starting 1624 at the new input position), so we won't have to check space 1625 when there are no errors in the rest of the string) */ 1626 repptr = PyUnicode_AS_UNICODE(repunicode); 1627 repsize = PyUnicode_GET_SIZE(repunicode); 1628 requiredsize = *outpos + repsize + insize-newpos; 1629 if (requiredsize > outsize) { 1630 if (requiredsize<2*outsize) 1631 requiredsize = 2*outsize; 1632 if (_PyUnicode_Resize(output, requiredsize) < 0) 1633 goto onError; 1634 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos; 1635 } 1636 *endinpos = newpos; 1637 *inptr = *input + newpos; 1638 Py_UNICODE_COPY(*outptr, repptr, repsize); 1639 *outptr += repsize; 1640 *outpos += repsize; 1641 1642 /* we made it! */ 1643 res = 0; 1644 1645 onError: 1646 Py_XDECREF(restuple); 1647 return res; 1648} 1649 1650/* --- UTF-7 Codec -------------------------------------------------------- */ 1651 1652/* see RFC2152 for details */ 1653 1654static 1655char utf7_special[128] = { 1656 /* indicate whether a UTF-7 character is special i.e. cannot be directly 1657 encoded: 1658 0 - not special 1659 1 - special 1660 2 - whitespace (optional) 1661 3 - RFC2152 Set O (optional) */ 1662 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1, 1663 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1664 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1, 1665 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0, 1666 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1667 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3, 1668 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1669 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1, 1670 1671}; 1672 1673/* Note: The comparison (c) <= 0 is a trick to work-around gcc 1674 warnings about the comparison always being false; since 1675 utf7_special[0] is 1, we can safely make that one comparison 1676 true */ 1677 1678#define SPECIAL(c, encodeO, encodeWS) \ 1679 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \ 1680 (encodeWS && (utf7_special[(c)] == 2)) || \ 1681 (encodeO && (utf7_special[(c)] == 3))) 1682 1683#define B64(n) \ 1684 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f]) 1685#define B64CHAR(c) \ 1686 (ISALNUM(c) || (c) == '+' || (c) == '/') 1687#define UB64(c) \ 1688 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \ 1689 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 ) 1690 1691#define ENCODE(out, ch, bits) \ 1692 while (bits >= 6) { \ 1693 *out++ = B64(ch >> (bits-6)); \ 1694 bits -= 6; \ 1695 } 1696 1697#define DECODE(out, ch, bits, surrogate) \ 1698 while (bits >= 16) { \ 1699 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \ 1700 bits -= 16; \ 1701 if (surrogate) { \ 1702 /* We have already generated an error for the high surrogate \ 1703 so let's not bother seeing if the low surrogate is correct or not */ \ 1704 surrogate = 0; \ 1705 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \ 1706 /* This is a surrogate pair. Unfortunately we can't represent \ 1707 it in a 16-bit character */ \ 1708 surrogate = 1; \ 1709 errmsg = "code pairs are not supported"; \ 1710 goto utf7Error; \ 1711 } else { \ 1712 *out++ = outCh; \ 1713 } \ 1714 } 1715 1716PyObject *PyUnicode_DecodeUTF7(const char *s, 1717 Py_ssize_t size, 1718 const char *errors) 1719{ 1720 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL); 1721} 1722 1723PyObject *PyUnicode_DecodeUTF7Stateful(const char *s, 1724 Py_ssize_t size, 1725 const char *errors, 1726 Py_ssize_t *consumed) 1727{ 1728 const char *starts = s; 1729 Py_ssize_t startinpos; 1730 Py_ssize_t endinpos; 1731 Py_ssize_t outpos; 1732 const char *e; 1733 PyUnicodeObject *unicode; 1734 Py_UNICODE *p; 1735 const char *errmsg = ""; 1736 int inShift = 0; 1737 unsigned int bitsleft = 0; 1738 unsigned long charsleft = 0; 1739 int surrogate = 0; 1740 PyObject *errorHandler = NULL; 1741 PyObject *exc = NULL; 1742 1743 unicode = _PyUnicode_New(size); 1744 if (!unicode) 1745 return NULL; 1746 if (size == 0) { 1747 if (consumed) 1748 *consumed = 0; 1749 return (PyObject *)unicode; 1750 } 1751 1752 p = unicode->str; 1753 e = s + size; 1754 1755 while (s < e) { 1756 Py_UNICODE ch; 1757 restart: 1758 ch = (unsigned char) *s; 1759 1760 if (inShift) { 1761 if ((ch == '-') || !B64CHAR(ch)) { 1762 inShift = 0; 1763 s++; 1764 1765 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate); 1766 if (bitsleft >= 6) { 1767 /* The shift sequence has a partial character in it. If 1768 bitsleft < 6 then we could just classify it as padding 1769 but that is not the case here */ 1770 1771 errmsg = "partial character in shift sequence"; 1772 goto utf7Error; 1773 } 1774 /* According to RFC2152 the remaining bits should be zero. We 1775 choose to signal an error/insert a replacement character 1776 here so indicate the potential of a misencoded character. */ 1777 1778 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */ 1779 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) { 1780 errmsg = "non-zero padding bits in shift sequence"; 1781 goto utf7Error; 1782 } 1783 1784 if (ch == '-') { 1785 if ((s < e) && (*(s) == '-')) { 1786 *p++ = '-'; 1787 inShift = 1; 1788 } 1789 } else if (SPECIAL(ch,0,0)) { 1790 errmsg = "unexpected special character"; 1791 goto utf7Error; 1792 } else { 1793 *p++ = ch; 1794 } 1795 } else { 1796 charsleft = (charsleft << 6) | UB64(ch); 1797 bitsleft += 6; 1798 s++; 1799 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate); 1800 } 1801 } 1802 else if ( ch == '+' ) { 1803 startinpos = s-starts; 1804 s++; 1805 if (s < e && *s == '-') { 1806 s++; 1807 *p++ = '+'; 1808 } else 1809 { 1810 inShift = 1; 1811 bitsleft = 0; 1812 } 1813 } 1814 else if (SPECIAL(ch,0,0)) { 1815 startinpos = s-starts; 1816 errmsg = "unexpected special character"; 1817 s++; 1818 goto utf7Error; 1819 } 1820 else { 1821 *p++ = ch; 1822 s++; 1823 } 1824 continue; 1825 utf7Error: 1826 outpos = p-PyUnicode_AS_UNICODE(unicode); 1827 endinpos = s-starts; 1828 if (unicode_decode_call_errorhandler( 1829 errors, &errorHandler, 1830 "utf7", errmsg, 1831 &starts, &e, &startinpos, &endinpos, &exc, &s, 1832 &unicode, &outpos, &p)) 1833 goto onError; 1834 } 1835 1836 if (inShift && !consumed) { 1837 outpos = p-PyUnicode_AS_UNICODE(unicode); 1838 endinpos = size; 1839 if (unicode_decode_call_errorhandler( 1840 errors, &errorHandler, 1841 "utf7", "unterminated shift sequence", 1842 &starts, &e, &startinpos, &endinpos, &exc, &s, 1843 &unicode, &outpos, &p)) 1844 goto onError; 1845 if (s < e) 1846 goto restart; 1847 } 1848 if (consumed) { 1849 if(inShift) 1850 *consumed = startinpos; 1851 else 1852 *consumed = s-starts; 1853 } 1854 1855 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0) 1856 goto onError; 1857 1858 Py_XDECREF(errorHandler); 1859 Py_XDECREF(exc); 1860 return (PyObject *)unicode; 1861 1862 onError: 1863 Py_XDECREF(errorHandler); 1864 Py_XDECREF(exc); 1865 Py_DECREF(unicode); 1866 return NULL; 1867} 1868 1869 1870PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s, 1871 Py_ssize_t size, 1872 int encodeSetO, 1873 int encodeWhiteSpace, 1874 const char *errors) 1875{ 1876 PyObject *v; 1877 /* It might be possible to tighten this worst case */ 1878 Py_ssize_t cbAllocated = 5 * size; 1879 int inShift = 0; 1880 Py_ssize_t i = 0; 1881 unsigned int bitsleft = 0; 1882 unsigned long charsleft = 0; 1883 char * out; 1884 char * start; 1885 1886 if (size == 0) 1887 return PyBytes_FromStringAndSize(NULL, 0); 1888 1889 if (cbAllocated / 5 != size) 1890 return PyErr_NoMemory(); 1891 1892 v = PyBytes_FromStringAndSize(NULL, cbAllocated); 1893 if (v == NULL) 1894 return NULL; 1895 1896 start = out = PyBytes_AS_STRING(v); 1897 for (;i < size; ++i) { 1898 Py_UNICODE ch = s[i]; 1899 1900 if (!inShift) { 1901 if (ch == '+') { 1902 *out++ = '+'; 1903 *out++ = '-'; 1904 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) { 1905 charsleft = ch; 1906 bitsleft = 16; 1907 *out++ = '+'; 1908 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft); 1909 inShift = bitsleft > 0; 1910 } else { 1911 *out++ = (char) ch; 1912 } 1913 } else { 1914 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) { 1915 *out++ = B64(charsleft << (6-bitsleft)); 1916 charsleft = 0; 1917 bitsleft = 0; 1918 /* Characters not in the BASE64 set implicitly unshift the sequence 1919 so no '-' is required, except if the character is itself a '-' */ 1920 if (B64CHAR(ch) || ch == '-') { 1921 *out++ = '-'; 1922 } 1923 inShift = 0; 1924 *out++ = (char) ch; 1925 } else { 1926 bitsleft += 16; 1927 charsleft = (charsleft << 16) | ch; 1928 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft); 1929 1930 /* If the next character is special then we dont' need to terminate 1931 the shift sequence. If the next character is not a BASE64 character 1932 or '-' then the shift sequence will be terminated implicitly and we 1933 don't have to insert a '-'. */ 1934 1935 if (bitsleft == 0) { 1936 if (i + 1 < size) { 1937 Py_UNICODE ch2 = s[i+1]; 1938 1939 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) { 1940 1941 } else if (B64CHAR(ch2) || ch2 == '-') { 1942 *out++ = '-'; 1943 inShift = 0; 1944 } else { 1945 inShift = 0; 1946 } 1947 1948 } 1949 else { 1950 *out++ = '-'; 1951 inShift = 0; 1952 } 1953 } 1954 } 1955 } 1956 } 1957 if (bitsleft) { 1958 *out++= B64(charsleft << (6-bitsleft) ); 1959 *out++ = '-'; 1960 } 1961 if (_PyBytes_Resize(&v, out - start) < 0) 1962 return NULL; 1963 return v; 1964} 1965 1966#undef SPECIAL 1967#undef B64 1968#undef B64CHAR 1969#undef UB64 1970#undef ENCODE 1971#undef DECODE 1972 1973/* --- UTF-8 Codec -------------------------------------------------------- */ 1974 1975static 1976char utf8_code_length[256] = { 1977 /* Map UTF-8 encoded prefix byte to sequence length. zero means 1978 illegal prefix. see RFC 2279 for details */ 1979 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1980 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1981 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1982 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1983 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1984 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1985 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1986 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1987 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1988 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1989 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1990 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1991 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1992 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1993 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1994 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0 1995}; 1996 1997PyObject *PyUnicode_DecodeUTF8(const char *s, 1998 Py_ssize_t size, 1999 const char *errors) 2000{ 2001 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL); 2002} 2003 2004/* Mask to check or force alignment of a pointer to C 'long' boundaries */ 2005#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1) 2006 2007/* Mask to quickly check whether a C 'long' contains a 2008 non-ASCII, UTF8-encoded char. */ 2009#if (SIZEOF_LONG == 8) 2010# define ASCII_CHAR_MASK 0x8080808080808080L 2011#elif (SIZEOF_LONG == 4) 2012# define ASCII_CHAR_MASK 0x80808080L 2013#else 2014# error C 'long' size should be either 4 or 8! 2015#endif 2016 2017PyObject *PyUnicode_DecodeUTF8Stateful(const char *s, 2018 Py_ssize_t size, 2019 const char *errors, 2020 Py_ssize_t *consumed) 2021{ 2022 const char *starts = s; 2023 int n; 2024 Py_ssize_t startinpos; 2025 Py_ssize_t endinpos; 2026 Py_ssize_t outpos; 2027 const char *e, *aligned_end; 2028 PyUnicodeObject *unicode; 2029 Py_UNICODE *p; 2030 const char *errmsg = ""; 2031 PyObject *errorHandler = NULL; 2032 PyObject *exc = NULL; 2033 2034 /* Note: size will always be longer than the resulting Unicode 2035 character count */ 2036 unicode = _PyUnicode_New(size); 2037 if (!unicode) 2038 return NULL; 2039 if (size == 0) { 2040 if (consumed) 2041 *consumed = 0; 2042 return (PyObject *)unicode; 2043 } 2044 2045 /* Unpack UTF-8 encoded data */ 2046 p = unicode->str; 2047 e = s + size; 2048 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK); 2049 2050 while (s < e) { 2051 Py_UCS4 ch = (unsigned char)*s; 2052 2053 if (ch < 0x80) { 2054 /* Fast path for runs of ASCII characters. Given that common UTF-8 2055 input will consist of an overwhelming majority of ASCII 2056 characters, we try to optimize for this case by checking 2057 as many characters as a C 'long' can contain. 2058 First, check if we can do an aligned read, as most CPUs have 2059 a penalty for unaligned reads. 2060 */ 2061 if (!((size_t) s & LONG_PTR_MASK)) { 2062 /* Help register allocation */ 2063 register const char *_s = s; 2064 register Py_UNICODE *_p = p; 2065 while (_s < aligned_end) { 2066 /* Read a whole long at a time (either 4 or 8 bytes), 2067 and do a fast unrolled copy if it only contains ASCII 2068 characters. */ 2069 unsigned long data = *(unsigned long *) _s; 2070 if (data & ASCII_CHAR_MASK) 2071 break; 2072 _p[0] = (unsigned char) _s[0]; 2073 _p[1] = (unsigned char) _s[1]; 2074 _p[2] = (unsigned char) _s[2]; 2075 _p[3] = (unsigned char) _s[3]; 2076#if (SIZEOF_LONG == 8) 2077 _p[4] = (unsigned char) _s[4]; 2078 _p[5] = (unsigned char) _s[5]; 2079 _p[6] = (unsigned char) _s[6]; 2080 _p[7] = (unsigned char) _s[7]; 2081#endif 2082 _s += SIZEOF_LONG; 2083 _p += SIZEOF_LONG; 2084 } 2085 s = _s; 2086 p = _p; 2087 if (s == e) 2088 break; 2089 ch = (unsigned char)*s; 2090 } 2091 } 2092 2093 if (ch < 0x80) { 2094 *p++ = (Py_UNICODE)ch; 2095 s++; 2096 continue; 2097 } 2098 2099 n = utf8_code_length[ch]; 2100 2101 if (s + n > e) { 2102 if (consumed) 2103 break; 2104 else { 2105 errmsg = "unexpected end of data"; 2106 startinpos = s-starts; 2107 endinpos = size; 2108 goto utf8Error; 2109 } 2110 } 2111 2112 switch (n) { 2113 2114 case 0: 2115 errmsg = "unexpected code byte"; 2116 startinpos = s-starts; 2117 endinpos = startinpos+1; 2118 goto utf8Error; 2119 2120 case 1: 2121 errmsg = "internal error"; 2122 startinpos = s-starts; 2123 endinpos = startinpos+1; 2124 goto utf8Error; 2125 2126 case 2: 2127 if ((s[1] & 0xc0) != 0x80) { 2128 errmsg = "invalid data"; 2129 startinpos = s-starts; 2130 endinpos = startinpos+2; 2131 goto utf8Error; 2132 } 2133 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f); 2134 if (ch < 0x80) { 2135 startinpos = s-starts; 2136 endinpos = startinpos+2; 2137 errmsg = "illegal encoding"; 2138 goto utf8Error; 2139 } 2140 else 2141 *p++ = (Py_UNICODE)ch; 2142 break; 2143 2144 case 3: 2145 if ((s[1] & 0xc0) != 0x80 || 2146 (s[2] & 0xc0) != 0x80) { 2147 errmsg = "invalid data"; 2148 startinpos = s-starts; 2149 endinpos = startinpos+3; 2150 goto utf8Error; 2151 } 2152 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f); 2153 if (ch < 0x0800) { 2154 /* Note: UTF-8 encodings of surrogates are considered 2155 legal UTF-8 sequences; 2156 2157 XXX For wide builds (UCS-4) we should probably try 2158 to recombine the surrogates into a single code 2159 unit. 2160 */ 2161 errmsg = "illegal encoding"; 2162 startinpos = s-starts; 2163 endinpos = startinpos+3; 2164 goto utf8Error; 2165 } 2166 else 2167 *p++ = (Py_UNICODE)ch; 2168 break; 2169 2170 case 4: 2171 if ((s[1] & 0xc0) != 0x80 || 2172 (s[2] & 0xc0) != 0x80 || 2173 (s[3] & 0xc0) != 0x80) { 2174 errmsg = "invalid data"; 2175 startinpos = s-starts; 2176 endinpos = startinpos+4; 2177 goto utf8Error; 2178 } 2179 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) + 2180 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f); 2181 /* validate and convert to UTF-16 */ 2182 if ((ch < 0x10000) /* minimum value allowed for 4 2183 byte encoding */ 2184 || (ch > 0x10ffff)) /* maximum value allowed for 2185 UTF-16 */ 2186 { 2187 errmsg = "illegal encoding"; 2188 startinpos = s-starts; 2189 endinpos = startinpos+4; 2190 goto utf8Error; 2191 } 2192#ifdef Py_UNICODE_WIDE 2193 *p++ = (Py_UNICODE)ch; 2194#else 2195 /* compute and append the two surrogates: */ 2196 2197 /* translate from 10000..10FFFF to 0..FFFF */ 2198 ch -= 0x10000; 2199 2200 /* high surrogate = top 10 bits added to D800 */ 2201 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10)); 2202 2203 /* low surrogate = bottom 10 bits added to DC00 */ 2204 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF)); 2205#endif 2206 break; 2207 2208 default: 2209 /* Other sizes are only needed for UCS-4 */ 2210 errmsg = "unsupported Unicode code range"; 2211 startinpos = s-starts; 2212 endinpos = startinpos+n; 2213 goto utf8Error; 2214 } 2215 s += n; 2216 continue; 2217 2218 utf8Error: 2219 outpos = p-PyUnicode_AS_UNICODE(unicode); 2220 if (unicode_decode_call_errorhandler( 2221 errors, &errorHandler, 2222 "utf8", errmsg, 2223 &starts, &e, &startinpos, &endinpos, &exc, &s, 2224 &unicode, &outpos, &p)) 2225 goto onError; 2226 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK); 2227 } 2228 if (consumed) 2229 *consumed = s-starts; 2230 2231 /* Adjust length */ 2232 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0) 2233 goto onError; 2234 2235 Py_XDECREF(errorHandler); 2236 Py_XDECREF(exc); 2237 return (PyObject *)unicode; 2238 2239 onError: 2240 Py_XDECREF(errorHandler); 2241 Py_XDECREF(exc); 2242 Py_DECREF(unicode); 2243 return NULL; 2244} 2245 2246#undef ASCII_CHAR_MASK 2247 2248 2249/* Allocation strategy: if the string is short, convert into a stack buffer 2250 and allocate exactly as much space needed at the end. Else allocate the 2251 maximum possible needed (4 result bytes per Unicode character), and return 2252 the excess memory at the end. 2253*/ 2254PyObject * 2255PyUnicode_EncodeUTF8(const Py_UNICODE *s, 2256 Py_ssize_t size, 2257 const char *errors) 2258{ 2259#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */ 2260 2261 Py_ssize_t i; /* index into s of next input byte */ 2262 PyObject *result; /* result string object */ 2263 char *p; /* next free byte in output buffer */ 2264 Py_ssize_t nallocated; /* number of result bytes allocated */ 2265 Py_ssize_t nneeded; /* number of result bytes needed */ 2266 char stackbuf[MAX_SHORT_UNICHARS * 4]; 2267 2268 assert(s != NULL); 2269 assert(size >= 0); 2270 2271 if (size <= MAX_SHORT_UNICHARS) { 2272 /* Write into the stack buffer; nallocated can't overflow. 2273 * At the end, we'll allocate exactly as much heap space as it 2274 * turns out we need. 2275 */ 2276 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int); 2277 result = NULL; /* will allocate after we're done */ 2278 p = stackbuf; 2279 } 2280 else { 2281 /* Overallocate on the heap, and give the excess back at the end. */ 2282 nallocated = size * 4; 2283 if (nallocated / 4 != size) /* overflow! */ 2284 return PyErr_NoMemory(); 2285 result = PyBytes_FromStringAndSize(NULL, nallocated); 2286 if (result == NULL) 2287 return NULL; 2288 p = PyBytes_AS_STRING(result); 2289 } 2290 2291 for (i = 0; i < size;) { 2292 Py_UCS4 ch = s[i++]; 2293 2294 if (ch < 0x80) 2295 /* Encode ASCII */ 2296 *p++ = (char) ch; 2297 2298 else if (ch < 0x0800) { 2299 /* Encode Latin-1 */ 2300 *p++ = (char)(0xc0 | (ch >> 6)); 2301 *p++ = (char)(0x80 | (ch & 0x3f)); 2302 } 2303 else { 2304 /* Encode UCS2 Unicode ordinals */ 2305 if (ch < 0x10000) { 2306 /* Special case: check for high surrogate */ 2307 if (0xD800 <= ch && ch <= 0xDBFF && i != size) { 2308 Py_UCS4 ch2 = s[i]; 2309 /* Check for low surrogate and combine the two to 2310 form a UCS4 value */ 2311 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) { 2312 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000; 2313 i++; 2314 goto encodeUCS4; 2315 } 2316 /* Fall through: handles isolated high surrogates */ 2317 } 2318 *p++ = (char)(0xe0 | (ch >> 12)); 2319 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); 2320 *p++ = (char)(0x80 | (ch & 0x3f)); 2321 continue; 2322 } 2323 encodeUCS4: 2324 /* Encode UCS4 Unicode ordinals */ 2325 *p++ = (char)(0xf0 | (ch >> 18)); 2326 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f)); 2327 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); 2328 *p++ = (char)(0x80 | (ch & 0x3f)); 2329 } 2330 } 2331 2332 if (result == NULL) { 2333 /* This was stack allocated. */ 2334 nneeded = p - stackbuf; 2335 assert(nneeded <= nallocated); 2336 result = PyBytes_FromStringAndSize(stackbuf, nneeded); 2337 } 2338 else { 2339 /* Cut back to size actually needed. */ 2340 nneeded = p - PyBytes_AS_STRING(result); 2341 assert(nneeded <= nallocated); 2342 _PyBytes_Resize(&result, nneeded); 2343 } 2344 return result; 2345 2346#undef MAX_SHORT_UNICHARS 2347} 2348 2349PyObject *PyUnicode_AsUTF8String(PyObject *unicode) 2350{ 2351 if (!PyUnicode_Check(unicode)) { 2352 PyErr_BadArgument(); 2353 return NULL; 2354 } 2355 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode), 2356 PyUnicode_GET_SIZE(unicode), 2357 NULL); 2358} 2359 2360/* --- UTF-32 Codec ------------------------------------------------------- */ 2361 2362PyObject * 2363PyUnicode_DecodeUTF32(const char *s, 2364 Py_ssize_t size, 2365 const char *errors, 2366 int *byteorder) 2367{ 2368 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL); 2369} 2370 2371PyObject * 2372PyUnicode_DecodeUTF32Stateful(const char *s, 2373 Py_ssize_t size, 2374 const char *errors, 2375 int *byteorder, 2376 Py_ssize_t *consumed) 2377{ 2378 const char *starts = s; 2379 Py_ssize_t startinpos; 2380 Py_ssize_t endinpos; 2381 Py_ssize_t outpos; 2382 PyUnicodeObject *unicode; 2383 Py_UNICODE *p; 2384#ifndef Py_UNICODE_WIDE 2385 int i, pairs; 2386#else 2387 const int pairs = 0; 2388#endif 2389 const unsigned char *q, *e; 2390 int bo = 0; /* assume native ordering by default */ 2391 const char *errmsg = ""; 2392 /* Offsets from q for retrieving bytes in the right order. */ 2393#ifdef BYTEORDER_IS_LITTLE_ENDIAN 2394 int iorder[] = {0, 1, 2, 3}; 2395#else 2396 int iorder[] = {3, 2, 1, 0}; 2397#endif 2398 PyObject *errorHandler = NULL; 2399 PyObject *exc = NULL; 2400 /* On narrow builds we split characters outside the BMP into two 2401 codepoints => count how much extra space we need. */ 2402#ifndef Py_UNICODE_WIDE 2403 for (i = pairs = 0; i < size/4; i++) 2404 if (((Py_UCS4 *)s)[i] >= 0x10000) 2405 pairs++; 2406#endif 2407 2408 /* This might be one to much, because of a BOM */ 2409 unicode = _PyUnicode_New((size+3)/4+pairs); 2410 if (!unicode) 2411 return NULL; 2412 if (size == 0) 2413 return (PyObject *)unicode; 2414 2415 /* Unpack UTF-32 encoded data */ 2416 p = unicode->str; 2417 q = (unsigned char *)s; 2418 e = q + size; 2419 2420 if (byteorder) 2421 bo = *byteorder; 2422 2423 /* Check for BOM marks (U+FEFF) in the input and adjust current 2424 byte order setting accordingly. In native mode, the leading BOM 2425 mark is skipped, in all other modes, it is copied to the output 2426 stream as-is (giving a ZWNBSP character). */ 2427 if (bo == 0) { 2428 if (size >= 4) { 2429 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) | 2430 (q[iorder[1]] << 8) | q[iorder[0]]; 2431#ifdef BYTEORDER_IS_LITTLE_ENDIAN 2432 if (bom == 0x0000FEFF) { 2433 q += 4; 2434 bo = -1; 2435 } 2436 else if (bom == 0xFFFE0000) { 2437 q += 4; 2438 bo = 1; 2439 } 2440#else 2441 if (bom == 0x0000FEFF) { 2442 q += 4; 2443 bo = 1; 2444 } 2445 else if (bom == 0xFFFE0000) { 2446 q += 4; 2447 bo = -1; 2448 } 2449#endif 2450 } 2451 } 2452 2453 if (bo == -1) { 2454 /* force LE */ 2455 iorder[0] = 0; 2456 iorder[1] = 1; 2457 iorder[2] = 2; 2458 iorder[3] = 3; 2459 } 2460 else if (bo == 1) { 2461 /* force BE */ 2462 iorder[0] = 3; 2463 iorder[1] = 2; 2464 iorder[2] = 1; 2465 iorder[3] = 0; 2466 } 2467 2468 while (q < e) { 2469 Py_UCS4 ch; 2470 /* remaining bytes at the end? (size should be divisible by 4) */ 2471 if (e-q<4) { 2472 if (consumed) 2473 break; 2474 errmsg = "truncated data"; 2475 startinpos = ((const char *)q)-starts; 2476 endinpos = ((const char *)e)-starts; 2477 goto utf32Error; 2478 /* The remaining input chars are ignored if the callback 2479 chooses to skip the input */ 2480 } 2481 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) | 2482 (q[iorder[1]] << 8) | q[iorder[0]]; 2483 2484 if (ch >= 0x110000) 2485 { 2486 errmsg = "codepoint not in range(0x110000)"; 2487 startinpos = ((const char *)q)-starts; 2488 endinpos = startinpos+4; 2489 goto utf32Error; 2490 } 2491#ifndef Py_UNICODE_WIDE 2492 if (ch >= 0x10000) 2493 { 2494 *p++ = 0xD800 | ((ch-0x10000) >> 10); 2495 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF); 2496 } 2497 else 2498#endif 2499 *p++ = ch; 2500 q += 4; 2501 continue; 2502 utf32Error: 2503 outpos = p-PyUnicode_AS_UNICODE(unicode); 2504 if (unicode_decode_call_errorhandler( 2505 errors, &errorHandler, 2506 "utf32", errmsg, 2507 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q, 2508 &unicode, &outpos, &p)) 2509 goto onError; 2510 } 2511 2512 if (byteorder) 2513 *byteorder = bo; 2514 2515 if (consumed) 2516 *consumed = (const char *)q-starts; 2517 2518 /* Adjust length */ 2519 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0) 2520 goto onError; 2521 2522 Py_XDECREF(errorHandler); 2523 Py_XDECREF(exc); 2524 return (PyObject *)unicode; 2525 2526 onError: 2527 Py_DECREF(unicode); 2528 Py_XDECREF(errorHandler); 2529 Py_XDECREF(exc); 2530 return NULL; 2531} 2532 2533PyObject * 2534PyUnicode_EncodeUTF32(const Py_UNICODE *s, 2535 Py_ssize_t size, 2536 const char *errors, 2537 int byteorder) 2538{ 2539 PyObject *v; 2540 unsigned char *p; 2541 Py_ssize_t nsize, bytesize; 2542#ifndef Py_UNICODE_WIDE 2543 Py_ssize_t i, pairs; 2544#else 2545 const int pairs = 0; 2546#endif 2547 /* Offsets from p for storing byte pairs in the right order. */ 2548#ifdef BYTEORDER_IS_LITTLE_ENDIAN 2549 int iorder[] = {0, 1, 2, 3}; 2550#else 2551 int iorder[] = {3, 2, 1, 0}; 2552#endif 2553 2554#define STORECHAR(CH) \ 2555 do { \ 2556 p[iorder[3]] = ((CH) >> 24) & 0xff; \ 2557 p[iorder[2]] = ((CH) >> 16) & 0xff; \ 2558 p[iorder[1]] = ((CH) >> 8) & 0xff; \ 2559 p[iorder[0]] = (CH) & 0xff; \ 2560 p += 4; \ 2561 } while(0) 2562 2563 /* In narrow builds we can output surrogate pairs as one codepoint, 2564 so we need less space. */ 2565#ifndef Py_UNICODE_WIDE 2566 for (i = pairs = 0; i < size-1; i++) 2567 if (0xD800 <= s[i] && s[i] <= 0xDBFF && 2568 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF) 2569 pairs++; 2570#endif 2571 nsize = (size - pairs + (byteorder == 0)); 2572 bytesize = nsize * 4; 2573 if (bytesize / 4 != nsize) 2574 return PyErr_NoMemory(); 2575 v = PyBytes_FromStringAndSize(NULL, bytesize); 2576 if (v == NULL) 2577 return NULL; 2578 2579 p = (unsigned char *)PyBytes_AS_STRING(v); 2580 if (byteorder == 0) 2581 STORECHAR(0xFEFF); 2582 if (size == 0) 2583 goto done; 2584 2585 if (byteorder == -1) { 2586 /* force LE */ 2587 iorder[0] = 0; 2588 iorder[1] = 1; 2589 iorder[2] = 2; 2590 iorder[3] = 3; 2591 } 2592 else if (byteorder == 1) { 2593 /* force BE */ 2594 iorder[0] = 3; 2595 iorder[1] = 2; 2596 iorder[2] = 1; 2597 iorder[3] = 0; 2598 } 2599 2600 while (size-- > 0) { 2601 Py_UCS4 ch = *s++; 2602#ifndef Py_UNICODE_WIDE 2603 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) { 2604 Py_UCS4 ch2 = *s; 2605 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) { 2606 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000; 2607 s++; 2608 size--; 2609 } 2610 } 2611#endif 2612 STORECHAR(ch); 2613 } 2614 2615 done: 2616 return v; 2617#undef STORECHAR 2618} 2619 2620PyObject *PyUnicode_AsUTF32String(PyObject *unicode) 2621{ 2622 if (!PyUnicode_Check(unicode)) { 2623 PyErr_BadArgument(); 2624 return NULL; 2625 } 2626 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode), 2627 PyUnicode_GET_SIZE(unicode), 2628 NULL, 2629 0); 2630} 2631 2632/* --- UTF-16 Codec ------------------------------------------------------- */ 2633 2634PyObject * 2635PyUnicode_DecodeUTF16(const char *s, 2636 Py_ssize_t size, 2637 const char *errors, 2638 int *byteorder) 2639{ 2640 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL); 2641} 2642 2643/* Two masks for fast checking of whether a C 'long' may contain 2644 UTF16-encoded surrogate characters. This is an efficient heuristic, 2645 assuming that non-surrogate characters with a code point >= 0x8000 are 2646 rare in most input. 2647 FAST_CHAR_MASK is used when the input is in native byte ordering, 2648 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering. 2649*/ 2650#if (SIZEOF_LONG == 8) 2651# define FAST_CHAR_MASK 0x8000800080008000L 2652# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L 2653#elif (SIZEOF_LONG == 4) 2654# define FAST_CHAR_MASK 0x80008000L 2655# define SWAPPED_FAST_CHAR_MASK 0x00800080L 2656#else 2657# error C 'long' size should be either 4 or 8! 2658#endif 2659 2660PyObject * 2661PyUnicode_DecodeUTF16Stateful(const char *s, 2662 Py_ssize_t size, 2663 const char *errors, 2664 int *byteorder, 2665 Py_ssize_t *consumed) 2666{ 2667 const char *starts = s; 2668 Py_ssize_t startinpos; 2669 Py_ssize_t endinpos; 2670 Py_ssize_t outpos; 2671 PyUnicodeObject *unicode; 2672 Py_UNICODE *p; 2673 const unsigned char *q, *e, *aligned_end; 2674 int bo = 0; /* assume native ordering by default */ 2675 int native_ordering = 0; 2676 const char *errmsg = ""; 2677 /* Offsets from q for retrieving byte pairs in the right order. */ 2678#ifdef BYTEORDER_IS_LITTLE_ENDIAN 2679 int ihi = 1, ilo = 0; 2680#else 2681 int ihi = 0, ilo = 1; 2682#endif 2683 PyObject *errorHandler = NULL; 2684 PyObject *exc = NULL; 2685 2686 /* Note: size will always be longer than the resulting Unicode 2687 character count */ 2688 unicode = _PyUnicode_New(size); 2689 if (!unicode) 2690 return NULL; 2691 if (size == 0) 2692 return (PyObject *)unicode; 2693 2694 /* Unpack UTF-16 encoded data */ 2695 p = unicode->str; 2696 q = (unsigned char *)s; 2697 e = q + size - 1; 2698 2699 if (byteorder) 2700 bo = *byteorder; 2701 2702 /* Check for BOM marks (U+FEFF) in the input and adjust current 2703 byte order setting accordingly. In native mode, the leading BOM 2704 mark is skipped, in all other modes, it is copied to the output 2705 stream as-is (giving a ZWNBSP character). */ 2706 if (bo == 0) { 2707 if (size >= 2) { 2708 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo]; 2709#ifdef BYTEORDER_IS_LITTLE_ENDIAN 2710 if (bom == 0xFEFF) { 2711 q += 2; 2712 bo = -1; 2713 } 2714 else if (bom == 0xFFFE) { 2715 q += 2; 2716 bo = 1; 2717 } 2718#else 2719 if (bom == 0xFEFF) { 2720 q += 2; 2721 bo = 1; 2722 } 2723 else if (bom == 0xFFFE) { 2724 q += 2; 2725 bo = -1; 2726 } 2727#endif 2728 } 2729 } 2730 2731 if (bo == -1) { 2732 /* force LE */ 2733 ihi = 1; 2734 ilo = 0; 2735 } 2736 else if (bo == 1) { 2737 /* force BE */ 2738 ihi = 0; 2739 ilo = 1; 2740 } 2741#ifdef BYTEORDER_IS_LITTLE_ENDIAN 2742 native_ordering = ilo < ihi; 2743#else 2744 native_ordering = ilo > ihi; 2745#endif 2746 2747 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK); 2748 while (q < e) { 2749 Py_UNICODE ch; 2750 /* First check for possible aligned read of a C 'long'. Unaligned 2751 reads are more expensive, better to defer to another iteration. */ 2752 if (!((size_t) q & LONG_PTR_MASK)) { 2753 /* Fast path for runs of non-surrogate chars. */ 2754 register const unsigned char *_q = q; 2755 Py_UNICODE *_p = p; 2756 if (native_ordering) { 2757 /* Native ordering is simple: as long as the input cannot 2758 possibly contain a surrogate char, do an unrolled copy 2759 of several 16-bit code points to the target object. 2760 The non-surrogate check is done on several input bytes 2761 at a time (as many as a C 'long' can contain). */ 2762 while (_q < aligned_end) { 2763 unsigned long data = * (unsigned long *) _q; 2764 if (data & FAST_CHAR_MASK) 2765 break; 2766 _p[0] = ((unsigned short *) _q)[0]; 2767 _p[1] = ((unsigned short *) _q)[1]; 2768#if (SIZEOF_LONG == 8) 2769 _p[2] = ((unsigned short *) _q)[2]; 2770 _p[3] = ((unsigned short *) _q)[3]; 2771#endif 2772 _q += SIZEOF_LONG; 2773 _p += SIZEOF_LONG / 2; 2774 } 2775 } 2776 else { 2777 /* Byteswapped ordering is similar, but we must decompose 2778 the copy bytewise, and take care of zero'ing out the 2779 upper bytes if the target object is in 32-bit units 2780 (that is, in UCS-4 builds). */ 2781 while (_q < aligned_end) { 2782 unsigned long data = * (unsigned long *) _q; 2783 if (data & SWAPPED_FAST_CHAR_MASK) 2784 break; 2785 /* Zero upper bytes in UCS-4 builds */ 2786#if (Py_UNICODE_SIZE > 2) 2787 _p[0] = 0; 2788 _p[1] = 0; 2789#if (SIZEOF_LONG == 8) 2790 _p[2] = 0; 2791 _p[3] = 0; 2792#endif 2793#endif 2794 /* Issue #4916; UCS-4 builds on big endian machines must 2795 fill the two last bytes of each 4-byte unit. */ 2796#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2) 2797# define OFF 2 2798#else 2799# define OFF 0 2800#endif 2801 ((unsigned char *) _p)[OFF + 1] = _q[0]; 2802 ((unsigned char *) _p)[OFF + 0] = _q[1]; 2803 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2]; 2804 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3]; 2805#if (SIZEOF_LONG == 8) 2806 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4]; 2807 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5]; 2808 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6]; 2809 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7]; 2810#endif 2811#undef OFF 2812 _q += SIZEOF_LONG; 2813 _p += SIZEOF_LONG / 2; 2814 } 2815 } 2816 p = _p; 2817 q = _q; 2818 if (q >= e) 2819 break; 2820 } 2821 ch = (q[ihi] << 8) | q[ilo]; 2822 2823 q += 2; 2824 2825 if (ch < 0xD800 || ch > 0xDFFF) { 2826 *p++ = ch; 2827 continue; 2828 } 2829 2830 /* UTF-16 code pair: */ 2831 if (q > e) { 2832 errmsg = "unexpected end of data"; 2833 startinpos = (((const char *)q) - 2) - starts; 2834 endinpos = ((const char *)e) + 1 - starts; 2835 goto utf16Error; 2836 } 2837 if (0xD800 <= ch && ch <= 0xDBFF) { 2838 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo]; 2839 q += 2; 2840 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) { 2841#ifndef Py_UNICODE_WIDE 2842 *p++ = ch; 2843 *p++ = ch2; 2844#else 2845 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000; 2846#endif 2847 continue; 2848 } 2849 else { 2850 errmsg = "illegal UTF-16 surrogate"; 2851 startinpos = (((const char *)q)-4)-starts; 2852 endinpos = startinpos+2; 2853 goto utf16Error; 2854 } 2855 2856 } 2857 errmsg = "illegal encoding"; 2858 startinpos = (((const char *)q)-2)-starts; 2859 endinpos = startinpos+2; 2860 /* Fall through to report the error */ 2861 2862 utf16Error: 2863 outpos = p - PyUnicode_AS_UNICODE(unicode); 2864 if (unicode_decode_call_errorhandler( 2865 errors, 2866 &errorHandler, 2867 "utf16", errmsg, 2868 &starts, 2869 (const char **)&e, 2870 &startinpos, 2871 &endinpos, 2872 &exc, 2873 (const char **)&q, 2874 &unicode, 2875 &outpos, 2876 &p)) 2877 goto onError; 2878 } 2879 /* remaining byte at the end? (size should be even) */ 2880 if (e == q) { 2881 if (!consumed) { 2882 errmsg = "truncated data"; 2883 startinpos = ((const char *)q) - starts; 2884 endinpos = ((const char *)e) + 1 - starts; 2885 outpos = p - PyUnicode_AS_UNICODE(unicode); 2886 if (unicode_decode_call_errorhandler( 2887 errors, 2888 &errorHandler, 2889 "utf16", errmsg, 2890 &starts, 2891 (const char **)&e, 2892 &startinpos, 2893 &endinpos, 2894 &exc, 2895 (const char **)&q, 2896 &unicode, 2897 &outpos, 2898 &p)) 2899 goto onError; 2900 /* The remaining input chars are ignored if the callback 2901 chooses to skip the input */ 2902 } 2903 } 2904 2905 if (byteorder) 2906 *byteorder = bo; 2907 2908 if (consumed) 2909 *consumed = (const char *)q-starts; 2910 2911 /* Adjust length */ 2912 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0) 2913 goto onError; 2914 2915 Py_XDECREF(errorHandler); 2916 Py_XDECREF(exc); 2917 return (PyObject *)unicode; 2918 2919 onError: 2920 Py_DECREF(unicode); 2921 Py_XDECREF(errorHandler); 2922 Py_XDECREF(exc); 2923 return NULL; 2924} 2925 2926#undef FAST_CHAR_MASK 2927#undef SWAPPED_FAST_CHAR_MASK 2928 2929PyObject * 2930PyUnicode_EncodeUTF16(const Py_UNICODE *s, 2931 Py_ssize_t size, 2932 const char *errors, 2933 int byteorder) 2934{ 2935 PyObject *v; 2936 unsigned char *p; 2937 Py_ssize_t nsize, bytesize; 2938#ifdef Py_UNICODE_WIDE 2939 Py_ssize_t i, pairs; 2940#else 2941 const int pairs = 0; 2942#endif 2943 /* Offsets from p for storing byte pairs in the right order. */ 2944#ifdef BYTEORDER_IS_LITTLE_ENDIAN 2945 int ihi = 1, ilo = 0; 2946#else 2947 int ihi = 0, ilo = 1; 2948#endif 2949 2950#define STORECHAR(CH) \ 2951 do { \ 2952 p[ihi] = ((CH) >> 8) & 0xff; \ 2953 p[ilo] = (CH) & 0xff; \ 2954 p += 2; \ 2955 } while(0) 2956 2957#ifdef Py_UNICODE_WIDE 2958 for (i = pairs = 0; i < size; i++) 2959 if (s[i] >= 0x10000) 2960 pairs++; 2961#endif 2962 /* 2 * (size + pairs + (byteorder == 0)) */ 2963 if (size > PY_SSIZE_T_MAX || 2964 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0)) 2965 return PyErr_NoMemory(); 2966 nsize = size + pairs + (byteorder == 0); 2967 bytesize = nsize * 2; 2968 if (bytesize / 2 != nsize) 2969 return PyErr_NoMemory(); 2970 v = PyBytes_FromStringAndSize(NULL, bytesize); 2971 if (v == NULL) 2972 return NULL; 2973 2974 p = (unsigned char *)PyBytes_AS_STRING(v); 2975 if (byteorder == 0) 2976 STORECHAR(0xFEFF); 2977 if (size == 0) 2978 goto done; 2979 2980 if (byteorder == -1) { 2981 /* force LE */ 2982 ihi = 1; 2983 ilo = 0; 2984 } 2985 else if (byteorder == 1) { 2986 /* force BE */ 2987 ihi = 0; 2988 ilo = 1; 2989 } 2990 2991 while (size-- > 0) { 2992 Py_UNICODE ch = *s++; 2993 Py_UNICODE ch2 = 0; 2994#ifdef Py_UNICODE_WIDE 2995 if (ch >= 0x10000) { 2996 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF); 2997 ch = 0xD800 | ((ch-0x10000) >> 10); 2998 } 2999#endif 3000 STORECHAR(ch); 3001 if (ch2) 3002 STORECHAR(ch2); 3003 } 3004 3005 done: 3006 return v; 3007#undef STORECHAR 3008} 3009 3010PyObject *PyUnicode_AsUTF16String(PyObject *unicode) 3011{ 3012 if (!PyUnicode_Check(unicode)) { 3013 PyErr_BadArgument(); 3014 return NULL; 3015 } 3016 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode), 3017 PyUnicode_GET_SIZE(unicode), 3018 NULL, 3019 0); 3020} 3021 3022/* --- Unicode Escape Codec ----------------------------------------------- */ 3023 3024static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL; 3025 3026PyObject *PyUnicode_DecodeUnicodeEscape(const char *s, 3027 Py_ssize_t size, 3028 const char *errors) 3029{ 3030 const char *starts = s; 3031 Py_ssize_t startinpos; 3032 Py_ssize_t endinpos; 3033 Py_ssize_t outpos; 3034 int i; 3035 PyUnicodeObject *v; 3036 Py_UNICODE *p; 3037 const char *end; 3038 char* message; 3039 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */ 3040 PyObject *errorHandler = NULL; 3041 PyObject *exc = NULL; 3042 3043 /* Escaped strings will always be longer than the resulting 3044 Unicode string, so we start with size here and then reduce the 3045 length after conversion to the true value. 3046 (but if the error callback returns a long replacement string 3047 we'll have to allocate more space) */ 3048 v = _PyUnicode_New(size); 3049 if (v == NULL) 3050 goto onError; 3051 if (size == 0) 3052 return (PyObject *)v; 3053 3054 p = PyUnicode_AS_UNICODE(v); 3055 end = s + size; 3056 3057 while (s < end) { 3058 unsigned char c; 3059 Py_UNICODE x; 3060 int digits; 3061 3062 /* Non-escape characters are interpreted as Unicode ordinals */ 3063 if (*s != '\\') { 3064 *p++ = (unsigned char) *s++; 3065 continue; 3066 } 3067 3068 startinpos = s-starts; 3069 /* \ - Escapes */ 3070 s++; 3071 c = *s++; 3072 if (s > end) 3073 c = '\0'; /* Invalid after \ */ 3074 switch (c) { 3075 3076 /* \x escapes */ 3077 case '\n': break; 3078 case '\\': *p++ = '\\'; break; 3079 case '\'': *p++ = '\''; break; 3080 case '\"': *p++ = '\"'; break; 3081 case 'b': *p++ = '\b'; break; 3082 case 'f': *p++ = '\014'; break; /* FF */ 3083 case 't': *p++ = '\t'; break; 3084 case 'n': *p++ = '\n'; break; 3085 case 'r': *p++ = '\r'; break; 3086 case 'v': *p++ = '\013'; break; /* VT */ 3087 case 'a': *p++ = '\007'; break; /* BEL, not classic C */ 3088 3089 /* \OOO (octal) escapes */ 3090 case '0': case '1': case '2': case '3': 3091 case '4': case '5': case '6': case '7': 3092 x = s[-1] - '0'; 3093 if (s < end && '0' <= *s && *s <= '7') { 3094 x = (x<<3) + *s++ - '0'; 3095 if (s < end && '0' <= *s && *s <= '7') 3096 x = (x<<3) + *s++ - '0'; 3097 } 3098 *p++ = x; 3099 break; 3100 3101 /* hex escapes */ 3102 /* \xXX */ 3103 case 'x': 3104 digits = 2; 3105 message = "truncated \\xXX escape"; 3106 goto hexescape; 3107 3108 /* \uXXXX */ 3109 case 'u': 3110 digits = 4; 3111 message = "truncated \\uXXXX escape"; 3112 goto hexescape; 3113 3114 /* \UXXXXXXXX */ 3115 case 'U': 3116 digits = 8; 3117 message = "truncated \\UXXXXXXXX escape"; 3118 hexescape: 3119 chr = 0; 3120 outpos = p-PyUnicode_AS_UNICODE(v); 3121 if (s+digits>end) { 3122 endinpos = size; 3123 if (unicode_decode_call_errorhandler( 3124 errors, &errorHandler, 3125 "unicodeescape", "end of string in escape sequence", 3126 &starts, &end, &startinpos, &endinpos, &exc, &s, 3127 &v, &outpos, &p)) 3128 goto onError; 3129 goto nextByte; 3130 } 3131 for (i = 0; i < digits; ++i) { 3132 c = (unsigned char) s[i]; 3133 if (!ISXDIGIT(c)) { 3134 endinpos = (s+i+1)-starts; 3135 if (unicode_decode_call_errorhandler( 3136 errors, &errorHandler, 3137 "unicodeescape", message, 3138 &starts, &end, &startinpos, &endinpos, &exc, &s, 3139 &v, &outpos, &p)) 3140 goto onError; 3141 goto nextByte; 3142 } 3143 chr = (chr<<4) & ~0xF; 3144 if (c >= '0' && c <= '9') 3145 chr += c - '0'; 3146 else if (c >= 'a' && c <= 'f') 3147 chr += 10 + c - 'a'; 3148 else 3149 chr += 10 + c - 'A'; 3150 } 3151 s += i; 3152 if (chr == 0xffffffff && PyErr_Occurred()) 3153 /* _decoding_error will have already written into the 3154 target buffer. */ 3155 break; 3156 store: 3157 /* when we get here, chr is a 32-bit unicode character */ 3158 if (chr <= 0xffff) 3159 /* UCS-2 character */ 3160 *p++ = (Py_UNICODE) chr; 3161 else if (chr <= 0x10ffff) { 3162 /* UCS-4 character. Either store directly, or as 3163 surrogate pair. */ 3164#ifdef Py_UNICODE_WIDE 3165 *p++ = chr; 3166#else 3167 chr -= 0x10000L; 3168 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10); 3169 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF); 3170#endif 3171 } else { 3172 endinpos = s-starts; 3173 outpos = p-PyUnicode_AS_UNICODE(v); 3174 if (unicode_decode_call_errorhandler( 3175 errors, &errorHandler, 3176 "unicodeescape", "illegal Unicode character", 3177 &starts, &end, &startinpos, &endinpos, &exc, &s, 3178 &v, &outpos, &p)) 3179 goto onError; 3180 } 3181 break; 3182 3183 /* \N{name} */ 3184 case 'N': 3185 message = "malformed \\N character escape"; 3186 if (ucnhash_CAPI == NULL) { 3187 /* load the unicode data module */ 3188 PyObject *m, *api; 3189 m = PyImport_ImportModuleNoBlock("unicodedata"); 3190 if (m == NULL) 3191 goto ucnhashError; 3192 api = PyObject_GetAttrString(m, "ucnhash_CAPI"); 3193 Py_DECREF(m); 3194 if (api == NULL) 3195 goto ucnhashError; 3196 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api); 3197 Py_DECREF(api); 3198 if (ucnhash_CAPI == NULL) 3199 goto ucnhashError; 3200 } 3201 if (*s == '{') { 3202 const char *start = s+1; 3203 /* look for the closing brace */ 3204 while (*s != '}' && s < end) 3205 s++; 3206 if (s > start && s < end && *s == '}') { 3207 /* found a name. look it up in the unicode database */ 3208 message = "unknown Unicode character name"; 3209 s++; 3210 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr)) 3211 goto store; 3212 } 3213 } 3214 endinpos = s-starts; 3215 outpos = p-PyUnicode_AS_UNICODE(v); 3216 if (unicode_decode_call_errorhandler( 3217 errors, &errorHandler, 3218 "unicodeescape", message, 3219 &starts, &end, &startinpos, &endinpos, &exc, &s, 3220 &v, &outpos, &p)) 3221 goto onError; 3222 break; 3223 3224 default: 3225 if (s > end) { 3226 message = "\\ at end of string"; 3227 s--; 3228 endinpos = s-starts; 3229 outpos = p-PyUnicode_AS_UNICODE(v); 3230 if (unicode_decode_call_errorhandler( 3231 errors, &errorHandler, 3232 "unicodeescape", message, 3233 &starts, &end, &startinpos, &endinpos, &exc, &s, 3234 &v, &outpos, &p)) 3235 goto onError; 3236 } 3237 else { 3238 *p++ = '\\'; 3239 *p++ = (unsigned char)s[-1]; 3240 } 3241 break; 3242 } 3243 nextByte: 3244 ; 3245 } 3246 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0) 3247 goto onError; 3248 Py_XDECREF(errorHandler); 3249 Py_XDECREF(exc); 3250 return (PyObject *)v; 3251 3252 ucnhashError: 3253 PyErr_SetString( 3254 PyExc_UnicodeError, 3255 "\\N escapes not supported (can't load unicodedata module)" 3256 ); 3257 Py_XDECREF(v); 3258 Py_XDECREF(errorHandler); 3259 Py_XDECREF(exc); 3260 return NULL; 3261 3262 onError: 3263 Py_XDECREF(v); 3264 Py_XDECREF(errorHandler); 3265 Py_XDECREF(exc); 3266 return NULL; 3267} 3268 3269/* Return a Unicode-Escape string version of the Unicode object. 3270 3271 If quotes is true, the string is enclosed in u"" or u'' quotes as 3272 appropriate. 3273 3274*/ 3275 3276Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s, 3277 Py_ssize_t size, 3278 Py_UNICODE ch) 3279{ 3280 /* like wcschr, but doesn't stop at NULL characters */ 3281 3282 while (size-- > 0) { 3283 if (*s == ch) 3284 return s; 3285 s++; 3286 } 3287 3288 return NULL; 3289} 3290 3291static const char *hexdigits = "0123456789abcdef"; 3292 3293PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s, 3294 Py_ssize_t size) 3295{ 3296 PyObject *repr; 3297 char *p; 3298 3299#ifdef Py_UNICODE_WIDE 3300 const Py_ssize_t expandsize = 10; 3301#else 3302 const Py_ssize_t expandsize = 6; 3303#endif 3304 3305 /* XXX(nnorwitz): rather than over-allocating, it would be 3306 better to choose a different scheme. Perhaps scan the 3307 first N-chars of the string and allocate based on that size. 3308 */ 3309 /* Initial allocation is based on the longest-possible unichr 3310 escape. 3311 3312 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source 3313 unichr, so in this case it's the longest unichr escape. In 3314 narrow (UTF-16) builds this is five chars per source unichr 3315 since there are two unichrs in the surrogate pair, so in narrow 3316 (UTF-16) builds it's not the longest unichr escape. 3317 3318 In wide or narrow builds '\uxxxx' is 6 chars per source unichr, 3319 so in the narrow (UTF-16) build case it's the longest unichr 3320 escape. 3321 */ 3322 3323 if (size == 0) 3324 return PyBytes_FromStringAndSize(NULL, 0); 3325 3326 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize) 3327 return PyErr_NoMemory(); 3328 3329 repr = PyBytes_FromStringAndSize(NULL, 3330 2 3331 + expandsize*size 3332 + 1); 3333 if (repr == NULL) 3334 return NULL; 3335 3336 p = PyBytes_AS_STRING(repr); 3337 3338 while (size-- > 0) { 3339 Py_UNICODE ch = *s++; 3340 3341 /* Escape backslashes */ 3342 if (ch == '\\') { 3343 *p++ = '\\'; 3344 *p++ = (char) ch; 3345 continue; 3346 } 3347 3348#ifdef Py_UNICODE_WIDE 3349 /* Map 21-bit characters to '\U00xxxxxx' */ 3350 else if (ch >= 0x10000) { 3351 *p++ = '\\'; 3352 *p++ = 'U'; 3353 *p++ = hexdigits[(ch >> 28) & 0x0000000F]; 3354 *p++ = hexdigits[(ch >> 24) & 0x0000000F]; 3355 *p++ = hexdigits[(ch >> 20) & 0x0000000F]; 3356 *p++ = hexdigits[(ch >> 16) & 0x0000000F]; 3357 *p++ = hexdigits[(ch >> 12) & 0x0000000F]; 3358 *p++ = hexdigits[(ch >> 8) & 0x0000000F]; 3359 *p++ = hexdigits[(ch >> 4) & 0x0000000F]; 3360 *p++ = hexdigits[ch & 0x0000000F]; 3361 continue; 3362 } 3363#else 3364 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */ 3365 else if (ch >= 0xD800 && ch < 0xDC00) { 3366 Py_UNICODE ch2; 3367 Py_UCS4 ucs; 3368 3369 ch2 = *s++; 3370 size--; 3371 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) { 3372 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000; 3373 *p++ = '\\'; 3374 *p++ = 'U'; 3375 *p++ = hexdigits[(ucs >> 28) & 0x0000000F]; 3376 *p++ = hexdigits[(ucs >> 24) & 0x0000000F]; 3377 *p++ = hexdigits[(ucs >> 20) & 0x0000000F]; 3378 *p++ = hexdigits[(ucs >> 16) & 0x0000000F]; 3379 *p++ = hexdigits[(ucs >> 12) & 0x0000000F]; 3380 *p++ = hexdigits[(ucs >> 8) & 0x0000000F]; 3381 *p++ = hexdigits[(ucs >> 4) & 0x0000000F]; 3382 *p++ = hexdigits[ucs & 0x0000000F]; 3383 continue; 3384 } 3385 /* Fall through: isolated surrogates are copied as-is */ 3386 s--; 3387 size++; 3388 } 3389#endif 3390 3391 /* Map 16-bit characters to '\uxxxx' */ 3392 if (ch >= 256) { 3393 *p++ = '\\'; 3394 *p++ = 'u'; 3395 *p++ = hexdigits[(ch >> 12) & 0x000F]; 3396 *p++ = hexdigits[(ch >> 8) & 0x000F]; 3397 *p++ = hexdigits[(ch >> 4) & 0x000F]; 3398 *p++ = hexdigits[ch & 0x000F]; 3399 } 3400 3401 /* Map special whitespace to '\t', \n', '\r' */ 3402 else if (ch == '\t') { 3403 *p++ = '\\'; 3404 *p++ = 't'; 3405 } 3406 else if (ch == '\n') { 3407 *p++ = '\\'; 3408 *p++ = 'n'; 3409 } 3410 else if (ch == '\r') { 3411 *p++ = '\\'; 3412 *p++ = 'r'; 3413 } 3414 3415 /* Map non-printable US ASCII to '\xhh' */ 3416 else if (ch < ' ' || ch >= 0x7F) { 3417 *p++ = '\\'; 3418 *p++ = 'x'; 3419 *p++ = hexdigits[(ch >> 4) & 0x000F]; 3420 *p++ = hexdigits[ch & 0x000F]; 3421 } 3422 3423 /* Copy everything else as-is */ 3424 else 3425 *p++ = (char) ch; 3426 } 3427 3428 assert(p - PyBytes_AS_STRING(repr) > 0); 3429 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) 3430 return NULL; 3431 return repr; 3432} 3433 3434PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode) 3435{ 3436 PyObject *s; 3437 if (!PyUnicode_Check(unicode)) { 3438 PyErr_BadArgument(); 3439 return NULL; 3440 } 3441 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode), 3442 PyUnicode_GET_SIZE(unicode)); 3443 return s; 3444} 3445 3446/* --- Raw Unicode Escape Codec ------------------------------------------- */ 3447 3448PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s, 3449 Py_ssize_t size, 3450 const char *errors) 3451{ 3452 const char *starts = s; 3453 Py_ssize_t startinpos; 3454 Py_ssize_t endinpos; 3455 Py_ssize_t outpos; 3456 PyUnicodeObject *v; 3457 Py_UNICODE *p; 3458 const char *end; 3459 const char *bs; 3460 PyObject *errorHandler = NULL; 3461 PyObject *exc = NULL; 3462 3463 /* Escaped strings will always be longer than the resulting 3464 Unicode string, so we start with size here and then reduce the 3465 length after conversion to the true value. (But decoding error 3466 handler might have to resize the string) */ 3467 v = _PyUnicode_New(size); 3468 if (v == NULL) 3469 goto onError; 3470 if (size == 0) 3471 return (PyObject *)v; 3472 p = PyUnicode_AS_UNICODE(v); 3473 end = s + size; 3474 while (s < end) { 3475 unsigned char c; 3476 Py_UCS4 x; 3477 int i; 3478 int count; 3479 3480 /* Non-escape characters are interpreted as Unicode ordinals */ 3481 if (*s != '\\') { 3482 *p++ = (unsigned char)*s++; 3483 continue; 3484 } 3485 startinpos = s-starts; 3486 3487 /* \u-escapes are only interpreted iff the number of leading 3488 backslashes if odd */ 3489 bs = s; 3490 for (;s < end;) { 3491 if (*s != '\\') 3492 break; 3493 *p++ = (unsigned char)*s++; 3494 } 3495 if (((s - bs) & 1) == 0 || 3496 s >= end || 3497 (*s != 'u' && *s != 'U')) { 3498 continue; 3499 } 3500 p--; 3501 count = *s=='u' ? 4 : 8; 3502 s++; 3503 3504 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */ 3505 outpos = p-PyUnicode_AS_UNICODE(v); 3506 for (x = 0, i = 0; i < count; ++i, ++s) { 3507 c = (unsigned char)*s; 3508 if (!ISXDIGIT(c)) { 3509 endinpos = s-starts; 3510 if (unicode_decode_call_errorhandler( 3511 errors, &errorHandler, 3512 "rawunicodeescape", "truncated \\uXXXX", 3513 &starts, &end, &startinpos, &endinpos, &exc, &s, 3514 &v, &outpos, &p)) 3515 goto onError; 3516 goto nextByte; 3517 } 3518 x = (x<<4) & ~0xF; 3519 if (c >= '0' && c <= '9') 3520 x += c - '0'; 3521 else if (c >= 'a' && c <= 'f') 3522 x += 10 + c - 'a'; 3523 else 3524 x += 10 + c - 'A'; 3525 } 3526 if (x <= 0xffff) 3527 /* UCS-2 character */ 3528 *p++ = (Py_UNICODE) x; 3529 else if (x <= 0x10ffff) { 3530 /* UCS-4 character. Either store directly, or as 3531 surrogate pair. */ 3532#ifdef Py_UNICODE_WIDE 3533 *p++ = (Py_UNICODE) x; 3534#else 3535 x -= 0x10000L; 3536 *p++ = 0xD800 + (Py_UNICODE) (x >> 10); 3537 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF); 3538#endif 3539 } else { 3540 endinpos = s-starts; 3541 outpos = p-PyUnicode_AS_UNICODE(v); 3542 if (unicode_decode_call_errorhandler( 3543 errors, &errorHandler, 3544 "rawunicodeescape", "\\Uxxxxxxxx out of range", 3545 &starts, &end, &startinpos, &endinpos, &exc, &s, 3546 &v, &outpos, &p)) 3547 goto onError; 3548 } 3549 nextByte: 3550 ; 3551 } 3552 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0) 3553 goto onError; 3554 Py_XDECREF(errorHandler); 3555 Py_XDECREF(exc); 3556 return (PyObject *)v; 3557 3558 onError: 3559 Py_XDECREF(v); 3560 Py_XDECREF(errorHandler); 3561 Py_XDECREF(exc); 3562 return NULL; 3563} 3564 3565PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s, 3566 Py_ssize_t size) 3567{ 3568 PyObject *repr; 3569 char *p; 3570 char *q; 3571 3572#ifdef Py_UNICODE_WIDE 3573 const Py_ssize_t expandsize = 10; 3574#else 3575 const Py_ssize_t expandsize = 6; 3576#endif 3577 3578 if (size > PY_SSIZE_T_MAX / expandsize) 3579 return PyErr_NoMemory(); 3580 3581 repr = PyBytes_FromStringAndSize(NULL, expandsize * size); 3582 if (repr == NULL) 3583 return NULL; 3584 if (size == 0) 3585 return repr; 3586 3587 p = q = PyBytes_AS_STRING(repr); 3588 while (size-- > 0) { 3589 Py_UNICODE ch = *s++; 3590#ifdef Py_UNICODE_WIDE 3591 /* Map 32-bit characters to '\Uxxxxxxxx' */ 3592 if (ch >= 0x10000) { 3593 *p++ = '\\'; 3594 *p++ = 'U'; 3595 *p++ = hexdigits[(ch >> 28) & 0xf]; 3596 *p++ = hexdigits[(ch >> 24) & 0xf]; 3597 *p++ = hexdigits[(ch >> 20) & 0xf]; 3598 *p++ = hexdigits[(ch >> 16) & 0xf]; 3599 *p++ = hexdigits[(ch >> 12) & 0xf]; 3600 *p++ = hexdigits[(ch >> 8) & 0xf]; 3601 *p++ = hexdigits[(ch >> 4) & 0xf]; 3602 *p++ = hexdigits[ch & 15]; 3603 } 3604 else 3605#else 3606 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */ 3607 if (ch >= 0xD800 && ch < 0xDC00) { 3608 Py_UNICODE ch2; 3609 Py_UCS4 ucs; 3610 3611 ch2 = *s++; 3612 size--; 3613 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) { 3614 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000; 3615 *p++ = '\\'; 3616 *p++ = 'U'; 3617 *p++ = hexdigits[(ucs >> 28) & 0xf]; 3618 *p++ = hexdigits[(ucs >> 24) & 0xf]; 3619 *p++ = hexdigits[(ucs >> 20) & 0xf]; 3620 *p++ = hexdigits[(ucs >> 16) & 0xf]; 3621 *p++ = hexdigits[(ucs >> 12) & 0xf]; 3622 *p++ = hexdigits[(ucs >> 8) & 0xf]; 3623 *p++ = hexdigits[(ucs >> 4) & 0xf]; 3624 *p++ = hexdigits[ucs & 0xf]; 3625 continue; 3626 } 3627 /* Fall through: isolated surrogates are copied as-is */ 3628 s--; 3629 size++; 3630 } 3631#endif 3632 /* Map 16-bit characters to '\uxxxx' */ 3633 if (ch >= 256) { 3634 *p++ = '\\'; 3635 *p++ = 'u'; 3636 *p++ = hexdigits[(ch >> 12) & 0xf]; 3637 *p++ = hexdigits[(ch >> 8) & 0xf]; 3638 *p++ = hexdigits[(ch >> 4) & 0xf]; 3639 *p++ = hexdigits[ch & 15]; 3640 } 3641 /* Copy everything else as-is */ 3642 else 3643 *p++ = (char) ch; 3644 } 3645 size = p - q; 3646 3647 assert(size > 0); 3648 if (_PyBytes_Resize(&repr, size) < 0) 3649 return NULL; 3650 return repr; 3651} 3652 3653PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode) 3654{ 3655 PyObject *s; 3656 if (!PyUnicode_Check(unicode)) { 3657 PyErr_BadArgument(); 3658 return NULL; 3659 } 3660 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode), 3661 PyUnicode_GET_SIZE(unicode)); 3662 3663 return s; 3664} 3665 3666/* --- Unicode Internal Codec ------------------------------------------- */ 3667 3668PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s, 3669 Py_ssize_t size, 3670 const char *errors) 3671{ 3672 const char *starts = s; 3673 Py_ssize_t startinpos; 3674 Py_ssize_t endinpos; 3675 Py_ssize_t outpos; 3676 PyUnicodeObject *v; 3677 Py_UNICODE *p; 3678 const char *end; 3679 const char *reason; 3680 PyObject *errorHandler = NULL; 3681 PyObject *exc = NULL; 3682 3683#ifdef Py_UNICODE_WIDE 3684 Py_UNICODE unimax = PyUnicode_GetMax(); 3685#endif 3686 3687 /* XXX overflow detection missing */ 3688 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE); 3689 if (v == NULL) 3690 goto onError; 3691 if (PyUnicode_GetSize((PyObject *)v) == 0) 3692 return (PyObject *)v; 3693 p = PyUnicode_AS_UNICODE(v); 3694 end = s + size; 3695 3696 while (s < end) { 3697 memcpy(p, s, sizeof(Py_UNICODE)); 3698 /* We have to sanity check the raw data, otherwise doom looms for 3699 some malformed UCS-4 data. */ 3700 if ( 3701#ifdef Py_UNICODE_WIDE 3702 *p > unimax || *p < 0 || 3703#endif 3704 end-s < Py_UNICODE_SIZE 3705 ) 3706 { 3707 startinpos = s - starts; 3708 if (end-s < Py_UNICODE_SIZE) { 3709 endinpos = end-starts; 3710 reason = "truncated input"; 3711 } 3712 else { 3713 endinpos = s - starts + Py_UNICODE_SIZE; 3714 reason = "illegal code point (> 0x10FFFF)"; 3715 } 3716 outpos = p - PyUnicode_AS_UNICODE(v); 3717 if (unicode_decode_call_errorhandler( 3718 errors, &errorHandler, 3719 "unicode_internal", reason, 3720 &starts, &end, &startinpos, &endinpos, &exc, &s, 3721 &v, &outpos, &p)) { 3722 goto onError; 3723 } 3724 } 3725 else { 3726 p++; 3727 s += Py_UNICODE_SIZE; 3728 } 3729 } 3730 3731 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0) 3732 goto onError; 3733 Py_XDECREF(errorHandler); 3734 Py_XDECREF(exc); 3735 return (PyObject *)v; 3736 3737 onError: 3738 Py_XDECREF(v); 3739 Py_XDECREF(errorHandler); 3740 Py_XDECREF(exc); 3741 return NULL; 3742} 3743 3744/* --- Latin-1 Codec ------------------------------------------------------ */ 3745 3746PyObject *PyUnicode_DecodeLatin1(const char *s, 3747 Py_ssize_t size, 3748 const char *errors) 3749{ 3750 PyUnicodeObject *v; 3751 Py_UNICODE *p; 3752 const char *e, *unrolled_end; 3753 3754 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */ 3755 if (size == 1) { 3756 Py_UNICODE r = *(unsigned char*)s; 3757 return PyUnicode_FromUnicode(&r, 1); 3758 } 3759 3760 v = _PyUnicode_New(size); 3761 if (v == NULL) 3762 goto onError; 3763 if (size == 0) 3764 return (PyObject *)v; 3765 p = PyUnicode_AS_UNICODE(v); 3766 e = s + size; 3767 /* Unrolling the copy makes it much faster by reducing the looping 3768 overhead. This is similar to what many memcpy() implementations do. */ 3769 unrolled_end = e - 4; 3770 while (s < unrolled_end) { 3771 p[0] = (unsigned char) s[0]; 3772 p[1] = (unsigned char) s[1]; 3773 p[2] = (unsigned char) s[2]; 3774 p[3] = (unsigned char) s[3]; 3775 s += 4; 3776 p += 4; 3777 } 3778 while (s < e) 3779 *p++ = (unsigned char) *s++; 3780 return (PyObject *)v; 3781 3782 onError: 3783 Py_XDECREF(v); 3784 return NULL; 3785} 3786 3787/* create or adjust a UnicodeEncodeError */ 3788static void make_encode_exception(PyObject **exceptionObject, 3789 const char *encoding, 3790 const Py_UNICODE *unicode, Py_ssize_t size, 3791 Py_ssize_t startpos, Py_ssize_t endpos, 3792 const char *reason) 3793{ 3794 if (*exceptionObject == NULL) { 3795 *exceptionObject = PyUnicodeEncodeError_Create( 3796 encoding, unicode, size, startpos, endpos, reason); 3797 } 3798 else { 3799 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos)) 3800 goto onError; 3801 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos)) 3802 goto onError; 3803 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason)) 3804 goto onError; 3805 return; 3806 onError: 3807 Py_DECREF(*exceptionObject); 3808 *exceptionObject = NULL; 3809 } 3810} 3811 3812/* raises a UnicodeEncodeError */ 3813static void raise_encode_exception(PyObject **exceptionObject, 3814 const char *encoding, 3815 const Py_UNICODE *unicode, Py_ssize_t size, 3816 Py_ssize_t startpos, Py_ssize_t endpos, 3817 const char *reason) 3818{ 3819 make_encode_exception(exceptionObject, 3820 encoding, unicode, size, startpos, endpos, reason); 3821 if (*exceptionObject != NULL) 3822 PyCodec_StrictErrors(*exceptionObject); 3823} 3824 3825/* error handling callback helper: 3826 build arguments, call the callback and check the arguments, 3827 put the result into newpos and return the replacement string, which 3828 has to be freed by the caller */ 3829static PyObject *unicode_encode_call_errorhandler(const char *errors, 3830 PyObject **errorHandler, 3831 const char *encoding, const char *reason, 3832 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject, 3833 Py_ssize_t startpos, Py_ssize_t endpos, 3834 Py_ssize_t *newpos) 3835{ 3836 static char *argparse = "O!n;encoding error handler must return (str, int) tuple"; 3837 3838 PyObject *restuple; 3839 PyObject *resunicode; 3840 3841 if (*errorHandler == NULL) { 3842 *errorHandler = PyCodec_LookupError(errors); 3843 if (*errorHandler == NULL) 3844 return NULL; 3845 } 3846 3847 make_encode_exception(exceptionObject, 3848 encoding, unicode, size, startpos, endpos, reason); 3849 if (*exceptionObject == NULL) 3850 return NULL; 3851 3852 restuple = PyObject_CallFunctionObjArgs( 3853 *errorHandler, *exceptionObject, NULL); 3854 if (restuple == NULL) 3855 return NULL; 3856 if (!PyTuple_Check(restuple)) { 3857 PyErr_Format(PyExc_TypeError, &argparse[4]); 3858 Py_DECREF(restuple); 3859 return NULL; 3860 } 3861 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, 3862 &resunicode, newpos)) { 3863 Py_DECREF(restuple); 3864 return NULL; 3865 } 3866 if (*newpos<0) 3867 *newpos = size+*newpos; 3868 if (*newpos<0 || *newpos>size) { 3869 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos); 3870 Py_DECREF(restuple); 3871 return NULL; 3872 } 3873 Py_INCREF(resunicode); 3874 Py_DECREF(restuple); 3875 return resunicode; 3876} 3877 3878static PyObject *unicode_encode_ucs1(const Py_UNICODE *p, 3879 Py_ssize_t size, 3880 const char *errors, 3881 int limit) 3882{ 3883 /* output object */ 3884 PyObject *res; 3885 /* pointers to the beginning and end+1 of input */ 3886 const Py_UNICODE *startp = p; 3887 const Py_UNICODE *endp = p + size; 3888 /* pointer to the beginning of the unencodable characters */ 3889 /* const Py_UNICODE *badp = NULL; */ 3890 /* pointer into the output */ 3891 char *str; 3892 /* current output position */ 3893 Py_ssize_t ressize; 3894 const char *encoding = (limit == 256) ? "latin-1" : "ascii"; 3895 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)"; 3896 PyObject *errorHandler = NULL; 3897 PyObject *exc = NULL; 3898 /* the following variable is used for caching string comparisons 3899 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */ 3900 int known_errorHandler = -1; 3901 3902 /* allocate enough for a simple encoding without 3903 replacements, if we need more, we'll resize */ 3904 if (size == 0) 3905 return PyBytes_FromStringAndSize(NULL, 0); 3906 res = PyBytes_FromStringAndSize(NULL, size); 3907 if (res == NULL) 3908 return NULL; 3909 str = PyBytes_AS_STRING(res); 3910 ressize = size; 3911 3912 while (p<endp) { 3913 Py_UNICODE c = *p; 3914 3915 /* can we encode this? */ 3916 if (c<limit) { 3917 /* no overflow check, because we know that the space is enough */ 3918 *str++ = (char)c; 3919 ++p; 3920 } 3921 else { 3922 Py_ssize_t unicodepos = p-startp; 3923 Py_ssize_t requiredsize; 3924 PyObject *repunicode; 3925 Py_ssize_t repsize; 3926 Py_ssize_t newpos; 3927 Py_ssize_t respos; 3928 Py_UNICODE *uni2; 3929 /* startpos for collecting unencodable chars */ 3930 const Py_UNICODE *collstart = p; 3931 const Py_UNICODE *collend = p; 3932 /* find all unecodable characters */ 3933 while ((collend < endp) && ((*collend)>=limit)) 3934 ++collend; 3935 /* cache callback name lookup (if not done yet, i.e. it's the first error) */ 3936 if (known_errorHandler==-1) { 3937 if ((errors==NULL) || (!strcmp(errors, "strict"))) 3938 known_errorHandler = 1; 3939 else if (!strcmp(errors, "replace")) 3940 known_errorHandler = 2; 3941 else if (!strcmp(errors, "ignore")) 3942 known_errorHandler = 3; 3943 else if (!strcmp(errors, "xmlcharrefreplace")) 3944 known_errorHandler = 4; 3945 else 3946 known_errorHandler = 0; 3947 } 3948 switch (known_errorHandler) { 3949 case 1: /* strict */ 3950 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason); 3951 goto onError; 3952 case 2: /* replace */ 3953 while (collstart++<collend) 3954 *str++ = '?'; /* fall through */ 3955 case 3: /* ignore */ 3956 p = collend; 3957 break; 3958 case 4: /* xmlcharrefreplace */ 3959 respos = str - PyBytes_AS_STRING(res); 3960 /* determine replacement size (temporarily (mis)uses p) */ 3961 for (p = collstart, repsize = 0; p < collend; ++p) { 3962 if (*p<10) 3963 repsize += 2+1+1; 3964 else if (*p<100) 3965 repsize += 2+2+1; 3966 else if (*p<1000) 3967 repsize += 2+3+1; 3968 else if (*p<10000) 3969 repsize += 2+4+1; 3970#ifndef Py_UNICODE_WIDE 3971 else 3972 repsize += 2+5+1; 3973#else 3974 else if (*p<100000) 3975 repsize += 2+5+1; 3976 else if (*p<1000000) 3977 repsize += 2+6+1; 3978 else 3979 repsize += 2+7+1; 3980#endif 3981 } 3982 requiredsize = respos+repsize+(endp-collend); 3983 if (requiredsize > ressize) { 3984 if (requiredsize<2*ressize) 3985 requiredsize = 2*ressize; 3986 if (_PyBytes_Resize(&res, requiredsize)) 3987 goto onError; 3988 str = PyBytes_AS_STRING(res) + respos; 3989 ressize = requiredsize; 3990 } 3991 /* generate replacement (temporarily (mis)uses p) */ 3992 for (p = collstart; p < collend; ++p) { 3993 str += sprintf(str, "&#%d;", (int)*p); 3994 } 3995 p = collend; 3996 break; 3997 default: 3998 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler, 3999 encoding, reason, startp, size, &exc, 4000 collstart-startp, collend-startp, &newpos); 4001 if (repunicode == NULL) 4002 goto onError; 4003 /* need more space? (at least enough for what we 4004 have+the replacement+the rest of the string, so 4005 we won't have to check space for encodable characters) */ 4006 respos = str - PyBytes_AS_STRING(res); 4007 repsize = PyUnicode_GET_SIZE(repunicode); 4008 requiredsize = respos+repsize+(endp-collend); 4009 if (requiredsize > ressize) { 4010 if (requiredsize<2*ressize) 4011 requiredsize = 2*ressize; 4012 if (_PyBytes_Resize(&res, requiredsize)) { 4013 Py_DECREF(repunicode); 4014 goto onError; 4015 } 4016 str = PyBytes_AS_STRING(res) + respos; 4017 ressize = requiredsize; 4018 } 4019 /* check if there is anything unencodable in the replacement 4020 and copy it to the output */ 4021 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) { 4022 c = *uni2; 4023 if (c >= limit) { 4024 raise_encode_exception(&exc, encoding, startp, size, 4025 unicodepos, unicodepos+1, reason); 4026 Py_DECREF(repunicode); 4027 goto onError; 4028 } 4029 *str = (char)c; 4030 } 4031 p = startp + newpos; 4032 Py_DECREF(repunicode); 4033 } 4034 } 4035 } 4036 /* Resize if we allocated to much */ 4037 size = str - PyBytes_AS_STRING(res); 4038 if (size < ressize) { /* If this falls res will be NULL */ 4039 assert(size >= 0); 4040 if (_PyBytes_Resize(&res, size) < 0) 4041 goto onError; 4042 } 4043 4044 Py_XDECREF(errorHandler); 4045 Py_XDECREF(exc); 4046 return res; 4047 4048 onError: 4049 Py_XDECREF(res); 4050 Py_XDECREF(errorHandler); 4051 Py_XDECREF(exc); 4052 return NULL; 4053} 4054 4055PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p, 4056 Py_ssize_t size, 4057 const char *errors) 4058{ 4059 return unicode_encode_ucs1(p, size, errors, 256); 4060} 4061 4062PyObject *PyUnicode_AsLatin1String(PyObject *unicode) 4063{ 4064 if (!PyUnicode_Check(unicode)) { 4065 PyErr_BadArgument(); 4066 return NULL; 4067 } 4068 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode), 4069 PyUnicode_GET_SIZE(unicode), 4070 NULL); 4071} 4072 4073/* --- 7-bit ASCII Codec -------------------------------------------------- */ 4074 4075PyObject *PyUnicode_DecodeASCII(const char *s, 4076 Py_ssize_t size, 4077 const char *errors) 4078{ 4079 const char *starts = s; 4080 PyUnicodeObject *v; 4081 Py_UNICODE *p; 4082 Py_ssize_t startinpos; 4083 Py_ssize_t endinpos; 4084 Py_ssize_t outpos; 4085 const char *e; 4086 PyObject *errorHandler = NULL; 4087 PyObject *exc = NULL; 4088 4089 /* ASCII is equivalent to the first 128 ordinals in Unicode. */ 4090 if (size == 1 && *(unsigned char*)s < 128) { 4091 Py_UNICODE r = *(unsigned char*)s; 4092 return PyUnicode_FromUnicode(&r, 1); 4093 } 4094 4095 v = _PyUnicode_New(size); 4096 if (v == NULL) 4097 goto onError; 4098 if (size == 0) 4099 return (PyObject *)v; 4100 p = PyUnicode_AS_UNICODE(v); 4101 e = s + size; 4102 while (s < e) { 4103 register unsigned char c = (unsigned char)*s; 4104 if (c < 128) { 4105 *p++ = c; 4106 ++s; 4107 } 4108 else { 4109 startinpos = s-starts; 4110 endinpos = startinpos + 1; 4111 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v); 4112 if (unicode_decode_call_errorhandler( 4113 errors, &errorHandler, 4114 "ascii", "ordinal not in range(128)", 4115 &starts, &e, &startinpos, &endinpos, &exc, &s, 4116 &v, &outpos, &p)) 4117 goto onError; 4118 } 4119 } 4120 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v)) 4121 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0) 4122 goto onError; 4123 Py_XDECREF(errorHandler); 4124 Py_XDECREF(exc); 4125 return (PyObject *)v; 4126 4127 onError: 4128 Py_XDECREF(v); 4129 Py_XDECREF(errorHandler); 4130 Py_XDECREF(exc); 4131 return NULL; 4132} 4133 4134PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p, 4135 Py_ssize_t size, 4136 const char *errors) 4137{ 4138 return unicode_encode_ucs1(p, size, errors, 128); 4139} 4140 4141PyObject *PyUnicode_AsASCIIString(PyObject *unicode) 4142{ 4143 if (!PyUnicode_Check(unicode)) { 4144 PyErr_BadArgument(); 4145 return NULL; 4146 } 4147 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode), 4148 PyUnicode_GET_SIZE(unicode), 4149 NULL); 4150} 4151 4152#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T) 4153 4154/* --- MBCS codecs for Windows -------------------------------------------- */ 4155 4156#if SIZEOF_INT < SIZEOF_SSIZE_T 4157#define NEED_RETRY 4158#endif 4159 4160/* XXX This code is limited to "true" double-byte encodings, as 4161 a) it assumes an incomplete character consists of a single byte, and 4162 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte 4163 encodings, see IsDBCSLeadByteEx documentation. */ 4164 4165static int is_dbcs_lead_byte(const char *s, int offset) 4166{ 4167 const char *curr = s + offset; 4168 4169 if (IsDBCSLeadByte(*curr)) { 4170 const char *prev = CharPrev(s, curr); 4171 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2); 4172 } 4173 return 0; 4174} 4175 4176/* 4177 * Decode MBCS string into unicode object. If 'final' is set, converts 4178 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise. 4179 */ 4180static int decode_mbcs(PyUnicodeObject **v, 4181 const char *s, /* MBCS string */ 4182 int size, /* sizeof MBCS string */ 4183 int final) 4184{ 4185 Py_UNICODE *p; 4186 Py_ssize_t n = 0; 4187 int usize = 0; 4188 4189 assert(size >= 0); 4190 4191 /* Skip trailing lead-byte unless 'final' is set */ 4192 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1)) 4193 --size; 4194 4195 /* First get the size of the result */ 4196 if (size > 0) { 4197 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0); 4198 if (usize == 0) { 4199 PyErr_SetFromWindowsErrWithFilename(0, NULL); 4200 return -1; 4201 } 4202 } 4203 4204 if (*v == NULL) { 4205 /* Create unicode object */ 4206 *v = _PyUnicode_New(usize); 4207 if (*v == NULL) 4208 return -1; 4209 } 4210 else { 4211 /* Extend unicode object */ 4212 n = PyUnicode_GET_SIZE(*v); 4213 if (_PyUnicode_Resize(v, n + usize) < 0) 4214 return -1; 4215 } 4216 4217 /* Do the conversion */ 4218 if (size > 0) { 4219 p = PyUnicode_AS_UNICODE(*v) + n; 4220 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) { 4221 PyErr_SetFromWindowsErrWithFilename(0, NULL); 4222 return -1; 4223 } 4224 } 4225 4226 return size; 4227} 4228 4229PyObject *PyUnicode_DecodeMBCSStateful(const char *s, 4230 Py_ssize_t size, 4231 const char *errors, 4232 Py_ssize_t *consumed) 4233{ 4234 PyUnicodeObject *v = NULL; 4235 int done; 4236 4237 if (consumed) 4238 *consumed = 0; 4239 4240#ifdef NEED_RETRY 4241 retry: 4242 if (size > INT_MAX) 4243 done = decode_mbcs(&v, s, INT_MAX, 0); 4244 else 4245#endif 4246 done = decode_mbcs(&v, s, (int)size, !consumed); 4247 4248 if (done < 0) { 4249 Py_XDECREF(v); 4250 return NULL; 4251 } 4252 4253 if (consumed) 4254 *consumed += done; 4255 4256#ifdef NEED_RETRY 4257 if (size > INT_MAX) { 4258 s += done; 4259 size -= done; 4260 goto retry; 4261 } 4262#endif 4263 4264 return (PyObject *)v; 4265} 4266 4267PyObject *PyUnicode_DecodeMBCS(const char *s, 4268 Py_ssize_t size, 4269 const char *errors) 4270{ 4271 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL); 4272} 4273 4274/* 4275 * Convert unicode into string object (MBCS). 4276 * Returns 0 if succeed, -1 otherwise. 4277 */ 4278static int encode_mbcs(PyObject **repr, 4279 const Py_UNICODE *p, /* unicode */ 4280 int size) /* size of unicode */ 4281{ 4282 int mbcssize = 0; 4283 Py_ssize_t n = 0; 4284 4285 assert(size >= 0); 4286 4287 /* First get the size of the result */ 4288 if (size > 0) { 4289 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL); 4290 if (mbcssize == 0) { 4291 PyErr_SetFromWindowsErrWithFilename(0, NULL); 4292 return -1; 4293 } 4294 } 4295 4296 if (*repr == NULL) { 4297 /* Create string object */ 4298 *repr = PyBytes_FromStringAndSize(NULL, mbcssize); 4299 if (*repr == NULL) 4300 return -1; 4301 } 4302 else { 4303 /* Extend string object */ 4304 n = PyBytes_Size(*repr); 4305 if (_PyBytes_Resize(repr, n + mbcssize) < 0) 4306 return -1; 4307 } 4308 4309 /* Do the conversion */ 4310 if (size > 0) { 4311 char *s = PyBytes_AS_STRING(*repr) + n; 4312 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) { 4313 PyErr_SetFromWindowsErrWithFilename(0, NULL); 4314 return -1; 4315 } 4316 } 4317 4318 return 0; 4319} 4320 4321PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p, 4322 Py_ssize_t size, 4323 const char *errors) 4324{ 4325 PyObject *repr = NULL; 4326 int ret; 4327 4328#ifdef NEED_RETRY 4329 retry: 4330 if (size > INT_MAX) 4331 ret = encode_mbcs(&repr, p, INT_MAX); 4332 else 4333#endif 4334 ret = encode_mbcs(&repr, p, (int)size); 4335 4336 if (ret < 0) { 4337 Py_XDECREF(repr); 4338 return NULL; 4339 } 4340 4341#ifdef NEED_RETRY 4342 if (size > INT_MAX) { 4343 p += INT_MAX; 4344 size -= INT_MAX; 4345 goto retry; 4346 } 4347#endif 4348 4349 return repr; 4350} 4351 4352PyObject *PyUnicode_AsMBCSString(PyObject *unicode) 4353{ 4354 if (!PyUnicode_Check(unicode)) { 4355 PyErr_BadArgument(); 4356 return NULL; 4357 } 4358 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode), 4359 PyUnicode_GET_SIZE(unicode), 4360 NULL); 4361} 4362 4363#undef NEED_RETRY 4364 4365#endif /* MS_WINDOWS */ 4366 4367/* --- Character Mapping Codec -------------------------------------------- */ 4368 4369PyObject *PyUnicode_DecodeCharmap(const char *s, 4370 Py_ssize_t size, 4371 PyObject *mapping, 4372 const char *errors) 4373{ 4374 const char *starts = s; 4375 Py_ssize_t startinpos; 4376 Py_ssize_t endinpos; 4377 Py_ssize_t outpos; 4378 const char *e; 4379 PyUnicodeObject *v; 4380 Py_UNICODE *p; 4381 Py_ssize_t extrachars = 0; 4382 PyObject *errorHandler = NULL; 4383 PyObject *exc = NULL; 4384 Py_UNICODE *mapstring = NULL; 4385 Py_ssize_t maplen = 0; 4386 4387 /* Default to Latin-1 */ 4388 if (mapping == NULL) 4389 return PyUnicode_DecodeLatin1(s, size, errors); 4390 4391 v = _PyUnicode_New(size); 4392 if (v == NULL) 4393 goto onError; 4394 if (size == 0) 4395 return (PyObject *)v; 4396 p = PyUnicode_AS_UNICODE(v); 4397 e = s + size; 4398 if (PyUnicode_CheckExact(mapping)) { 4399 mapstring = PyUnicode_AS_UNICODE(mapping); 4400 maplen = PyUnicode_GET_SIZE(mapping); 4401 while (s < e) { 4402 unsigned char ch = *s; 4403 Py_UNICODE x = 0xfffe; /* illegal value */ 4404 4405 if (ch < maplen) 4406 x = mapstring[ch]; 4407 4408 if (x == 0xfffe) { 4409 /* undefined mapping */ 4410 outpos = p-PyUnicode_AS_UNICODE(v); 4411 startinpos = s-starts; 4412 endinpos = startinpos+1; 4413 if (unicode_decode_call_errorhandler( 4414 errors, &errorHandler, 4415 "charmap", "character maps to <undefined>", 4416 &starts, &e, &startinpos, &endinpos, &exc, &s, 4417 &v, &outpos, &p)) { 4418 goto onError; 4419 } 4420 continue; 4421 } 4422 *p++ = x; 4423 ++s; 4424 } 4425 } 4426 else { 4427 while (s < e) { 4428 unsigned char ch = *s; 4429 PyObject *w, *x; 4430 4431 /* Get mapping (char ordinal -> integer, Unicode char or None) */ 4432 w = PyLong_FromLong((long)ch); 4433 if (w == NULL) 4434 goto onError; 4435 x = PyObject_GetItem(mapping, w); 4436 Py_DECREF(w); 4437 if (x == NULL) { 4438 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 4439 /* No mapping found means: mapping is undefined. */ 4440 PyErr_Clear(); 4441 x = Py_None; 4442 Py_INCREF(x); 4443 } else 4444 goto onError; 4445 } 4446 4447 /* Apply mapping */ 4448 if (PyLong_Check(x)) { 4449 long value = PyLong_AS_LONG(x); 4450 if (value < 0 || value > 65535) { 4451 PyErr_SetString(PyExc_TypeError, 4452 "character mapping must be in range(65536)"); 4453 Py_DECREF(x); 4454 goto onError; 4455 } 4456 *p++ = (Py_UNICODE)value; 4457 } 4458 else if (x == Py_None) { 4459 /* undefined mapping */ 4460 outpos = p-PyUnicode_AS_UNICODE(v); 4461 startinpos = s-starts; 4462 endinpos = startinpos+1; 4463 if (unicode_decode_call_errorhandler( 4464 errors, &errorHandler, 4465 "charmap", "character maps to <undefined>", 4466 &starts, &e, &startinpos, &endinpos, &exc, &s, 4467 &v, &outpos, &p)) { 4468 Py_DECREF(x); 4469 goto onError; 4470 } 4471 Py_DECREF(x); 4472 continue; 4473 } 4474 else if (PyUnicode_Check(x)) { 4475 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x); 4476 4477 if (targetsize == 1) 4478 /* 1-1 mapping */ 4479 *p++ = *PyUnicode_AS_UNICODE(x); 4480 4481 else if (targetsize > 1) { 4482 /* 1-n mapping */ 4483 if (targetsize > extrachars) { 4484 /* resize first */ 4485 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v); 4486 Py_ssize_t needed = (targetsize - extrachars) + \ 4487 (targetsize << 2); 4488 extrachars += needed; 4489 /* XXX overflow detection missing */ 4490 if (_PyUnicode_Resize(&v, 4491 PyUnicode_GET_SIZE(v) + needed) < 0) { 4492 Py_DECREF(x); 4493 goto onError; 4494 } 4495 p = PyUnicode_AS_UNICODE(v) + oldpos; 4496 } 4497 Py_UNICODE_COPY(p, 4498 PyUnicode_AS_UNICODE(x), 4499 targetsize); 4500 p += targetsize; 4501 extrachars -= targetsize; 4502 } 4503 /* 1-0 mapping: skip the character */ 4504 } 4505 else { 4506 /* wrong return value */ 4507 PyErr_SetString(PyExc_TypeError, 4508 "character mapping must return integer, None or str"); 4509 Py_DECREF(x); 4510 goto onError; 4511 } 4512 Py_DECREF(x); 4513 ++s; 4514 } 4515 } 4516 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v)) 4517 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0) 4518 goto onError; 4519 Py_XDECREF(errorHandler); 4520 Py_XDECREF(exc); 4521 return (PyObject *)v; 4522 4523 onError: 4524 Py_XDECREF(errorHandler); 4525 Py_XDECREF(exc); 4526 Py_XDECREF(v); 4527 return NULL; 4528} 4529 4530/* Charmap encoding: the lookup table */ 4531 4532struct encoding_map{ 4533 PyObject_HEAD 4534 unsigned char level1[32]; 4535 int count2, count3; 4536 unsigned char level23[1]; 4537}; 4538 4539static PyObject* 4540encoding_map_size(PyObject *obj, PyObject* args) 4541{ 4542 struct encoding_map *map = (struct encoding_map*)obj; 4543 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 + 4544 128*map->count3); 4545} 4546 4547static PyMethodDef encoding_map_methods[] = { 4548 {"size", encoding_map_size, METH_NOARGS, 4549 PyDoc_STR("Return the size (in bytes) of this object") }, 4550 { 0 } 4551}; 4552 4553static void 4554encoding_map_dealloc(PyObject* o) 4555{ 4556 PyObject_FREE(o); 4557} 4558 4559static PyTypeObject EncodingMapType = { 4560 PyVarObject_HEAD_INIT(NULL, 0) 4561 "EncodingMap", /*tp_name*/ 4562 sizeof(struct encoding_map), /*tp_basicsize*/ 4563 0, /*tp_itemsize*/ 4564 /* methods */ 4565 encoding_map_dealloc, /*tp_dealloc*/ 4566 0, /*tp_print*/ 4567 0, /*tp_getattr*/ 4568 0, /*tp_setattr*/ 4569 0, /*tp_reserved*/ 4570 0, /*tp_repr*/ 4571 0, /*tp_as_number*/ 4572 0, /*tp_as_sequence*/ 4573 0, /*tp_as_mapping*/ 4574 0, /*tp_hash*/ 4575 0, /*tp_call*/ 4576 0, /*tp_str*/ 4577 0, /*tp_getattro*/ 4578 0, /*tp_setattro*/ 4579 0, /*tp_as_buffer*/ 4580 Py_TPFLAGS_DEFAULT, /*tp_flags*/ 4581 0, /*tp_doc*/ 4582 0, /*tp_traverse*/ 4583 0, /*tp_clear*/ 4584 0, /*tp_richcompare*/ 4585 0, /*tp_weaklistoffset*/ 4586 0, /*tp_iter*/ 4587 0, /*tp_iternext*/ 4588 encoding_map_methods, /*tp_methods*/ 4589 0, /*tp_members*/ 4590 0, /*tp_getset*/ 4591 0, /*tp_base*/ 4592 0, /*tp_dict*/ 4593 0, /*tp_descr_get*/ 4594 0, /*tp_descr_set*/ 4595 0, /*tp_dictoffset*/ 4596 0, /*tp_init*/ 4597 0, /*tp_alloc*/ 4598 0, /*tp_new*/ 4599 0, /*tp_free*/ 4600 0, /*tp_is_gc*/ 4601}; 4602 4603PyObject* 4604PyUnicode_BuildEncodingMap(PyObject* string) 4605{ 4606 Py_UNICODE *decode; 4607 PyObject *result; 4608 struct encoding_map *mresult; 4609 int i; 4610 int need_dict = 0; 4611 unsigned char level1[32]; 4612 unsigned char level2[512]; 4613 unsigned char *mlevel1, *mlevel2, *mlevel3; 4614 int count2 = 0, count3 = 0; 4615 4616 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) { 4617 PyErr_BadArgument(); 4618 return NULL; 4619 } 4620 decode = PyUnicode_AS_UNICODE(string); 4621 memset(level1, 0xFF, sizeof level1); 4622 memset(level2, 0xFF, sizeof level2); 4623 4624 /* If there isn't a one-to-one mapping of NULL to \0, 4625 or if there are non-BMP characters, we need to use 4626 a mapping dictionary. */ 4627 if (decode[0] != 0) 4628 need_dict = 1; 4629 for (i = 1; i < 256; i++) { 4630 int l1, l2; 4631 if (decode[i] == 0 4632#ifdef Py_UNICODE_WIDE 4633 || decode[i] > 0xFFFF 4634#endif 4635 ) { 4636 need_dict = 1; 4637 break; 4638 } 4639 if (decode[i] == 0xFFFE) 4640 /* unmapped character */ 4641 continue; 4642 l1 = decode[i] >> 11; 4643 l2 = decode[i] >> 7; 4644 if (level1[l1] == 0xFF) 4645 level1[l1] = count2++; 4646 if (level2[l2] == 0xFF) 4647 level2[l2] = count3++; 4648 } 4649 4650 if (count2 >= 0xFF || count3 >= 0xFF) 4651 need_dict = 1; 4652 4653 if (need_dict) { 4654 PyObject *result = PyDict_New(); 4655 PyObject *key, *value; 4656 if (!result) 4657 return NULL; 4658 for (i = 0; i < 256; i++) { 4659 key = value = NULL; 4660 key = PyLong_FromLong(decode[i]); 4661 value = PyLong_FromLong(i); 4662 if (!key || !value) 4663 goto failed1; 4664 if (PyDict_SetItem(result, key, value) == -1) 4665 goto failed1; 4666 Py_DECREF(key); 4667 Py_DECREF(value); 4668 } 4669 return result; 4670 failed1: 4671 Py_XDECREF(key); 4672 Py_XDECREF(value); 4673 Py_DECREF(result); 4674 return NULL; 4675 } 4676 4677 /* Create a three-level trie */ 4678 result = PyObject_MALLOC(sizeof(struct encoding_map) + 4679 16*count2 + 128*count3 - 1); 4680 if (!result) 4681 return PyErr_NoMemory(); 4682 PyObject_Init(result, &EncodingMapType); 4683 mresult = (struct encoding_map*)result; 4684 mresult->count2 = count2; 4685 mresult->count3 = count3; 4686 mlevel1 = mresult->level1; 4687 mlevel2 = mresult->level23; 4688 mlevel3 = mresult->level23 + 16*count2; 4689 memcpy(mlevel1, level1, 32); 4690 memset(mlevel2, 0xFF, 16*count2); 4691 memset(mlevel3, 0, 128*count3); 4692 count3 = 0; 4693 for (i = 1; i < 256; i++) { 4694 int o1, o2, o3, i2, i3; 4695 if (decode[i] == 0xFFFE) 4696 /* unmapped character */ 4697 continue; 4698 o1 = decode[i]>>11; 4699 o2 = (decode[i]>>7) & 0xF; 4700 i2 = 16*mlevel1[o1] + o2; 4701 if (mlevel2[i2] == 0xFF) 4702 mlevel2[i2] = count3++; 4703 o3 = decode[i] & 0x7F; 4704 i3 = 128*mlevel2[i2] + o3; 4705 mlevel3[i3] = i; 4706 } 4707 return result; 4708} 4709 4710static int 4711encoding_map_lookup(Py_UNICODE c, PyObject *mapping) 4712{ 4713 struct encoding_map *map = (struct encoding_map*)mapping; 4714 int l1 = c>>11; 4715 int l2 = (c>>7) & 0xF; 4716 int l3 = c & 0x7F; 4717 int i; 4718 4719#ifdef Py_UNICODE_WIDE 4720 if (c > 0xFFFF) { 4721 return -1; 4722 } 4723#endif 4724 if (c == 0) 4725 return 0; 4726 /* level 1*/ 4727 i = map->level1[l1]; 4728 if (i == 0xFF) { 4729 return -1; 4730 } 4731 /* level 2*/ 4732 i = map->level23[16*i+l2]; 4733 if (i == 0xFF) { 4734 return -1; 4735 } 4736 /* level 3 */ 4737 i = map->level23[16*map->count2 + 128*i + l3]; 4738 if (i == 0) { 4739 return -1; 4740 } 4741 return i; 4742} 4743 4744/* Lookup the character ch in the mapping. If the character 4745 can't be found, Py_None is returned (or NULL, if another 4746 error occurred). */ 4747static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping) 4748{ 4749 PyObject *w = PyLong_FromLong((long)c); 4750 PyObject *x; 4751 4752 if (w == NULL) 4753 return NULL; 4754 x = PyObject_GetItem(mapping, w); 4755 Py_DECREF(w); 4756 if (x == NULL) { 4757 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 4758 /* No mapping found means: mapping is undefined. */ 4759 PyErr_Clear(); 4760 x = Py_None; 4761 Py_INCREF(x); 4762 return x; 4763 } else 4764 return NULL; 4765 } 4766 else if (x == Py_None) 4767 return x; 4768 else if (PyLong_Check(x)) { 4769 long value = PyLong_AS_LONG(x); 4770 if (value < 0 || value > 255) { 4771 PyErr_SetString(PyExc_TypeError, 4772 "character mapping must be in range(256)"); 4773 Py_DECREF(x); 4774 return NULL; 4775 } 4776 return x; 4777 } 4778 else if (PyBytes_Check(x)) 4779 return x; 4780 else { 4781 /* wrong return value */ 4782 PyErr_Format(PyExc_TypeError, 4783 "character mapping must return integer, bytes or None, not %.400s", 4784 x->ob_type->tp_name); 4785 Py_DECREF(x); 4786 return NULL; 4787 } 4788} 4789 4790static int 4791charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize) 4792{ 4793 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj); 4794 /* exponentially overallocate to minimize reallocations */ 4795 if (requiredsize < 2*outsize) 4796 requiredsize = 2*outsize; 4797 if (_PyBytes_Resize(outobj, requiredsize)) 4798 return -1; 4799 return 0; 4800} 4801 4802typedef enum charmapencode_result { 4803 enc_SUCCESS, enc_FAILED, enc_EXCEPTION 4804}charmapencode_result; 4805/* lookup the character, put the result in the output string and adjust 4806 various state variables. Resize the output bytes object if not enough 4807 space is available. Return a new reference to the object that 4808 was put in the output buffer, or Py_None, if the mapping was undefined 4809 (in which case no character was written) or NULL, if a 4810 reallocation error occurred. The caller must decref the result */ 4811static 4812charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping, 4813 PyObject **outobj, Py_ssize_t *outpos) 4814{ 4815 PyObject *rep; 4816 char *outstart; 4817 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj); 4818 4819 if (Py_TYPE(mapping) == &EncodingMapType) { 4820 int res = encoding_map_lookup(c, mapping); 4821 Py_ssize_t requiredsize = *outpos+1; 4822 if (res == -1) 4823 return enc_FAILED; 4824 if (outsize<requiredsize) 4825 if (charmapencode_resize(outobj, outpos, requiredsize)) 4826 return enc_EXCEPTION; 4827 outstart = PyBytes_AS_STRING(*outobj); 4828 outstart[(*outpos)++] = (char)res; 4829 return enc_SUCCESS; 4830 } 4831 4832 rep = charmapencode_lookup(c, mapping); 4833 if (rep==NULL) 4834 return enc_EXCEPTION; 4835 else if (rep==Py_None) { 4836 Py_DECREF(rep); 4837 return enc_FAILED; 4838 } else { 4839 if (PyLong_Check(rep)) { 4840 Py_ssize_t requiredsize = *outpos+1; 4841 if (outsize<requiredsize) 4842 if (charmapencode_resize(outobj, outpos, requiredsize)) { 4843 Py_DECREF(rep); 4844 return enc_EXCEPTION; 4845 } 4846 outstart = PyBytes_AS_STRING(*outobj); 4847 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep); 4848 } 4849 else { 4850 const char *repchars = PyBytes_AS_STRING(rep); 4851 Py_ssize_t repsize = PyBytes_GET_SIZE(rep); 4852 Py_ssize_t requiredsize = *outpos+repsize; 4853 if (outsize<requiredsize) 4854 if (charmapencode_resize(outobj, outpos, requiredsize)) { 4855 Py_DECREF(rep); 4856 return enc_EXCEPTION; 4857 } 4858 outstart = PyBytes_AS_STRING(*outobj); 4859 memcpy(outstart + *outpos, repchars, repsize); 4860 *outpos += repsize; 4861 } 4862 } 4863 Py_DECREF(rep); 4864 return enc_SUCCESS; 4865} 4866 4867/* handle an error in PyUnicode_EncodeCharmap 4868 Return 0 on success, -1 on error */ 4869static 4870int charmap_encoding_error( 4871 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping, 4872 PyObject **exceptionObject, 4873 int *known_errorHandler, PyObject **errorHandler, const char *errors, 4874 PyObject **res, Py_ssize_t *respos) 4875{ 4876 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */ 4877 Py_ssize_t repsize; 4878 Py_ssize_t newpos; 4879 Py_UNICODE *uni2; 4880 /* startpos for collecting unencodable chars */ 4881 Py_ssize_t collstartpos = *inpos; 4882 Py_ssize_t collendpos = *inpos+1; 4883 Py_ssize_t collpos; 4884 char *encoding = "charmap"; 4885 char *reason = "character maps to <undefined>"; 4886 charmapencode_result x; 4887 4888 /* find all unencodable characters */ 4889 while (collendpos < size) { 4890 PyObject *rep; 4891 if (Py_TYPE(mapping) == &EncodingMapType) { 4892 int res = encoding_map_lookup(p[collendpos], mapping); 4893 if (res != -1) 4894 break; 4895 ++collendpos; 4896 continue; 4897 } 4898 4899 rep = charmapencode_lookup(p[collendpos], mapping); 4900 if (rep==NULL) 4901 return -1; 4902 else if (rep!=Py_None) { 4903 Py_DECREF(rep); 4904 break; 4905 } 4906 Py_DECREF(rep); 4907 ++collendpos; 4908 } 4909 /* cache callback name lookup 4910 * (if not done yet, i.e. it's the first error) */ 4911 if (*known_errorHandler==-1) { 4912 if ((errors==NULL) || (!strcmp(errors, "strict"))) 4913 *known_errorHandler = 1; 4914 else if (!strcmp(errors, "replace")) 4915 *known_errorHandler = 2; 4916 else if (!strcmp(errors, "ignore")) 4917 *known_errorHandler = 3; 4918 else if (!strcmp(errors, "xmlcharrefreplace")) 4919 *known_errorHandler = 4; 4920 else 4921 *known_errorHandler = 0; 4922 } 4923 switch (*known_errorHandler) { 4924 case 1: /* strict */ 4925 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason); 4926 return -1; 4927 case 2: /* replace */ 4928 for (collpos = collstartpos; collpos<collendpos; ++collpos) { 4929 x = charmapencode_output('?', mapping, res, respos); 4930 if (x==enc_EXCEPTION) { 4931 return -1; 4932 } 4933 else if (x==enc_FAILED) { 4934 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason); 4935 return -1; 4936 } 4937 } 4938 /* fall through */ 4939 case 3: /* ignore */ 4940 *inpos = collendpos; 4941 break; 4942 case 4: /* xmlcharrefreplace */ 4943 /* generate replacement (temporarily (mis)uses p) */ 4944 for (collpos = collstartpos; collpos < collendpos; ++collpos) { 4945 char buffer[2+29+1+1]; 4946 char *cp; 4947 sprintf(buffer, "&#%d;", (int)p[collpos]); 4948 for (cp = buffer; *cp; ++cp) { 4949 x = charmapencode_output(*cp, mapping, res, respos); 4950 if (x==enc_EXCEPTION) 4951 return -1; 4952 else if (x==enc_FAILED) { 4953 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason); 4954 return -1; 4955 } 4956 } 4957 } 4958 *inpos = collendpos; 4959 break; 4960 default: 4961 repunicode = unicode_encode_call_errorhandler(errors, errorHandler, 4962 encoding, reason, p, size, exceptionObject, 4963 collstartpos, collendpos, &newpos); 4964 if (repunicode == NULL) 4965 return -1; 4966 /* generate replacement */ 4967 repsize = PyUnicode_GET_SIZE(repunicode); 4968 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) { 4969 x = charmapencode_output(*uni2, mapping, res, respos); 4970 if (x==enc_EXCEPTION) { 4971 return -1; 4972 } 4973 else if (x==enc_FAILED) { 4974 Py_DECREF(repunicode); 4975 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason); 4976 return -1; 4977 } 4978 } 4979 *inpos = newpos; 4980 Py_DECREF(repunicode); 4981 } 4982 return 0; 4983} 4984 4985PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p, 4986 Py_ssize_t size, 4987 PyObject *mapping, 4988 const char *errors) 4989{ 4990 /* output object */ 4991 PyObject *res = NULL; 4992 /* current input position */ 4993 Py_ssize_t inpos = 0; 4994 /* current output position */ 4995 Py_ssize_t respos = 0; 4996 PyObject *errorHandler = NULL; 4997 PyObject *exc = NULL; 4998 /* the following variable is used for caching string comparisons 4999 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 5000 * 3=ignore, 4=xmlcharrefreplace */ 5001 int known_errorHandler = -1; 5002 5003 /* Default to Latin-1 */ 5004 if (mapping == NULL) 5005 return PyUnicode_EncodeLatin1(p, size, errors); 5006 5007 /* allocate enough for a simple encoding without 5008 replacements, if we need more, we'll resize */ 5009 res = PyBytes_FromStringAndSize(NULL, size); 5010 if (res == NULL) 5011 goto onError; 5012 if (size == 0) 5013 return res; 5014 5015 while (inpos<size) { 5016 /* try to encode it */ 5017 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos); 5018 if (x==enc_EXCEPTION) /* error */ 5019 goto onError; 5020 if (x==enc_FAILED) { /* unencodable character */ 5021 if (charmap_encoding_error(p, size, &inpos, mapping, 5022 &exc, 5023 &known_errorHandler, &errorHandler, errors, 5024 &res, &respos)) { 5025 goto onError; 5026 } 5027 } 5028 else 5029 /* done with this character => adjust input position */ 5030 ++inpos; 5031 } 5032 5033 /* Resize if we allocated to much */ 5034 if (respos<PyBytes_GET_SIZE(res)) 5035 if (_PyBytes_Resize(&res, respos) < 0) 5036 goto onError; 5037 5038 Py_XDECREF(exc); 5039 Py_XDECREF(errorHandler); 5040 return res; 5041 5042 onError: 5043 Py_XDECREF(res); 5044 Py_XDECREF(exc); 5045 Py_XDECREF(errorHandler); 5046 return NULL; 5047} 5048 5049PyObject *PyUnicode_AsCharmapString(PyObject *unicode, 5050 PyObject *mapping) 5051{ 5052 if (!PyUnicode_Check(unicode) || mapping == NULL) { 5053 PyErr_BadArgument(); 5054 return NULL; 5055 } 5056 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode), 5057 PyUnicode_GET_SIZE(unicode), 5058 mapping, 5059 NULL); 5060} 5061 5062/* create or adjust a UnicodeTranslateError */ 5063static void make_translate_exception(PyObject **exceptionObject, 5064 const Py_UNICODE *unicode, Py_ssize_t size, 5065 Py_ssize_t startpos, Py_ssize_t endpos, 5066 const char *reason) 5067{ 5068 if (*exceptionObject == NULL) { 5069 *exceptionObject = PyUnicodeTranslateError_Create( 5070 unicode, size, startpos, endpos, reason); 5071 } 5072 else { 5073 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos)) 5074 goto onError; 5075 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos)) 5076 goto onError; 5077 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason)) 5078 goto onError; 5079 return; 5080 onError: 5081 Py_DECREF(*exceptionObject); 5082 *exceptionObject = NULL; 5083 } 5084} 5085 5086/* raises a UnicodeTranslateError */ 5087static void raise_translate_exception(PyObject **exceptionObject, 5088 const Py_UNICODE *unicode, Py_ssize_t size, 5089 Py_ssize_t startpos, Py_ssize_t endpos, 5090 const char *reason) 5091{ 5092 make_translate_exception(exceptionObject, 5093 unicode, size, startpos, endpos, reason); 5094 if (*exceptionObject != NULL) 5095 PyCodec_StrictErrors(*exceptionObject); 5096} 5097 5098/* error handling callback helper: 5099 build arguments, call the callback and check the arguments, 5100 put the result into newpos and return the replacement string, which 5101 has to be freed by the caller */ 5102static PyObject *unicode_translate_call_errorhandler(const char *errors, 5103 PyObject **errorHandler, 5104 const char *reason, 5105 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject, 5106 Py_ssize_t startpos, Py_ssize_t endpos, 5107 Py_ssize_t *newpos) 5108{ 5109 static char *argparse = "O!n;translating error handler must return (str, int) tuple"; 5110 5111 Py_ssize_t i_newpos; 5112 PyObject *restuple; 5113 PyObject *resunicode; 5114 5115 if (*errorHandler == NULL) { 5116 *errorHandler = PyCodec_LookupError(errors); 5117 if (*errorHandler == NULL) 5118 return NULL; 5119 } 5120 5121 make_translate_exception(exceptionObject, 5122 unicode, size, startpos, endpos, reason); 5123 if (*exceptionObject == NULL) 5124 return NULL; 5125 5126 restuple = PyObject_CallFunctionObjArgs( 5127 *errorHandler, *exceptionObject, NULL); 5128 if (restuple == NULL) 5129 return NULL; 5130 if (!PyTuple_Check(restuple)) { 5131 PyErr_Format(PyExc_TypeError, &argparse[4]); 5132 Py_DECREF(restuple); 5133 return NULL; 5134 } 5135 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, 5136 &resunicode, &i_newpos)) { 5137 Py_DECREF(restuple); 5138 return NULL; 5139 } 5140 if (i_newpos<0) 5141 *newpos = size+i_newpos; 5142 else 5143 *newpos = i_newpos; 5144 if (*newpos<0 || *newpos>size) { 5145 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos); 5146 Py_DECREF(restuple); 5147 return NULL; 5148 } 5149 Py_INCREF(resunicode); 5150 Py_DECREF(restuple); 5151 return resunicode; 5152} 5153 5154/* Lookup the character ch in the mapping and put the result in result, 5155 which must be decrefed by the caller. 5156 Return 0 on success, -1 on error */ 5157static 5158int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result) 5159{ 5160 PyObject *w = PyLong_FromLong((long)c); 5161 PyObject *x; 5162 5163 if (w == NULL) 5164 return -1; 5165 x = PyObject_GetItem(mapping, w); 5166 Py_DECREF(w); 5167 if (x == NULL) { 5168 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 5169 /* No mapping found means: use 1:1 mapping. */ 5170 PyErr_Clear(); 5171 *result = NULL; 5172 return 0; 5173 } else 5174 return -1; 5175 } 5176 else if (x == Py_None) { 5177 *result = x; 5178 return 0; 5179 } 5180 else if (PyLong_Check(x)) { 5181 long value = PyLong_AS_LONG(x); 5182 long max = PyUnicode_GetMax(); 5183 if (value < 0 || value > max) { 5184 PyErr_Format(PyExc_TypeError, 5185 "character mapping must be in range(0x%x)", max+1); 5186 Py_DECREF(x); 5187 return -1; 5188 } 5189 *result = x; 5190 return 0; 5191 } 5192 else if (PyUnicode_Check(x)) { 5193 *result = x; 5194 return 0; 5195 } 5196 else { 5197 /* wrong return value */ 5198 PyErr_SetString(PyExc_TypeError, 5199 "character mapping must return integer, None or str"); 5200 Py_DECREF(x); 5201 return -1; 5202 } 5203} 5204/* ensure that *outobj is at least requiredsize characters long, 5205 if not reallocate and adjust various state variables. 5206 Return 0 on success, -1 on error */ 5207static 5208int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp, 5209 Py_ssize_t requiredsize) 5210{ 5211 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj); 5212 if (requiredsize > oldsize) { 5213 /* remember old output position */ 5214 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj); 5215 /* exponentially overallocate to minimize reallocations */ 5216 if (requiredsize < 2 * oldsize) 5217 requiredsize = 2 * oldsize; 5218 if (PyUnicode_Resize(outobj, requiredsize) < 0) 5219 return -1; 5220 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos; 5221 } 5222 return 0; 5223} 5224/* lookup the character, put the result in the output string and adjust 5225 various state variables. Return a new reference to the object that 5226 was put in the output buffer in *result, or Py_None, if the mapping was 5227 undefined (in which case no character was written). 5228 The called must decref result. 5229 Return 0 on success, -1 on error. */ 5230static 5231int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp, 5232 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp, 5233 PyObject **res) 5234{ 5235 if (charmaptranslate_lookup(*curinp, mapping, res)) 5236 return -1; 5237 if (*res==NULL) { 5238 /* not found => default to 1:1 mapping */ 5239 *(*outp)++ = *curinp; 5240 } 5241 else if (*res==Py_None) 5242 ; 5243 else if (PyLong_Check(*res)) { 5244 /* no overflow check, because we know that the space is enough */ 5245 *(*outp)++ = (Py_UNICODE)PyLong_AS_LONG(*res); 5246 } 5247 else if (PyUnicode_Check(*res)) { 5248 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res); 5249 if (repsize==1) { 5250 /* no overflow check, because we know that the space is enough */ 5251 *(*outp)++ = *PyUnicode_AS_UNICODE(*res); 5252 } 5253 else if (repsize!=0) { 5254 /* more than one character */ 5255 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) + 5256 (insize - (curinp-startinp)) + 5257 repsize - 1; 5258 if (charmaptranslate_makespace(outobj, outp, requiredsize)) 5259 return -1; 5260 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize); 5261 *outp += repsize; 5262 } 5263 } 5264 else 5265 return -1; 5266 return 0; 5267} 5268 5269PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p, 5270 Py_ssize_t size, 5271 PyObject *mapping, 5272 const char *errors) 5273{ 5274 /* output object */ 5275 PyObject *res = NULL; 5276 /* pointers to the beginning and end+1 of input */ 5277 const Py_UNICODE *startp = p; 5278 const Py_UNICODE *endp = p + size; 5279 /* pointer into the output */ 5280 Py_UNICODE *str; 5281 /* current output position */ 5282 Py_ssize_t respos = 0; 5283 char *reason = "character maps to <undefined>"; 5284 PyObject *errorHandler = NULL; 5285 PyObject *exc = NULL; 5286 /* the following variable is used for caching string comparisons 5287 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 5288 * 3=ignore, 4=xmlcharrefreplace */ 5289 int known_errorHandler = -1; 5290 5291 if (mapping == NULL) { 5292 PyErr_BadArgument(); 5293 return NULL; 5294 } 5295 5296 /* allocate enough for a simple 1:1 translation without 5297 replacements, if we need more, we'll resize */ 5298 res = PyUnicode_FromUnicode(NULL, size); 5299 if (res == NULL) 5300 goto onError; 5301 if (size == 0) 5302 return res; 5303 str = PyUnicode_AS_UNICODE(res); 5304 5305 while (p<endp) { 5306 /* try to encode it */ 5307 PyObject *x = NULL; 5308 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) { 5309 Py_XDECREF(x); 5310 goto onError; 5311 } 5312 Py_XDECREF(x); 5313 if (x!=Py_None) /* it worked => adjust input pointer */ 5314 ++p; 5315 else { /* untranslatable character */ 5316 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */ 5317 Py_ssize_t repsize; 5318 Py_ssize_t newpos; 5319 Py_UNICODE *uni2; 5320 /* startpos for collecting untranslatable chars */ 5321 const Py_UNICODE *collstart = p; 5322 const Py_UNICODE *collend = p+1; 5323 const Py_UNICODE *coll; 5324 5325 /* find all untranslatable characters */ 5326 while (collend < endp) { 5327 if (charmaptranslate_lookup(*collend, mapping, &x)) 5328 goto onError; 5329 Py_XDECREF(x); 5330 if (x!=Py_None) 5331 break; 5332 ++collend; 5333 } 5334 /* cache callback name lookup 5335 * (if not done yet, i.e. it's the first error) */ 5336 if (known_errorHandler==-1) { 5337 if ((errors==NULL) || (!strcmp(errors, "strict"))) 5338 known_errorHandler = 1; 5339 else if (!strcmp(errors, "replace")) 5340 known_errorHandler = 2; 5341 else if (!strcmp(errors, "ignore")) 5342 known_errorHandler = 3; 5343 else if (!strcmp(errors, "xmlcharrefreplace")) 5344 known_errorHandler = 4; 5345 else 5346 known_errorHandler = 0; 5347 } 5348 switch (known_errorHandler) { 5349 case 1: /* strict */ 5350 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason); 5351 goto onError; 5352 case 2: /* replace */ 5353 /* No need to check for space, this is a 1:1 replacement */ 5354 for (coll = collstart; coll<collend; ++coll) 5355 *str++ = '?'; 5356 /* fall through */ 5357 case 3: /* ignore */ 5358 p = collend; 5359 break; 5360 case 4: /* xmlcharrefreplace */ 5361 /* generate replacement (temporarily (mis)uses p) */ 5362 for (p = collstart; p < collend; ++p) { 5363 char buffer[2+29+1+1]; 5364 char *cp; 5365 sprintf(buffer, "&#%d;", (int)*p); 5366 if (charmaptranslate_makespace(&res, &str, 5367 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend))) 5368 goto onError; 5369 for (cp = buffer; *cp; ++cp) 5370 *str++ = *cp; 5371 } 5372 p = collend; 5373 break; 5374 default: 5375 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler, 5376 reason, startp, size, &exc, 5377 collstart-startp, collend-startp, &newpos); 5378 if (repunicode == NULL) 5379 goto onError; 5380 /* generate replacement */ 5381 repsize = PyUnicode_GET_SIZE(repunicode); 5382 if (charmaptranslate_makespace(&res, &str, 5383 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) { 5384 Py_DECREF(repunicode); 5385 goto onError; 5386 } 5387 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) 5388 *str++ = *uni2; 5389 p = startp + newpos; 5390 Py_DECREF(repunicode); 5391 } 5392 } 5393 } 5394 /* Resize if we allocated to much */ 5395 respos = str-PyUnicode_AS_UNICODE(res); 5396 if (respos<PyUnicode_GET_SIZE(res)) { 5397 if (PyUnicode_Resize(&res, respos) < 0) 5398 goto onError; 5399 } 5400 Py_XDECREF(exc); 5401 Py_XDECREF(errorHandler); 5402 return res; 5403 5404 onError: 5405 Py_XDECREF(res); 5406 Py_XDECREF(exc); 5407 Py_XDECREF(errorHandler); 5408 return NULL; 5409} 5410 5411PyObject *PyUnicode_Translate(PyObject *str, 5412 PyObject *mapping, 5413 const char *errors) 5414{ 5415 PyObject *result; 5416 5417 str = PyUnicode_FromObject(str); 5418 if (str == NULL) 5419 goto onError; 5420 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str), 5421 PyUnicode_GET_SIZE(str), 5422 mapping, 5423 errors); 5424 Py_DECREF(str); 5425 return result; 5426 5427 onError: 5428 Py_XDECREF(str); 5429 return NULL; 5430} 5431 5432/* --- Decimal Encoder ---------------------------------------------------- */ 5433 5434int PyUnicode_EncodeDecimal(Py_UNICODE *s, 5435 Py_ssize_t length, 5436 char *output, 5437 const char *errors) 5438{ 5439 Py_UNICODE *p, *end; 5440 PyObject *errorHandler = NULL; 5441 PyObject *exc = NULL; 5442 const char *encoding = "decimal"; 5443 const char *reason = "invalid decimal Unicode string"; 5444 /* the following variable is used for caching string comparisons 5445 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */ 5446 int known_errorHandler = -1; 5447 5448 if (output == NULL) { 5449 PyErr_BadArgument(); 5450 return -1; 5451 } 5452 5453 p = s; 5454 end = s + length; 5455 while (p < end) { 5456 register Py_UNICODE ch = *p; 5457 int decimal; 5458 PyObject *repunicode; 5459 Py_ssize_t repsize; 5460 Py_ssize_t newpos; 5461 Py_UNICODE *uni2; 5462 Py_UNICODE *collstart; 5463 Py_UNICODE *collend; 5464 5465 if (Py_UNICODE_ISSPACE(ch)) { 5466 *output++ = ' '; 5467 ++p; 5468 continue; 5469 } 5470 decimal = Py_UNICODE_TODECIMAL(ch); 5471 if (decimal >= 0) { 5472 *output++ = '0' + decimal; 5473 ++p; 5474 continue; 5475 } 5476 if (0 < ch && ch < 256) { 5477 *output++ = (char)ch; 5478 ++p; 5479 continue; 5480 } 5481 /* All other characters are considered unencodable */ 5482 collstart = p; 5483 collend = p+1; 5484 while (collend < end) { 5485 if ((0 < *collend && *collend < 256) || 5486 !Py_UNICODE_ISSPACE(*collend) || 5487 Py_UNICODE_TODECIMAL(*collend)) 5488 break; 5489 } 5490 /* cache callback name lookup 5491 * (if not done yet, i.e. it's the first error) */ 5492 if (known_errorHandler==-1) { 5493 if ((errors==NULL) || (!strcmp(errors, "strict"))) 5494 known_errorHandler = 1; 5495 else if (!strcmp(errors, "replace")) 5496 known_errorHandler = 2; 5497 else if (!strcmp(errors, "ignore")) 5498 known_errorHandler = 3; 5499 else if (!strcmp(errors, "xmlcharrefreplace")) 5500 known_errorHandler = 4; 5501 else 5502 known_errorHandler = 0; 5503 } 5504 switch (known_errorHandler) { 5505 case 1: /* strict */ 5506 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason); 5507 goto onError; 5508 case 2: /* replace */ 5509 for (p = collstart; p < collend; ++p) 5510 *output++ = '?'; 5511 /* fall through */ 5512 case 3: /* ignore */ 5513 p = collend; 5514 break; 5515 case 4: /* xmlcharrefreplace */ 5516 /* generate replacement (temporarily (mis)uses p) */ 5517 for (p = collstart; p < collend; ++p) 5518 output += sprintf(output, "&#%d;", (int)*p); 5519 p = collend; 5520 break; 5521 default: 5522 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler, 5523 encoding, reason, s, length, &exc, 5524 collstart-s, collend-s, &newpos); 5525 if (repunicode == NULL) 5526 goto onError; 5527 /* generate replacement */ 5528 repsize = PyUnicode_GET_SIZE(repunicode); 5529 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) { 5530 Py_UNICODE ch = *uni2; 5531 if (Py_UNICODE_ISSPACE(ch)) 5532 *output++ = ' '; 5533 else { 5534 decimal = Py_UNICODE_TODECIMAL(ch); 5535 if (decimal >= 0) 5536 *output++ = '0' + decimal; 5537 else if (0 < ch && ch < 256) 5538 *output++ = (char)ch; 5539 else { 5540 Py_DECREF(repunicode); 5541 raise_encode_exception(&exc, encoding, 5542 s, length, collstart-s, collend-s, reason); 5543 goto onError; 5544 } 5545 } 5546 } 5547 p = s + newpos; 5548 Py_DECREF(repunicode); 5549 } 5550 } 5551 /* 0-terminate the output string */ 5552 *output++ = '\0'; 5553 Py_XDECREF(exc); 5554 Py_XDECREF(errorHandler); 5555 return 0; 5556 5557 onError: 5558 Py_XDECREF(exc); 5559 Py_XDECREF(errorHandler); 5560 return -1; 5561} 5562 5563/* --- Helpers ------------------------------------------------------------ */ 5564 5565#include "stringlib/unicodedefs.h" 5566#include "stringlib/fastsearch.h" 5567#include "stringlib/count.h" 5568/* Include _ParseTupleFinds from find.h */ 5569#define FROM_UNICODE 5570#include "stringlib/find.h" 5571#include "stringlib/partition.h" 5572 5573#define _Py_InsertThousandsGrouping _PyUnicode_InsertThousandsGrouping 5574#include "stringlib/localeutil.h" 5575 5576/* helper macro to fixup start/end slice values */ 5577#define FIX_START_END(obj) \ 5578 if (start < 0) \ 5579 start += (obj)->length; \ 5580 if (start < 0) \ 5581 start = 0; \ 5582 if (end > (obj)->length) \ 5583 end = (obj)->length; \ 5584 if (end < 0) \ 5585 end += (obj)->length; \ 5586 if (end < 0) \ 5587 end = 0; 5588 5589Py_ssize_t PyUnicode_Count(PyObject *str, 5590 PyObject *substr, 5591 Py_ssize_t start, 5592 Py_ssize_t end) 5593{ 5594 Py_ssize_t result; 5595 PyUnicodeObject* str_obj; 5596 PyUnicodeObject* sub_obj; 5597 5598 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str); 5599 if (!str_obj) 5600 return -1; 5601 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr); 5602 if (!sub_obj) { 5603 Py_DECREF(str_obj); 5604 return -1; 5605 } 5606 5607 FIX_START_END(str_obj); 5608 5609 result = stringlib_count( 5610 str_obj->str + start, end - start, sub_obj->str, sub_obj->length 5611 ); 5612 5613 Py_DECREF(sub_obj); 5614 Py_DECREF(str_obj); 5615 5616 return result; 5617} 5618 5619Py_ssize_t PyUnicode_Find(PyObject *str, 5620 PyObject *sub, 5621 Py_ssize_t start, 5622 Py_ssize_t end, 5623 int direction) 5624{ 5625 Py_ssize_t result; 5626 5627 str = PyUnicode_FromObject(str); 5628 if (!str) 5629 return -2; 5630 sub = PyUnicode_FromObject(sub); 5631 if (!sub) { 5632 Py_DECREF(str); 5633 return -2; 5634 } 5635 5636 if (direction > 0) 5637 result = stringlib_find_slice( 5638 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str), 5639 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub), 5640 start, end 5641 ); 5642 else 5643 result = stringlib_rfind_slice( 5644 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str), 5645 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub), 5646 start, end 5647 ); 5648 5649 Py_DECREF(str); 5650 Py_DECREF(sub); 5651 5652 return result; 5653} 5654 5655static 5656int tailmatch(PyUnicodeObject *self, 5657 PyUnicodeObject *substring, 5658 Py_ssize_t start, 5659 Py_ssize_t end, 5660 int direction) 5661{ 5662 if (substring->length == 0) 5663 return 1; 5664 5665 FIX_START_END(self); 5666 5667 end -= substring->length; 5668 if (end < start) 5669 return 0; 5670 5671 if (direction > 0) { 5672 if (Py_UNICODE_MATCH(self, end, substring)) 5673 return 1; 5674 } else { 5675 if (Py_UNICODE_MATCH(self, start, substring)) 5676 return 1; 5677 } 5678 5679 return 0; 5680} 5681 5682Py_ssize_t PyUnicode_Tailmatch(PyObject *str, 5683 PyObject *substr, 5684 Py_ssize_t start, 5685 Py_ssize_t end, 5686 int direction) 5687{ 5688 Py_ssize_t result; 5689 5690 str = PyUnicode_FromObject(str); 5691 if (str == NULL) 5692 return -1; 5693 substr = PyUnicode_FromObject(substr); 5694 if (substr == NULL) { 5695 Py_DECREF(str); 5696 return -1; 5697 } 5698 5699 result = tailmatch((PyUnicodeObject *)str, 5700 (PyUnicodeObject *)substr, 5701 start, end, direction); 5702 Py_DECREF(str); 5703 Py_DECREF(substr); 5704 return result; 5705} 5706 5707/* Apply fixfct filter to the Unicode object self and return a 5708 reference to the modified object */ 5709 5710static 5711PyObject *fixup(PyUnicodeObject *self, 5712 int (*fixfct)(PyUnicodeObject *s)) 5713{ 5714 5715 PyUnicodeObject *u; 5716 5717 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length); 5718 if (u == NULL) 5719 return NULL; 5720 5721 Py_UNICODE_COPY(u->str, self->str, self->length); 5722 5723 if (!fixfct(u) && PyUnicode_CheckExact(self)) { 5724 /* fixfct should return TRUE if it modified the buffer. If 5725 FALSE, return a reference to the original buffer instead 5726 (to save space, not time) */ 5727 Py_INCREF(self); 5728 Py_DECREF(u); 5729 return (PyObject*) self; 5730 } 5731 return (PyObject*) u; 5732} 5733 5734static 5735int fixupper(PyUnicodeObject *self) 5736{ 5737 Py_ssize_t len = self->length; 5738 Py_UNICODE *s = self->str; 5739 int status = 0; 5740 5741 while (len-- > 0) { 5742 register Py_UNICODE ch; 5743 5744 ch = Py_UNICODE_TOUPPER(*s); 5745 if (ch != *s) { 5746 status = 1; 5747 *s = ch; 5748 } 5749 s++; 5750 } 5751 5752 return status; 5753} 5754 5755static 5756int fixlower(PyUnicodeObject *self) 5757{ 5758 Py_ssize_t len = self->length; 5759 Py_UNICODE *s = self->str; 5760 int status = 0; 5761 5762 while (len-- > 0) { 5763 register Py_UNICODE ch; 5764 5765 ch = Py_UNICODE_TOLOWER(*s); 5766 if (ch != *s) { 5767 status = 1; 5768 *s = ch; 5769 } 5770 s++; 5771 } 5772 5773 return status; 5774} 5775 5776static 5777int fixswapcase(PyUnicodeObject *self) 5778{ 5779 Py_ssize_t len = self->length; 5780 Py_UNICODE *s = self->str; 5781 int status = 0; 5782 5783 while (len-- > 0) { 5784 if (Py_UNICODE_ISUPPER(*s)) { 5785 *s = Py_UNICODE_TOLOWER(*s); 5786 status = 1; 5787 } else if (Py_UNICODE_ISLOWER(*s)) { 5788 *s = Py_UNICODE_TOUPPER(*s); 5789 status = 1; 5790 } 5791 s++; 5792 } 5793 5794 return status; 5795} 5796 5797static 5798int fixcapitalize(PyUnicodeObject *self) 5799{ 5800 Py_ssize_t len = self->length; 5801 Py_UNICODE *s = self->str; 5802 int status = 0; 5803 5804 if (len == 0) 5805 return 0; 5806 if (Py_UNICODE_ISLOWER(*s)) { 5807 *s = Py_UNICODE_TOUPPER(*s); 5808 status = 1; 5809 } 5810 s++; 5811 while (--len > 0) { 5812 if (Py_UNICODE_ISUPPER(*s)) { 5813 *s = Py_UNICODE_TOLOWER(*s); 5814 status = 1; 5815 } 5816 s++; 5817 } 5818 return status; 5819} 5820 5821static 5822int fixtitle(PyUnicodeObject *self) 5823{ 5824 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 5825 register Py_UNICODE *e; 5826 int previous_is_cased; 5827 5828 /* Shortcut for single character strings */ 5829 if (PyUnicode_GET_SIZE(self) == 1) { 5830 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p); 5831 if (*p != ch) { 5832 *p = ch; 5833 return 1; 5834 } 5835 else 5836 return 0; 5837 } 5838 5839 e = p + PyUnicode_GET_SIZE(self); 5840 previous_is_cased = 0; 5841 for (; p < e; p++) { 5842 register const Py_UNICODE ch = *p; 5843 5844 if (previous_is_cased) 5845 *p = Py_UNICODE_TOLOWER(ch); 5846 else 5847 *p = Py_UNICODE_TOTITLE(ch); 5848 5849 if (Py_UNICODE_ISLOWER(ch) || 5850 Py_UNICODE_ISUPPER(ch) || 5851 Py_UNICODE_ISTITLE(ch)) 5852 previous_is_cased = 1; 5853 else 5854 previous_is_cased = 0; 5855 } 5856 return 1; 5857} 5858 5859PyObject * 5860PyUnicode_Join(PyObject *separator, PyObject *seq) 5861{ 5862 const Py_UNICODE blank = ' '; 5863 const Py_UNICODE *sep = ␣ 5864 Py_ssize_t seplen = 1; 5865 PyUnicodeObject *res = NULL; /* the result */ 5866 Py_UNICODE *res_p; /* pointer to free byte in res's string area */ 5867 PyObject *fseq; /* PySequence_Fast(seq) */ 5868 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */ 5869 PyObject **items; 5870 PyObject *item; 5871 Py_ssize_t sz, i; 5872 5873 fseq = PySequence_Fast(seq, ""); 5874 if (fseq == NULL) { 5875 return NULL; 5876 } 5877 5878 /* NOTE: the following code can't call back into Python code, 5879 * so we are sure that fseq won't be mutated. 5880 */ 5881 5882 seqlen = PySequence_Fast_GET_SIZE(fseq); 5883 /* If empty sequence, return u"". */ 5884 if (seqlen == 0) { 5885 res = _PyUnicode_New(0); /* empty sequence; return u"" */ 5886 goto Done; 5887 } 5888 items = PySequence_Fast_ITEMS(fseq); 5889 /* If singleton sequence with an exact Unicode, return that. */ 5890 if (seqlen == 1) { 5891 item = items[0]; 5892 if (PyUnicode_CheckExact(item)) { 5893 Py_INCREF(item); 5894 res = (PyUnicodeObject *)item; 5895 goto Done; 5896 } 5897 } 5898 else { 5899 /* Set up sep and seplen */ 5900 if (separator == NULL) { 5901 sep = ␣ 5902 seplen = 1; 5903 } 5904 else { 5905 if (!PyUnicode_Check(separator)) { 5906 PyErr_Format(PyExc_TypeError, 5907 "separator: expected str instance," 5908 " %.80s found", 5909 Py_TYPE(separator)->tp_name); 5910 goto onError; 5911 } 5912 sep = PyUnicode_AS_UNICODE(separator); 5913 seplen = PyUnicode_GET_SIZE(separator); 5914 } 5915 } 5916 5917 /* There are at least two things to join, or else we have a subclass 5918 * of str in the sequence. 5919 * Do a pre-pass to figure out the total amount of space we'll 5920 * need (sz), and see whether all argument are strings. 5921 */ 5922 sz = 0; 5923 for (i = 0; i < seqlen; i++) { 5924 const Py_ssize_t old_sz = sz; 5925 item = items[i]; 5926 if (!PyUnicode_Check(item)) { 5927 PyErr_Format(PyExc_TypeError, 5928 "sequence item %zd: expected str instance," 5929 " %.80s found", 5930 i, Py_TYPE(item)->tp_name); 5931 goto onError; 5932 } 5933 sz += PyUnicode_GET_SIZE(item); 5934 if (i != 0) 5935 sz += seplen; 5936 if (sz < old_sz || sz > PY_SSIZE_T_MAX) { 5937 PyErr_SetString(PyExc_OverflowError, 5938 "join() result is too long for a Python string"); 5939 goto onError; 5940 } 5941 } 5942 5943 res = _PyUnicode_New(sz); 5944 if (res == NULL) 5945 goto onError; 5946 5947 /* Catenate everything. */ 5948 res_p = PyUnicode_AS_UNICODE(res); 5949 for (i = 0; i < seqlen; ++i) { 5950 Py_ssize_t itemlen; 5951 item = items[i]; 5952 itemlen = PyUnicode_GET_SIZE(item); 5953 /* Copy item, and maybe the separator. */ 5954 if (i) { 5955 Py_UNICODE_COPY(res_p, sep, seplen); 5956 res_p += seplen; 5957 } 5958 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen); 5959 res_p += itemlen; 5960 } 5961 5962 Done: 5963 Py_DECREF(fseq); 5964 return (PyObject *)res; 5965 5966 onError: 5967 Py_DECREF(fseq); 5968 Py_XDECREF(res); 5969 return NULL; 5970} 5971 5972static 5973PyUnicodeObject *pad(PyUnicodeObject *self, 5974 Py_ssize_t left, 5975 Py_ssize_t right, 5976 Py_UNICODE fill) 5977{ 5978 PyUnicodeObject *u; 5979 5980 if (left < 0) 5981 left = 0; 5982 if (right < 0) 5983 right = 0; 5984 5985 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) { 5986 Py_INCREF(self); 5987 return self; 5988 } 5989 5990 if (left > PY_SSIZE_T_MAX - self->length || 5991 right > PY_SSIZE_T_MAX - (left + self->length)) { 5992 PyErr_SetString(PyExc_OverflowError, "padded string is too long"); 5993 return NULL; 5994 } 5995 u = _PyUnicode_New(left + self->length + right); 5996 if (u) { 5997 if (left) 5998 Py_UNICODE_FILL(u->str, fill, left); 5999 Py_UNICODE_COPY(u->str + left, self->str, self->length); 6000 if (right) 6001 Py_UNICODE_FILL(u->str + left + self->length, fill, right); 6002 } 6003 6004 return u; 6005} 6006 6007#define SPLIT_APPEND(data, left, right) \ 6008 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \ 6009 if (!str) \ 6010 goto onError; \ 6011 if (PyList_Append(list, str)) { \ 6012 Py_DECREF(str); \ 6013 goto onError; \ 6014 } \ 6015 else \ 6016 Py_DECREF(str); 6017 6018static 6019PyObject *split_whitespace(PyUnicodeObject *self, 6020 PyObject *list, 6021 Py_ssize_t maxcount) 6022{ 6023 register Py_ssize_t i; 6024 register Py_ssize_t j; 6025 Py_ssize_t len = self->length; 6026 PyObject *str; 6027 register const Py_UNICODE *buf = self->str; 6028 6029 for (i = j = 0; i < len; ) { 6030 /* find a token */ 6031 while (i < len && Py_UNICODE_ISSPACE(buf[i])) 6032 i++; 6033 j = i; 6034 while (i < len && !Py_UNICODE_ISSPACE(buf[i])) 6035 i++; 6036 if (j < i) { 6037 if (maxcount-- <= 0) 6038 break; 6039 SPLIT_APPEND(buf, j, i); 6040 while (i < len && Py_UNICODE_ISSPACE(buf[i])) 6041 i++; 6042 j = i; 6043 } 6044 } 6045 if (j < len) { 6046 SPLIT_APPEND(buf, j, len); 6047 } 6048 return list; 6049 6050 onError: 6051 Py_DECREF(list); 6052 return NULL; 6053} 6054 6055PyObject *PyUnicode_Splitlines(PyObject *string, 6056 int keepends) 6057{ 6058 register Py_ssize_t i; 6059 register Py_ssize_t j; 6060 Py_ssize_t len; 6061 PyObject *list; 6062 PyObject *str; 6063 Py_UNICODE *data; 6064 6065 string = PyUnicode_FromObject(string); 6066 if (string == NULL) 6067 return NULL; 6068 data = PyUnicode_AS_UNICODE(string); 6069 len = PyUnicode_GET_SIZE(string); 6070 6071 list = PyList_New(0); 6072 if (!list) 6073 goto onError; 6074 6075 for (i = j = 0; i < len; ) { 6076 Py_ssize_t eol; 6077 6078 /* Find a line and append it */ 6079 while (i < len && !BLOOM_LINEBREAK(data[i])) 6080 i++; 6081 6082 /* Skip the line break reading CRLF as one line break */ 6083 eol = i; 6084 if (i < len) { 6085 if (data[i] == '\r' && i + 1 < len && 6086 data[i+1] == '\n') 6087 i += 2; 6088 else 6089 i++; 6090 if (keepends) 6091 eol = i; 6092 } 6093 SPLIT_APPEND(data, j, eol); 6094 j = i; 6095 } 6096 if (j < len) { 6097 SPLIT_APPEND(data, j, len); 6098 } 6099 6100 Py_DECREF(string); 6101 return list; 6102 6103 onError: 6104 Py_XDECREF(list); 6105 Py_DECREF(string); 6106 return NULL; 6107} 6108 6109static 6110PyObject *split_char(PyUnicodeObject *self, 6111 PyObject *list, 6112 Py_UNICODE ch, 6113 Py_ssize_t maxcount) 6114{ 6115 register Py_ssize_t i; 6116 register Py_ssize_t j; 6117 Py_ssize_t len = self->length; 6118 PyObject *str; 6119 register const Py_UNICODE *buf = self->str; 6120 6121 for (i = j = 0; i < len; ) { 6122 if (buf[i] == ch) { 6123 if (maxcount-- <= 0) 6124 break; 6125 SPLIT_APPEND(buf, j, i); 6126 i = j = i + 1; 6127 } else 6128 i++; 6129 } 6130 if (j <= len) { 6131 SPLIT_APPEND(buf, j, len); 6132 } 6133 return list; 6134 6135 onError: 6136 Py_DECREF(list); 6137 return NULL; 6138} 6139 6140static 6141PyObject *split_substring(PyUnicodeObject *self, 6142 PyObject *list, 6143 PyUnicodeObject *substring, 6144 Py_ssize_t maxcount) 6145{ 6146 register Py_ssize_t i; 6147 register Py_ssize_t j; 6148 Py_ssize_t len = self->length; 6149 Py_ssize_t sublen = substring->length; 6150 PyObject *str; 6151 6152 for (i = j = 0; i <= len - sublen; ) { 6153 if (Py_UNICODE_MATCH(self, i, substring)) { 6154 if (maxcount-- <= 0) 6155 break; 6156 SPLIT_APPEND(self->str, j, i); 6157 i = j = i + sublen; 6158 } else 6159 i++; 6160 } 6161 if (j <= len) { 6162 SPLIT_APPEND(self->str, j, len); 6163 } 6164 return list; 6165 6166 onError: 6167 Py_DECREF(list); 6168 return NULL; 6169} 6170 6171static 6172PyObject *rsplit_whitespace(PyUnicodeObject *self, 6173 PyObject *list, 6174 Py_ssize_t maxcount) 6175{ 6176 register Py_ssize_t i; 6177 register Py_ssize_t j; 6178 Py_ssize_t len = self->length; 6179 PyObject *str; 6180 register const Py_UNICODE *buf = self->str; 6181 6182 for (i = j = len - 1; i >= 0; ) { 6183 /* find a token */ 6184 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i])) 6185 i--; 6186 j = i; 6187 while (i >= 0 && !Py_UNICODE_ISSPACE(buf[i])) 6188 i--; 6189 if (j > i) { 6190 if (maxcount-- <= 0) 6191 break; 6192 SPLIT_APPEND(buf, i + 1, j + 1); 6193 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i])) 6194 i--; 6195 j = i; 6196 } 6197 } 6198 if (j >= 0) { 6199 SPLIT_APPEND(buf, 0, j + 1); 6200 } 6201 if (PyList_Reverse(list) < 0) 6202 goto onError; 6203 return list; 6204 6205 onError: 6206 Py_DECREF(list); 6207 return NULL; 6208} 6209 6210static 6211PyObject *rsplit_char(PyUnicodeObject *self, 6212 PyObject *list, 6213 Py_UNICODE ch, 6214 Py_ssize_t maxcount) 6215{ 6216 register Py_ssize_t i; 6217 register Py_ssize_t j; 6218 Py_ssize_t len = self->length; 6219 PyObject *str; 6220 register const Py_UNICODE *buf = self->str; 6221 6222 for (i = j = len - 1; i >= 0; ) { 6223 if (buf[i] == ch) { 6224 if (maxcount-- <= 0) 6225 break; 6226 SPLIT_APPEND(buf, i + 1, j + 1); 6227 j = i = i - 1; 6228 } else 6229 i--; 6230 } 6231 if (j >= -1) { 6232 SPLIT_APPEND(buf, 0, j + 1); 6233 } 6234 if (PyList_Reverse(list) < 0) 6235 goto onError; 6236 return list; 6237 6238 onError: 6239 Py_DECREF(list); 6240 return NULL; 6241} 6242 6243static 6244PyObject *rsplit_substring(PyUnicodeObject *self, 6245 PyObject *list, 6246 PyUnicodeObject *substring, 6247 Py_ssize_t maxcount) 6248{ 6249 register Py_ssize_t i; 6250 register Py_ssize_t j; 6251 Py_ssize_t len = self->length; 6252 Py_ssize_t sublen = substring->length; 6253 PyObject *str; 6254 6255 for (i = len - sublen, j = len; i >= 0; ) { 6256 if (Py_UNICODE_MATCH(self, i, substring)) { 6257 if (maxcount-- <= 0) 6258 break; 6259 SPLIT_APPEND(self->str, i + sublen, j); 6260 j = i; 6261 i -= sublen; 6262 } else 6263 i--; 6264 } 6265 if (j >= 0) { 6266 SPLIT_APPEND(self->str, 0, j); 6267 } 6268 if (PyList_Reverse(list) < 0) 6269 goto onError; 6270 return list; 6271 6272 onError: 6273 Py_DECREF(list); 6274 return NULL; 6275} 6276 6277#undef SPLIT_APPEND 6278 6279static 6280PyObject *split(PyUnicodeObject *self, 6281 PyUnicodeObject *substring, 6282 Py_ssize_t maxcount) 6283{ 6284 PyObject *list; 6285 6286 if (maxcount < 0) 6287 maxcount = PY_SSIZE_T_MAX; 6288 6289 list = PyList_New(0); 6290 if (!list) 6291 return NULL; 6292 6293 if (substring == NULL) 6294 return split_whitespace(self,list,maxcount); 6295 6296 else if (substring->length == 1) 6297 return split_char(self,list,substring->str[0],maxcount); 6298 6299 else if (substring->length == 0) { 6300 Py_DECREF(list); 6301 PyErr_SetString(PyExc_ValueError, "empty separator"); 6302 return NULL; 6303 } 6304 else 6305 return split_substring(self,list,substring,maxcount); 6306} 6307 6308static 6309PyObject *rsplit(PyUnicodeObject *self, 6310 PyUnicodeObject *substring, 6311 Py_ssize_t maxcount) 6312{ 6313 PyObject *list; 6314 6315 if (maxcount < 0) 6316 maxcount = PY_SSIZE_T_MAX; 6317 6318 list = PyList_New(0); 6319 if (!list) 6320 return NULL; 6321 6322 if (substring == NULL) 6323 return rsplit_whitespace(self,list,maxcount); 6324 6325 else if (substring->length == 1) 6326 return rsplit_char(self,list,substring->str[0],maxcount); 6327 6328 else if (substring->length == 0) { 6329 Py_DECREF(list); 6330 PyErr_SetString(PyExc_ValueError, "empty separator"); 6331 return NULL; 6332 } 6333 else 6334 return rsplit_substring(self,list,substring,maxcount); 6335} 6336 6337static 6338PyObject *replace(PyUnicodeObject *self, 6339 PyUnicodeObject *str1, 6340 PyUnicodeObject *str2, 6341 Py_ssize_t maxcount) 6342{ 6343 PyUnicodeObject *u; 6344 6345 if (maxcount < 0) 6346 maxcount = PY_SSIZE_T_MAX; 6347 6348 if (str1->length == str2->length) { 6349 /* same length */ 6350 Py_ssize_t i; 6351 if (str1->length == 1) { 6352 /* replace characters */ 6353 Py_UNICODE u1, u2; 6354 if (!findchar(self->str, self->length, str1->str[0])) 6355 goto nothing; 6356 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length); 6357 if (!u) 6358 return NULL; 6359 Py_UNICODE_COPY(u->str, self->str, self->length); 6360 u1 = str1->str[0]; 6361 u2 = str2->str[0]; 6362 for (i = 0; i < u->length; i++) 6363 if (u->str[i] == u1) { 6364 if (--maxcount < 0) 6365 break; 6366 u->str[i] = u2; 6367 } 6368 } else { 6369 i = fastsearch( 6370 self->str, self->length, str1->str, str1->length, FAST_SEARCH 6371 ); 6372 if (i < 0) 6373 goto nothing; 6374 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length); 6375 if (!u) 6376 return NULL; 6377 Py_UNICODE_COPY(u->str, self->str, self->length); 6378 while (i <= self->length - str1->length) 6379 if (Py_UNICODE_MATCH(self, i, str1)) { 6380 if (--maxcount < 0) 6381 break; 6382 Py_UNICODE_COPY(u->str+i, str2->str, str2->length); 6383 i += str1->length; 6384 } else 6385 i++; 6386 } 6387 } else { 6388 6389 Py_ssize_t n, i, j, e; 6390 Py_ssize_t product, new_size, delta; 6391 Py_UNICODE *p; 6392 6393 /* replace strings */ 6394 n = stringlib_count(self->str, self->length, str1->str, str1->length); 6395 if (n > maxcount) 6396 n = maxcount; 6397 if (n == 0) 6398 goto nothing; 6399 /* new_size = self->length + n * (str2->length - str1->length)); */ 6400 delta = (str2->length - str1->length); 6401 if (delta == 0) { 6402 new_size = self->length; 6403 } else { 6404 product = n * (str2->length - str1->length); 6405 if ((product / (str2->length - str1->length)) != n) { 6406 PyErr_SetString(PyExc_OverflowError, 6407 "replace string is too long"); 6408 return NULL; 6409 } 6410 new_size = self->length + product; 6411 if (new_size < 0) { 6412 PyErr_SetString(PyExc_OverflowError, 6413 "replace string is too long"); 6414 return NULL; 6415 } 6416 } 6417 u = _PyUnicode_New(new_size); 6418 if (!u) 6419 return NULL; 6420 i = 0; 6421 p = u->str; 6422 e = self->length - str1->length; 6423 if (str1->length > 0) { 6424 while (n-- > 0) { 6425 /* look for next match */ 6426 j = i; 6427 while (j <= e) { 6428 if (Py_UNICODE_MATCH(self, j, str1)) 6429 break; 6430 j++; 6431 } 6432 if (j > i) { 6433 if (j > e) 6434 break; 6435 /* copy unchanged part [i:j] */ 6436 Py_UNICODE_COPY(p, self->str+i, j-i); 6437 p += j - i; 6438 } 6439 /* copy substitution string */ 6440 if (str2->length > 0) { 6441 Py_UNICODE_COPY(p, str2->str, str2->length); 6442 p += str2->length; 6443 } 6444 i = j + str1->length; 6445 } 6446 if (i < self->length) 6447 /* copy tail [i:] */ 6448 Py_UNICODE_COPY(p, self->str+i, self->length-i); 6449 } else { 6450 /* interleave */ 6451 while (n > 0) { 6452 Py_UNICODE_COPY(p, str2->str, str2->length); 6453 p += str2->length; 6454 if (--n <= 0) 6455 break; 6456 *p++ = self->str[i++]; 6457 } 6458 Py_UNICODE_COPY(p, self->str+i, self->length-i); 6459 } 6460 } 6461 return (PyObject *) u; 6462 6463 nothing: 6464 /* nothing to replace; return original string (when possible) */ 6465 if (PyUnicode_CheckExact(self)) { 6466 Py_INCREF(self); 6467 return (PyObject *) self; 6468 } 6469 return PyUnicode_FromUnicode(self->str, self->length); 6470} 6471 6472/* --- Unicode Object Methods --------------------------------------------- */ 6473 6474PyDoc_STRVAR(title__doc__, 6475 "S.title() -> str\n\ 6476\n\ 6477Return a titlecased version of S, i.e. words start with title case\n\ 6478characters, all remaining cased characters have lower case."); 6479 6480static PyObject* 6481unicode_title(PyUnicodeObject *self) 6482{ 6483 return fixup(self, fixtitle); 6484} 6485 6486PyDoc_STRVAR(capitalize__doc__, 6487 "S.capitalize() -> str\n\ 6488\n\ 6489Return a capitalized version of S, i.e. make the first character\n\ 6490have upper case."); 6491 6492static PyObject* 6493unicode_capitalize(PyUnicodeObject *self) 6494{ 6495 return fixup(self, fixcapitalize); 6496} 6497 6498#if 0 6499PyDoc_STRVAR(capwords__doc__, 6500 "S.capwords() -> str\n\ 6501\n\ 6502Apply .capitalize() to all words in S and return the result with\n\ 6503normalized whitespace (all whitespace strings are replaced by ' ')."); 6504 6505static PyObject* 6506unicode_capwords(PyUnicodeObject *self) 6507{ 6508 PyObject *list; 6509 PyObject *item; 6510 Py_ssize_t i; 6511 6512 /* Split into words */ 6513 list = split(self, NULL, -1); 6514 if (!list) 6515 return NULL; 6516 6517 /* Capitalize each word */ 6518 for (i = 0; i < PyList_GET_SIZE(list); i++) { 6519 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i), 6520 fixcapitalize); 6521 if (item == NULL) 6522 goto onError; 6523 Py_DECREF(PyList_GET_ITEM(list, i)); 6524 PyList_SET_ITEM(list, i, item); 6525 } 6526 6527 /* Join the words to form a new string */ 6528 item = PyUnicode_Join(NULL, list); 6529 6530 onError: 6531 Py_DECREF(list); 6532 return (PyObject *)item; 6533} 6534#endif 6535 6536/* Argument converter. Coerces to a single unicode character */ 6537 6538static int 6539convert_uc(PyObject *obj, void *addr) 6540{ 6541 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr; 6542 PyObject *uniobj; 6543 Py_UNICODE *unistr; 6544 6545 uniobj = PyUnicode_FromObject(obj); 6546 if (uniobj == NULL) { 6547 PyErr_SetString(PyExc_TypeError, 6548 "The fill character cannot be converted to Unicode"); 6549 return 0; 6550 } 6551 if (PyUnicode_GET_SIZE(uniobj) != 1) { 6552 PyErr_SetString(PyExc_TypeError, 6553 "The fill character must be exactly one character long"); 6554 Py_DECREF(uniobj); 6555 return 0; 6556 } 6557 unistr = PyUnicode_AS_UNICODE(uniobj); 6558 *fillcharloc = unistr[0]; 6559 Py_DECREF(uniobj); 6560 return 1; 6561} 6562 6563PyDoc_STRVAR(center__doc__, 6564 "S.center(width[, fillchar]) -> str\n\ 6565\n\ 6566Return S centered in a string of length width. Padding is\n\ 6567done using the specified fill character (default is a space)"); 6568 6569static PyObject * 6570unicode_center(PyUnicodeObject *self, PyObject *args) 6571{ 6572 Py_ssize_t marg, left; 6573 Py_ssize_t width; 6574 Py_UNICODE fillchar = ' '; 6575 6576 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar)) 6577 return NULL; 6578 6579 if (self->length >= width && PyUnicode_CheckExact(self)) { 6580 Py_INCREF(self); 6581 return (PyObject*) self; 6582 } 6583 6584 marg = width - self->length; 6585 left = marg / 2 + (marg & width & 1); 6586 6587 return (PyObject*) pad(self, left, marg - left, fillchar); 6588} 6589 6590#if 0 6591 6592/* This code should go into some future Unicode collation support 6593 module. The basic comparison should compare ordinals on a naive 6594 basis (this is what Java does and thus JPython too). */ 6595 6596/* speedy UTF-16 code point order comparison */ 6597/* gleaned from: */ 6598/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */ 6599 6600static short utf16Fixup[32] = 6601{ 6602 0, 0, 0, 0, 0, 0, 0, 0, 6603 0, 0, 0, 0, 0, 0, 0, 0, 6604 0, 0, 0, 0, 0, 0, 0, 0, 6605 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800 6606}; 6607 6608static int 6609unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2) 6610{ 6611 Py_ssize_t len1, len2; 6612 6613 Py_UNICODE *s1 = str1->str; 6614 Py_UNICODE *s2 = str2->str; 6615 6616 len1 = str1->length; 6617 len2 = str2->length; 6618 6619 while (len1 > 0 && len2 > 0) { 6620 Py_UNICODE c1, c2; 6621 6622 c1 = *s1++; 6623 c2 = *s2++; 6624 6625 if (c1 > (1<<11) * 26) 6626 c1 += utf16Fixup[c1>>11]; 6627 if (c2 > (1<<11) * 26) 6628 c2 += utf16Fixup[c2>>11]; 6629 /* now c1 and c2 are in UTF-32-compatible order */ 6630 6631 if (c1 != c2) 6632 return (c1 < c2) ? -1 : 1; 6633 6634 len1--; len2--; 6635 } 6636 6637 return (len1 < len2) ? -1 : (len1 != len2); 6638} 6639 6640#else 6641 6642static int 6643unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2) 6644{ 6645 register Py_ssize_t len1, len2; 6646 6647 Py_UNICODE *s1 = str1->str; 6648 Py_UNICODE *s2 = str2->str; 6649 6650 len1 = str1->length; 6651 len2 = str2->length; 6652 6653 while (len1 > 0 && len2 > 0) { 6654 Py_UNICODE c1, c2; 6655 6656 c1 = *s1++; 6657 c2 = *s2++; 6658 6659 if (c1 != c2) 6660 return (c1 < c2) ? -1 : 1; 6661 6662 len1--; len2--; 6663 } 6664 6665 return (len1 < len2) ? -1 : (len1 != len2); 6666} 6667 6668#endif 6669 6670int PyUnicode_Compare(PyObject *left, 6671 PyObject *right) 6672{ 6673 if (PyUnicode_Check(left) && PyUnicode_Check(right)) 6674 return unicode_compare((PyUnicodeObject *)left, 6675 (PyUnicodeObject *)right); 6676 PyErr_Format(PyExc_TypeError, 6677 "Can't compare %.100s and %.100s", 6678 left->ob_type->tp_name, 6679 right->ob_type->tp_name); 6680 return -1; 6681} 6682 6683int 6684PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str) 6685{ 6686 int i; 6687 Py_UNICODE *id; 6688 assert(PyUnicode_Check(uni)); 6689 id = PyUnicode_AS_UNICODE(uni); 6690 /* Compare Unicode string and source character set string */ 6691 for (i = 0; id[i] && str[i]; i++) 6692 if (id[i] != str[i]) 6693 return ((int)id[i] < (int)str[i]) ? -1 : 1; 6694 if (id[i]) 6695 return 1; /* uni is longer */ 6696 if (str[i]) 6697 return -1; /* str is longer */ 6698 return 0; 6699} 6700 6701 6702#define TEST_COND(cond) \ 6703 ((cond) ? Py_True : Py_False) 6704 6705PyObject *PyUnicode_RichCompare(PyObject *left, 6706 PyObject *right, 6707 int op) 6708{ 6709 int result; 6710 6711 if (PyUnicode_Check(left) && PyUnicode_Check(right)) { 6712 PyObject *v; 6713 if (((PyUnicodeObject *) left)->length != 6714 ((PyUnicodeObject *) right)->length) { 6715 if (op == Py_EQ) { 6716 Py_INCREF(Py_False); 6717 return Py_False; 6718 } 6719 if (op == Py_NE) { 6720 Py_INCREF(Py_True); 6721 return Py_True; 6722 } 6723 } 6724 if (left == right) 6725 result = 0; 6726 else 6727 result = unicode_compare((PyUnicodeObject *)left, 6728 (PyUnicodeObject *)right); 6729 6730 /* Convert the return value to a Boolean */ 6731 switch (op) { 6732 case Py_EQ: 6733 v = TEST_COND(result == 0); 6734 break; 6735 case Py_NE: 6736 v = TEST_COND(result != 0); 6737 break; 6738 case Py_LE: 6739 v = TEST_COND(result <= 0); 6740 break; 6741 case Py_GE: 6742 v = TEST_COND(result >= 0); 6743 break; 6744 case Py_LT: 6745 v = TEST_COND(result == -1); 6746 break; 6747 case Py_GT: 6748 v = TEST_COND(result == 1); 6749 break; 6750 default: 6751 PyErr_BadArgument(); 6752 return NULL; 6753 } 6754 Py_INCREF(v); 6755 return v; 6756 } 6757 6758 Py_INCREF(Py_NotImplemented); 6759 return Py_NotImplemented; 6760} 6761 6762int PyUnicode_Contains(PyObject *container, 6763 PyObject *element) 6764{ 6765 PyObject *str, *sub; 6766 int result; 6767 6768 /* Coerce the two arguments */ 6769 sub = PyUnicode_FromObject(element); 6770 if (!sub) { 6771 PyErr_Format(PyExc_TypeError, 6772 "'in <string>' requires string as left operand, not %s", 6773 element->ob_type->tp_name); 6774 return -1; 6775 } 6776 6777 str = PyUnicode_FromObject(container); 6778 if (!str) { 6779 Py_DECREF(sub); 6780 return -1; 6781 } 6782 6783 result = stringlib_contains_obj(str, sub); 6784 6785 Py_DECREF(str); 6786 Py_DECREF(sub); 6787 6788 return result; 6789} 6790 6791/* Concat to string or Unicode object giving a new Unicode object. */ 6792 6793PyObject *PyUnicode_Concat(PyObject *left, 6794 PyObject *right) 6795{ 6796 PyUnicodeObject *u = NULL, *v = NULL, *w; 6797 6798 /* Coerce the two arguments */ 6799 u = (PyUnicodeObject *)PyUnicode_FromObject(left); 6800 if (u == NULL) 6801 goto onError; 6802 v = (PyUnicodeObject *)PyUnicode_FromObject(right); 6803 if (v == NULL) 6804 goto onError; 6805 6806 /* Shortcuts */ 6807 if (v == unicode_empty) { 6808 Py_DECREF(v); 6809 return (PyObject *)u; 6810 } 6811 if (u == unicode_empty) { 6812 Py_DECREF(u); 6813 return (PyObject *)v; 6814 } 6815 6816 /* Concat the two Unicode strings */ 6817 w = _PyUnicode_New(u->length + v->length); 6818 if (w == NULL) 6819 goto onError; 6820 Py_UNICODE_COPY(w->str, u->str, u->length); 6821 Py_UNICODE_COPY(w->str + u->length, v->str, v->length); 6822 6823 Py_DECREF(u); 6824 Py_DECREF(v); 6825 return (PyObject *)w; 6826 6827 onError: 6828 Py_XDECREF(u); 6829 Py_XDECREF(v); 6830 return NULL; 6831} 6832 6833void 6834PyUnicode_Append(PyObject **pleft, PyObject *right) 6835{ 6836 PyObject *new; 6837 if (*pleft == NULL) 6838 return; 6839 if (right == NULL || !PyUnicode_Check(*pleft)) { 6840 Py_DECREF(*pleft); 6841 *pleft = NULL; 6842 return; 6843 } 6844 new = PyUnicode_Concat(*pleft, right); 6845 Py_DECREF(*pleft); 6846 *pleft = new; 6847} 6848 6849void 6850PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right) 6851{ 6852 PyUnicode_Append(pleft, right); 6853 Py_XDECREF(right); 6854} 6855 6856PyDoc_STRVAR(count__doc__, 6857 "S.count(sub[, start[, end]]) -> int\n\ 6858\n\ 6859Return the number of non-overlapping occurrences of substring sub in\n\ 6860string S[start:end]. Optional arguments start and end are\n\ 6861interpreted as in slice notation."); 6862 6863static PyObject * 6864unicode_count(PyUnicodeObject *self, PyObject *args) 6865{ 6866 PyUnicodeObject *substring; 6867 Py_ssize_t start = 0; 6868 Py_ssize_t end = PY_SSIZE_T_MAX; 6869 PyObject *result; 6870 6871 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring, 6872 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 6873 return NULL; 6874 6875 substring = (PyUnicodeObject *)PyUnicode_FromObject( 6876 (PyObject *)substring); 6877 if (substring == NULL) 6878 return NULL; 6879 6880 FIX_START_END(self); 6881 6882 result = PyLong_FromSsize_t( 6883 stringlib_count(self->str + start, end - start, 6884 substring->str, substring->length) 6885 ); 6886 6887 Py_DECREF(substring); 6888 6889 return result; 6890} 6891 6892PyDoc_STRVAR(encode__doc__, 6893 "S.encode([encoding[, errors]]) -> bytes\n\ 6894\n\ 6895Encode S using the codec registered for encoding. encoding defaults\n\ 6896to the default encoding. errors may be given to set a different error\n\ 6897handling scheme. Default is 'strict' meaning that encoding errors raise\n\ 6898a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\ 6899'xmlcharrefreplace' as well as any other name registered with\n\ 6900codecs.register_error that can handle UnicodeEncodeErrors."); 6901 6902static PyObject * 6903unicode_encode(PyUnicodeObject *self, PyObject *args) 6904{ 6905 char *encoding = NULL; 6906 char *errors = NULL; 6907 PyObject *v; 6908 6909 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors)) 6910 return NULL; 6911 v = PyUnicode_AsEncodedString((PyObject *)self, encoding, errors); 6912 if (v == NULL) 6913 goto onError; 6914 if (!PyBytes_Check(v)) { 6915 PyErr_Format(PyExc_TypeError, 6916 "encoder did not return a bytes object " 6917 "(type=%.400s)", 6918 Py_TYPE(v)->tp_name); 6919 Py_DECREF(v); 6920 return NULL; 6921 } 6922 return v; 6923 6924 onError: 6925 return NULL; 6926} 6927 6928PyDoc_STRVAR(expandtabs__doc__, 6929 "S.expandtabs([tabsize]) -> str\n\ 6930\n\ 6931Return a copy of S where all tab characters are expanded using spaces.\n\ 6932If tabsize is not given, a tab size of 8 characters is assumed."); 6933 6934static PyObject* 6935unicode_expandtabs(PyUnicodeObject *self, PyObject *args) 6936{ 6937 Py_UNICODE *e; 6938 Py_UNICODE *p; 6939 Py_UNICODE *q; 6940 Py_UNICODE *qe; 6941 Py_ssize_t i, j, incr; 6942 PyUnicodeObject *u; 6943 int tabsize = 8; 6944 6945 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize)) 6946 return NULL; 6947 6948 /* First pass: determine size of output string */ 6949 i = 0; /* chars up to and including most recent \n or \r */ 6950 j = 0; /* chars since most recent \n or \r (use in tab calculations) */ 6951 e = self->str + self->length; /* end of input */ 6952 for (p = self->str; p < e; p++) 6953 if (*p == '\t') { 6954 if (tabsize > 0) { 6955 incr = tabsize - (j % tabsize); /* cannot overflow */ 6956 if (j > PY_SSIZE_T_MAX - incr) 6957 goto overflow1; 6958 j += incr; 6959 } 6960 } 6961 else { 6962 if (j > PY_SSIZE_T_MAX - 1) 6963 goto overflow1; 6964 j++; 6965 if (*p == '\n' || *p == '\r') { 6966 if (i > PY_SSIZE_T_MAX - j) 6967 goto overflow1; 6968 i += j; 6969 j = 0; 6970 } 6971 } 6972 6973 if (i > PY_SSIZE_T_MAX - j) 6974 goto overflow1; 6975 6976 /* Second pass: create output string and fill it */ 6977 u = _PyUnicode_New(i + j); 6978 if (!u) 6979 return NULL; 6980 6981 j = 0; /* same as in first pass */ 6982 q = u->str; /* next output char */ 6983 qe = u->str + u->length; /* end of output */ 6984 6985 for (p = self->str; p < e; p++) 6986 if (*p == '\t') { 6987 if (tabsize > 0) { 6988 i = tabsize - (j % tabsize); 6989 j += i; 6990 while (i--) { 6991 if (q >= qe) 6992 goto overflow2; 6993 *q++ = ' '; 6994 } 6995 } 6996 } 6997 else { 6998 if (q >= qe) 6999 goto overflow2; 7000 *q++ = *p; 7001 j++; 7002 if (*p == '\n' || *p == '\r') 7003 j = 0; 7004 } 7005 7006 return (PyObject*) u; 7007 7008 overflow2: 7009 Py_DECREF(u); 7010 overflow1: 7011 PyErr_SetString(PyExc_OverflowError, "new string is too long"); 7012 return NULL; 7013} 7014 7015PyDoc_STRVAR(find__doc__, 7016 "S.find(sub[, start[, end]]) -> int\n\ 7017\n\ 7018Return the lowest index in S where substring sub is found,\n\ 7019such that sub is contained within s[start:end]. Optional\n\ 7020arguments start and end are interpreted as in slice notation.\n\ 7021\n\ 7022Return -1 on failure."); 7023 7024static PyObject * 7025unicode_find(PyUnicodeObject *self, PyObject *args) 7026{ 7027 PyObject *substring; 7028 Py_ssize_t start; 7029 Py_ssize_t end; 7030 Py_ssize_t result; 7031 7032 if (!_ParseTupleFinds(args, &substring, &start, &end)) 7033 return NULL; 7034 7035 result = stringlib_find_slice( 7036 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self), 7037 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring), 7038 start, end 7039 ); 7040 7041 Py_DECREF(substring); 7042 7043 return PyLong_FromSsize_t(result); 7044} 7045 7046static PyObject * 7047unicode_getitem(PyUnicodeObject *self, Py_ssize_t index) 7048{ 7049 if (index < 0 || index >= self->length) { 7050 PyErr_SetString(PyExc_IndexError, "string index out of range"); 7051 return NULL; 7052 } 7053 7054 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1); 7055} 7056 7057/* Believe it or not, this produces the same value for ASCII strings 7058 as string_hash(). */ 7059static long 7060unicode_hash(PyUnicodeObject *self) 7061{ 7062 Py_ssize_t len; 7063 Py_UNICODE *p; 7064 long x; 7065 7066 if (self->hash != -1) 7067 return self->hash; 7068 len = Py_SIZE(self); 7069 p = self->str; 7070 x = *p << 7; 7071 while (--len >= 0) 7072 x = (1000003*x) ^ *p++; 7073 x ^= Py_SIZE(self); 7074 if (x == -1) 7075 x = -2; 7076 self->hash = x; 7077 return x; 7078} 7079 7080PyDoc_STRVAR(index__doc__, 7081 "S.index(sub[, start[, end]]) -> int\n\ 7082\n\ 7083Like S.find() but raise ValueError when the substring is not found."); 7084 7085static PyObject * 7086unicode_index(PyUnicodeObject *self, PyObject *args) 7087{ 7088 Py_ssize_t result; 7089 PyObject *substring; 7090 Py_ssize_t start; 7091 Py_ssize_t end; 7092 7093 if (!_ParseTupleFinds(args, &substring, &start, &end)) 7094 return NULL; 7095 7096 result = stringlib_find_slice( 7097 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self), 7098 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring), 7099 start, end 7100 ); 7101 7102 Py_DECREF(substring); 7103 7104 if (result < 0) { 7105 PyErr_SetString(PyExc_ValueError, "substring not found"); 7106 return NULL; 7107 } 7108 7109 return PyLong_FromSsize_t(result); 7110} 7111 7112PyDoc_STRVAR(islower__doc__, 7113 "S.islower() -> bool\n\ 7114\n\ 7115Return True if all cased characters in S are lowercase and there is\n\ 7116at least one cased character in S, False otherwise."); 7117 7118static PyObject* 7119unicode_islower(PyUnicodeObject *self) 7120{ 7121 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 7122 register const Py_UNICODE *e; 7123 int cased; 7124 7125 /* Shortcut for single character strings */ 7126 if (PyUnicode_GET_SIZE(self) == 1) 7127 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p)); 7128 7129 /* Special case for empty strings */ 7130 if (PyUnicode_GET_SIZE(self) == 0) 7131 return PyBool_FromLong(0); 7132 7133 e = p + PyUnicode_GET_SIZE(self); 7134 cased = 0; 7135 for (; p < e; p++) { 7136 register const Py_UNICODE ch = *p; 7137 7138 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) 7139 return PyBool_FromLong(0); 7140 else if (!cased && Py_UNICODE_ISLOWER(ch)) 7141 cased = 1; 7142 } 7143 return PyBool_FromLong(cased); 7144} 7145 7146PyDoc_STRVAR(isupper__doc__, 7147 "S.isupper() -> bool\n\ 7148\n\ 7149Return True if all cased characters in S are uppercase and there is\n\ 7150at least one cased character in S, False otherwise."); 7151 7152static PyObject* 7153unicode_isupper(PyUnicodeObject *self) 7154{ 7155 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 7156 register const Py_UNICODE *e; 7157 int cased; 7158 7159 /* Shortcut for single character strings */ 7160 if (PyUnicode_GET_SIZE(self) == 1) 7161 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0); 7162 7163 /* Special case for empty strings */ 7164 if (PyUnicode_GET_SIZE(self) == 0) 7165 return PyBool_FromLong(0); 7166 7167 e = p + PyUnicode_GET_SIZE(self); 7168 cased = 0; 7169 for (; p < e; p++) { 7170 register const Py_UNICODE ch = *p; 7171 7172 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch)) 7173 return PyBool_FromLong(0); 7174 else if (!cased && Py_UNICODE_ISUPPER(ch)) 7175 cased = 1; 7176 } 7177 return PyBool_FromLong(cased); 7178} 7179 7180PyDoc_STRVAR(istitle__doc__, 7181 "S.istitle() -> bool\n\ 7182\n\ 7183Return True if S is a titlecased string and there is at least one\n\ 7184character in S, i.e. upper- and titlecase characters may only\n\ 7185follow uncased characters and lowercase characters only cased ones.\n\ 7186Return False otherwise."); 7187 7188static PyObject* 7189unicode_istitle(PyUnicodeObject *self) 7190{ 7191 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 7192 register const Py_UNICODE *e; 7193 int cased, previous_is_cased; 7194 7195 /* Shortcut for single character strings */ 7196 if (PyUnicode_GET_SIZE(self) == 1) 7197 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) || 7198 (Py_UNICODE_ISUPPER(*p) != 0)); 7199 7200 /* Special case for empty strings */ 7201 if (PyUnicode_GET_SIZE(self) == 0) 7202 return PyBool_FromLong(0); 7203 7204 e = p + PyUnicode_GET_SIZE(self); 7205 cased = 0; 7206 previous_is_cased = 0; 7207 for (; p < e; p++) { 7208 register const Py_UNICODE ch = *p; 7209 7210 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) { 7211 if (previous_is_cased) 7212 return PyBool_FromLong(0); 7213 previous_is_cased = 1; 7214 cased = 1; 7215 } 7216 else if (Py_UNICODE_ISLOWER(ch)) { 7217 if (!previous_is_cased) 7218 return PyBool_FromLong(0); 7219 previous_is_cased = 1; 7220 cased = 1; 7221 } 7222 else 7223 previous_is_cased = 0; 7224 } 7225 return PyBool_FromLong(cased); 7226} 7227 7228PyDoc_STRVAR(isspace__doc__, 7229 "S.isspace() -> bool\n\ 7230\n\ 7231Return True if all characters in S are whitespace\n\ 7232and there is at least one character in S, False otherwise."); 7233 7234static PyObject* 7235unicode_isspace(PyUnicodeObject *self) 7236{ 7237 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 7238 register const Py_UNICODE *e; 7239 7240 /* Shortcut for single character strings */ 7241 if (PyUnicode_GET_SIZE(self) == 1 && 7242 Py_UNICODE_ISSPACE(*p)) 7243 return PyBool_FromLong(1); 7244 7245 /* Special case for empty strings */ 7246 if (PyUnicode_GET_SIZE(self) == 0) 7247 return PyBool_FromLong(0); 7248 7249 e = p + PyUnicode_GET_SIZE(self); 7250 for (; p < e; p++) { 7251 if (!Py_UNICODE_ISSPACE(*p)) 7252 return PyBool_FromLong(0); 7253 } 7254 return PyBool_FromLong(1); 7255} 7256 7257PyDoc_STRVAR(isalpha__doc__, 7258 "S.isalpha() -> bool\n\ 7259\n\ 7260Return True if all characters in S are alphabetic\n\ 7261and there is at least one character in S, False otherwise."); 7262 7263static PyObject* 7264unicode_isalpha(PyUnicodeObject *self) 7265{ 7266 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 7267 register const Py_UNICODE *e; 7268 7269 /* Shortcut for single character strings */ 7270 if (PyUnicode_GET_SIZE(self) == 1 && 7271 Py_UNICODE_ISALPHA(*p)) 7272 return PyBool_FromLong(1); 7273 7274 /* Special case for empty strings */ 7275 if (PyUnicode_GET_SIZE(self) == 0) 7276 return PyBool_FromLong(0); 7277 7278 e = p + PyUnicode_GET_SIZE(self); 7279 for (; p < e; p++) { 7280 if (!Py_UNICODE_ISALPHA(*p)) 7281 return PyBool_FromLong(0); 7282 } 7283 return PyBool_FromLong(1); 7284} 7285 7286PyDoc_STRVAR(isalnum__doc__, 7287 "S.isalnum() -> bool\n\ 7288\n\ 7289Return True if all characters in S are alphanumeric\n\ 7290and there is at least one character in S, False otherwise."); 7291 7292static PyObject* 7293unicode_isalnum(PyUnicodeObject *self) 7294{ 7295 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 7296 register const Py_UNICODE *e; 7297 7298 /* Shortcut for single character strings */ 7299 if (PyUnicode_GET_SIZE(self) == 1 && 7300 Py_UNICODE_ISALNUM(*p)) 7301 return PyBool_FromLong(1); 7302 7303 /* Special case for empty strings */ 7304 if (PyUnicode_GET_SIZE(self) == 0) 7305 return PyBool_FromLong(0); 7306 7307 e = p + PyUnicode_GET_SIZE(self); 7308 for (; p < e; p++) { 7309 if (!Py_UNICODE_ISALNUM(*p)) 7310 return PyBool_FromLong(0); 7311 } 7312 return PyBool_FromLong(1); 7313} 7314 7315PyDoc_STRVAR(isdecimal__doc__, 7316 "S.isdecimal() -> bool\n\ 7317\n\ 7318Return True if there are only decimal characters in S,\n\ 7319False otherwise."); 7320 7321static PyObject* 7322unicode_isdecimal(PyUnicodeObject *self) 7323{ 7324 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 7325 register const Py_UNICODE *e; 7326 7327 /* Shortcut for single character strings */ 7328 if (PyUnicode_GET_SIZE(self) == 1 && 7329 Py_UNICODE_ISDECIMAL(*p)) 7330 return PyBool_FromLong(1); 7331 7332 /* Special case for empty strings */ 7333 if (PyUnicode_GET_SIZE(self) == 0) 7334 return PyBool_FromLong(0); 7335 7336 e = p + PyUnicode_GET_SIZE(self); 7337 for (; p < e; p++) { 7338 if (!Py_UNICODE_ISDECIMAL(*p)) 7339 return PyBool_FromLong(0); 7340 } 7341 return PyBool_FromLong(1); 7342} 7343 7344PyDoc_STRVAR(isdigit__doc__, 7345 "S.isdigit() -> bool\n\ 7346\n\ 7347Return True if all characters in S are digits\n\ 7348and there is at least one character in S, False otherwise."); 7349 7350static PyObject* 7351unicode_isdigit(PyUnicodeObject *self) 7352{ 7353 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 7354 register const Py_UNICODE *e; 7355 7356 /* Shortcut for single character strings */ 7357 if (PyUnicode_GET_SIZE(self) == 1 && 7358 Py_UNICODE_ISDIGIT(*p)) 7359 return PyBool_FromLong(1); 7360 7361 /* Special case for empty strings */ 7362 if (PyUnicode_GET_SIZE(self) == 0) 7363 return PyBool_FromLong(0); 7364 7365 e = p + PyUnicode_GET_SIZE(self); 7366 for (; p < e; p++) { 7367 if (!Py_UNICODE_ISDIGIT(*p)) 7368 return PyBool_FromLong(0); 7369 } 7370 return PyBool_FromLong(1); 7371} 7372 7373PyDoc_STRVAR(isnumeric__doc__, 7374 "S.isnumeric() -> bool\n\ 7375\n\ 7376Return True if there are only numeric characters in S,\n\ 7377False otherwise."); 7378 7379static PyObject* 7380unicode_isnumeric(PyUnicodeObject *self) 7381{ 7382 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 7383 register const Py_UNICODE *e; 7384 7385 /* Shortcut for single character strings */ 7386 if (PyUnicode_GET_SIZE(self) == 1 && 7387 Py_UNICODE_ISNUMERIC(*p)) 7388 return PyBool_FromLong(1); 7389 7390 /* Special case for empty strings */ 7391 if (PyUnicode_GET_SIZE(self) == 0) 7392 return PyBool_FromLong(0); 7393 7394 e = p + PyUnicode_GET_SIZE(self); 7395 for (; p < e; p++) { 7396 if (!Py_UNICODE_ISNUMERIC(*p)) 7397 return PyBool_FromLong(0); 7398 } 7399 return PyBool_FromLong(1); 7400} 7401 7402int 7403PyUnicode_IsIdentifier(PyObject *self) 7404{ 7405 register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self); 7406 register const Py_UNICODE *e; 7407 7408 /* Special case for empty strings */ 7409 if (PyUnicode_GET_SIZE(self) == 0) 7410 return 0; 7411 7412 /* PEP 3131 says that the first character must be in 7413 XID_Start and subsequent characters in XID_Continue, 7414 and for the ASCII range, the 2.x rules apply (i.e 7415 start with letters and underscore, continue with 7416 letters, digits, underscore). However, given the current 7417 definition of XID_Start and XID_Continue, it is sufficient 7418 to check just for these, except that _ must be allowed 7419 as starting an identifier. */ 7420 if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */) 7421 return 0; 7422 7423 e = p + PyUnicode_GET_SIZE(self); 7424 for (p++; p < e; p++) { 7425 if (!_PyUnicode_IsXidContinue(*p)) 7426 return 0; 7427 } 7428 return 1; 7429} 7430 7431PyDoc_STRVAR(isidentifier__doc__, 7432 "S.isidentifier() -> bool\n\ 7433\n\ 7434Return True if S is a valid identifier according\n\ 7435to the language definition."); 7436 7437static PyObject* 7438unicode_isidentifier(PyObject *self) 7439{ 7440 return PyBool_FromLong(PyUnicode_IsIdentifier(self)); 7441} 7442 7443PyDoc_STRVAR(isprintable__doc__, 7444 "S.isprintable() -> bool\n\ 7445\n\ 7446Return True if all characters in S are considered\n\ 7447printable in repr() or S is empty, False otherwise."); 7448 7449static PyObject* 7450unicode_isprintable(PyObject *self) 7451{ 7452 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 7453 register const Py_UNICODE *e; 7454 7455 /* Shortcut for single character strings */ 7456 if (PyUnicode_GET_SIZE(self) == 1 && Py_UNICODE_ISPRINTABLE(*p)) { 7457 Py_RETURN_TRUE; 7458 } 7459 7460 e = p + PyUnicode_GET_SIZE(self); 7461 for (; p < e; p++) { 7462 if (!Py_UNICODE_ISPRINTABLE(*p)) { 7463 Py_RETURN_FALSE; 7464 } 7465 } 7466 Py_RETURN_TRUE; 7467} 7468 7469PyDoc_STRVAR(join__doc__, 7470 "S.join(sequence) -> str\n\ 7471\n\ 7472Return a string which is the concatenation of the strings in the\n\ 7473sequence. The separator between elements is S."); 7474 7475static PyObject* 7476unicode_join(PyObject *self, PyObject *data) 7477{ 7478 return PyUnicode_Join(self, data); 7479} 7480 7481static Py_ssize_t 7482unicode_length(PyUnicodeObject *self) 7483{ 7484 return self->length; 7485} 7486 7487PyDoc_STRVAR(ljust__doc__, 7488 "S.ljust(width[, fillchar]) -> str\n\ 7489\n\ 7490Return S left-justified in a Unicode string of length width. Padding is\n\ 7491done using the specified fill character (default is a space)."); 7492 7493static PyObject * 7494unicode_ljust(PyUnicodeObject *self, PyObject *args) 7495{ 7496 Py_ssize_t width; 7497 Py_UNICODE fillchar = ' '; 7498 7499 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar)) 7500 return NULL; 7501 7502 if (self->length >= width && PyUnicode_CheckExact(self)) { 7503 Py_INCREF(self); 7504 return (PyObject*) self; 7505 } 7506 7507 return (PyObject*) pad(self, 0, width - self->length, fillchar); 7508} 7509 7510PyDoc_STRVAR(lower__doc__, 7511 "S.lower() -> str\n\ 7512\n\ 7513Return a copy of the string S converted to lowercase."); 7514 7515static PyObject* 7516unicode_lower(PyUnicodeObject *self) 7517{ 7518 return fixup(self, fixlower); 7519} 7520 7521#define LEFTSTRIP 0 7522#define RIGHTSTRIP 1 7523#define BOTHSTRIP 2 7524 7525/* Arrays indexed by above */ 7526static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"}; 7527 7528#define STRIPNAME(i) (stripformat[i]+3) 7529 7530/* externally visible for str.strip(unicode) */ 7531PyObject * 7532_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj) 7533{ 7534 Py_UNICODE *s = PyUnicode_AS_UNICODE(self); 7535 Py_ssize_t len = PyUnicode_GET_SIZE(self); 7536 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj); 7537 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj); 7538 Py_ssize_t i, j; 7539 7540 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen); 7541 7542 i = 0; 7543 if (striptype != RIGHTSTRIP) { 7544 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) { 7545 i++; 7546 } 7547 } 7548 7549 j = len; 7550 if (striptype != LEFTSTRIP) { 7551 do { 7552 j--; 7553 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen)); 7554 j++; 7555 } 7556 7557 if (i == 0 && j == len && PyUnicode_CheckExact(self)) { 7558 Py_INCREF(self); 7559 return (PyObject*)self; 7560 } 7561 else 7562 return PyUnicode_FromUnicode(s+i, j-i); 7563} 7564 7565 7566static PyObject * 7567do_strip(PyUnicodeObject *self, int striptype) 7568{ 7569 Py_UNICODE *s = PyUnicode_AS_UNICODE(self); 7570 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j; 7571 7572 i = 0; 7573 if (striptype != RIGHTSTRIP) { 7574 while (i < len && Py_UNICODE_ISSPACE(s[i])) { 7575 i++; 7576 } 7577 } 7578 7579 j = len; 7580 if (striptype != LEFTSTRIP) { 7581 do { 7582 j--; 7583 } while (j >= i && Py_UNICODE_ISSPACE(s[j])); 7584 j++; 7585 } 7586 7587 if (i == 0 && j == len && PyUnicode_CheckExact(self)) { 7588 Py_INCREF(self); 7589 return (PyObject*)self; 7590 } 7591 else 7592 return PyUnicode_FromUnicode(s+i, j-i); 7593} 7594 7595 7596static PyObject * 7597do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args) 7598{ 7599 PyObject *sep = NULL; 7600 7601 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep)) 7602 return NULL; 7603 7604 if (sep != NULL && sep != Py_None) { 7605 if (PyUnicode_Check(sep)) 7606 return _PyUnicode_XStrip(self, striptype, sep); 7607 else { 7608 PyErr_Format(PyExc_TypeError, 7609 "%s arg must be None or str", 7610 STRIPNAME(striptype)); 7611 return NULL; 7612 } 7613 } 7614 7615 return do_strip(self, striptype); 7616} 7617 7618 7619PyDoc_STRVAR(strip__doc__, 7620 "S.strip([chars]) -> str\n\ 7621\n\ 7622Return a copy of the string S with leading and trailing\n\ 7623whitespace removed.\n\ 7624If chars is given and not None, remove characters in chars instead."); 7625 7626static PyObject * 7627unicode_strip(PyUnicodeObject *self, PyObject *args) 7628{ 7629 if (PyTuple_GET_SIZE(args) == 0) 7630 return do_strip(self, BOTHSTRIP); /* Common case */ 7631 else 7632 return do_argstrip(self, BOTHSTRIP, args); 7633} 7634 7635 7636PyDoc_STRVAR(lstrip__doc__, 7637 "S.lstrip([chars]) -> str\n\ 7638\n\ 7639Return a copy of the string S with leading whitespace removed.\n\ 7640If chars is given and not None, remove characters in chars instead."); 7641 7642static PyObject * 7643unicode_lstrip(PyUnicodeObject *self, PyObject *args) 7644{ 7645 if (PyTuple_GET_SIZE(args) == 0) 7646 return do_strip(self, LEFTSTRIP); /* Common case */ 7647 else 7648 return do_argstrip(self, LEFTSTRIP, args); 7649} 7650 7651 7652PyDoc_STRVAR(rstrip__doc__, 7653 "S.rstrip([chars]) -> str\n\ 7654\n\ 7655Return a copy of the string S with trailing whitespace removed.\n\ 7656If chars is given and not None, remove characters in chars instead."); 7657 7658static PyObject * 7659unicode_rstrip(PyUnicodeObject *self, PyObject *args) 7660{ 7661 if (PyTuple_GET_SIZE(args) == 0) 7662 return do_strip(self, RIGHTSTRIP); /* Common case */ 7663 else 7664 return do_argstrip(self, RIGHTSTRIP, args); 7665} 7666 7667 7668static PyObject* 7669unicode_repeat(PyUnicodeObject *str, Py_ssize_t len) 7670{ 7671 PyUnicodeObject *u; 7672 Py_UNICODE *p; 7673 Py_ssize_t nchars; 7674 size_t nbytes; 7675 7676 if (len < 0) 7677 len = 0; 7678 7679 if (len == 1 && PyUnicode_CheckExact(str)) { 7680 /* no repeat, return original string */ 7681 Py_INCREF(str); 7682 return (PyObject*) str; 7683 } 7684 7685 /* ensure # of chars needed doesn't overflow int and # of bytes 7686 * needed doesn't overflow size_t 7687 */ 7688 nchars = len * str->length; 7689 if (len && nchars / len != str->length) { 7690 PyErr_SetString(PyExc_OverflowError, 7691 "repeated string is too long"); 7692 return NULL; 7693 } 7694 nbytes = (nchars + 1) * sizeof(Py_UNICODE); 7695 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) { 7696 PyErr_SetString(PyExc_OverflowError, 7697 "repeated string is too long"); 7698 return NULL; 7699 } 7700 u = _PyUnicode_New(nchars); 7701 if (!u) 7702 return NULL; 7703 7704 p = u->str; 7705 7706 if (str->length == 1 && len > 0) { 7707 Py_UNICODE_FILL(p, str->str[0], len); 7708 } else { 7709 Py_ssize_t done = 0; /* number of characters copied this far */ 7710 if (done < nchars) { 7711 Py_UNICODE_COPY(p, str->str, str->length); 7712 done = str->length; 7713 } 7714 while (done < nchars) { 7715 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done; 7716 Py_UNICODE_COPY(p+done, p, n); 7717 done += n; 7718 } 7719 } 7720 7721 return (PyObject*) u; 7722} 7723 7724PyObject *PyUnicode_Replace(PyObject *obj, 7725 PyObject *subobj, 7726 PyObject *replobj, 7727 Py_ssize_t maxcount) 7728{ 7729 PyObject *self; 7730 PyObject *str1; 7731 PyObject *str2; 7732 PyObject *result; 7733 7734 self = PyUnicode_FromObject(obj); 7735 if (self == NULL) 7736 return NULL; 7737 str1 = PyUnicode_FromObject(subobj); 7738 if (str1 == NULL) { 7739 Py_DECREF(self); 7740 return NULL; 7741 } 7742 str2 = PyUnicode_FromObject(replobj); 7743 if (str2 == NULL) { 7744 Py_DECREF(self); 7745 Py_DECREF(str1); 7746 return NULL; 7747 } 7748 result = replace((PyUnicodeObject *)self, 7749 (PyUnicodeObject *)str1, 7750 (PyUnicodeObject *)str2, 7751 maxcount); 7752 Py_DECREF(self); 7753 Py_DECREF(str1); 7754 Py_DECREF(str2); 7755 return result; 7756} 7757 7758PyDoc_STRVAR(replace__doc__, 7759 "S.replace (old, new[, count]) -> str\n\ 7760\n\ 7761Return a copy of S with all occurrences of substring\n\ 7762old replaced by new. If the optional argument count is\n\ 7763given, only the first count occurrences are replaced."); 7764 7765static PyObject* 7766unicode_replace(PyUnicodeObject *self, PyObject *args) 7767{ 7768 PyUnicodeObject *str1; 7769 PyUnicodeObject *str2; 7770 Py_ssize_t maxcount = -1; 7771 PyObject *result; 7772 7773 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount)) 7774 return NULL; 7775 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1); 7776 if (str1 == NULL) 7777 return NULL; 7778 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2); 7779 if (str2 == NULL) { 7780 Py_DECREF(str1); 7781 return NULL; 7782 } 7783 7784 result = replace(self, str1, str2, maxcount); 7785 7786 Py_DECREF(str1); 7787 Py_DECREF(str2); 7788 return result; 7789} 7790 7791static 7792PyObject *unicode_repr(PyObject *unicode) 7793{ 7794 PyObject *repr; 7795 Py_UNICODE *p; 7796 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode); 7797 Py_ssize_t size = PyUnicode_GET_SIZE(unicode); 7798 7799 /* XXX(nnorwitz): rather than over-allocating, it would be 7800 better to choose a different scheme. Perhaps scan the 7801 first N-chars of the string and allocate based on that size. 7802 */ 7803 /* Initial allocation is based on the longest-possible unichr 7804 escape. 7805 7806 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source 7807 unichr, so in this case it's the longest unichr escape. In 7808 narrow (UTF-16) builds this is five chars per source unichr 7809 since there are two unichrs in the surrogate pair, so in narrow 7810 (UTF-16) builds it's not the longest unichr escape. 7811 7812 In wide or narrow builds '\uxxxx' is 6 chars per source unichr, 7813 so in the narrow (UTF-16) build case it's the longest unichr 7814 escape. 7815 */ 7816 7817 repr = PyUnicode_FromUnicode(NULL, 7818 2 /* quotes */ 7819#ifdef Py_UNICODE_WIDE 7820 + 10*size 7821#else 7822 + 6*size 7823#endif 7824 + 1); 7825 if (repr == NULL) 7826 return NULL; 7827 7828 p = PyUnicode_AS_UNICODE(repr); 7829 7830 /* Add quote */ 7831 *p++ = (findchar(s, size, '\'') && 7832 !findchar(s, size, '"')) ? '"' : '\''; 7833 while (size-- > 0) { 7834 Py_UNICODE ch = *s++; 7835 7836 /* Escape quotes and backslashes */ 7837 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) { 7838 *p++ = '\\'; 7839 *p++ = ch; 7840 continue; 7841 } 7842 7843 /* Map special whitespace to '\t', \n', '\r' */ 7844 if (ch == '\t') { 7845 *p++ = '\\'; 7846 *p++ = 't'; 7847 } 7848 else if (ch == '\n') { 7849 *p++ = '\\'; 7850 *p++ = 'n'; 7851 } 7852 else if (ch == '\r') { 7853 *p++ = '\\'; 7854 *p++ = 'r'; 7855 } 7856 7857 /* Map non-printable US ASCII to '\xhh' */ 7858 else if (ch < ' ' || ch == 0x7F) { 7859 *p++ = '\\'; 7860 *p++ = 'x'; 7861 *p++ = hexdigits[(ch >> 4) & 0x000F]; 7862 *p++ = hexdigits[ch & 0x000F]; 7863 } 7864 7865 /* Copy ASCII characters as-is */ 7866 else if (ch < 0x7F) { 7867 *p++ = ch; 7868 } 7869 7870 /* Non-ASCII characters */ 7871 else { 7872 Py_UCS4 ucs = ch; 7873 7874#ifndef Py_UNICODE_WIDE 7875 Py_UNICODE ch2 = 0; 7876 /* Get code point from surrogate pair */ 7877 if (size > 0) { 7878 ch2 = *s; 7879 if (ch >= 0xD800 && ch < 0xDC00 && ch2 >= 0xDC00 7880 && ch2 <= 0xDFFF) { 7881 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) 7882 + 0x00010000; 7883 s++; 7884 size--; 7885 } 7886 } 7887#endif 7888 /* Map Unicode whitespace and control characters 7889 (categories Z* and C* except ASCII space) 7890 */ 7891 if (!Py_UNICODE_ISPRINTABLE(ucs)) { 7892 /* Map 8-bit characters to '\xhh' */ 7893 if (ucs <= 0xff) { 7894 *p++ = '\\'; 7895 *p++ = 'x'; 7896 *p++ = hexdigits[(ch >> 4) & 0x000F]; 7897 *p++ = hexdigits[ch & 0x000F]; 7898 } 7899 /* Map 21-bit characters to '\U00xxxxxx' */ 7900 else if (ucs >= 0x10000) { 7901 *p++ = '\\'; 7902 *p++ = 'U'; 7903 *p++ = hexdigits[(ucs >> 28) & 0x0000000F]; 7904 *p++ = hexdigits[(ucs >> 24) & 0x0000000F]; 7905 *p++ = hexdigits[(ucs >> 20) & 0x0000000F]; 7906 *p++ = hexdigits[(ucs >> 16) & 0x0000000F]; 7907 *p++ = hexdigits[(ucs >> 12) & 0x0000000F]; 7908 *p++ = hexdigits[(ucs >> 8) & 0x0000000F]; 7909 *p++ = hexdigits[(ucs >> 4) & 0x0000000F]; 7910 *p++ = hexdigits[ucs & 0x0000000F]; 7911 } 7912 /* Map 16-bit characters to '\uxxxx' */ 7913 else { 7914 *p++ = '\\'; 7915 *p++ = 'u'; 7916 *p++ = hexdigits[(ucs >> 12) & 0x000F]; 7917 *p++ = hexdigits[(ucs >> 8) & 0x000F]; 7918 *p++ = hexdigits[(ucs >> 4) & 0x000F]; 7919 *p++ = hexdigits[ucs & 0x000F]; 7920 } 7921 } 7922 /* Copy characters as-is */ 7923 else { 7924 *p++ = ch; 7925#ifndef Py_UNICODE_WIDE 7926 if (ucs >= 0x10000) 7927 *p++ = ch2; 7928#endif 7929 } 7930 } 7931 } 7932 /* Add quote */ 7933 *p++ = PyUnicode_AS_UNICODE(repr)[0]; 7934 7935 *p = '\0'; 7936 PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr)); 7937 return repr; 7938} 7939 7940PyDoc_STRVAR(rfind__doc__, 7941 "S.rfind(sub[, start[, end]]) -> int\n\ 7942\n\ 7943Return the highest index in S where substring sub is found,\n\ 7944such that sub is contained within s[start:end]. Optional\n\ 7945arguments start and end are interpreted as in slice notation.\n\ 7946\n\ 7947Return -1 on failure."); 7948 7949static PyObject * 7950unicode_rfind(PyUnicodeObject *self, PyObject *args) 7951{ 7952 PyObject *substring; 7953 Py_ssize_t start; 7954 Py_ssize_t end; 7955 Py_ssize_t result; 7956 7957 if (!_ParseTupleFinds(args, &substring, &start, &end)) 7958 return NULL; 7959 7960 result = stringlib_rfind_slice( 7961 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self), 7962 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring), 7963 start, end 7964 ); 7965 7966 Py_DECREF(substring); 7967 7968 return PyLong_FromSsize_t(result); 7969} 7970 7971PyDoc_STRVAR(rindex__doc__, 7972 "S.rindex(sub[, start[, end]]) -> int\n\ 7973\n\ 7974Like S.rfind() but raise ValueError when the substring is not found."); 7975 7976static PyObject * 7977unicode_rindex(PyUnicodeObject *self, PyObject *args) 7978{ 7979 PyObject *substring; 7980 Py_ssize_t start; 7981 Py_ssize_t end; 7982 Py_ssize_t result; 7983 7984 if (!_ParseTupleFinds(args, &substring, &start, &end)) 7985 return NULL; 7986 7987 result = stringlib_rfind_slice( 7988 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self), 7989 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring), 7990 start, end 7991 ); 7992 7993 Py_DECREF(substring); 7994 7995 if (result < 0) { 7996 PyErr_SetString(PyExc_ValueError, "substring not found"); 7997 return NULL; 7998 } 7999 return PyLong_FromSsize_t(result); 8000} 8001 8002PyDoc_STRVAR(rjust__doc__, 8003 "S.rjust(width[, fillchar]) -> str\n\ 8004\n\ 8005Return S right-justified in a string of length width. Padding is\n\ 8006done using the specified fill character (default is a space)."); 8007 8008static PyObject * 8009unicode_rjust(PyUnicodeObject *self, PyObject *args) 8010{ 8011 Py_ssize_t width; 8012 Py_UNICODE fillchar = ' '; 8013 8014 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar)) 8015 return NULL; 8016 8017 if (self->length >= width && PyUnicode_CheckExact(self)) { 8018 Py_INCREF(self); 8019 return (PyObject*) self; 8020 } 8021 8022 return (PyObject*) pad(self, width - self->length, 0, fillchar); 8023} 8024 8025PyObject *PyUnicode_Split(PyObject *s, 8026 PyObject *sep, 8027 Py_ssize_t maxsplit) 8028{ 8029 PyObject *result; 8030 8031 s = PyUnicode_FromObject(s); 8032 if (s == NULL) 8033 return NULL; 8034 if (sep != NULL) { 8035 sep = PyUnicode_FromObject(sep); 8036 if (sep == NULL) { 8037 Py_DECREF(s); 8038 return NULL; 8039 } 8040 } 8041 8042 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit); 8043 8044 Py_DECREF(s); 8045 Py_XDECREF(sep); 8046 return result; 8047} 8048 8049PyDoc_STRVAR(split__doc__, 8050 "S.split([sep[, maxsplit]]) -> list of strings\n\ 8051\n\ 8052Return a list of the words in S, using sep as the\n\ 8053delimiter string. If maxsplit is given, at most maxsplit\n\ 8054splits are done. If sep is not specified or is None, any\n\ 8055whitespace string is a separator and empty strings are\n\ 8056removed from the result."); 8057 8058static PyObject* 8059unicode_split(PyUnicodeObject *self, PyObject *args) 8060{ 8061 PyObject *substring = Py_None; 8062 Py_ssize_t maxcount = -1; 8063 8064 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount)) 8065 return NULL; 8066 8067 if (substring == Py_None) 8068 return split(self, NULL, maxcount); 8069 else if (PyUnicode_Check(substring)) 8070 return split(self, (PyUnicodeObject *)substring, maxcount); 8071 else 8072 return PyUnicode_Split((PyObject *)self, substring, maxcount); 8073} 8074 8075PyObject * 8076PyUnicode_Partition(PyObject *str_in, PyObject *sep_in) 8077{ 8078 PyObject* str_obj; 8079 PyObject* sep_obj; 8080 PyObject* out; 8081 8082 str_obj = PyUnicode_FromObject(str_in); 8083 if (!str_obj) 8084 return NULL; 8085 sep_obj = PyUnicode_FromObject(sep_in); 8086 if (!sep_obj) { 8087 Py_DECREF(str_obj); 8088 return NULL; 8089 } 8090 8091 out = stringlib_partition( 8092 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj), 8093 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj) 8094 ); 8095 8096 Py_DECREF(sep_obj); 8097 Py_DECREF(str_obj); 8098 8099 return out; 8100} 8101 8102 8103PyObject * 8104PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in) 8105{ 8106 PyObject* str_obj; 8107 PyObject* sep_obj; 8108 PyObject* out; 8109 8110 str_obj = PyUnicode_FromObject(str_in); 8111 if (!str_obj) 8112 return NULL; 8113 sep_obj = PyUnicode_FromObject(sep_in); 8114 if (!sep_obj) { 8115 Py_DECREF(str_obj); 8116 return NULL; 8117 } 8118 8119 out = stringlib_rpartition( 8120 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj), 8121 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj) 8122 ); 8123 8124 Py_DECREF(sep_obj); 8125 Py_DECREF(str_obj); 8126 8127 return out; 8128} 8129 8130PyDoc_STRVAR(partition__doc__, 8131 "S.partition(sep) -> (head, sep, tail)\n\ 8132\n\ 8133Search for the separator sep in S, and return the part before it,\n\ 8134the separator itself, and the part after it. If the separator is not\n\ 8135found, return S and two empty strings."); 8136 8137static PyObject* 8138unicode_partition(PyUnicodeObject *self, PyObject *separator) 8139{ 8140 return PyUnicode_Partition((PyObject *)self, separator); 8141} 8142 8143PyDoc_STRVAR(rpartition__doc__, 8144 "S.rpartition(sep) -> (tail, sep, head)\n\ 8145\n\ 8146Search for the separator sep in S, starting at the end of S, and return\n\ 8147the part before it, the separator itself, and the part after it. If the\n\ 8148separator is not found, return two empty strings and S."); 8149 8150static PyObject* 8151unicode_rpartition(PyUnicodeObject *self, PyObject *separator) 8152{ 8153 return PyUnicode_RPartition((PyObject *)self, separator); 8154} 8155 8156PyObject *PyUnicode_RSplit(PyObject *s, 8157 PyObject *sep, 8158 Py_ssize_t maxsplit) 8159{ 8160 PyObject *result; 8161 8162 s = PyUnicode_FromObject(s); 8163 if (s == NULL) 8164 return NULL; 8165 if (sep != NULL) { 8166 sep = PyUnicode_FromObject(sep); 8167 if (sep == NULL) { 8168 Py_DECREF(s); 8169 return NULL; 8170 } 8171 } 8172 8173 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit); 8174 8175 Py_DECREF(s); 8176 Py_XDECREF(sep); 8177 return result; 8178} 8179 8180PyDoc_STRVAR(rsplit__doc__, 8181 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\ 8182\n\ 8183Return a list of the words in S, using sep as the\n\ 8184delimiter string, starting at the end of the string and\n\ 8185working to the front. If maxsplit is given, at most maxsplit\n\ 8186splits are done. If sep is not specified, any whitespace string\n\ 8187is a separator."); 8188 8189static PyObject* 8190unicode_rsplit(PyUnicodeObject *self, PyObject *args) 8191{ 8192 PyObject *substring = Py_None; 8193 Py_ssize_t maxcount = -1; 8194 8195 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount)) 8196 return NULL; 8197 8198 if (substring == Py_None) 8199 return rsplit(self, NULL, maxcount); 8200 else if (PyUnicode_Check(substring)) 8201 return rsplit(self, (PyUnicodeObject *)substring, maxcount); 8202 else 8203 return PyUnicode_RSplit((PyObject *)self, substring, maxcount); 8204} 8205 8206PyDoc_STRVAR(splitlines__doc__, 8207 "S.splitlines([keepends]) -> list of strings\n\ 8208\n\ 8209Return a list of the lines in S, breaking at line boundaries.\n\ 8210Line breaks are not included in the resulting list unless keepends\n\ 8211is given and true."); 8212 8213static PyObject* 8214unicode_splitlines(PyUnicodeObject *self, PyObject *args) 8215{ 8216 int keepends = 0; 8217 8218 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends)) 8219 return NULL; 8220 8221 return PyUnicode_Splitlines((PyObject *)self, keepends); 8222} 8223 8224static 8225PyObject *unicode_str(PyObject *self) 8226{ 8227 if (PyUnicode_CheckExact(self)) { 8228 Py_INCREF(self); 8229 return self; 8230 } else 8231 /* Subtype -- return genuine unicode string with the same value. */ 8232 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self), 8233 PyUnicode_GET_SIZE(self)); 8234} 8235 8236PyDoc_STRVAR(swapcase__doc__, 8237 "S.swapcase() -> str\n\ 8238\n\ 8239Return a copy of S with uppercase characters converted to lowercase\n\ 8240and vice versa."); 8241 8242static PyObject* 8243unicode_swapcase(PyUnicodeObject *self) 8244{ 8245 return fixup(self, fixswapcase); 8246} 8247 8248PyDoc_STRVAR(maketrans__doc__, 8249 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\ 8250\n\ 8251Return a translation table usable for str.translate().\n\ 8252If there is only one argument, it must be a dictionary mapping Unicode\n\ 8253ordinals (integers) or characters to Unicode ordinals, strings or None.\n\ 8254Character keys will be then converted to ordinals.\n\ 8255If there are two arguments, they must be strings of equal length, and\n\ 8256in the resulting dictionary, each character in x will be mapped to the\n\ 8257character at the same position in y. If there is a third argument, it\n\ 8258must be a string, whose characters will be mapped to None in the result."); 8259 8260static PyObject* 8261unicode_maketrans(PyUnicodeObject *null, PyObject *args) 8262{ 8263 PyObject *x, *y = NULL, *z = NULL; 8264 PyObject *new = NULL, *key, *value; 8265 Py_ssize_t i = 0; 8266 int res; 8267 8268 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z)) 8269 return NULL; 8270 new = PyDict_New(); 8271 if (!new) 8272 return NULL; 8273 if (y != NULL) { 8274 /* x must be a string too, of equal length */ 8275 Py_ssize_t ylen = PyUnicode_GET_SIZE(y); 8276 if (!PyUnicode_Check(x)) { 8277 PyErr_SetString(PyExc_TypeError, "first maketrans argument must " 8278 "be a string if there is a second argument"); 8279 goto err; 8280 } 8281 if (PyUnicode_GET_SIZE(x) != ylen) { 8282 PyErr_SetString(PyExc_ValueError, "the first two maketrans " 8283 "arguments must have equal length"); 8284 goto err; 8285 } 8286 /* create entries for translating chars in x to those in y */ 8287 for (i = 0; i < PyUnicode_GET_SIZE(x); i++) { 8288 key = PyLong_FromLong(PyUnicode_AS_UNICODE(x)[i]); 8289 value = PyLong_FromLong(PyUnicode_AS_UNICODE(y)[i]); 8290 if (!key || !value) 8291 goto err; 8292 res = PyDict_SetItem(new, key, value); 8293 Py_DECREF(key); 8294 Py_DECREF(value); 8295 if (res < 0) 8296 goto err; 8297 } 8298 /* create entries for deleting chars in z */ 8299 if (z != NULL) { 8300 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) { 8301 key = PyLong_FromLong(PyUnicode_AS_UNICODE(z)[i]); 8302 if (!key) 8303 goto err; 8304 res = PyDict_SetItem(new, key, Py_None); 8305 Py_DECREF(key); 8306 if (res < 0) 8307 goto err; 8308 } 8309 } 8310 } else { 8311 /* x must be a dict */ 8312 if (!PyDict_Check(x)) { 8313 PyErr_SetString(PyExc_TypeError, "if you give only one argument " 8314 "to maketrans it must be a dict"); 8315 goto err; 8316 } 8317 /* copy entries into the new dict, converting string keys to int keys */ 8318 while (PyDict_Next(x, &i, &key, &value)) { 8319 if (PyUnicode_Check(key)) { 8320 /* convert string keys to integer keys */ 8321 PyObject *newkey; 8322 if (PyUnicode_GET_SIZE(key) != 1) { 8323 PyErr_SetString(PyExc_ValueError, "string keys in translate " 8324 "table must be of length 1"); 8325 goto err; 8326 } 8327 newkey = PyLong_FromLong(PyUnicode_AS_UNICODE(key)[0]); 8328 if (!newkey) 8329 goto err; 8330 res = PyDict_SetItem(new, newkey, value); 8331 Py_DECREF(newkey); 8332 if (res < 0) 8333 goto err; 8334 } else if (PyLong_Check(key)) { 8335 /* just keep integer keys */ 8336 if (PyDict_SetItem(new, key, value) < 0) 8337 goto err; 8338 } else { 8339 PyErr_SetString(PyExc_TypeError, "keys in translate table must " 8340 "be strings or integers"); 8341 goto err; 8342 } 8343 } 8344 } 8345 return new; 8346 err: 8347 Py_DECREF(new); 8348 return NULL; 8349} 8350 8351PyDoc_STRVAR(translate__doc__, 8352 "S.translate(table) -> str\n\ 8353\n\ 8354Return a copy of the string S, where all characters have been mapped\n\ 8355through the given translation table, which must be a mapping of\n\ 8356Unicode ordinals to Unicode ordinals, strings, or None.\n\ 8357Unmapped characters are left untouched. Characters mapped to None\n\ 8358are deleted."); 8359 8360static PyObject* 8361unicode_translate(PyUnicodeObject *self, PyObject *table) 8362{ 8363 return PyUnicode_TranslateCharmap(self->str, self->length, table, "ignore"); 8364} 8365 8366PyDoc_STRVAR(upper__doc__, 8367 "S.upper() -> str\n\ 8368\n\ 8369Return a copy of S converted to uppercase."); 8370 8371static PyObject* 8372unicode_upper(PyUnicodeObject *self) 8373{ 8374 return fixup(self, fixupper); 8375} 8376 8377PyDoc_STRVAR(zfill__doc__, 8378 "S.zfill(width) -> str\n\ 8379\n\ 8380Pad a numeric string S with zeros on the left, to fill a field\n\ 8381of the specified width. The string S is never truncated."); 8382 8383static PyObject * 8384unicode_zfill(PyUnicodeObject *self, PyObject *args) 8385{ 8386 Py_ssize_t fill; 8387 PyUnicodeObject *u; 8388 8389 Py_ssize_t width; 8390 if (!PyArg_ParseTuple(args, "n:zfill", &width)) 8391 return NULL; 8392 8393 if (self->length >= width) { 8394 if (PyUnicode_CheckExact(self)) { 8395 Py_INCREF(self); 8396 return (PyObject*) self; 8397 } 8398 else 8399 return PyUnicode_FromUnicode( 8400 PyUnicode_AS_UNICODE(self), 8401 PyUnicode_GET_SIZE(self) 8402 ); 8403 } 8404 8405 fill = width - self->length; 8406 8407 u = pad(self, fill, 0, '0'); 8408 8409 if (u == NULL) 8410 return NULL; 8411 8412 if (u->str[fill] == '+' || u->str[fill] == '-') { 8413 /* move sign to beginning of string */ 8414 u->str[0] = u->str[fill]; 8415 u->str[fill] = '0'; 8416 } 8417 8418 return (PyObject*) u; 8419} 8420 8421#if 0 8422static PyObject* 8423unicode_freelistsize(PyUnicodeObject *self) 8424{ 8425 return PyLong_FromLong(numfree); 8426} 8427#endif 8428 8429PyDoc_STRVAR(startswith__doc__, 8430 "S.startswith(prefix[, start[, end]]) -> bool\n\ 8431\n\ 8432Return True if S starts with the specified prefix, False otherwise.\n\ 8433With optional start, test S beginning at that position.\n\ 8434With optional end, stop comparing S at that position.\n\ 8435prefix can also be a tuple of strings to try."); 8436 8437static PyObject * 8438unicode_startswith(PyUnicodeObject *self, 8439 PyObject *args) 8440{ 8441 PyObject *subobj; 8442 PyUnicodeObject *substring; 8443 Py_ssize_t start = 0; 8444 Py_ssize_t end = PY_SSIZE_T_MAX; 8445 int result; 8446 8447 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj, 8448 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 8449 return NULL; 8450 if (PyTuple_Check(subobj)) { 8451 Py_ssize_t i; 8452 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) { 8453 substring = (PyUnicodeObject *)PyUnicode_FromObject( 8454 PyTuple_GET_ITEM(subobj, i)); 8455 if (substring == NULL) 8456 return NULL; 8457 result = tailmatch(self, substring, start, end, -1); 8458 Py_DECREF(substring); 8459 if (result) { 8460 Py_RETURN_TRUE; 8461 } 8462 } 8463 /* nothing matched */ 8464 Py_RETURN_FALSE; 8465 } 8466 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj); 8467 if (substring == NULL) 8468 return NULL; 8469 result = tailmatch(self, substring, start, end, -1); 8470 Py_DECREF(substring); 8471 return PyBool_FromLong(result); 8472} 8473 8474 8475PyDoc_STRVAR(endswith__doc__, 8476 "S.endswith(suffix[, start[, end]]) -> bool\n\ 8477\n\ 8478Return True if S ends with the specified suffix, False otherwise.\n\ 8479With optional start, test S beginning at that position.\n\ 8480With optional end, stop comparing S at that position.\n\ 8481suffix can also be a tuple of strings to try."); 8482 8483static PyObject * 8484unicode_endswith(PyUnicodeObject *self, 8485 PyObject *args) 8486{ 8487 PyObject *subobj; 8488 PyUnicodeObject *substring; 8489 Py_ssize_t start = 0; 8490 Py_ssize_t end = PY_SSIZE_T_MAX; 8491 int result; 8492 8493 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj, 8494 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 8495 return NULL; 8496 if (PyTuple_Check(subobj)) { 8497 Py_ssize_t i; 8498 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) { 8499 substring = (PyUnicodeObject *)PyUnicode_FromObject( 8500 PyTuple_GET_ITEM(subobj, i)); 8501 if (substring == NULL) 8502 return NULL; 8503 result = tailmatch(self, substring, start, end, +1); 8504 Py_DECREF(substring); 8505 if (result) { 8506 Py_RETURN_TRUE; 8507 } 8508 } 8509 Py_RETURN_FALSE; 8510 } 8511 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj); 8512 if (substring == NULL) 8513 return NULL; 8514 8515 result = tailmatch(self, substring, start, end, +1); 8516 Py_DECREF(substring); 8517 return PyBool_FromLong(result); 8518} 8519 8520#include "stringlib/string_format.h" 8521 8522PyDoc_STRVAR(format__doc__, 8523 "S.format(*args, **kwargs) -> str\n\ 8524\n\ 8525"); 8526 8527static PyObject * 8528unicode__format__(PyObject* self, PyObject* args) 8529{ 8530 PyObject *format_spec; 8531 8532 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec)) 8533 return NULL; 8534 8535 return _PyUnicode_FormatAdvanced(self, 8536 PyUnicode_AS_UNICODE(format_spec), 8537 PyUnicode_GET_SIZE(format_spec)); 8538} 8539 8540PyDoc_STRVAR(p_format__doc__, 8541 "S.__format__(format_spec) -> str\n\ 8542\n\ 8543"); 8544 8545static PyObject * 8546unicode__sizeof__(PyUnicodeObject *v) 8547{ 8548 return PyLong_FromSsize_t(sizeof(PyUnicodeObject) + 8549 sizeof(Py_UNICODE) * (v->length + 1)); 8550} 8551 8552PyDoc_STRVAR(sizeof__doc__, 8553 "S.__sizeof__() -> size of S in memory, in bytes"); 8554 8555static PyObject * 8556unicode_getnewargs(PyUnicodeObject *v) 8557{ 8558 return Py_BuildValue("(u#)", v->str, v->length); 8559} 8560 8561 8562static PyMethodDef unicode_methods[] = { 8563 8564 /* Order is according to common usage: often used methods should 8565 appear first, since lookup is done sequentially. */ 8566 8567 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__}, 8568 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__}, 8569 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__}, 8570 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__}, 8571 {"join", (PyCFunction) unicode_join, METH_O, join__doc__}, 8572 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__}, 8573 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__}, 8574 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__}, 8575 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__}, 8576 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__}, 8577 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__}, 8578 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__}, 8579 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__}, 8580 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__}, 8581 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__}, 8582 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__}, 8583 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__}, 8584 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__}, 8585 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__}, 8586 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__}, 8587 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__}, 8588 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__}, 8589 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__}, 8590 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__}, 8591 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__}, 8592 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__}, 8593 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__}, 8594 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__}, 8595 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__}, 8596 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__}, 8597 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__}, 8598 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__}, 8599 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__}, 8600 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__}, 8601 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__}, 8602 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__}, 8603 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__}, 8604 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__}, 8605 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__}, 8606 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__}, 8607 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__}, 8608 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__}, 8609 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS}, 8610 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS}, 8611 {"maketrans", (PyCFunction) unicode_maketrans, 8612 METH_VARARGS | METH_STATIC, maketrans__doc__}, 8613 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__}, 8614#if 0 8615 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__}, 8616#endif 8617 8618#if 0 8619 /* This one is just used for debugging the implementation. */ 8620 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS}, 8621#endif 8622 8623 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS}, 8624 {NULL, NULL} 8625}; 8626 8627static PyObject * 8628unicode_mod(PyObject *v, PyObject *w) 8629{ 8630 if (!PyUnicode_Check(v)) { 8631 Py_INCREF(Py_NotImplemented); 8632 return Py_NotImplemented; 8633 } 8634 return PyUnicode_Format(v, w); 8635} 8636 8637static PyNumberMethods unicode_as_number = { 8638 0, /*nb_add*/ 8639 0, /*nb_subtract*/ 8640 0, /*nb_multiply*/ 8641 unicode_mod, /*nb_remainder*/ 8642}; 8643 8644static PySequenceMethods unicode_as_sequence = { 8645 (lenfunc) unicode_length, /* sq_length */ 8646 PyUnicode_Concat, /* sq_concat */ 8647 (ssizeargfunc) unicode_repeat, /* sq_repeat */ 8648 (ssizeargfunc) unicode_getitem, /* sq_item */ 8649 0, /* sq_slice */ 8650 0, /* sq_ass_item */ 8651 0, /* sq_ass_slice */ 8652 PyUnicode_Contains, /* sq_contains */ 8653}; 8654 8655static PyObject* 8656unicode_subscript(PyUnicodeObject* self, PyObject* item) 8657{ 8658 if (PyIndex_Check(item)) { 8659 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError); 8660 if (i == -1 && PyErr_Occurred()) 8661 return NULL; 8662 if (i < 0) 8663 i += PyUnicode_GET_SIZE(self); 8664 return unicode_getitem(self, i); 8665 } else if (PySlice_Check(item)) { 8666 Py_ssize_t start, stop, step, slicelength, cur, i; 8667 Py_UNICODE* source_buf; 8668 Py_UNICODE* result_buf; 8669 PyObject* result; 8670 8671 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self), 8672 &start, &stop, &step, &slicelength) < 0) { 8673 return NULL; 8674 } 8675 8676 if (slicelength <= 0) { 8677 return PyUnicode_FromUnicode(NULL, 0); 8678 } else if (start == 0 && step == 1 && slicelength == self->length && 8679 PyUnicode_CheckExact(self)) { 8680 Py_INCREF(self); 8681 return (PyObject *)self; 8682 } else if (step == 1) { 8683 return PyUnicode_FromUnicode(self->str + start, slicelength); 8684 } else { 8685 source_buf = PyUnicode_AS_UNICODE((PyObject*)self); 8686 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength* 8687 sizeof(Py_UNICODE)); 8688 8689 if (result_buf == NULL) 8690 return PyErr_NoMemory(); 8691 8692 for (cur = start, i = 0; i < slicelength; cur += step, i++) { 8693 result_buf[i] = source_buf[cur]; 8694 } 8695 8696 result = PyUnicode_FromUnicode(result_buf, slicelength); 8697 PyObject_FREE(result_buf); 8698 return result; 8699 } 8700 } else { 8701 PyErr_SetString(PyExc_TypeError, "string indices must be integers"); 8702 return NULL; 8703 } 8704} 8705 8706static PyMappingMethods unicode_as_mapping = { 8707 (lenfunc)unicode_length, /* mp_length */ 8708 (binaryfunc)unicode_subscript, /* mp_subscript */ 8709 (objobjargproc)0, /* mp_ass_subscript */ 8710}; 8711 8712 8713/* Helpers for PyUnicode_Format() */ 8714 8715static PyObject * 8716getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx) 8717{ 8718 Py_ssize_t argidx = *p_argidx; 8719 if (argidx < arglen) { 8720 (*p_argidx)++; 8721 if (arglen < 0) 8722 return args; 8723 else 8724 return PyTuple_GetItem(args, argidx); 8725 } 8726 PyErr_SetString(PyExc_TypeError, 8727 "not enough arguments for format string"); 8728 return NULL; 8729} 8730 8731static Py_ssize_t 8732strtounicode(Py_UNICODE *buffer, const char *charbuffer) 8733{ 8734 register Py_ssize_t i; 8735 Py_ssize_t len = strlen(charbuffer); 8736 for (i = len - 1; i >= 0; i--) 8737 buffer[i] = (Py_UNICODE) charbuffer[i]; 8738 8739 return len; 8740} 8741 8742static int 8743doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x) 8744{ 8745 Py_ssize_t result; 8746 8747 PyOS_ascii_formatd((char *)buffer, len, format, x); 8748 result = strtounicode(buffer, (char *)buffer); 8749 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int); 8750} 8751 8752#if 0 8753static int 8754longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x) 8755{ 8756 Py_ssize_t result; 8757 8758 PyOS_snprintf((char *)buffer, len, format, x); 8759 result = strtounicode(buffer, (char *)buffer); 8760 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int); 8761} 8762#endif 8763 8764/* XXX To save some code duplication, formatfloat/long/int could have been 8765 shared with stringobject.c, converting from 8-bit to Unicode after the 8766 formatting is done. */ 8767 8768static int 8769formatfloat(Py_UNICODE *buf, 8770 size_t buflen, 8771 int flags, 8772 int prec, 8773 int type, 8774 PyObject *v) 8775{ 8776 /* fmt = '%#.' + `prec` + `type` 8777 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/ 8778 char fmt[20]; 8779 double x; 8780 8781 x = PyFloat_AsDouble(v); 8782 if (x == -1.0 && PyErr_Occurred()) 8783 return -1; 8784 if (prec < 0) 8785 prec = 6; 8786 if (type == 'f' && (fabs(x) / 1e25) >= 1e25) 8787 type = 'g'; 8788 /* Worst case length calc to ensure no buffer overrun: 8789 8790 'g' formats: 8791 fmt = %#.<prec>g 8792 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp 8793 for any double rep.) 8794 len = 1 + prec + 1 + 2 + 5 = 9 + prec 8795 8796 'f' formats: 8797 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50) 8798 len = 1 + 50 + 1 + prec = 52 + prec 8799 8800 If prec=0 the effective precision is 1 (the leading digit is 8801 always given), therefore increase the length by one. 8802 8803 */ 8804 if (((type == 'g' || type == 'G') && 8805 buflen <= (size_t)10 + (size_t)prec) || 8806 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) { 8807 PyErr_SetString(PyExc_OverflowError, 8808 "formatted float is too long (precision too large?)"); 8809 return -1; 8810 } 8811 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c", 8812 (flags&F_ALT) ? "#" : "", 8813 prec, type); 8814 return doubletounicode(buf, buflen, fmt, x); 8815} 8816 8817static PyObject* 8818formatlong(PyObject *val, int flags, int prec, int type) 8819{ 8820 char *buf; 8821 int len; 8822 PyObject *str; /* temporary string object. */ 8823 PyObject *result; 8824 8825 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len); 8826 if (!str) 8827 return NULL; 8828 result = PyUnicode_FromStringAndSize(buf, len); 8829 Py_DECREF(str); 8830 return result; 8831} 8832 8833#if 0 8834static int 8835formatint(Py_UNICODE *buf, 8836 size_t buflen, 8837 int flags, 8838 int prec, 8839 int type, 8840 PyObject *v) 8841{ 8842 /* fmt = '%#.' + `prec` + 'l' + `type` 8843 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine) 8844 * + 1 + 1 8845 * = 24 8846 */ 8847 char fmt[64]; /* plenty big enough! */ 8848 char *sign; 8849 long x; 8850 8851 x = PyLong_AsLong(v); 8852 if (x == -1 && PyErr_Occurred()) 8853 return -1; 8854 if (x < 0 && type == 'u') { 8855 type = 'd'; 8856 } 8857 if (x < 0 && (type == 'x' || type == 'X' || type == 'o')) 8858 sign = "-"; 8859 else 8860 sign = ""; 8861 if (prec < 0) 8862 prec = 1; 8863 8864 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal)) 8865 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11 8866 */ 8867 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) { 8868 PyErr_SetString(PyExc_OverflowError, 8869 "formatted integer is too long (precision too large?)"); 8870 return -1; 8871 } 8872 8873 if ((flags & F_ALT) && 8874 (type == 'x' || type == 'X' || type == 'o')) { 8875 /* When converting under %#o, %#x or %#X, there are a number 8876 * of issues that cause pain: 8877 * - for %#o, we want a different base marker than C 8878 * - when 0 is being converted, the C standard leaves off 8879 * the '0x' or '0X', which is inconsistent with other 8880 * %#x/%#X conversions and inconsistent with Python's 8881 * hex() function 8882 * - there are platforms that violate the standard and 8883 * convert 0 with the '0x' or '0X' 8884 * (Metrowerks, Compaq Tru64) 8885 * - there are platforms that give '0x' when converting 8886 * under %#X, but convert 0 in accordance with the 8887 * standard (OS/2 EMX) 8888 * 8889 * We can achieve the desired consistency by inserting our 8890 * own '0x' or '0X' prefix, and substituting %x/%X in place 8891 * of %#x/%#X. 8892 * 8893 * Note that this is the same approach as used in 8894 * formatint() in stringobject.c 8895 */ 8896 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c", 8897 sign, type, prec, type); 8898 } 8899 else { 8900 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c", 8901 sign, (flags&F_ALT) ? "#" : "", 8902 prec, type); 8903 } 8904 if (sign[0]) 8905 return longtounicode(buf, buflen, fmt, -x); 8906 else 8907 return longtounicode(buf, buflen, fmt, x); 8908} 8909#endif 8910 8911static int 8912formatchar(Py_UNICODE *buf, 8913 size_t buflen, 8914 PyObject *v) 8915{ 8916 /* presume that the buffer is at least 3 characters long */ 8917 if (PyUnicode_Check(v)) { 8918 if (PyUnicode_GET_SIZE(v) == 1) { 8919 buf[0] = PyUnicode_AS_UNICODE(v)[0]; 8920 buf[1] = '\0'; 8921 return 1; 8922 } 8923#ifndef Py_UNICODE_WIDE 8924 if (PyUnicode_GET_SIZE(v) == 2) { 8925 /* Decode a valid surrogate pair */ 8926 int c0 = PyUnicode_AS_UNICODE(v)[0]; 8927 int c1 = PyUnicode_AS_UNICODE(v)[1]; 8928 if (0xD800 <= c0 && c0 <= 0xDBFF && 8929 0xDC00 <= c1 && c1 <= 0xDFFF) { 8930 buf[0] = c0; 8931 buf[1] = c1; 8932 buf[2] = '\0'; 8933 return 2; 8934 } 8935 } 8936#endif 8937 goto onError; 8938 } 8939 else { 8940 /* Integer input truncated to a character */ 8941 long x; 8942 x = PyLong_AsLong(v); 8943 if (x == -1 && PyErr_Occurred()) 8944 goto onError; 8945 8946 if (x < 0 || x > 0x10ffff) { 8947 PyErr_SetString(PyExc_OverflowError, 8948 "%c arg not in range(0x110000)"); 8949 return -1; 8950 } 8951 8952#ifndef Py_UNICODE_WIDE 8953 if (x > 0xffff) { 8954 x -= 0x10000; 8955 buf[0] = (Py_UNICODE)(0xD800 | (x >> 10)); 8956 buf[1] = (Py_UNICODE)(0xDC00 | (x & 0x3FF)); 8957 return 2; 8958 } 8959#endif 8960 buf[0] = (Py_UNICODE) x; 8961 buf[1] = '\0'; 8962 return 1; 8963 } 8964 8965 onError: 8966 PyErr_SetString(PyExc_TypeError, 8967 "%c requires int or char"); 8968 return -1; 8969} 8970 8971/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...) 8972 8973 FORMATBUFLEN is the length of the buffer in which the floats, ints, & 8974 chars are formatted. XXX This is a magic number. Each formatting 8975 routine does bounds checking to ensure no overflow, but a better 8976 solution may be to malloc a buffer of appropriate size for each 8977 format. For now, the current solution is sufficient. 8978*/ 8979#define FORMATBUFLEN (size_t)120 8980 8981PyObject *PyUnicode_Format(PyObject *format, 8982 PyObject *args) 8983{ 8984 Py_UNICODE *fmt, *res; 8985 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx; 8986 int args_owned = 0; 8987 PyUnicodeObject *result = NULL; 8988 PyObject *dict = NULL; 8989 PyObject *uformat; 8990 8991 if (format == NULL || args == NULL) { 8992 PyErr_BadInternalCall(); 8993 return NULL; 8994 } 8995 uformat = PyUnicode_FromObject(format); 8996 if (uformat == NULL) 8997 return NULL; 8998 fmt = PyUnicode_AS_UNICODE(uformat); 8999 fmtcnt = PyUnicode_GET_SIZE(uformat); 9000 9001 reslen = rescnt = fmtcnt + 100; 9002 result = _PyUnicode_New(reslen); 9003 if (result == NULL) 9004 goto onError; 9005 res = PyUnicode_AS_UNICODE(result); 9006 9007 if (PyTuple_Check(args)) { 9008 arglen = PyTuple_Size(args); 9009 argidx = 0; 9010 } 9011 else { 9012 arglen = -1; 9013 argidx = -2; 9014 } 9015 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) && 9016 !PyUnicode_Check(args)) 9017 dict = args; 9018 9019 while (--fmtcnt >= 0) { 9020 if (*fmt != '%') { 9021 if (--rescnt < 0) { 9022 rescnt = fmtcnt + 100; 9023 reslen += rescnt; 9024 if (_PyUnicode_Resize(&result, reslen) < 0) 9025 goto onError; 9026 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt; 9027 --rescnt; 9028 } 9029 *res++ = *fmt++; 9030 } 9031 else { 9032 /* Got a format specifier */ 9033 int flags = 0; 9034 Py_ssize_t width = -1; 9035 int prec = -1; 9036 Py_UNICODE c = '\0'; 9037 Py_UNICODE fill; 9038 int isnumok; 9039 PyObject *v = NULL; 9040 PyObject *temp = NULL; 9041 Py_UNICODE *pbuf; 9042 Py_UNICODE sign; 9043 Py_ssize_t len; 9044 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */ 9045 9046 fmt++; 9047 if (*fmt == '(') { 9048 Py_UNICODE *keystart; 9049 Py_ssize_t keylen; 9050 PyObject *key; 9051 int pcount = 1; 9052 9053 if (dict == NULL) { 9054 PyErr_SetString(PyExc_TypeError, 9055 "format requires a mapping"); 9056 goto onError; 9057 } 9058 ++fmt; 9059 --fmtcnt; 9060 keystart = fmt; 9061 /* Skip over balanced parentheses */ 9062 while (pcount > 0 && --fmtcnt >= 0) { 9063 if (*fmt == ')') 9064 --pcount; 9065 else if (*fmt == '(') 9066 ++pcount; 9067 fmt++; 9068 } 9069 keylen = fmt - keystart - 1; 9070 if (fmtcnt < 0 || pcount > 0) { 9071 PyErr_SetString(PyExc_ValueError, 9072 "incomplete format key"); 9073 goto onError; 9074 } 9075#if 0 9076 /* keys are converted to strings using UTF-8 and 9077 then looked up since Python uses strings to hold 9078 variables names etc. in its namespaces and we 9079 wouldn't want to break common idioms. */ 9080 key = PyUnicode_EncodeUTF8(keystart, 9081 keylen, 9082 NULL); 9083#else 9084 key = PyUnicode_FromUnicode(keystart, keylen); 9085#endif 9086 if (key == NULL) 9087 goto onError; 9088 if (args_owned) { 9089 Py_DECREF(args); 9090 args_owned = 0; 9091 } 9092 args = PyObject_GetItem(dict, key); 9093 Py_DECREF(key); 9094 if (args == NULL) { 9095 goto onError; 9096 } 9097 args_owned = 1; 9098 arglen = -1; 9099 argidx = -2; 9100 } 9101 while (--fmtcnt >= 0) { 9102 switch (c = *fmt++) { 9103 case '-': flags |= F_LJUST; continue; 9104 case '+': flags |= F_SIGN; continue; 9105 case ' ': flags |= F_BLANK; continue; 9106 case '#': flags |= F_ALT; continue; 9107 case '0': flags |= F_ZERO; continue; 9108 } 9109 break; 9110 } 9111 if (c == '*') { 9112 v = getnextarg(args, arglen, &argidx); 9113 if (v == NULL) 9114 goto onError; 9115 if (!PyLong_Check(v)) { 9116 PyErr_SetString(PyExc_TypeError, 9117 "* wants int"); 9118 goto onError; 9119 } 9120 width = PyLong_AsLong(v); 9121 if (width == -1 && PyErr_Occurred()) 9122 goto onError; 9123 if (width < 0) { 9124 flags |= F_LJUST; 9125 width = -width; 9126 } 9127 if (--fmtcnt >= 0) 9128 c = *fmt++; 9129 } 9130 else if (c >= '0' && c <= '9') { 9131 width = c - '0'; 9132 while (--fmtcnt >= 0) { 9133 c = *fmt++; 9134 if (c < '0' || c > '9') 9135 break; 9136 if ((width*10) / 10 != width) { 9137 PyErr_SetString(PyExc_ValueError, 9138 "width too big"); 9139 goto onError; 9140 } 9141 width = width*10 + (c - '0'); 9142 } 9143 } 9144 if (c == '.') { 9145 prec = 0; 9146 if (--fmtcnt >= 0) 9147 c = *fmt++; 9148 if (c == '*') { 9149 v = getnextarg(args, arglen, &argidx); 9150 if (v == NULL) 9151 goto onError; 9152 if (!PyLong_Check(v)) { 9153 PyErr_SetString(PyExc_TypeError, 9154 "* wants int"); 9155 goto onError; 9156 } 9157 prec = PyLong_AsLong(v); 9158 if (prec == -1 && PyErr_Occurred()) 9159 goto onError; 9160 if (prec < 0) 9161 prec = 0; 9162 if (--fmtcnt >= 0) 9163 c = *fmt++; 9164 } 9165 else if (c >= '0' && c <= '9') { 9166 prec = c - '0'; 9167 while (--fmtcnt >= 0) { 9168 c = Py_CHARMASK(*fmt++); 9169 if (c < '0' || c > '9') 9170 break; 9171 if ((prec*10) / 10 != prec) { 9172 PyErr_SetString(PyExc_ValueError, 9173 "prec too big"); 9174 goto onError; 9175 } 9176 prec = prec*10 + (c - '0'); 9177 } 9178 } 9179 } /* prec */ 9180 if (fmtcnt >= 0) { 9181 if (c == 'h' || c == 'l' || c == 'L') { 9182 if (--fmtcnt >= 0) 9183 c = *fmt++; 9184 } 9185 } 9186 if (fmtcnt < 0) { 9187 PyErr_SetString(PyExc_ValueError, 9188 "incomplete format"); 9189 goto onError; 9190 } 9191 if (c != '%') { 9192 v = getnextarg(args, arglen, &argidx); 9193 if (v == NULL) 9194 goto onError; 9195 } 9196 sign = 0; 9197 fill = ' '; 9198 switch (c) { 9199 9200 case '%': 9201 pbuf = formatbuf; 9202 /* presume that buffer length is at least 1 */ 9203 pbuf[0] = '%'; 9204 len = 1; 9205 break; 9206 9207 case 's': 9208 case 'r': 9209 case 'a': 9210 if (PyUnicode_Check(v) && c == 's') { 9211 temp = v; 9212 Py_INCREF(temp); 9213 } 9214 else { 9215 if (c == 's') 9216 temp = PyObject_Str(v); 9217 else if (c == 'r') 9218 temp = PyObject_Repr(v); 9219 else 9220 temp = PyObject_ASCII(v); 9221 if (temp == NULL) 9222 goto onError; 9223 if (PyUnicode_Check(temp)) 9224 /* nothing to do */; 9225 else { 9226 Py_DECREF(temp); 9227 PyErr_SetString(PyExc_TypeError, 9228 "%s argument has non-string str()"); 9229 goto onError; 9230 } 9231 } 9232 pbuf = PyUnicode_AS_UNICODE(temp); 9233 len = PyUnicode_GET_SIZE(temp); 9234 if (prec >= 0 && len > prec) 9235 len = prec; 9236 break; 9237 9238 case 'i': 9239 case 'd': 9240 case 'u': 9241 case 'o': 9242 case 'x': 9243 case 'X': 9244 if (c == 'i') 9245 c = 'd'; 9246 isnumok = 0; 9247 if (PyNumber_Check(v)) { 9248 PyObject *iobj=NULL; 9249 9250 if (PyLong_Check(v)) { 9251 iobj = v; 9252 Py_INCREF(iobj); 9253 } 9254 else { 9255 iobj = PyNumber_Long(v); 9256 } 9257 if (iobj!=NULL) { 9258 if (PyLong_Check(iobj)) { 9259 isnumok = 1; 9260 temp = formatlong(iobj, flags, prec, c); 9261 Py_DECREF(iobj); 9262 if (!temp) 9263 goto onError; 9264 pbuf = PyUnicode_AS_UNICODE(temp); 9265 len = PyUnicode_GET_SIZE(temp); 9266 sign = 1; 9267 } 9268 else { 9269 Py_DECREF(iobj); 9270 } 9271 } 9272 } 9273 if (!isnumok) { 9274 PyErr_Format(PyExc_TypeError, 9275 "%%%c format: a number is required, " 9276 "not %.200s", (char)c, Py_TYPE(v)->tp_name); 9277 goto onError; 9278 } 9279 if (flags & F_ZERO) 9280 fill = '0'; 9281 break; 9282 9283 case 'e': 9284 case 'E': 9285 case 'f': 9286 case 'F': 9287 case 'g': 9288 case 'G': 9289 if (c == 'F') 9290 c = 'f'; 9291 pbuf = formatbuf; 9292 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), 9293 flags, prec, c, v); 9294 if (len < 0) 9295 goto onError; 9296 sign = 1; 9297 if (flags & F_ZERO) 9298 fill = '0'; 9299 break; 9300 9301 case 'c': 9302 pbuf = formatbuf; 9303 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v); 9304 if (len < 0) 9305 goto onError; 9306 break; 9307 9308 default: 9309 PyErr_Format(PyExc_ValueError, 9310 "unsupported format character '%c' (0x%x) " 9311 "at index %zd", 9312 (31<=c && c<=126) ? (char)c : '?', 9313 (int)c, 9314 (Py_ssize_t)(fmt - 1 - 9315 PyUnicode_AS_UNICODE(uformat))); 9316 goto onError; 9317 } 9318 if (sign) { 9319 if (*pbuf == '-' || *pbuf == '+') { 9320 sign = *pbuf++; 9321 len--; 9322 } 9323 else if (flags & F_SIGN) 9324 sign = '+'; 9325 else if (flags & F_BLANK) 9326 sign = ' '; 9327 else 9328 sign = 0; 9329 } 9330 if (width < len) 9331 width = len; 9332 if (rescnt - (sign != 0) < width) { 9333 reslen -= rescnt; 9334 rescnt = width + fmtcnt + 100; 9335 reslen += rescnt; 9336 if (reslen < 0) { 9337 Py_XDECREF(temp); 9338 PyErr_NoMemory(); 9339 goto onError; 9340 } 9341 if (_PyUnicode_Resize(&result, reslen) < 0) { 9342 Py_XDECREF(temp); 9343 goto onError; 9344 } 9345 res = PyUnicode_AS_UNICODE(result) 9346 + reslen - rescnt; 9347 } 9348 if (sign) { 9349 if (fill != ' ') 9350 *res++ = sign; 9351 rescnt--; 9352 if (width > len) 9353 width--; 9354 } 9355 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) { 9356 assert(pbuf[0] == '0'); 9357 assert(pbuf[1] == c); 9358 if (fill != ' ') { 9359 *res++ = *pbuf++; 9360 *res++ = *pbuf++; 9361 } 9362 rescnt -= 2; 9363 width -= 2; 9364 if (width < 0) 9365 width = 0; 9366 len -= 2; 9367 } 9368 if (width > len && !(flags & F_LJUST)) { 9369 do { 9370 --rescnt; 9371 *res++ = fill; 9372 } while (--width > len); 9373 } 9374 if (fill == ' ') { 9375 if (sign) 9376 *res++ = sign; 9377 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) { 9378 assert(pbuf[0] == '0'); 9379 assert(pbuf[1] == c); 9380 *res++ = *pbuf++; 9381 *res++ = *pbuf++; 9382 } 9383 } 9384 Py_UNICODE_COPY(res, pbuf, len); 9385 res += len; 9386 rescnt -= len; 9387 while (--width >= len) { 9388 --rescnt; 9389 *res++ = ' '; 9390 } 9391 if (dict && (argidx < arglen) && c != '%') { 9392 PyErr_SetString(PyExc_TypeError, 9393 "not all arguments converted during string formatting"); 9394 Py_XDECREF(temp); 9395 goto onError; 9396 } 9397 Py_XDECREF(temp); 9398 } /* '%' */ 9399 } /* until end */ 9400 if (argidx < arglen && !dict) { 9401 PyErr_SetString(PyExc_TypeError, 9402 "not all arguments converted during string formatting"); 9403 goto onError; 9404 } 9405 9406 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0) 9407 goto onError; 9408 if (args_owned) { 9409 Py_DECREF(args); 9410 } 9411 Py_DECREF(uformat); 9412 return (PyObject *)result; 9413 9414 onError: 9415 Py_XDECREF(result); 9416 Py_DECREF(uformat); 9417 if (args_owned) { 9418 Py_DECREF(args); 9419 } 9420 return NULL; 9421} 9422 9423static PyObject * 9424unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds); 9425 9426static PyObject * 9427unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds) 9428{ 9429 PyObject *x = NULL; 9430 static char *kwlist[] = {"object", "encoding", "errors", 0}; 9431 char *encoding = NULL; 9432 char *errors = NULL; 9433 9434 if (type != &PyUnicode_Type) 9435 return unicode_subtype_new(type, args, kwds); 9436 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str", 9437 kwlist, &x, &encoding, &errors)) 9438 return NULL; 9439 if (x == NULL) 9440 return (PyObject *)_PyUnicode_New(0); 9441 if (encoding == NULL && errors == NULL) 9442 return PyObject_Str(x); 9443 else 9444 return PyUnicode_FromEncodedObject(x, encoding, errors); 9445} 9446 9447static PyObject * 9448unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds) 9449{ 9450 PyUnicodeObject *tmp, *pnew; 9451 Py_ssize_t n; 9452 9453 assert(PyType_IsSubtype(type, &PyUnicode_Type)); 9454 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds); 9455 if (tmp == NULL) 9456 return NULL; 9457 assert(PyUnicode_Check(tmp)); 9458 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length); 9459 if (pnew == NULL) { 9460 Py_DECREF(tmp); 9461 return NULL; 9462 } 9463 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1)); 9464 if (pnew->str == NULL) { 9465 _Py_ForgetReference((PyObject *)pnew); 9466 PyObject_Del(pnew); 9467 Py_DECREF(tmp); 9468 return PyErr_NoMemory(); 9469 } 9470 Py_UNICODE_COPY(pnew->str, tmp->str, n+1); 9471 pnew->length = n; 9472 pnew->hash = tmp->hash; 9473 Py_DECREF(tmp); 9474 return (PyObject *)pnew; 9475} 9476 9477PyDoc_STRVAR(unicode_doc, 9478 "str(string[, encoding[, errors]]) -> str\n\ 9479\n\ 9480Create a new string object from the given encoded string.\n\ 9481encoding defaults to the current default string encoding.\n\ 9482errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'."); 9483 9484static PyObject *unicode_iter(PyObject *seq); 9485 9486PyTypeObject PyUnicode_Type = { 9487 PyVarObject_HEAD_INIT(&PyType_Type, 0) 9488 "str", /* tp_name */ 9489 sizeof(PyUnicodeObject), /* tp_size */ 9490 0, /* tp_itemsize */ 9491 /* Slots */ 9492 (destructor)unicode_dealloc, /* tp_dealloc */ 9493 0, /* tp_print */ 9494 0, /* tp_getattr */ 9495 0, /* tp_setattr */ 9496 0, /* tp_reserved */ 9497 unicode_repr, /* tp_repr */ 9498 &unicode_as_number, /* tp_as_number */ 9499 &unicode_as_sequence, /* tp_as_sequence */ 9500 &unicode_as_mapping, /* tp_as_mapping */ 9501 (hashfunc) unicode_hash, /* tp_hash*/ 9502 0, /* tp_call*/ 9503 (reprfunc) unicode_str, /* tp_str */ 9504 PyObject_GenericGetAttr, /* tp_getattro */ 9505 0, /* tp_setattro */ 9506 0, /* tp_as_buffer */ 9507 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | 9508 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */ 9509 unicode_doc, /* tp_doc */ 9510 0, /* tp_traverse */ 9511 0, /* tp_clear */ 9512 PyUnicode_RichCompare, /* tp_richcompare */ 9513 0, /* tp_weaklistoffset */ 9514 unicode_iter, /* tp_iter */ 9515 0, /* tp_iternext */ 9516 unicode_methods, /* tp_methods */ 9517 0, /* tp_members */ 9518 0, /* tp_getset */ 9519 &PyBaseObject_Type, /* tp_base */ 9520 0, /* tp_dict */ 9521 0, /* tp_descr_get */ 9522 0, /* tp_descr_set */ 9523 0, /* tp_dictoffset */ 9524 0, /* tp_init */ 9525 0, /* tp_alloc */ 9526 unicode_new, /* tp_new */ 9527 PyObject_Del, /* tp_free */ 9528}; 9529 9530/* Initialize the Unicode implementation */ 9531 9532void _PyUnicode_Init(void) 9533{ 9534 int i; 9535 9536 /* XXX - move this array to unicodectype.c ? */ 9537 Py_UNICODE linebreak[] = { 9538 0x000A, /* LINE FEED */ 9539 0x000D, /* CARRIAGE RETURN */ 9540 0x001C, /* FILE SEPARATOR */ 9541 0x001D, /* GROUP SEPARATOR */ 9542 0x001E, /* RECORD SEPARATOR */ 9543 0x0085, /* NEXT LINE */ 9544 0x2028, /* LINE SEPARATOR */ 9545 0x2029, /* PARAGRAPH SEPARATOR */ 9546 }; 9547 9548 /* Init the implementation */ 9549 free_list = NULL; 9550 numfree = 0; 9551 unicode_empty = _PyUnicode_New(0); 9552 if (!unicode_empty) 9553 return; 9554 9555 for (i = 0; i < 256; i++) 9556 unicode_latin1[i] = NULL; 9557 if (PyType_Ready(&PyUnicode_Type) < 0) 9558 Py_FatalError("Can't initialize 'unicode'"); 9559 9560 /* initialize the linebreak bloom filter */ 9561 bloom_linebreak = make_bloom_mask( 9562 linebreak, sizeof(linebreak) / sizeof(linebreak[0]) 9563 ); 9564 9565 PyType_Ready(&EncodingMapType); 9566} 9567 9568/* Finalize the Unicode implementation */ 9569 9570int 9571PyUnicode_ClearFreeList(void) 9572{ 9573 int freelist_size = numfree; 9574 PyUnicodeObject *u; 9575 9576 for (u = free_list; u != NULL;) { 9577 PyUnicodeObject *v = u; 9578 u = *(PyUnicodeObject **)u; 9579 if (v->str) 9580 PyObject_DEL(v->str); 9581 Py_XDECREF(v->defenc); 9582 PyObject_Del(v); 9583 numfree--; 9584 } 9585 free_list = NULL; 9586 assert(numfree == 0); 9587 return freelist_size; 9588} 9589 9590void 9591_PyUnicode_Fini(void) 9592{ 9593 int i; 9594 9595 Py_XDECREF(unicode_empty); 9596 unicode_empty = NULL; 9597 9598 for (i = 0; i < 256; i++) { 9599 if (unicode_latin1[i]) { 9600 Py_DECREF(unicode_latin1[i]); 9601 unicode_latin1[i] = NULL; 9602 } 9603 } 9604 (void)PyUnicode_ClearFreeList(); 9605} 9606 9607void 9608PyUnicode_InternInPlace(PyObject **p) 9609{ 9610 register PyUnicodeObject *s = (PyUnicodeObject *)(*p); 9611 PyObject *t; 9612 if (s == NULL || !PyUnicode_Check(s)) 9613 Py_FatalError( 9614 "PyUnicode_InternInPlace: unicode strings only please!"); 9615 /* If it's a subclass, we don't really know what putting 9616 it in the interned dict might do. */ 9617 if (!PyUnicode_CheckExact(s)) 9618 return; 9619 if (PyUnicode_CHECK_INTERNED(s)) 9620 return; 9621 if (interned == NULL) { 9622 interned = PyDict_New(); 9623 if (interned == NULL) { 9624 PyErr_Clear(); /* Don't leave an exception */ 9625 return; 9626 } 9627 } 9628 /* It might be that the GetItem call fails even 9629 though the key is present in the dictionary, 9630 namely when this happens during a stack overflow. */ 9631 Py_ALLOW_RECURSION 9632 t = PyDict_GetItem(interned, (PyObject *)s); 9633 Py_END_ALLOW_RECURSION 9634 9635 if (t) { 9636 Py_INCREF(t); 9637 Py_DECREF(*p); 9638 *p = t; 9639 return; 9640 } 9641 9642 PyThreadState_GET()->recursion_critical = 1; 9643 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) { 9644 PyErr_Clear(); 9645 PyThreadState_GET()->recursion_critical = 0; 9646 return; 9647 } 9648 PyThreadState_GET()->recursion_critical = 0; 9649 /* The two references in interned are not counted by refcnt. 9650 The deallocator will take care of this */ 9651 Py_REFCNT(s) -= 2; 9652 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL; 9653} 9654 9655void 9656PyUnicode_InternImmortal(PyObject **p) 9657{ 9658 PyUnicode_InternInPlace(p); 9659 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) { 9660 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL; 9661 Py_INCREF(*p); 9662 } 9663} 9664 9665PyObject * 9666PyUnicode_InternFromString(const char *cp) 9667{ 9668 PyObject *s = PyUnicode_FromString(cp); 9669 if (s == NULL) 9670 return NULL; 9671 PyUnicode_InternInPlace(&s); 9672 return s; 9673} 9674 9675void _Py_ReleaseInternedUnicodeStrings(void) 9676{ 9677 PyObject *keys; 9678 PyUnicodeObject *s; 9679 Py_ssize_t i, n; 9680 Py_ssize_t immortal_size = 0, mortal_size = 0; 9681 9682 if (interned == NULL || !PyDict_Check(interned)) 9683 return; 9684 keys = PyDict_Keys(interned); 9685 if (keys == NULL || !PyList_Check(keys)) { 9686 PyErr_Clear(); 9687 return; 9688 } 9689 9690 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak 9691 detector, interned unicode strings are not forcibly deallocated; 9692 rather, we give them their stolen references back, and then clear 9693 and DECREF the interned dict. */ 9694 9695 n = PyList_GET_SIZE(keys); 9696 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n", 9697 n); 9698 for (i = 0; i < n; i++) { 9699 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i); 9700 switch (s->state) { 9701 case SSTATE_NOT_INTERNED: 9702 /* XXX Shouldn't happen */ 9703 break; 9704 case SSTATE_INTERNED_IMMORTAL: 9705 Py_REFCNT(s) += 1; 9706 immortal_size += s->length; 9707 break; 9708 case SSTATE_INTERNED_MORTAL: 9709 Py_REFCNT(s) += 2; 9710 mortal_size += s->length; 9711 break; 9712 default: 9713 Py_FatalError("Inconsistent interned string state."); 9714 } 9715 s->state = SSTATE_NOT_INTERNED; 9716 } 9717 fprintf(stderr, "total size of all interned strings: " 9718 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d " 9719 "mortal/immortal\n", mortal_size, immortal_size); 9720 Py_DECREF(keys); 9721 PyDict_Clear(interned); 9722 Py_DECREF(interned); 9723 interned = NULL; 9724} 9725 9726 9727/********************* Unicode Iterator **************************/ 9728 9729typedef struct { 9730 PyObject_HEAD 9731 Py_ssize_t it_index; 9732 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */ 9733} unicodeiterobject; 9734 9735static void 9736unicodeiter_dealloc(unicodeiterobject *it) 9737{ 9738 _PyObject_GC_UNTRACK(it); 9739 Py_XDECREF(it->it_seq); 9740 PyObject_GC_Del(it); 9741} 9742 9743static int 9744unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg) 9745{ 9746 Py_VISIT(it->it_seq); 9747 return 0; 9748} 9749 9750static PyObject * 9751unicodeiter_next(unicodeiterobject *it) 9752{ 9753 PyUnicodeObject *seq; 9754 PyObject *item; 9755 9756 assert(it != NULL); 9757 seq = it->it_seq; 9758 if (seq == NULL) 9759 return NULL; 9760 assert(PyUnicode_Check(seq)); 9761 9762 if (it->it_index < PyUnicode_GET_SIZE(seq)) { 9763 item = PyUnicode_FromUnicode( 9764 PyUnicode_AS_UNICODE(seq)+it->it_index, 1); 9765 if (item != NULL) 9766 ++it->it_index; 9767 return item; 9768 } 9769 9770 Py_DECREF(seq); 9771 it->it_seq = NULL; 9772 return NULL; 9773} 9774 9775static PyObject * 9776unicodeiter_len(unicodeiterobject *it) 9777{ 9778 Py_ssize_t len = 0; 9779 if (it->it_seq) 9780 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index; 9781 return PyLong_FromSsize_t(len); 9782} 9783 9784PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it))."); 9785 9786static PyMethodDef unicodeiter_methods[] = { 9787 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS, 9788 length_hint_doc}, 9789 {NULL, NULL} /* sentinel */ 9790}; 9791 9792PyTypeObject PyUnicodeIter_Type = { 9793 PyVarObject_HEAD_INIT(&PyType_Type, 0) 9794 "str_iterator", /* tp_name */ 9795 sizeof(unicodeiterobject), /* tp_basicsize */ 9796 0, /* tp_itemsize */ 9797 /* methods */ 9798 (destructor)unicodeiter_dealloc, /* tp_dealloc */ 9799 0, /* tp_print */ 9800 0, /* tp_getattr */ 9801 0, /* tp_setattr */ 9802 0, /* tp_reserved */ 9803 0, /* tp_repr */ 9804 0, /* tp_as_number */ 9805 0, /* tp_as_sequence */ 9806 0, /* tp_as_mapping */ 9807 0, /* tp_hash */ 9808 0, /* tp_call */ 9809 0, /* tp_str */ 9810 PyObject_GenericGetAttr, /* tp_getattro */ 9811 0, /* tp_setattro */ 9812 0, /* tp_as_buffer */ 9813 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */ 9814 0, /* tp_doc */ 9815 (traverseproc)unicodeiter_traverse, /* tp_traverse */ 9816 0, /* tp_clear */ 9817 0, /* tp_richcompare */ 9818 0, /* tp_weaklistoffset */ 9819 PyObject_SelfIter, /* tp_iter */ 9820 (iternextfunc)unicodeiter_next, /* tp_iternext */ 9821 unicodeiter_methods, /* tp_methods */ 9822 0, 9823}; 9824 9825static PyObject * 9826unicode_iter(PyObject *seq) 9827{ 9828 unicodeiterobject *it; 9829 9830 if (!PyUnicode_Check(seq)) { 9831 PyErr_BadInternalCall(); 9832 return NULL; 9833 } 9834 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type); 9835 if (it == NULL) 9836 return NULL; 9837 it->it_index = 0; 9838 Py_INCREF(seq); 9839 it->it_seq = (PyUnicodeObject *)seq; 9840 _PyObject_GC_TRACK(it); 9841 return (PyObject *)it; 9842} 9843 9844size_t 9845Py_UNICODE_strlen(const Py_UNICODE *u) 9846{ 9847 int res = 0; 9848 while(*u++) 9849 res++; 9850 return res; 9851} 9852 9853Py_UNICODE* 9854Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2) 9855{ 9856 Py_UNICODE *u = s1; 9857 while ((*u++ = *s2++)); 9858 return s1; 9859} 9860 9861Py_UNICODE* 9862Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n) 9863{ 9864 Py_UNICODE *u = s1; 9865 while ((*u++ = *s2++)) 9866 if (n-- == 0) 9867 break; 9868 return s1; 9869} 9870 9871int 9872Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2) 9873{ 9874 while (*s1 && *s2 && *s1 == *s2) 9875 s1++, s2++; 9876 if (*s1 && *s2) 9877 return (*s1 < *s2) ? -1 : +1; 9878 if (*s1) 9879 return 1; 9880 if (*s2) 9881 return -1; 9882 return 0; 9883} 9884 9885Py_UNICODE* 9886Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c) 9887{ 9888 const Py_UNICODE *p; 9889 for (p = s; *p; p++) 9890 if (*p == c) 9891 return (Py_UNICODE*)p; 9892 return NULL; 9893} 9894 9895 9896#ifdef __cplusplus 9897} 9898#endif 9899 9900 9901/* 9902 Local variables: 9903 c-basic-offset: 4 9904 indent-tabs-mode: nil 9905 End: 9906*/ 9907