unicodeobject.c revision 6a27efa2d321c2b262c0cab3c2d4af3e2e8a9ead
1/* 2 3Unicode implementation based on original code by Fredrik Lundh, 4modified by Marc-Andre Lemburg <mal@lemburg.com> according to the 5Unicode Integration Proposal (see file Misc/unicode.txt). 6 7Major speed upgrades to the method implementations at the Reykjavik 8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke. 9 10Copyright (c) Corporation for National Research Initiatives. 11 12-------------------------------------------------------------------- 13The original string type implementation is: 14 15 Copyright (c) 1999 by Secret Labs AB 16 Copyright (c) 1999 by Fredrik Lundh 17 18By obtaining, using, and/or copying this software and/or its 19associated documentation, you agree that you have read, understood, 20and will comply with the following terms and conditions: 21 22Permission to use, copy, modify, and distribute this software and its 23associated documentation for any purpose and without fee is hereby 24granted, provided that the above copyright notice appears in all 25copies, and that both that copyright notice and this permission notice 26appear in supporting documentation, and that the name of Secret Labs 27AB or the author not be used in advertising or publicity pertaining to 28distribution of the software without specific, written prior 29permission. 30 31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO 32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND 33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR 34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT 37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 38-------------------------------------------------------------------- 39 40*/ 41 42#define PY_SSIZE_T_CLEAN 43#include "Python.h" 44#include "bytes_methods.h" 45 46#include "unicodeobject.h" 47#include "ucnhash.h" 48 49#ifdef MS_WINDOWS 50#include <windows.h> 51#endif 52 53/* Limit for the Unicode object free list */ 54 55#define PyUnicode_MAXFREELIST 1024 56 57/* Limit for the Unicode object free list stay alive optimization. 58 59 The implementation will keep allocated Unicode memory intact for 60 all objects on the free list having a size less than this 61 limit. This reduces malloc() overhead for small Unicode objects. 62 63 At worst this will result in PyUnicode_MAXFREELIST * 64 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT + 65 malloc()-overhead) bytes of unused garbage. 66 67 Setting the limit to 0 effectively turns the feature off. 68 69 Note: This is an experimental feature ! If you get core dumps when 70 using Unicode objects, turn this feature off. 71 72*/ 73 74#define KEEPALIVE_SIZE_LIMIT 9 75 76/* Endianness switches; defaults to little endian */ 77 78#ifdef WORDS_BIGENDIAN 79# define BYTEORDER_IS_BIG_ENDIAN 80#else 81# define BYTEORDER_IS_LITTLE_ENDIAN 82#endif 83 84/* --- Globals ------------------------------------------------------------ 85 86 The globals are initialized by the _PyUnicode_Init() API and should 87 not be used before calling that API. 88 89*/ 90 91 92#ifdef __cplusplus 93extern "C" { 94#endif 95 96/* This dictionary holds all interned unicode strings. Note that references 97 to strings in this dictionary are *not* counted in the string's ob_refcnt. 98 When the interned string reaches a refcnt of 0 the string deallocation 99 function will delete the reference from this dictionary. 100 101 Another way to look at this is that to say that the actual reference 102 count of a string is: s->ob_refcnt + (s->state ? 2 : 0) 103*/ 104static PyObject *interned; 105 106/* Free list for Unicode objects */ 107static PyUnicodeObject *free_list; 108static int numfree; 109 110/* The empty Unicode object is shared to improve performance. */ 111static PyUnicodeObject *unicode_empty; 112 113/* Single character Unicode strings in the Latin-1 range are being 114 shared as well. */ 115static PyUnicodeObject *unicode_latin1[256]; 116 117/* Default encoding to use and assume when NULL is passed as encoding 118 parameter; it is fixed to "utf-8". Always use the 119 PyUnicode_GetDefaultEncoding() API to access this global. 120 121 Don't forget to alter Py_FileSystemDefaultEncoding if you change the 122 hard coded default! 123*/ 124static const char unicode_default_encoding[] = "utf-8"; 125 126/* Fast detection of the most frequent whitespace characters */ 127const unsigned char _Py_ascii_whitespace[] = { 128 0, 0, 0, 0, 0, 0, 0, 0, 129/* case 0x0009: * HORIZONTAL TABULATION */ 130/* case 0x000A: * LINE FEED */ 131/* case 0x000B: * VERTICAL TABULATION */ 132/* case 0x000C: * FORM FEED */ 133/* case 0x000D: * CARRIAGE RETURN */ 134 0, 1, 1, 1, 1, 1, 0, 0, 135 0, 0, 0, 0, 0, 0, 0, 0, 136/* case 0x001C: * FILE SEPARATOR */ 137/* case 0x001D: * GROUP SEPARATOR */ 138/* case 0x001E: * RECORD SEPARATOR */ 139/* case 0x001F: * UNIT SEPARATOR */ 140 0, 0, 0, 0, 1, 1, 1, 1, 141/* case 0x0020: * SPACE */ 142 1, 0, 0, 0, 0, 0, 0, 0, 143 0, 0, 0, 0, 0, 0, 0, 0, 144 0, 0, 0, 0, 0, 0, 0, 0, 145 0, 0, 0, 0, 0, 0, 0, 0, 146 147 0, 0, 0, 0, 0, 0, 0, 0, 148 0, 0, 0, 0, 0, 0, 0, 0, 149 0, 0, 0, 0, 0, 0, 0, 0, 150 0, 0, 0, 0, 0, 0, 0, 0, 151 0, 0, 0, 0, 0, 0, 0, 0, 152 0, 0, 0, 0, 0, 0, 0, 0, 153 0, 0, 0, 0, 0, 0, 0, 0, 154 0, 0, 0, 0, 0, 0, 0, 0 155}; 156 157/* Same for linebreaks */ 158static unsigned char ascii_linebreak[] = { 159 0, 0, 0, 0, 0, 0, 0, 0, 160/* 0x000A, * LINE FEED */ 161/* 0x000D, * CARRIAGE RETURN */ 162 0, 0, 1, 0, 0, 1, 0, 0, 163 0, 0, 0, 0, 0, 0, 0, 0, 164/* 0x001C, * FILE SEPARATOR */ 165/* 0x001D, * GROUP SEPARATOR */ 166/* 0x001E, * RECORD SEPARATOR */ 167 0, 0, 0, 0, 1, 1, 1, 0, 168 0, 0, 0, 0, 0, 0, 0, 0, 169 0, 0, 0, 0, 0, 0, 0, 0, 170 0, 0, 0, 0, 0, 0, 0, 0, 171 0, 0, 0, 0, 0, 0, 0, 0, 172 173 0, 0, 0, 0, 0, 0, 0, 0, 174 0, 0, 0, 0, 0, 0, 0, 0, 175 0, 0, 0, 0, 0, 0, 0, 0, 176 0, 0, 0, 0, 0, 0, 0, 0, 177 0, 0, 0, 0, 0, 0, 0, 0, 178 0, 0, 0, 0, 0, 0, 0, 0, 179 0, 0, 0, 0, 0, 0, 0, 0, 180 0, 0, 0, 0, 0, 0, 0, 0 181}; 182 183 184Py_UNICODE 185PyUnicode_GetMax(void) 186{ 187#ifdef Py_UNICODE_WIDE 188 return 0x10FFFF; 189#else 190 /* This is actually an illegal character, so it should 191 not be passed to unichr. */ 192 return 0xFFFF; 193#endif 194} 195 196/* --- Bloom Filters ----------------------------------------------------- */ 197 198/* stuff to implement simple "bloom filters" for Unicode characters. 199 to keep things simple, we use a single bitmask, using the least 5 200 bits from each unicode characters as the bit index. */ 201 202/* the linebreak mask is set up by Unicode_Init below */ 203 204#define BLOOM_MASK unsigned long 205 206static BLOOM_MASK bloom_linebreak; 207 208#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F)))) 209 210#define BLOOM_LINEBREAK(ch) \ 211 ((ch) < 128U ? ascii_linebreak[(ch)] : \ 212 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch))) 213 214Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len) 215{ 216 /* calculate simple bloom-style bitmask for a given unicode string */ 217 218 long mask; 219 Py_ssize_t i; 220 221 mask = 0; 222 for (i = 0; i < len; i++) 223 mask |= (1 << (ptr[i] & 0x1F)); 224 225 return mask; 226} 227 228Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen) 229{ 230 Py_ssize_t i; 231 232 for (i = 0; i < setlen; i++) 233 if (set[i] == chr) 234 return 1; 235 236 return 0; 237} 238 239#define BLOOM_MEMBER(mask, chr, set, setlen)\ 240 BLOOM(mask, chr) && unicode_member(chr, set, setlen) 241 242/* --- Unicode Object ----------------------------------------------------- */ 243 244static 245int unicode_resize(register PyUnicodeObject *unicode, 246 Py_ssize_t length) 247{ 248 void *oldstr; 249 250 /* Shortcut if there's nothing much to do. */ 251 if (unicode->length == length) 252 goto reset; 253 254 /* Resizing shared object (unicode_empty or single character 255 objects) in-place is not allowed. Use PyUnicode_Resize() 256 instead ! */ 257 258 if (unicode == unicode_empty || 259 (unicode->length == 1 && 260 unicode->str[0] < 256U && 261 unicode_latin1[unicode->str[0]] == unicode)) { 262 PyErr_SetString(PyExc_SystemError, 263 "can't resize shared str objects"); 264 return -1; 265 } 266 267 /* We allocate one more byte to make sure the string is Ux0000 terminated. 268 The overallocation is also used by fastsearch, which assumes that it's 269 safe to look at str[length] (without making any assumptions about what 270 it contains). */ 271 272 oldstr = unicode->str; 273 unicode->str = PyObject_REALLOC(unicode->str, 274 sizeof(Py_UNICODE) * (length + 1)); 275 if (!unicode->str) { 276 unicode->str = (Py_UNICODE *)oldstr; 277 PyErr_NoMemory(); 278 return -1; 279 } 280 unicode->str[length] = 0; 281 unicode->length = length; 282 283 reset: 284 /* Reset the object caches */ 285 if (unicode->defenc) { 286 Py_DECREF(unicode->defenc); 287 unicode->defenc = NULL; 288 } 289 unicode->hash = -1; 290 291 return 0; 292} 293 294/* We allocate one more byte to make sure the string is 295 Ux0000 terminated; some code (e.g. new_identifier) 296 relies on that. 297 298 XXX This allocator could further be enhanced by assuring that the 299 free list never reduces its size below 1. 300 301*/ 302 303static 304PyUnicodeObject *_PyUnicode_New(Py_ssize_t length) 305{ 306 register PyUnicodeObject *unicode; 307 308 /* Optimization for empty strings */ 309 if (length == 0 && unicode_empty != NULL) { 310 Py_INCREF(unicode_empty); 311 return unicode_empty; 312 } 313 314 /* Ensure we won't overflow the size. */ 315 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) { 316 return (PyUnicodeObject *)PyErr_NoMemory(); 317 } 318 319 /* Unicode freelist & memory allocation */ 320 if (free_list) { 321 unicode = free_list; 322 free_list = *(PyUnicodeObject **)unicode; 323 numfree--; 324 if (unicode->str) { 325 /* Keep-Alive optimization: we only upsize the buffer, 326 never downsize it. */ 327 if ((unicode->length < length) && 328 unicode_resize(unicode, length) < 0) { 329 PyObject_DEL(unicode->str); 330 unicode->str = NULL; 331 } 332 } 333 else { 334 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1); 335 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size); 336 } 337 PyObject_INIT(unicode, &PyUnicode_Type); 338 } 339 else { 340 size_t new_size; 341 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type); 342 if (unicode == NULL) 343 return NULL; 344 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1); 345 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size); 346 } 347 348 if (!unicode->str) { 349 PyErr_NoMemory(); 350 goto onError; 351 } 352 /* Initialize the first element to guard against cases where 353 * the caller fails before initializing str -- unicode_resize() 354 * reads str[0], and the Keep-Alive optimization can keep memory 355 * allocated for str alive across a call to unicode_dealloc(unicode). 356 * We don't want unicode_resize to read uninitialized memory in 357 * that case. 358 */ 359 unicode->str[0] = 0; 360 unicode->str[length] = 0; 361 unicode->length = length; 362 unicode->hash = -1; 363 unicode->state = 0; 364 unicode->defenc = NULL; 365 return unicode; 366 367 onError: 368 /* XXX UNREF/NEWREF interface should be more symmetrical */ 369 _Py_DEC_REFTOTAL; 370 _Py_ForgetReference((PyObject *)unicode); 371 PyObject_Del(unicode); 372 return NULL; 373} 374 375static 376void unicode_dealloc(register PyUnicodeObject *unicode) 377{ 378 switch (PyUnicode_CHECK_INTERNED(unicode)) { 379 case SSTATE_NOT_INTERNED: 380 break; 381 382 case SSTATE_INTERNED_MORTAL: 383 /* revive dead object temporarily for DelItem */ 384 Py_REFCNT(unicode) = 3; 385 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0) 386 Py_FatalError( 387 "deletion of interned string failed"); 388 break; 389 390 case SSTATE_INTERNED_IMMORTAL: 391 Py_FatalError("Immortal interned string died."); 392 393 default: 394 Py_FatalError("Inconsistent interned string state."); 395 } 396 397 if (PyUnicode_CheckExact(unicode) && 398 numfree < PyUnicode_MAXFREELIST) { 399 /* Keep-Alive optimization */ 400 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) { 401 PyObject_DEL(unicode->str); 402 unicode->str = NULL; 403 unicode->length = 0; 404 } 405 if (unicode->defenc) { 406 Py_DECREF(unicode->defenc); 407 unicode->defenc = NULL; 408 } 409 /* Add to free list */ 410 *(PyUnicodeObject **)unicode = free_list; 411 free_list = unicode; 412 numfree++; 413 } 414 else { 415 PyObject_DEL(unicode->str); 416 Py_XDECREF(unicode->defenc); 417 Py_TYPE(unicode)->tp_free((PyObject *)unicode); 418 } 419} 420 421int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length) 422{ 423 register PyUnicodeObject *v; 424 425 /* Argument checks */ 426 if (unicode == NULL) { 427 PyErr_BadInternalCall(); 428 return -1; 429 } 430 v = (PyUnicodeObject *)*unicode; 431 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) { 432 PyErr_BadInternalCall(); 433 return -1; 434 } 435 436 /* Resizing unicode_empty and single character objects is not 437 possible since these are being shared. We simply return a fresh 438 copy with the same Unicode content. */ 439 if (v->length != length && 440 (v == unicode_empty || v->length == 1)) { 441 PyUnicodeObject *w = _PyUnicode_New(length); 442 if (w == NULL) 443 return -1; 444 Py_UNICODE_COPY(w->str, v->str, 445 length < v->length ? length : v->length); 446 Py_DECREF(*unicode); 447 *unicode = (PyObject *)w; 448 return 0; 449 } 450 451 /* Note that we don't have to modify *unicode for unshared Unicode 452 objects, since we can modify them in-place. */ 453 return unicode_resize(v, length); 454} 455 456/* Internal API for use in unicodeobject.c only ! */ 457#define _PyUnicode_Resize(unicodevar, length) \ 458 PyUnicode_Resize(((PyObject **)(unicodevar)), length) 459 460PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u, 461 Py_ssize_t size) 462{ 463 PyUnicodeObject *unicode; 464 465 /* If the Unicode data is known at construction time, we can apply 466 some optimizations which share commonly used objects. */ 467 if (u != NULL) { 468 469 /* Optimization for empty strings */ 470 if (size == 0 && unicode_empty != NULL) { 471 Py_INCREF(unicode_empty); 472 return (PyObject *)unicode_empty; 473 } 474 475 /* Single character Unicode objects in the Latin-1 range are 476 shared when using this constructor */ 477 if (size == 1 && *u < 256) { 478 unicode = unicode_latin1[*u]; 479 if (!unicode) { 480 unicode = _PyUnicode_New(1); 481 if (!unicode) 482 return NULL; 483 unicode->str[0] = *u; 484 unicode_latin1[*u] = unicode; 485 } 486 Py_INCREF(unicode); 487 return (PyObject *)unicode; 488 } 489 } 490 491 unicode = _PyUnicode_New(size); 492 if (!unicode) 493 return NULL; 494 495 /* Copy the Unicode data into the new object */ 496 if (u != NULL) 497 Py_UNICODE_COPY(unicode->str, u, size); 498 499 return (PyObject *)unicode; 500} 501 502PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size) 503{ 504 PyUnicodeObject *unicode; 505 506 if (size < 0) { 507 PyErr_SetString(PyExc_SystemError, 508 "Negative size passed to PyUnicode_FromStringAndSize"); 509 return NULL; 510 } 511 512 /* If the Unicode data is known at construction time, we can apply 513 some optimizations which share commonly used objects. 514 Also, this means the input must be UTF-8, so fall back to the 515 UTF-8 decoder at the end. */ 516 if (u != NULL) { 517 518 /* Optimization for empty strings */ 519 if (size == 0 && unicode_empty != NULL) { 520 Py_INCREF(unicode_empty); 521 return (PyObject *)unicode_empty; 522 } 523 524 /* Single characters are shared when using this constructor. 525 Restrict to ASCII, since the input must be UTF-8. */ 526 if (size == 1 && Py_CHARMASK(*u) < 128) { 527 unicode = unicode_latin1[Py_CHARMASK(*u)]; 528 if (!unicode) { 529 unicode = _PyUnicode_New(1); 530 if (!unicode) 531 return NULL; 532 unicode->str[0] = Py_CHARMASK(*u); 533 unicode_latin1[Py_CHARMASK(*u)] = unicode; 534 } 535 Py_INCREF(unicode); 536 return (PyObject *)unicode; 537 } 538 539 return PyUnicode_DecodeUTF8(u, size, NULL); 540 } 541 542 unicode = _PyUnicode_New(size); 543 if (!unicode) 544 return NULL; 545 546 return (PyObject *)unicode; 547} 548 549PyObject *PyUnicode_FromString(const char *u) 550{ 551 size_t size = strlen(u); 552 if (size > PY_SSIZE_T_MAX) { 553 PyErr_SetString(PyExc_OverflowError, "input too long"); 554 return NULL; 555 } 556 557 return PyUnicode_FromStringAndSize(u, size); 558} 559 560#ifdef HAVE_WCHAR_H 561 562PyObject *PyUnicode_FromWideChar(register const wchar_t *w, 563 Py_ssize_t size) 564{ 565 PyUnicodeObject *unicode; 566 567 if (w == NULL) { 568 if (size == 0) 569 return PyUnicode_FromStringAndSize(NULL, 0); 570 PyErr_BadInternalCall(); 571 return NULL; 572 } 573 574 if (size == -1) { 575 size = wcslen(w); 576 } 577 578 unicode = _PyUnicode_New(size); 579 if (!unicode) 580 return NULL; 581 582 /* Copy the wchar_t data into the new object */ 583#ifdef HAVE_USABLE_WCHAR_T 584 memcpy(unicode->str, w, size * sizeof(wchar_t)); 585#else 586 { 587 register Py_UNICODE *u; 588 register Py_ssize_t i; 589 u = PyUnicode_AS_UNICODE(unicode); 590 for (i = size; i > 0; i--) 591 *u++ = *w++; 592 } 593#endif 594 595 return (PyObject *)unicode; 596} 597 598static void 599makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c) 600{ 601 *fmt++ = '%'; 602 if (width) { 603 if (zeropad) 604 *fmt++ = '0'; 605 fmt += sprintf(fmt, "%d", width); 606 } 607 if (precision) 608 fmt += sprintf(fmt, ".%d", precision); 609 if (longflag) 610 *fmt++ = 'l'; 611 else if (size_tflag) { 612 char *f = PY_FORMAT_SIZE_T; 613 while (*f) 614 *fmt++ = *f++; 615 } 616 *fmt++ = c; 617 *fmt = '\0'; 618} 619 620#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;} 621 622PyObject * 623PyUnicode_FromFormatV(const char *format, va_list vargs) 624{ 625 va_list count; 626 Py_ssize_t callcount = 0; 627 PyObject **callresults = NULL; 628 PyObject **callresult = NULL; 629 Py_ssize_t n = 0; 630 int width = 0; 631 int precision = 0; 632 int zeropad; 633 const char* f; 634 Py_UNICODE *s; 635 PyObject *string; 636 /* used by sprintf */ 637 char buffer[21]; 638 /* use abuffer instead of buffer, if we need more space 639 * (which can happen if there's a format specifier with width). */ 640 char *abuffer = NULL; 641 char *realbuffer; 642 Py_ssize_t abuffersize = 0; 643 char fmt[60]; /* should be enough for %0width.precisionld */ 644 const char *copy; 645 646#ifdef VA_LIST_IS_ARRAY 647 Py_MEMCPY(count, vargs, sizeof(va_list)); 648#else 649#ifdef __va_copy 650 __va_copy(count, vargs); 651#else 652 count = vargs; 653#endif 654#endif 655 /* step 1: count the number of %S/%R/%A format specifications 656 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII() for 657 * these objects once during step 3 and put the result in 658 an array) */ 659 for (f = format; *f; f++) { 660 if (*f == '%' && (*(f+1)=='S' || *(f+1)=='R' || *(f+1)=='A')) 661 ++callcount; 662 } 663 /* step 2: allocate memory for the results of 664 * PyObject_Str()/PyObject_Repr() calls */ 665 if (callcount) { 666 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount); 667 if (!callresults) { 668 PyErr_NoMemory(); 669 return NULL; 670 } 671 callresult = callresults; 672 } 673 /* step 3: figure out how large a buffer we need */ 674 for (f = format; *f; f++) { 675 if (*f == '%') { 676 const char* p = f; 677 width = 0; 678 while (ISDIGIT((unsigned)*f)) 679 width = (width*10) + *f++ - '0'; 680 while (*++f && *f != '%' && !ISALPHA((unsigned)*f)) 681 ; 682 683 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since 684 * they don't affect the amount of space we reserve. 685 */ 686 if ((*f == 'l' || *f == 'z') && 687 (f[1] == 'd' || f[1] == 'u')) 688 ++f; 689 690 switch (*f) { 691 case 'c': 692 (void)va_arg(count, int); 693 /* fall through... */ 694 case '%': 695 n++; 696 break; 697 case 'd': case 'u': case 'i': case 'x': 698 (void) va_arg(count, int); 699 /* 20 bytes is enough to hold a 64-bit 700 integer. Decimal takes the most space. 701 This isn't enough for octal. 702 If a width is specified we need more 703 (which we allocate later). */ 704 if (width < 20) 705 width = 20; 706 n += width; 707 if (abuffersize < width) 708 abuffersize = width; 709 break; 710 case 's': 711 { 712 /* UTF-8 */ 713 unsigned char*s; 714 s = va_arg(count, unsigned char*); 715 while (*s) { 716 if (*s < 128) { 717 n++; s++; 718 } else if (*s < 0xc0) { 719 /* invalid UTF-8 */ 720 n++; s++; 721 } else if (*s < 0xc0) { 722 n++; 723 s++; if(!*s)break; 724 s++; 725 } else if (*s < 0xe0) { 726 n++; 727 s++; if(!*s)break; 728 s++; if(!*s)break; 729 s++; 730 } else { 731 #ifdef Py_UNICODE_WIDE 732 n++; 733 #else 734 n+=2; 735 #endif 736 s++; if(!*s)break; 737 s++; if(!*s)break; 738 s++; if(!*s)break; 739 s++; 740 } 741 } 742 break; 743 } 744 case 'U': 745 { 746 PyObject *obj = va_arg(count, PyObject *); 747 assert(obj && PyUnicode_Check(obj)); 748 n += PyUnicode_GET_SIZE(obj); 749 break; 750 } 751 case 'V': 752 { 753 PyObject *obj = va_arg(count, PyObject *); 754 const char *str = va_arg(count, const char *); 755 assert(obj || str); 756 assert(!obj || PyUnicode_Check(obj)); 757 if (obj) 758 n += PyUnicode_GET_SIZE(obj); 759 else 760 n += strlen(str); 761 break; 762 } 763 case 'S': 764 { 765 PyObject *obj = va_arg(count, PyObject *); 766 PyObject *str; 767 assert(obj); 768 str = PyObject_Str(obj); 769 if (!str) 770 goto fail; 771 n += PyUnicode_GET_SIZE(str); 772 /* Remember the str and switch to the next slot */ 773 *callresult++ = str; 774 break; 775 } 776 case 'R': 777 { 778 PyObject *obj = va_arg(count, PyObject *); 779 PyObject *repr; 780 assert(obj); 781 repr = PyObject_Repr(obj); 782 if (!repr) 783 goto fail; 784 n += PyUnicode_GET_SIZE(repr); 785 /* Remember the repr and switch to the next slot */ 786 *callresult++ = repr; 787 break; 788 } 789 case 'A': 790 { 791 PyObject *obj = va_arg(count, PyObject *); 792 PyObject *ascii; 793 assert(obj); 794 ascii = PyObject_ASCII(obj); 795 if (!ascii) 796 goto fail; 797 n += PyUnicode_GET_SIZE(ascii); 798 /* Remember the repr and switch to the next slot */ 799 *callresult++ = ascii; 800 break; 801 } 802 case 'p': 803 (void) va_arg(count, int); 804 /* maximum 64-bit pointer representation: 805 * 0xffffffffffffffff 806 * so 19 characters is enough. 807 * XXX I count 18 -- what's the extra for? 808 */ 809 n += 19; 810 break; 811 default: 812 /* if we stumble upon an unknown 813 formatting code, copy the rest of 814 the format string to the output 815 string. (we cannot just skip the 816 code, since there's no way to know 817 what's in the argument list) */ 818 n += strlen(p); 819 goto expand; 820 } 821 } else 822 n++; 823 } 824 expand: 825 if (abuffersize > 20) { 826 abuffer = PyObject_Malloc(abuffersize); 827 if (!abuffer) { 828 PyErr_NoMemory(); 829 goto fail; 830 } 831 realbuffer = abuffer; 832 } 833 else 834 realbuffer = buffer; 835 /* step 4: fill the buffer */ 836 /* Since we've analyzed how much space we need for the worst case, 837 we don't have to resize the string. 838 There can be no errors beyond this point. */ 839 string = PyUnicode_FromUnicode(NULL, n); 840 if (!string) 841 goto fail; 842 843 s = PyUnicode_AS_UNICODE(string); 844 callresult = callresults; 845 846 for (f = format; *f; f++) { 847 if (*f == '%') { 848 const char* p = f++; 849 int longflag = 0; 850 int size_tflag = 0; 851 zeropad = (*f == '0'); 852 /* parse the width.precision part */ 853 width = 0; 854 while (ISDIGIT((unsigned)*f)) 855 width = (width*10) + *f++ - '0'; 856 precision = 0; 857 if (*f == '.') { 858 f++; 859 while (ISDIGIT((unsigned)*f)) 860 precision = (precision*10) + *f++ - '0'; 861 } 862 /* handle the long flag, but only for %ld and %lu. 863 others can be added when necessary. */ 864 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) { 865 longflag = 1; 866 ++f; 867 } 868 /* handle the size_t flag. */ 869 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) { 870 size_tflag = 1; 871 ++f; 872 } 873 874 switch (*f) { 875 case 'c': 876 *s++ = va_arg(vargs, int); 877 break; 878 case 'd': 879 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd'); 880 if (longflag) 881 sprintf(realbuffer, fmt, va_arg(vargs, long)); 882 else if (size_tflag) 883 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t)); 884 else 885 sprintf(realbuffer, fmt, va_arg(vargs, int)); 886 appendstring(realbuffer); 887 break; 888 case 'u': 889 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u'); 890 if (longflag) 891 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long)); 892 else if (size_tflag) 893 sprintf(realbuffer, fmt, va_arg(vargs, size_t)); 894 else 895 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int)); 896 appendstring(realbuffer); 897 break; 898 case 'i': 899 makefmt(fmt, 0, 0, zeropad, width, precision, 'i'); 900 sprintf(realbuffer, fmt, va_arg(vargs, int)); 901 appendstring(realbuffer); 902 break; 903 case 'x': 904 makefmt(fmt, 0, 0, zeropad, width, precision, 'x'); 905 sprintf(realbuffer, fmt, va_arg(vargs, int)); 906 appendstring(realbuffer); 907 break; 908 case 's': 909 { 910 /* Parameter must be UTF-8 encoded. 911 In case of encoding errors, use 912 the replacement character. */ 913 PyObject *u; 914 p = va_arg(vargs, char*); 915 u = PyUnicode_DecodeUTF8(p, strlen(p), 916 "replace"); 917 if (!u) 918 goto fail; 919 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(u), 920 PyUnicode_GET_SIZE(u)); 921 s += PyUnicode_GET_SIZE(u); 922 Py_DECREF(u); 923 break; 924 } 925 case 'U': 926 { 927 PyObject *obj = va_arg(vargs, PyObject *); 928 Py_ssize_t size = PyUnicode_GET_SIZE(obj); 929 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size); 930 s += size; 931 break; 932 } 933 case 'V': 934 { 935 PyObject *obj = va_arg(vargs, PyObject *); 936 const char *str = va_arg(vargs, const char *); 937 if (obj) { 938 Py_ssize_t size = PyUnicode_GET_SIZE(obj); 939 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size); 940 s += size; 941 } else { 942 appendstring(str); 943 } 944 break; 945 } 946 case 'S': 947 case 'R': 948 { 949 Py_UNICODE *ucopy; 950 Py_ssize_t usize; 951 Py_ssize_t upos; 952 /* unused, since we already have the result */ 953 (void) va_arg(vargs, PyObject *); 954 ucopy = PyUnicode_AS_UNICODE(*callresult); 955 usize = PyUnicode_GET_SIZE(*callresult); 956 for (upos = 0; upos<usize;) 957 *s++ = ucopy[upos++]; 958 /* We're done with the unicode()/repr() => forget it */ 959 Py_DECREF(*callresult); 960 /* switch to next unicode()/repr() result */ 961 ++callresult; 962 break; 963 } 964 case 'p': 965 sprintf(buffer, "%p", va_arg(vargs, void*)); 966 /* %p is ill-defined: ensure leading 0x. */ 967 if (buffer[1] == 'X') 968 buffer[1] = 'x'; 969 else if (buffer[1] != 'x') { 970 memmove(buffer+2, buffer, strlen(buffer)+1); 971 buffer[0] = '0'; 972 buffer[1] = 'x'; 973 } 974 appendstring(buffer); 975 break; 976 case '%': 977 *s++ = '%'; 978 break; 979 default: 980 appendstring(p); 981 goto end; 982 } 983 } else 984 *s++ = *f; 985 } 986 987 end: 988 if (callresults) 989 PyObject_Free(callresults); 990 if (abuffer) 991 PyObject_Free(abuffer); 992 _PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string)); 993 return string; 994 fail: 995 if (callresults) { 996 PyObject **callresult2 = callresults; 997 while (callresult2 < callresult) { 998 Py_DECREF(*callresult2); 999 ++callresult2; 1000 } 1001 PyObject_Free(callresults); 1002 } 1003 if (abuffer) 1004 PyObject_Free(abuffer); 1005 return NULL; 1006} 1007 1008#undef appendstring 1009 1010PyObject * 1011PyUnicode_FromFormat(const char *format, ...) 1012{ 1013 PyObject* ret; 1014 va_list vargs; 1015 1016#ifdef HAVE_STDARG_PROTOTYPES 1017 va_start(vargs, format); 1018#else 1019 va_start(vargs); 1020#endif 1021 ret = PyUnicode_FromFormatV(format, vargs); 1022 va_end(vargs); 1023 return ret; 1024} 1025 1026Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode, 1027 wchar_t *w, 1028 Py_ssize_t size) 1029{ 1030 if (unicode == NULL) { 1031 PyErr_BadInternalCall(); 1032 return -1; 1033 } 1034 1035 /* If possible, try to copy the 0-termination as well */ 1036 if (size > PyUnicode_GET_SIZE(unicode)) 1037 size = PyUnicode_GET_SIZE(unicode) + 1; 1038 1039#ifdef HAVE_USABLE_WCHAR_T 1040 memcpy(w, unicode->str, size * sizeof(wchar_t)); 1041#else 1042 { 1043 register Py_UNICODE *u; 1044 register Py_ssize_t i; 1045 u = PyUnicode_AS_UNICODE(unicode); 1046 for (i = size; i > 0; i--) 1047 *w++ = *u++; 1048 } 1049#endif 1050 1051 if (size > PyUnicode_GET_SIZE(unicode)) 1052 return PyUnicode_GET_SIZE(unicode); 1053 else 1054 return size; 1055} 1056 1057#endif 1058 1059PyObject *PyUnicode_FromOrdinal(int ordinal) 1060{ 1061 Py_UNICODE s[2]; 1062 1063 if (ordinal < 0 || ordinal > 0x10ffff) { 1064 PyErr_SetString(PyExc_ValueError, 1065 "chr() arg not in range(0x110000)"); 1066 return NULL; 1067 } 1068 1069#ifndef Py_UNICODE_WIDE 1070 if (ordinal > 0xffff) { 1071 ordinal -= 0x10000; 1072 s[0] = 0xD800 | (ordinal >> 10); 1073 s[1] = 0xDC00 | (ordinal & 0x3FF); 1074 return PyUnicode_FromUnicode(s, 2); 1075 } 1076#endif 1077 1078 s[0] = (Py_UNICODE)ordinal; 1079 return PyUnicode_FromUnicode(s, 1); 1080} 1081 1082PyObject *PyUnicode_FromObject(register PyObject *obj) 1083{ 1084 /* XXX Perhaps we should make this API an alias of 1085 PyObject_Str() instead ?! */ 1086 if (PyUnicode_CheckExact(obj)) { 1087 Py_INCREF(obj); 1088 return obj; 1089 } 1090 if (PyUnicode_Check(obj)) { 1091 /* For a Unicode subtype that's not a Unicode object, 1092 return a true Unicode object with the same data. */ 1093 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj), 1094 PyUnicode_GET_SIZE(obj)); 1095 } 1096 PyErr_Format(PyExc_TypeError, 1097 "Can't convert '%.100s' object to str implicitly", 1098 Py_TYPE(obj)->tp_name); 1099 return NULL; 1100} 1101 1102PyObject *PyUnicode_FromEncodedObject(register PyObject *obj, 1103 const char *encoding, 1104 const char *errors) 1105{ 1106 const char *s = NULL; 1107 Py_ssize_t len; 1108 PyObject *v; 1109 1110 if (obj == NULL) { 1111 PyErr_BadInternalCall(); 1112 return NULL; 1113 } 1114 1115 if (PyUnicode_Check(obj)) { 1116 PyErr_SetString(PyExc_TypeError, 1117 "decoding str is not supported"); 1118 return NULL; 1119 } 1120 1121 /* Coerce object */ 1122 if (PyBytes_Check(obj)) { 1123 s = PyBytes_AS_STRING(obj); 1124 len = PyBytes_GET_SIZE(obj); 1125 } 1126 else if (PyByteArray_Check(obj)) { 1127 s = PyByteArray_AS_STRING(obj); 1128 len = PyByteArray_GET_SIZE(obj); 1129 } 1130 else if (PyObject_AsCharBuffer(obj, &s, &len)) { 1131 /* Overwrite the error message with something more useful in 1132 case of a TypeError. */ 1133 if (PyErr_ExceptionMatches(PyExc_TypeError)) 1134 PyErr_Format(PyExc_TypeError, 1135 "coercing to str: need string or buffer, " 1136 "%.80s found", 1137 Py_TYPE(obj)->tp_name); 1138 goto onError; 1139 } 1140 1141 /* Convert to Unicode */ 1142 if (len == 0) { 1143 Py_INCREF(unicode_empty); 1144 v = (PyObject *)unicode_empty; 1145 } 1146 else 1147 v = PyUnicode_Decode(s, len, encoding, errors); 1148 1149 return v; 1150 1151 onError: 1152 return NULL; 1153} 1154 1155PyObject *PyUnicode_Decode(const char *s, 1156 Py_ssize_t size, 1157 const char *encoding, 1158 const char *errors) 1159{ 1160 PyObject *buffer = NULL, *unicode; 1161 Py_buffer info; 1162 char lower[20]; /* Enough for any encoding name we recognize */ 1163 char *l; 1164 const char *e; 1165 1166 if (encoding == NULL) 1167 encoding = PyUnicode_GetDefaultEncoding(); 1168 1169 /* Convert encoding to lower case and replace '_' with '-' in order to 1170 catch e.g. UTF_8 */ 1171 e = encoding; 1172 l = lower; 1173 while (*e && l < &lower[(sizeof lower) - 2]) { 1174 if (ISUPPER(*e)) { 1175 *l++ = TOLOWER(*e++); 1176 } 1177 else if (*e == '_') { 1178 *l++ = '-'; 1179 e++; 1180 } 1181 else { 1182 *l++ = *e++; 1183 } 1184 } 1185 *l = '\0'; 1186 1187 /* Shortcuts for common default encodings */ 1188 if (strcmp(lower, "utf-8") == 0) 1189 return PyUnicode_DecodeUTF8(s, size, errors); 1190 else if ((strcmp(lower, "latin-1") == 0) || 1191 (strcmp(lower, "iso-8859-1") == 0)) 1192 return PyUnicode_DecodeLatin1(s, size, errors); 1193#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T) 1194 else if (strcmp(lower, "mbcs") == 0) 1195 return PyUnicode_DecodeMBCS(s, size, errors); 1196#endif 1197 else if (strcmp(lower, "ascii") == 0) 1198 return PyUnicode_DecodeASCII(s, size, errors); 1199 else if (strcmp(lower, "utf-16") == 0) 1200 return PyUnicode_DecodeUTF16(s, size, errors, 0); 1201 else if (strcmp(lower, "utf-32") == 0) 1202 return PyUnicode_DecodeUTF32(s, size, errors, 0); 1203 1204 /* Decode via the codec registry */ 1205 buffer = NULL; 1206 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_SIMPLE) < 0) 1207 goto onError; 1208 buffer = PyMemoryView_FromBuffer(&info); 1209 if (buffer == NULL) 1210 goto onError; 1211 unicode = PyCodec_Decode(buffer, encoding, errors); 1212 if (unicode == NULL) 1213 goto onError; 1214 if (!PyUnicode_Check(unicode)) { 1215 PyErr_Format(PyExc_TypeError, 1216 "decoder did not return a str object (type=%.400s)", 1217 Py_TYPE(unicode)->tp_name); 1218 Py_DECREF(unicode); 1219 goto onError; 1220 } 1221 Py_DECREF(buffer); 1222 return unicode; 1223 1224 onError: 1225 Py_XDECREF(buffer); 1226 return NULL; 1227} 1228 1229PyObject *PyUnicode_AsDecodedObject(PyObject *unicode, 1230 const char *encoding, 1231 const char *errors) 1232{ 1233 PyObject *v; 1234 1235 if (!PyUnicode_Check(unicode)) { 1236 PyErr_BadArgument(); 1237 goto onError; 1238 } 1239 1240 if (encoding == NULL) 1241 encoding = PyUnicode_GetDefaultEncoding(); 1242 1243 /* Decode via the codec registry */ 1244 v = PyCodec_Decode(unicode, encoding, errors); 1245 if (v == NULL) 1246 goto onError; 1247 return v; 1248 1249 onError: 1250 return NULL; 1251} 1252 1253PyObject *PyUnicode_AsDecodedUnicode(PyObject *unicode, 1254 const char *encoding, 1255 const char *errors) 1256{ 1257 PyObject *v; 1258 1259 if (!PyUnicode_Check(unicode)) { 1260 PyErr_BadArgument(); 1261 goto onError; 1262 } 1263 1264 if (encoding == NULL) 1265 encoding = PyUnicode_GetDefaultEncoding(); 1266 1267 /* Decode via the codec registry */ 1268 v = PyCodec_Decode(unicode, encoding, errors); 1269 if (v == NULL) 1270 goto onError; 1271 if (!PyUnicode_Check(v)) { 1272 PyErr_Format(PyExc_TypeError, 1273 "decoder did not return a str object (type=%.400s)", 1274 Py_TYPE(v)->tp_name); 1275 Py_DECREF(v); 1276 goto onError; 1277 } 1278 return v; 1279 1280 onError: 1281 return NULL; 1282} 1283 1284PyObject *PyUnicode_Encode(const Py_UNICODE *s, 1285 Py_ssize_t size, 1286 const char *encoding, 1287 const char *errors) 1288{ 1289 PyObject *v, *unicode; 1290 1291 unicode = PyUnicode_FromUnicode(s, size); 1292 if (unicode == NULL) 1293 return NULL; 1294 v = PyUnicode_AsEncodedString(unicode, encoding, errors); 1295 Py_DECREF(unicode); 1296 return v; 1297} 1298 1299PyObject *PyUnicode_AsEncodedObject(PyObject *unicode, 1300 const char *encoding, 1301 const char *errors) 1302{ 1303 PyObject *v; 1304 1305 if (!PyUnicode_Check(unicode)) { 1306 PyErr_BadArgument(); 1307 goto onError; 1308 } 1309 1310 if (encoding == NULL) 1311 encoding = PyUnicode_GetDefaultEncoding(); 1312 1313 /* Encode via the codec registry */ 1314 v = PyCodec_Encode(unicode, encoding, errors); 1315 if (v == NULL) 1316 goto onError; 1317 return v; 1318 1319 onError: 1320 return NULL; 1321} 1322 1323PyObject *PyUnicode_AsEncodedString(PyObject *unicode, 1324 const char *encoding, 1325 const char *errors) 1326{ 1327 PyObject *v; 1328 1329 if (!PyUnicode_Check(unicode)) { 1330 PyErr_BadArgument(); 1331 return NULL; 1332 } 1333 1334 if (encoding == NULL) 1335 encoding = PyUnicode_GetDefaultEncoding(); 1336 1337 /* Shortcuts for common default encodings */ 1338 if (errors == NULL) { 1339 if (strcmp(encoding, "utf-8") == 0) 1340 return PyUnicode_AsUTF8String(unicode); 1341 else if (strcmp(encoding, "latin-1") == 0) 1342 return PyUnicode_AsLatin1String(unicode); 1343#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T) 1344 else if (strcmp(encoding, "mbcs") == 0) 1345 return PyUnicode_AsMBCSString(unicode); 1346#endif 1347 else if (strcmp(encoding, "ascii") == 0) 1348 return PyUnicode_AsASCIIString(unicode); 1349 /* During bootstrap, we may need to find the encodings 1350 package, to load the file system encoding, and require the 1351 file system encoding in order to load the encodings 1352 package. 1353 1354 Break out of this dependency by assuming that the path to 1355 the encodings module is ASCII-only. XXX could try wcstombs 1356 instead, if the file system encoding is the locale's 1357 encoding. */ 1358 else if (Py_FileSystemDefaultEncoding && 1359 strcmp(encoding, Py_FileSystemDefaultEncoding) == 0 && 1360 !PyThreadState_GET()->interp->codecs_initialized) 1361 return PyUnicode_AsASCIIString(unicode); 1362 } 1363 1364 /* Encode via the codec registry */ 1365 v = PyCodec_Encode(unicode, encoding, errors); 1366 if (v == NULL) 1367 return NULL; 1368 1369 /* The normal path */ 1370 if (PyBytes_Check(v)) 1371 return v; 1372 1373 /* If the codec returns a buffer, raise a warning and convert to bytes */ 1374 if (PyByteArray_Check(v)) { 1375 char msg[100]; 1376 PyObject *b; 1377 PyOS_snprintf(msg, sizeof(msg), 1378 "encoder %s returned buffer instead of bytes", 1379 encoding); 1380 if (PyErr_WarnEx(PyExc_RuntimeWarning, msg, 1) < 0) { 1381 Py_DECREF(v); 1382 return NULL; 1383 } 1384 1385 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v)); 1386 Py_DECREF(v); 1387 return b; 1388 } 1389 1390 PyErr_Format(PyExc_TypeError, 1391 "encoder did not return a bytes object (type=%.400s)", 1392 Py_TYPE(v)->tp_name); 1393 Py_DECREF(v); 1394 return NULL; 1395} 1396 1397PyObject *PyUnicode_AsEncodedUnicode(PyObject *unicode, 1398 const char *encoding, 1399 const char *errors) 1400{ 1401 PyObject *v; 1402 1403 if (!PyUnicode_Check(unicode)) { 1404 PyErr_BadArgument(); 1405 goto onError; 1406 } 1407 1408 if (encoding == NULL) 1409 encoding = PyUnicode_GetDefaultEncoding(); 1410 1411 /* Encode via the codec registry */ 1412 v = PyCodec_Encode(unicode, encoding, errors); 1413 if (v == NULL) 1414 goto onError; 1415 if (!PyUnicode_Check(v)) { 1416 PyErr_Format(PyExc_TypeError, 1417 "encoder did not return an str object (type=%.400s)", 1418 Py_TYPE(v)->tp_name); 1419 Py_DECREF(v); 1420 goto onError; 1421 } 1422 return v; 1423 1424 onError: 1425 return NULL; 1426} 1427 1428PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode, 1429 const char *errors) 1430{ 1431 PyObject *v = ((PyUnicodeObject *)unicode)->defenc; 1432 if (v) 1433 return v; 1434 if (errors != NULL) 1435 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString"); 1436 v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode), 1437 PyUnicode_GET_SIZE(unicode), 1438 NULL); 1439 if (!v) 1440 return NULL; 1441 ((PyUnicodeObject *)unicode)->defenc = v; 1442 return v; 1443} 1444 1445PyObject* 1446PyUnicode_DecodeFSDefault(const char *s) { 1447 Py_ssize_t size = (Py_ssize_t)strlen(s); 1448 return PyUnicode_DecodeFSDefaultAndSize(s, size); 1449} 1450 1451PyObject* 1452PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size) 1453{ 1454 /* During the early bootstrapping process, Py_FileSystemDefaultEncoding 1455 can be undefined. If it is case, decode using UTF-8. The following assumes 1456 that Py_FileSystemDefaultEncoding is set to a built-in encoding during the 1457 bootstrapping process where the codecs aren't ready yet. 1458 */ 1459 if (Py_FileSystemDefaultEncoding) { 1460#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T) 1461 if (strcmp(Py_FileSystemDefaultEncoding, "mbcs") == 0) { 1462 return PyUnicode_DecodeMBCS(s, size, "replace"); 1463 } 1464#elif defined(__APPLE__) 1465 if (strcmp(Py_FileSystemDefaultEncoding, "utf-8") == 0) { 1466 return PyUnicode_DecodeUTF8(s, size, "replace"); 1467 } 1468#endif 1469 return PyUnicode_Decode(s, size, 1470 Py_FileSystemDefaultEncoding, 1471 "replace"); 1472 } 1473 else { 1474 return PyUnicode_DecodeUTF8(s, size, "replace"); 1475 } 1476} 1477 1478char* 1479_PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize) 1480{ 1481 PyObject *bytes; 1482 if (!PyUnicode_Check(unicode)) { 1483 PyErr_BadArgument(); 1484 return NULL; 1485 } 1486 bytes = _PyUnicode_AsDefaultEncodedString(unicode, NULL); 1487 if (bytes == NULL) 1488 return NULL; 1489 if (psize != NULL) 1490 *psize = PyBytes_GET_SIZE(bytes); 1491 return PyBytes_AS_STRING(bytes); 1492} 1493 1494char* 1495_PyUnicode_AsString(PyObject *unicode) 1496{ 1497 return _PyUnicode_AsStringAndSize(unicode, NULL); 1498} 1499 1500Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode) 1501{ 1502 if (!PyUnicode_Check(unicode)) { 1503 PyErr_BadArgument(); 1504 goto onError; 1505 } 1506 return PyUnicode_AS_UNICODE(unicode); 1507 1508 onError: 1509 return NULL; 1510} 1511 1512Py_ssize_t PyUnicode_GetSize(PyObject *unicode) 1513{ 1514 if (!PyUnicode_Check(unicode)) { 1515 PyErr_BadArgument(); 1516 goto onError; 1517 } 1518 return PyUnicode_GET_SIZE(unicode); 1519 1520 onError: 1521 return -1; 1522} 1523 1524const char *PyUnicode_GetDefaultEncoding(void) 1525{ 1526 return unicode_default_encoding; 1527} 1528 1529int PyUnicode_SetDefaultEncoding(const char *encoding) 1530{ 1531 if (strcmp(encoding, unicode_default_encoding) != 0) { 1532 PyErr_Format(PyExc_ValueError, 1533 "Can only set default encoding to %s", 1534 unicode_default_encoding); 1535 return -1; 1536 } 1537 return 0; 1538} 1539 1540/* error handling callback helper: 1541 build arguments, call the callback and check the arguments, 1542 if no exception occurred, copy the replacement to the output 1543 and adjust various state variables. 1544 return 0 on success, -1 on error 1545*/ 1546 1547static 1548int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler, 1549 const char *encoding, const char *reason, 1550 const char **input, const char **inend, Py_ssize_t *startinpos, 1551 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr, 1552 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr) 1553{ 1554 static char *argparse = "O!n;decoding error handler must return (str, int) tuple"; 1555 1556 PyObject *restuple = NULL; 1557 PyObject *repunicode = NULL; 1558 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output); 1559 Py_ssize_t insize; 1560 Py_ssize_t requiredsize; 1561 Py_ssize_t newpos; 1562 Py_UNICODE *repptr; 1563 PyObject *inputobj = NULL; 1564 Py_ssize_t repsize; 1565 int res = -1; 1566 1567 if (*errorHandler == NULL) { 1568 *errorHandler = PyCodec_LookupError(errors); 1569 if (*errorHandler == NULL) 1570 goto onError; 1571 } 1572 1573 if (*exceptionObject == NULL) { 1574 *exceptionObject = PyUnicodeDecodeError_Create( 1575 encoding, *input, *inend-*input, *startinpos, *endinpos, reason); 1576 if (*exceptionObject == NULL) 1577 goto onError; 1578 } 1579 else { 1580 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos)) 1581 goto onError; 1582 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos)) 1583 goto onError; 1584 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason)) 1585 goto onError; 1586 } 1587 1588 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL); 1589 if (restuple == NULL) 1590 goto onError; 1591 if (!PyTuple_Check(restuple)) { 1592 PyErr_Format(PyExc_TypeError, &argparse[4]); 1593 goto onError; 1594 } 1595 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos)) 1596 goto onError; 1597 1598 /* Copy back the bytes variables, which might have been modified by the 1599 callback */ 1600 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject); 1601 if (!inputobj) 1602 goto onError; 1603 if (!PyBytes_Check(inputobj)) { 1604 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes"); 1605 } 1606 *input = PyBytes_AS_STRING(inputobj); 1607 insize = PyBytes_GET_SIZE(inputobj); 1608 *inend = *input + insize; 1609 /* we can DECREF safely, as the exception has another reference, 1610 so the object won't go away. */ 1611 Py_DECREF(inputobj); 1612 1613 if (newpos<0) 1614 newpos = insize+newpos; 1615 if (newpos<0 || newpos>insize) { 1616 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos); 1617 goto onError; 1618 } 1619 1620 /* need more space? (at least enough for what we 1621 have+the replacement+the rest of the string (starting 1622 at the new input position), so we won't have to check space 1623 when there are no errors in the rest of the string) */ 1624 repptr = PyUnicode_AS_UNICODE(repunicode); 1625 repsize = PyUnicode_GET_SIZE(repunicode); 1626 requiredsize = *outpos + repsize + insize-newpos; 1627 if (requiredsize > outsize) { 1628 if (requiredsize<2*outsize) 1629 requiredsize = 2*outsize; 1630 if (PyUnicode_Resize(output, requiredsize) < 0) 1631 goto onError; 1632 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos; 1633 } 1634 *endinpos = newpos; 1635 *inptr = *input + newpos; 1636 Py_UNICODE_COPY(*outptr, repptr, repsize); 1637 *outptr += repsize; 1638 *outpos += repsize; 1639 1640 /* we made it! */ 1641 res = 0; 1642 1643 onError: 1644 Py_XDECREF(restuple); 1645 return res; 1646} 1647 1648/* --- UTF-7 Codec -------------------------------------------------------- */ 1649 1650/* see RFC2152 for details */ 1651 1652static 1653char utf7_special[128] = { 1654 /* indicate whether a UTF-7 character is special i.e. cannot be directly 1655 encoded: 1656 0 - not special 1657 1 - special 1658 2 - whitespace (optional) 1659 3 - RFC2152 Set O (optional) */ 1660 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1, 1661 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1662 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1, 1663 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0, 1664 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1665 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3, 1666 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1667 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1, 1668 1669}; 1670 1671/* Note: The comparison (c) <= 0 is a trick to work-around gcc 1672 warnings about the comparison always being false; since 1673 utf7_special[0] is 1, we can safely make that one comparison 1674 true */ 1675 1676#define SPECIAL(c, encodeO, encodeWS) \ 1677 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \ 1678 (encodeWS && (utf7_special[(c)] == 2)) || \ 1679 (encodeO && (utf7_special[(c)] == 3))) 1680 1681#define B64(n) \ 1682 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f]) 1683#define B64CHAR(c) \ 1684 (ISALNUM(c) || (c) == '+' || (c) == '/') 1685#define UB64(c) \ 1686 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \ 1687 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 ) 1688 1689#define ENCODE(out, ch, bits) \ 1690 while (bits >= 6) { \ 1691 *out++ = B64(ch >> (bits-6)); \ 1692 bits -= 6; \ 1693 } 1694 1695#define DECODE(out, ch, bits, surrogate) \ 1696 while (bits >= 16) { \ 1697 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \ 1698 bits -= 16; \ 1699 if (surrogate) { \ 1700 /* We have already generated an error for the high surrogate \ 1701 so let's not bother seeing if the low surrogate is correct or not */ \ 1702 surrogate = 0; \ 1703 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \ 1704 /* This is a surrogate pair. Unfortunately we can't represent \ 1705 it in a 16-bit character */ \ 1706 surrogate = 1; \ 1707 errmsg = "code pairs are not supported"; \ 1708 goto utf7Error; \ 1709 } else { \ 1710 *out++ = outCh; \ 1711 } \ 1712 } 1713 1714PyObject *PyUnicode_DecodeUTF7(const char *s, 1715 Py_ssize_t size, 1716 const char *errors) 1717{ 1718 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL); 1719} 1720 1721PyObject *PyUnicode_DecodeUTF7Stateful(const char *s, 1722 Py_ssize_t size, 1723 const char *errors, 1724 Py_ssize_t *consumed) 1725{ 1726 const char *starts = s; 1727 Py_ssize_t startinpos; 1728 Py_ssize_t endinpos; 1729 Py_ssize_t outpos; 1730 const char *e; 1731 PyUnicodeObject *unicode; 1732 Py_UNICODE *p; 1733 const char *errmsg = ""; 1734 int inShift = 0; 1735 unsigned int bitsleft = 0; 1736 unsigned long charsleft = 0; 1737 int surrogate = 0; 1738 PyObject *errorHandler = NULL; 1739 PyObject *exc = NULL; 1740 1741 unicode = _PyUnicode_New(size); 1742 if (!unicode) 1743 return NULL; 1744 if (size == 0) { 1745 if (consumed) 1746 *consumed = 0; 1747 return (PyObject *)unicode; 1748 } 1749 1750 p = unicode->str; 1751 e = s + size; 1752 1753 while (s < e) { 1754 Py_UNICODE ch; 1755 restart: 1756 ch = (unsigned char) *s; 1757 1758 if (inShift) { 1759 if ((ch == '-') || !B64CHAR(ch)) { 1760 inShift = 0; 1761 s++; 1762 1763 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate); 1764 if (bitsleft >= 6) { 1765 /* The shift sequence has a partial character in it. If 1766 bitsleft < 6 then we could just classify it as padding 1767 but that is not the case here */ 1768 1769 errmsg = "partial character in shift sequence"; 1770 goto utf7Error; 1771 } 1772 /* According to RFC2152 the remaining bits should be zero. We 1773 choose to signal an error/insert a replacement character 1774 here so indicate the potential of a misencoded character. */ 1775 1776 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */ 1777 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) { 1778 errmsg = "non-zero padding bits in shift sequence"; 1779 goto utf7Error; 1780 } 1781 1782 if (ch == '-') { 1783 if ((s < e) && (*(s) == '-')) { 1784 *p++ = '-'; 1785 inShift = 1; 1786 } 1787 } else if (SPECIAL(ch,0,0)) { 1788 errmsg = "unexpected special character"; 1789 goto utf7Error; 1790 } else { 1791 *p++ = ch; 1792 } 1793 } else { 1794 charsleft = (charsleft << 6) | UB64(ch); 1795 bitsleft += 6; 1796 s++; 1797 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate); 1798 } 1799 } 1800 else if ( ch == '+' ) { 1801 startinpos = s-starts; 1802 s++; 1803 if (s < e && *s == '-') { 1804 s++; 1805 *p++ = '+'; 1806 } else 1807 { 1808 inShift = 1; 1809 bitsleft = 0; 1810 } 1811 } 1812 else if (SPECIAL(ch,0,0)) { 1813 startinpos = s-starts; 1814 errmsg = "unexpected special character"; 1815 s++; 1816 goto utf7Error; 1817 } 1818 else { 1819 *p++ = ch; 1820 s++; 1821 } 1822 continue; 1823 utf7Error: 1824 outpos = p-PyUnicode_AS_UNICODE(unicode); 1825 endinpos = s-starts; 1826 if (unicode_decode_call_errorhandler( 1827 errors, &errorHandler, 1828 "utf7", errmsg, 1829 &starts, &e, &startinpos, &endinpos, &exc, &s, 1830 (PyObject **)&unicode, &outpos, &p)) 1831 goto onError; 1832 } 1833 1834 if (inShift && !consumed) { 1835 outpos = p-PyUnicode_AS_UNICODE(unicode); 1836 endinpos = size; 1837 if (unicode_decode_call_errorhandler( 1838 errors, &errorHandler, 1839 "utf7", "unterminated shift sequence", 1840 &starts, &e, &startinpos, &endinpos, &exc, &s, 1841 (PyObject **)&unicode, &outpos, &p)) 1842 goto onError; 1843 if (s < e) 1844 goto restart; 1845 } 1846 if (consumed) { 1847 if(inShift) 1848 *consumed = startinpos; 1849 else 1850 *consumed = s-starts; 1851 } 1852 1853 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0) 1854 goto onError; 1855 1856 Py_XDECREF(errorHandler); 1857 Py_XDECREF(exc); 1858 return (PyObject *)unicode; 1859 1860onError: 1861 Py_XDECREF(errorHandler); 1862 Py_XDECREF(exc); 1863 Py_DECREF(unicode); 1864 return NULL; 1865} 1866 1867 1868PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s, 1869 Py_ssize_t size, 1870 int encodeSetO, 1871 int encodeWhiteSpace, 1872 const char *errors) 1873{ 1874 PyObject *v, *result; 1875 /* It might be possible to tighten this worst case */ 1876 Py_ssize_t cbAllocated = 5 * size; 1877 int inShift = 0; 1878 Py_ssize_t i = 0; 1879 unsigned int bitsleft = 0; 1880 unsigned long charsleft = 0; 1881 char * out; 1882 char * start; 1883 1884 if (size == 0) 1885 return PyBytes_FromStringAndSize(NULL, 0); 1886 1887 if (cbAllocated / 5 != size) 1888 return PyErr_NoMemory(); 1889 1890 v = PyByteArray_FromStringAndSize(NULL, cbAllocated); 1891 if (v == NULL) 1892 return NULL; 1893 1894 start = out = PyByteArray_AS_STRING(v); 1895 for (;i < size; ++i) { 1896 Py_UNICODE ch = s[i]; 1897 1898 if (!inShift) { 1899 if (ch == '+') { 1900 *out++ = '+'; 1901 *out++ = '-'; 1902 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) { 1903 charsleft = ch; 1904 bitsleft = 16; 1905 *out++ = '+'; 1906 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft); 1907 inShift = bitsleft > 0; 1908 } else { 1909 *out++ = (char) ch; 1910 } 1911 } else { 1912 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) { 1913 *out++ = B64(charsleft << (6-bitsleft)); 1914 charsleft = 0; 1915 bitsleft = 0; 1916 /* Characters not in the BASE64 set implicitly unshift the sequence 1917 so no '-' is required, except if the character is itself a '-' */ 1918 if (B64CHAR(ch) || ch == '-') { 1919 *out++ = '-'; 1920 } 1921 inShift = 0; 1922 *out++ = (char) ch; 1923 } else { 1924 bitsleft += 16; 1925 charsleft = (charsleft << 16) | ch; 1926 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft); 1927 1928 /* If the next character is special then we dont' need to terminate 1929 the shift sequence. If the next character is not a BASE64 character 1930 or '-' then the shift sequence will be terminated implicitly and we 1931 don't have to insert a '-'. */ 1932 1933 if (bitsleft == 0) { 1934 if (i + 1 < size) { 1935 Py_UNICODE ch2 = s[i+1]; 1936 1937 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) { 1938 1939 } else if (B64CHAR(ch2) || ch2 == '-') { 1940 *out++ = '-'; 1941 inShift = 0; 1942 } else { 1943 inShift = 0; 1944 } 1945 1946 } 1947 else { 1948 *out++ = '-'; 1949 inShift = 0; 1950 } 1951 } 1952 } 1953 } 1954 } 1955 if (bitsleft) { 1956 *out++= B64(charsleft << (6-bitsleft) ); 1957 *out++ = '-'; 1958 } 1959 1960 result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), out - start); 1961 Py_DECREF(v); 1962 return result; 1963} 1964 1965#undef SPECIAL 1966#undef B64 1967#undef B64CHAR 1968#undef UB64 1969#undef ENCODE 1970#undef DECODE 1971 1972/* --- UTF-8 Codec -------------------------------------------------------- */ 1973 1974static 1975char utf8_code_length[256] = { 1976 /* Map UTF-8 encoded prefix byte to sequence length. zero means 1977 illegal prefix. see RFC 2279 for details */ 1978 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1979 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1980 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1981 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1982 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1983 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1984 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1985 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1986 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1987 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1988 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1989 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1990 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1991 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1992 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1993 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0 1994}; 1995 1996PyObject *PyUnicode_DecodeUTF8(const char *s, 1997 Py_ssize_t size, 1998 const char *errors) 1999{ 2000 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL); 2001} 2002 2003PyObject *PyUnicode_DecodeUTF8Stateful(const char *s, 2004 Py_ssize_t size, 2005 const char *errors, 2006 Py_ssize_t *consumed) 2007{ 2008 const char *starts = s; 2009 int n; 2010 Py_ssize_t startinpos; 2011 Py_ssize_t endinpos; 2012 Py_ssize_t outpos; 2013 const char *e; 2014 PyUnicodeObject *unicode; 2015 Py_UNICODE *p; 2016 const char *errmsg = ""; 2017 PyObject *errorHandler = NULL; 2018 PyObject *exc = NULL; 2019 2020 /* Note: size will always be longer than the resulting Unicode 2021 character count */ 2022 unicode = _PyUnicode_New(size); 2023 if (!unicode) 2024 return NULL; 2025 if (size == 0) { 2026 if (consumed) 2027 *consumed = 0; 2028 return (PyObject *)unicode; 2029 } 2030 2031 /* Unpack UTF-8 encoded data */ 2032 p = unicode->str; 2033 e = s + size; 2034 2035 while (s < e) { 2036 Py_UCS4 ch = (unsigned char)*s; 2037 2038 if (ch < 0x80) { 2039 *p++ = (Py_UNICODE)ch; 2040 s++; 2041 continue; 2042 } 2043 2044 n = utf8_code_length[ch]; 2045 2046 if (s + n > e) { 2047 if (consumed) 2048 break; 2049 else { 2050 errmsg = "unexpected end of data"; 2051 startinpos = s-starts; 2052 endinpos = size; 2053 goto utf8Error; 2054 } 2055 } 2056 2057 switch (n) { 2058 2059 case 0: 2060 errmsg = "unexpected code byte"; 2061 startinpos = s-starts; 2062 endinpos = startinpos+1; 2063 goto utf8Error; 2064 2065 case 1: 2066 errmsg = "internal error"; 2067 startinpos = s-starts; 2068 endinpos = startinpos+1; 2069 goto utf8Error; 2070 2071 case 2: 2072 if ((s[1] & 0xc0) != 0x80) { 2073 errmsg = "invalid data"; 2074 startinpos = s-starts; 2075 endinpos = startinpos+2; 2076 goto utf8Error; 2077 } 2078 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f); 2079 if (ch < 0x80) { 2080 startinpos = s-starts; 2081 endinpos = startinpos+2; 2082 errmsg = "illegal encoding"; 2083 goto utf8Error; 2084 } 2085 else 2086 *p++ = (Py_UNICODE)ch; 2087 break; 2088 2089 case 3: 2090 if ((s[1] & 0xc0) != 0x80 || 2091 (s[2] & 0xc0) != 0x80) { 2092 errmsg = "invalid data"; 2093 startinpos = s-starts; 2094 endinpos = startinpos+3; 2095 goto utf8Error; 2096 } 2097 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f); 2098 if (ch < 0x0800) { 2099 /* Note: UTF-8 encodings of surrogates are considered 2100 legal UTF-8 sequences; 2101 2102 XXX For wide builds (UCS-4) we should probably try 2103 to recombine the surrogates into a single code 2104 unit. 2105 */ 2106 errmsg = "illegal encoding"; 2107 startinpos = s-starts; 2108 endinpos = startinpos+3; 2109 goto utf8Error; 2110 } 2111 else 2112 *p++ = (Py_UNICODE)ch; 2113 break; 2114 2115 case 4: 2116 if ((s[1] & 0xc0) != 0x80 || 2117 (s[2] & 0xc0) != 0x80 || 2118 (s[3] & 0xc0) != 0x80) { 2119 errmsg = "invalid data"; 2120 startinpos = s-starts; 2121 endinpos = startinpos+4; 2122 goto utf8Error; 2123 } 2124 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) + 2125 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f); 2126 /* validate and convert to UTF-16 */ 2127 if ((ch < 0x10000) /* minimum value allowed for 4 2128 byte encoding */ 2129 || (ch > 0x10ffff)) /* maximum value allowed for 2130 UTF-16 */ 2131 { 2132 errmsg = "illegal encoding"; 2133 startinpos = s-starts; 2134 endinpos = startinpos+4; 2135 goto utf8Error; 2136 } 2137#ifdef Py_UNICODE_WIDE 2138 *p++ = (Py_UNICODE)ch; 2139#else 2140 /* compute and append the two surrogates: */ 2141 2142 /* translate from 10000..10FFFF to 0..FFFF */ 2143 ch -= 0x10000; 2144 2145 /* high surrogate = top 10 bits added to D800 */ 2146 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10)); 2147 2148 /* low surrogate = bottom 10 bits added to DC00 */ 2149 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF)); 2150#endif 2151 break; 2152 2153 default: 2154 /* Other sizes are only needed for UCS-4 */ 2155 errmsg = "unsupported Unicode code range"; 2156 startinpos = s-starts; 2157 endinpos = startinpos+n; 2158 goto utf8Error; 2159 } 2160 s += n; 2161 continue; 2162 2163 utf8Error: 2164 outpos = p-PyUnicode_AS_UNICODE(unicode); 2165 if (unicode_decode_call_errorhandler( 2166 errors, &errorHandler, 2167 "utf8", errmsg, 2168 &starts, &e, &startinpos, &endinpos, &exc, &s, 2169 (PyObject **)&unicode, &outpos, &p)) 2170 goto onError; 2171 } 2172 if (consumed) 2173 *consumed = s-starts; 2174 2175 /* Adjust length */ 2176 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0) 2177 goto onError; 2178 2179 Py_XDECREF(errorHandler); 2180 Py_XDECREF(exc); 2181 return (PyObject *)unicode; 2182 2183onError: 2184 Py_XDECREF(errorHandler); 2185 Py_XDECREF(exc); 2186 Py_DECREF(unicode); 2187 return NULL; 2188} 2189 2190/* Allocation strategy: if the string is short, convert into a stack buffer 2191 and allocate exactly as much space needed at the end. Else allocate the 2192 maximum possible needed (4 result bytes per Unicode character), and return 2193 the excess memory at the end. 2194*/ 2195PyObject * 2196PyUnicode_EncodeUTF8(const Py_UNICODE *s, 2197 Py_ssize_t size, 2198 const char *errors) 2199{ 2200#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */ 2201 2202 Py_ssize_t i; /* index into s of next input byte */ 2203 PyObject *result; /* result string object */ 2204 char *p; /* next free byte in output buffer */ 2205 Py_ssize_t nallocated; /* number of result bytes allocated */ 2206 Py_ssize_t nneeded; /* number of result bytes needed */ 2207 char stackbuf[MAX_SHORT_UNICHARS * 4]; 2208 2209 assert(s != NULL); 2210 assert(size >= 0); 2211 2212 if (size <= MAX_SHORT_UNICHARS) { 2213 /* Write into the stack buffer; nallocated can't overflow. 2214 * At the end, we'll allocate exactly as much heap space as it 2215 * turns out we need. 2216 */ 2217 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int); 2218 result = NULL; /* will allocate after we're done */ 2219 p = stackbuf; 2220 } 2221 else { 2222 /* Overallocate on the heap, and give the excess back at the end. */ 2223 nallocated = size * 4; 2224 if (nallocated / 4 != size) /* overflow! */ 2225 return PyErr_NoMemory(); 2226 result = PyBytes_FromStringAndSize(NULL, nallocated); 2227 if (result == NULL) 2228 return NULL; 2229 p = PyBytes_AS_STRING(result); 2230 } 2231 2232 for (i = 0; i < size;) { 2233 Py_UCS4 ch = s[i++]; 2234 2235 if (ch < 0x80) 2236 /* Encode ASCII */ 2237 *p++ = (char) ch; 2238 2239 else if (ch < 0x0800) { 2240 /* Encode Latin-1 */ 2241 *p++ = (char)(0xc0 | (ch >> 6)); 2242 *p++ = (char)(0x80 | (ch & 0x3f)); 2243 } 2244 else { 2245 /* Encode UCS2 Unicode ordinals */ 2246 if (ch < 0x10000) { 2247 /* Special case: check for high surrogate */ 2248 if (0xD800 <= ch && ch <= 0xDBFF && i != size) { 2249 Py_UCS4 ch2 = s[i]; 2250 /* Check for low surrogate and combine the two to 2251 form a UCS4 value */ 2252 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) { 2253 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000; 2254 i++; 2255 goto encodeUCS4; 2256 } 2257 /* Fall through: handles isolated high surrogates */ 2258 } 2259 *p++ = (char)(0xe0 | (ch >> 12)); 2260 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); 2261 *p++ = (char)(0x80 | (ch & 0x3f)); 2262 continue; 2263 } 2264encodeUCS4: 2265 /* Encode UCS4 Unicode ordinals */ 2266 *p++ = (char)(0xf0 | (ch >> 18)); 2267 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f)); 2268 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); 2269 *p++ = (char)(0x80 | (ch & 0x3f)); 2270 } 2271 } 2272 2273 if (result == NULL) { 2274 /* This was stack allocated. */ 2275 nneeded = p - stackbuf; 2276 assert(nneeded <= nallocated); 2277 result = PyBytes_FromStringAndSize(stackbuf, nneeded); 2278 } 2279 else { 2280 /* Cut back to size actually needed. */ 2281 nneeded = p - PyBytes_AS_STRING(result); 2282 assert(nneeded <= nallocated); 2283 _PyBytes_Resize(&result, nneeded); 2284 } 2285 return result; 2286 2287#undef MAX_SHORT_UNICHARS 2288} 2289 2290PyObject *PyUnicode_AsUTF8String(PyObject *unicode) 2291{ 2292 if (!PyUnicode_Check(unicode)) { 2293 PyErr_BadArgument(); 2294 return NULL; 2295 } 2296 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode), 2297 PyUnicode_GET_SIZE(unicode), 2298 NULL); 2299} 2300 2301/* --- UTF-32 Codec ------------------------------------------------------- */ 2302 2303PyObject * 2304PyUnicode_DecodeUTF32(const char *s, 2305 Py_ssize_t size, 2306 const char *errors, 2307 int *byteorder) 2308{ 2309 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL); 2310} 2311 2312PyObject * 2313PyUnicode_DecodeUTF32Stateful(const char *s, 2314 Py_ssize_t size, 2315 const char *errors, 2316 int *byteorder, 2317 Py_ssize_t *consumed) 2318{ 2319 const char *starts = s; 2320 Py_ssize_t startinpos; 2321 Py_ssize_t endinpos; 2322 Py_ssize_t outpos; 2323 PyUnicodeObject *unicode; 2324 Py_UNICODE *p; 2325#ifndef Py_UNICODE_WIDE 2326 int i, pairs; 2327#else 2328 const int pairs = 0; 2329#endif 2330 const unsigned char *q, *e; 2331 int bo = 0; /* assume native ordering by default */ 2332 const char *errmsg = ""; 2333 /* Offsets from q for retrieving bytes in the right order. */ 2334#ifdef BYTEORDER_IS_LITTLE_ENDIAN 2335 int iorder[] = {0, 1, 2, 3}; 2336#else 2337 int iorder[] = {3, 2, 1, 0}; 2338#endif 2339 PyObject *errorHandler = NULL; 2340 PyObject *exc = NULL; 2341 /* On narrow builds we split characters outside the BMP into two 2342 codepoints => count how much extra space we need. */ 2343#ifndef Py_UNICODE_WIDE 2344 for (i = pairs = 0; i < size/4; i++) 2345 if (((Py_UCS4 *)s)[i] >= 0x10000) 2346 pairs++; 2347#endif 2348 2349 /* This might be one to much, because of a BOM */ 2350 unicode = _PyUnicode_New((size+3)/4+pairs); 2351 if (!unicode) 2352 return NULL; 2353 if (size == 0) 2354 return (PyObject *)unicode; 2355 2356 /* Unpack UTF-32 encoded data */ 2357 p = unicode->str; 2358 q = (unsigned char *)s; 2359 e = q + size; 2360 2361 if (byteorder) 2362 bo = *byteorder; 2363 2364 /* Check for BOM marks (U+FEFF) in the input and adjust current 2365 byte order setting accordingly. In native mode, the leading BOM 2366 mark is skipped, in all other modes, it is copied to the output 2367 stream as-is (giving a ZWNBSP character). */ 2368 if (bo == 0) { 2369 if (size >= 4) { 2370 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) | 2371 (q[iorder[1]] << 8) | q[iorder[0]]; 2372#ifdef BYTEORDER_IS_LITTLE_ENDIAN 2373 if (bom == 0x0000FEFF) { 2374 q += 4; 2375 bo = -1; 2376 } 2377 else if (bom == 0xFFFE0000) { 2378 q += 4; 2379 bo = 1; 2380 } 2381#else 2382 if (bom == 0x0000FEFF) { 2383 q += 4; 2384 bo = 1; 2385 } 2386 else if (bom == 0xFFFE0000) { 2387 q += 4; 2388 bo = -1; 2389 } 2390#endif 2391 } 2392 } 2393 2394 if (bo == -1) { 2395 /* force LE */ 2396 iorder[0] = 0; 2397 iorder[1] = 1; 2398 iorder[2] = 2; 2399 iorder[3] = 3; 2400 } 2401 else if (bo == 1) { 2402 /* force BE */ 2403 iorder[0] = 3; 2404 iorder[1] = 2; 2405 iorder[2] = 1; 2406 iorder[3] = 0; 2407 } 2408 2409 while (q < e) { 2410 Py_UCS4 ch; 2411 /* remaining bytes at the end? (size should be divisible by 4) */ 2412 if (e-q<4) { 2413 if (consumed) 2414 break; 2415 errmsg = "truncated data"; 2416 startinpos = ((const char *)q)-starts; 2417 endinpos = ((const char *)e)-starts; 2418 goto utf32Error; 2419 /* The remaining input chars are ignored if the callback 2420 chooses to skip the input */ 2421 } 2422 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) | 2423 (q[iorder[1]] << 8) | q[iorder[0]]; 2424 2425 if (ch >= 0x110000) 2426 { 2427 errmsg = "codepoint not in range(0x110000)"; 2428 startinpos = ((const char *)q)-starts; 2429 endinpos = startinpos+4; 2430 goto utf32Error; 2431 } 2432#ifndef Py_UNICODE_WIDE 2433 if (ch >= 0x10000) 2434 { 2435 *p++ = 0xD800 | ((ch-0x10000) >> 10); 2436 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF); 2437 } 2438 else 2439#endif 2440 *p++ = ch; 2441 q += 4; 2442 continue; 2443 utf32Error: 2444 outpos = p-PyUnicode_AS_UNICODE(unicode); 2445 if (unicode_decode_call_errorhandler( 2446 errors, &errorHandler, 2447 "utf32", errmsg, 2448 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q, 2449 (PyObject **)&unicode, &outpos, &p)) 2450 goto onError; 2451 } 2452 2453 if (byteorder) 2454 *byteorder = bo; 2455 2456 if (consumed) 2457 *consumed = (const char *)q-starts; 2458 2459 /* Adjust length */ 2460 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0) 2461 goto onError; 2462 2463 Py_XDECREF(errorHandler); 2464 Py_XDECREF(exc); 2465 return (PyObject *)unicode; 2466 2467onError: 2468 Py_DECREF(unicode); 2469 Py_XDECREF(errorHandler); 2470 Py_XDECREF(exc); 2471 return NULL; 2472} 2473 2474PyObject * 2475PyUnicode_EncodeUTF32(const Py_UNICODE *s, 2476 Py_ssize_t size, 2477 const char *errors, 2478 int byteorder) 2479{ 2480 PyObject *v, *result; 2481 unsigned char *p; 2482 Py_ssize_t nsize, bytesize; 2483#ifndef Py_UNICODE_WIDE 2484 Py_ssize_t i, pairs; 2485#else 2486 const int pairs = 0; 2487#endif 2488 /* Offsets from p for storing byte pairs in the right order. */ 2489#ifdef BYTEORDER_IS_LITTLE_ENDIAN 2490 int iorder[] = {0, 1, 2, 3}; 2491#else 2492 int iorder[] = {3, 2, 1, 0}; 2493#endif 2494 2495#define STORECHAR(CH) \ 2496 do { \ 2497 p[iorder[3]] = ((CH) >> 24) & 0xff; \ 2498 p[iorder[2]] = ((CH) >> 16) & 0xff; \ 2499 p[iorder[1]] = ((CH) >> 8) & 0xff; \ 2500 p[iorder[0]] = (CH) & 0xff; \ 2501 p += 4; \ 2502 } while(0) 2503 2504 /* In narrow builds we can output surrogate pairs as one codepoint, 2505 so we need less space. */ 2506#ifndef Py_UNICODE_WIDE 2507 for (i = pairs = 0; i < size-1; i++) 2508 if (0xD800 <= s[i] && s[i] <= 0xDBFF && 2509 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF) 2510 pairs++; 2511#endif 2512 nsize = (size - pairs + (byteorder == 0)); 2513 bytesize = nsize * 4; 2514 if (bytesize / 4 != nsize) 2515 return PyErr_NoMemory(); 2516 v = PyByteArray_FromStringAndSize(NULL, bytesize); 2517 if (v == NULL) 2518 return NULL; 2519 2520 p = (unsigned char *)PyByteArray_AS_STRING(v); 2521 if (byteorder == 0) 2522 STORECHAR(0xFEFF); 2523 if (size == 0) 2524 goto done; 2525 2526 if (byteorder == -1) { 2527 /* force LE */ 2528 iorder[0] = 0; 2529 iorder[1] = 1; 2530 iorder[2] = 2; 2531 iorder[3] = 3; 2532 } 2533 else if (byteorder == 1) { 2534 /* force BE */ 2535 iorder[0] = 3; 2536 iorder[1] = 2; 2537 iorder[2] = 1; 2538 iorder[3] = 0; 2539 } 2540 2541 while (size-- > 0) { 2542 Py_UCS4 ch = *s++; 2543#ifndef Py_UNICODE_WIDE 2544 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) { 2545 Py_UCS4 ch2 = *s; 2546 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) { 2547 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000; 2548 s++; 2549 size--; 2550 } 2551 } 2552#endif 2553 STORECHAR(ch); 2554 } 2555 2556 done: 2557 result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v)); 2558 Py_DECREF(v); 2559 return result; 2560#undef STORECHAR 2561} 2562 2563PyObject *PyUnicode_AsUTF32String(PyObject *unicode) 2564{ 2565 if (!PyUnicode_Check(unicode)) { 2566 PyErr_BadArgument(); 2567 return NULL; 2568 } 2569 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode), 2570 PyUnicode_GET_SIZE(unicode), 2571 NULL, 2572 0); 2573} 2574 2575/* --- UTF-16 Codec ------------------------------------------------------- */ 2576 2577PyObject * 2578PyUnicode_DecodeUTF16(const char *s, 2579 Py_ssize_t size, 2580 const char *errors, 2581 int *byteorder) 2582{ 2583 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL); 2584} 2585 2586PyObject * 2587PyUnicode_DecodeUTF16Stateful(const char *s, 2588 Py_ssize_t size, 2589 const char *errors, 2590 int *byteorder, 2591 Py_ssize_t *consumed) 2592{ 2593 const char *starts = s; 2594 Py_ssize_t startinpos; 2595 Py_ssize_t endinpos; 2596 Py_ssize_t outpos; 2597 PyUnicodeObject *unicode; 2598 Py_UNICODE *p; 2599 const unsigned char *q, *e; 2600 int bo = 0; /* assume native ordering by default */ 2601 const char *errmsg = ""; 2602 /* Offsets from q for retrieving byte pairs in the right order. */ 2603#ifdef BYTEORDER_IS_LITTLE_ENDIAN 2604 int ihi = 1, ilo = 0; 2605#else 2606 int ihi = 0, ilo = 1; 2607#endif 2608 PyObject *errorHandler = NULL; 2609 PyObject *exc = NULL; 2610 2611 /* Note: size will always be longer than the resulting Unicode 2612 character count */ 2613 unicode = _PyUnicode_New(size); 2614 if (!unicode) 2615 return NULL; 2616 if (size == 0) 2617 return (PyObject *)unicode; 2618 2619 /* Unpack UTF-16 encoded data */ 2620 p = unicode->str; 2621 q = (unsigned char *)s; 2622 e = q + size; 2623 2624 if (byteorder) 2625 bo = *byteorder; 2626 2627 /* Check for BOM marks (U+FEFF) in the input and adjust current 2628 byte order setting accordingly. In native mode, the leading BOM 2629 mark is skipped, in all other modes, it is copied to the output 2630 stream as-is (giving a ZWNBSP character). */ 2631 if (bo == 0) { 2632 if (size >= 2) { 2633 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo]; 2634#ifdef BYTEORDER_IS_LITTLE_ENDIAN 2635 if (bom == 0xFEFF) { 2636 q += 2; 2637 bo = -1; 2638 } 2639 else if (bom == 0xFFFE) { 2640 q += 2; 2641 bo = 1; 2642 } 2643#else 2644 if (bom == 0xFEFF) { 2645 q += 2; 2646 bo = 1; 2647 } 2648 else if (bom == 0xFFFE) { 2649 q += 2; 2650 bo = -1; 2651 } 2652#endif 2653 } 2654 } 2655 2656 if (bo == -1) { 2657 /* force LE */ 2658 ihi = 1; 2659 ilo = 0; 2660 } 2661 else if (bo == 1) { 2662 /* force BE */ 2663 ihi = 0; 2664 ilo = 1; 2665 } 2666 2667 while (q < e) { 2668 Py_UNICODE ch; 2669 /* remaining bytes at the end? (size should be even) */ 2670 if (e-q<2) { 2671 if (consumed) 2672 break; 2673 errmsg = "truncated data"; 2674 startinpos = ((const char *)q)-starts; 2675 endinpos = ((const char *)e)-starts; 2676 goto utf16Error; 2677 /* The remaining input chars are ignored if the callback 2678 chooses to skip the input */ 2679 } 2680 ch = (q[ihi] << 8) | q[ilo]; 2681 2682 q += 2; 2683 2684 if (ch < 0xD800 || ch > 0xDFFF) { 2685 *p++ = ch; 2686 continue; 2687 } 2688 2689 /* UTF-16 code pair: */ 2690 if (q >= e) { 2691 errmsg = "unexpected end of data"; 2692 startinpos = (((const char *)q)-2)-starts; 2693 endinpos = ((const char *)e)-starts; 2694 goto utf16Error; 2695 } 2696 if (0xD800 <= ch && ch <= 0xDBFF) { 2697 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo]; 2698 q += 2; 2699 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) { 2700#ifndef Py_UNICODE_WIDE 2701 *p++ = ch; 2702 *p++ = ch2; 2703#else 2704 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000; 2705#endif 2706 continue; 2707 } 2708 else { 2709 errmsg = "illegal UTF-16 surrogate"; 2710 startinpos = (((const char *)q)-4)-starts; 2711 endinpos = startinpos+2; 2712 goto utf16Error; 2713 } 2714 2715 } 2716 errmsg = "illegal encoding"; 2717 startinpos = (((const char *)q)-2)-starts; 2718 endinpos = startinpos+2; 2719 /* Fall through to report the error */ 2720 2721 utf16Error: 2722 outpos = p-PyUnicode_AS_UNICODE(unicode); 2723 if (unicode_decode_call_errorhandler( 2724 errors, &errorHandler, 2725 "utf16", errmsg, 2726 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q, 2727 (PyObject **)&unicode, &outpos, &p)) 2728 goto onError; 2729 } 2730 2731 if (byteorder) 2732 *byteorder = bo; 2733 2734 if (consumed) 2735 *consumed = (const char *)q-starts; 2736 2737 /* Adjust length */ 2738 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0) 2739 goto onError; 2740 2741 Py_XDECREF(errorHandler); 2742 Py_XDECREF(exc); 2743 return (PyObject *)unicode; 2744 2745onError: 2746 Py_DECREF(unicode); 2747 Py_XDECREF(errorHandler); 2748 Py_XDECREF(exc); 2749 return NULL; 2750} 2751 2752PyObject * 2753PyUnicode_EncodeUTF16(const Py_UNICODE *s, 2754 Py_ssize_t size, 2755 const char *errors, 2756 int byteorder) 2757{ 2758 PyObject *v, *result; 2759 unsigned char *p; 2760 Py_ssize_t nsize, bytesize; 2761#ifdef Py_UNICODE_WIDE 2762 Py_ssize_t i, pairs; 2763#else 2764 const int pairs = 0; 2765#endif 2766 /* Offsets from p for storing byte pairs in the right order. */ 2767#ifdef BYTEORDER_IS_LITTLE_ENDIAN 2768 int ihi = 1, ilo = 0; 2769#else 2770 int ihi = 0, ilo = 1; 2771#endif 2772 2773#define STORECHAR(CH) \ 2774 do { \ 2775 p[ihi] = ((CH) >> 8) & 0xff; \ 2776 p[ilo] = (CH) & 0xff; \ 2777 p += 2; \ 2778 } while(0) 2779 2780#ifdef Py_UNICODE_WIDE 2781 for (i = pairs = 0; i < size; i++) 2782 if (s[i] >= 0x10000) 2783 pairs++; 2784#endif 2785 /* 2 * (size + pairs + (byteorder == 0)) */ 2786 if (size > PY_SSIZE_T_MAX || 2787 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0)) 2788 return PyErr_NoMemory(); 2789 nsize = size + pairs + (byteorder == 0); 2790 bytesize = nsize * 2; 2791 if (bytesize / 2 != nsize) 2792 return PyErr_NoMemory(); 2793 v = PyByteArray_FromStringAndSize(NULL, bytesize); 2794 if (v == NULL) 2795 return NULL; 2796 2797 p = (unsigned char *)PyByteArray_AS_STRING(v); 2798 if (byteorder == 0) 2799 STORECHAR(0xFEFF); 2800 if (size == 0) 2801 goto done; 2802 2803 if (byteorder == -1) { 2804 /* force LE */ 2805 ihi = 1; 2806 ilo = 0; 2807 } 2808 else if (byteorder == 1) { 2809 /* force BE */ 2810 ihi = 0; 2811 ilo = 1; 2812 } 2813 2814 while (size-- > 0) { 2815 Py_UNICODE ch = *s++; 2816 Py_UNICODE ch2 = 0; 2817#ifdef Py_UNICODE_WIDE 2818 if (ch >= 0x10000) { 2819 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF); 2820 ch = 0xD800 | ((ch-0x10000) >> 10); 2821 } 2822#endif 2823 STORECHAR(ch); 2824 if (ch2) 2825 STORECHAR(ch2); 2826 } 2827 2828 done: 2829 result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v)); 2830 Py_DECREF(v); 2831 return result; 2832#undef STORECHAR 2833} 2834 2835PyObject *PyUnicode_AsUTF16String(PyObject *unicode) 2836{ 2837 if (!PyUnicode_Check(unicode)) { 2838 PyErr_BadArgument(); 2839 return NULL; 2840 } 2841 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode), 2842 PyUnicode_GET_SIZE(unicode), 2843 NULL, 2844 0); 2845} 2846 2847/* --- Unicode Escape Codec ----------------------------------------------- */ 2848 2849static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL; 2850 2851PyObject *PyUnicode_DecodeUnicodeEscape(const char *s, 2852 Py_ssize_t size, 2853 const char *errors) 2854{ 2855 const char *starts = s; 2856 Py_ssize_t startinpos; 2857 Py_ssize_t endinpos; 2858 Py_ssize_t outpos; 2859 int i; 2860 PyUnicodeObject *v; 2861 Py_UNICODE *p; 2862 const char *end; 2863 char* message; 2864 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */ 2865 PyObject *errorHandler = NULL; 2866 PyObject *exc = NULL; 2867 2868 /* Escaped strings will always be longer than the resulting 2869 Unicode string, so we start with size here and then reduce the 2870 length after conversion to the true value. 2871 (but if the error callback returns a long replacement string 2872 we'll have to allocate more space) */ 2873 v = _PyUnicode_New(size); 2874 if (v == NULL) 2875 goto onError; 2876 if (size == 0) 2877 return (PyObject *)v; 2878 2879 p = PyUnicode_AS_UNICODE(v); 2880 end = s + size; 2881 2882 while (s < end) { 2883 unsigned char c; 2884 Py_UNICODE x; 2885 int digits; 2886 2887 /* Non-escape characters are interpreted as Unicode ordinals */ 2888 if (*s != '\\') { 2889 *p++ = (unsigned char) *s++; 2890 continue; 2891 } 2892 2893 startinpos = s-starts; 2894 /* \ - Escapes */ 2895 s++; 2896 c = *s++; 2897 if (s > end) 2898 c = '\0'; /* Invalid after \ */ 2899 switch (c) { 2900 2901 /* \x escapes */ 2902 case '\n': break; 2903 case '\\': *p++ = '\\'; break; 2904 case '\'': *p++ = '\''; break; 2905 case '\"': *p++ = '\"'; break; 2906 case 'b': *p++ = '\b'; break; 2907 case 'f': *p++ = '\014'; break; /* FF */ 2908 case 't': *p++ = '\t'; break; 2909 case 'n': *p++ = '\n'; break; 2910 case 'r': *p++ = '\r'; break; 2911 case 'v': *p++ = '\013'; break; /* VT */ 2912 case 'a': *p++ = '\007'; break; /* BEL, not classic C */ 2913 2914 /* \OOO (octal) escapes */ 2915 case '0': case '1': case '2': case '3': 2916 case '4': case '5': case '6': case '7': 2917 x = s[-1] - '0'; 2918 if (s < end && '0' <= *s && *s <= '7') { 2919 x = (x<<3) + *s++ - '0'; 2920 if (s < end && '0' <= *s && *s <= '7') 2921 x = (x<<3) + *s++ - '0'; 2922 } 2923 *p++ = x; 2924 break; 2925 2926 /* hex escapes */ 2927 /* \xXX */ 2928 case 'x': 2929 digits = 2; 2930 message = "truncated \\xXX escape"; 2931 goto hexescape; 2932 2933 /* \uXXXX */ 2934 case 'u': 2935 digits = 4; 2936 message = "truncated \\uXXXX escape"; 2937 goto hexescape; 2938 2939 /* \UXXXXXXXX */ 2940 case 'U': 2941 digits = 8; 2942 message = "truncated \\UXXXXXXXX escape"; 2943 hexescape: 2944 chr = 0; 2945 outpos = p-PyUnicode_AS_UNICODE(v); 2946 if (s+digits>end) { 2947 endinpos = size; 2948 if (unicode_decode_call_errorhandler( 2949 errors, &errorHandler, 2950 "unicodeescape", "end of string in escape sequence", 2951 &starts, &end, &startinpos, &endinpos, &exc, &s, 2952 (PyObject **)&v, &outpos, &p)) 2953 goto onError; 2954 goto nextByte; 2955 } 2956 for (i = 0; i < digits; ++i) { 2957 c = (unsigned char) s[i]; 2958 if (!ISXDIGIT(c)) { 2959 endinpos = (s+i+1)-starts; 2960 if (unicode_decode_call_errorhandler( 2961 errors, &errorHandler, 2962 "unicodeescape", message, 2963 &starts, &end, &startinpos, &endinpos, &exc, &s, 2964 (PyObject **)&v, &outpos, &p)) 2965 goto onError; 2966 goto nextByte; 2967 } 2968 chr = (chr<<4) & ~0xF; 2969 if (c >= '0' && c <= '9') 2970 chr += c - '0'; 2971 else if (c >= 'a' && c <= 'f') 2972 chr += 10 + c - 'a'; 2973 else 2974 chr += 10 + c - 'A'; 2975 } 2976 s += i; 2977 if (chr == 0xffffffff && PyErr_Occurred()) 2978 /* _decoding_error will have already written into the 2979 target buffer. */ 2980 break; 2981 store: 2982 /* when we get here, chr is a 32-bit unicode character */ 2983 if (chr <= 0xffff) 2984 /* UCS-2 character */ 2985 *p++ = (Py_UNICODE) chr; 2986 else if (chr <= 0x10ffff) { 2987 /* UCS-4 character. Either store directly, or as 2988 surrogate pair. */ 2989#ifdef Py_UNICODE_WIDE 2990 *p++ = chr; 2991#else 2992 chr -= 0x10000L; 2993 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10); 2994 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF); 2995#endif 2996 } else { 2997 endinpos = s-starts; 2998 outpos = p-PyUnicode_AS_UNICODE(v); 2999 if (unicode_decode_call_errorhandler( 3000 errors, &errorHandler, 3001 "unicodeescape", "illegal Unicode character", 3002 &starts, &end, &startinpos, &endinpos, &exc, &s, 3003 (PyObject **)&v, &outpos, &p)) 3004 goto onError; 3005 } 3006 break; 3007 3008 /* \N{name} */ 3009 case 'N': 3010 message = "malformed \\N character escape"; 3011 if (ucnhash_CAPI == NULL) { 3012 /* load the unicode data module */ 3013 PyObject *m, *api; 3014 m = PyImport_ImportModuleNoBlock("unicodedata"); 3015 if (m == NULL) 3016 goto ucnhashError; 3017 api = PyObject_GetAttrString(m, "ucnhash_CAPI"); 3018 Py_DECREF(m); 3019 if (api == NULL) 3020 goto ucnhashError; 3021 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api); 3022 Py_DECREF(api); 3023 if (ucnhash_CAPI == NULL) 3024 goto ucnhashError; 3025 } 3026 if (*s == '{') { 3027 const char *start = s+1; 3028 /* look for the closing brace */ 3029 while (*s != '}' && s < end) 3030 s++; 3031 if (s > start && s < end && *s == '}') { 3032 /* found a name. look it up in the unicode database */ 3033 message = "unknown Unicode character name"; 3034 s++; 3035 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr)) 3036 goto store; 3037 } 3038 } 3039 endinpos = s-starts; 3040 outpos = p-PyUnicode_AS_UNICODE(v); 3041 if (unicode_decode_call_errorhandler( 3042 errors, &errorHandler, 3043 "unicodeescape", message, 3044 &starts, &end, &startinpos, &endinpos, &exc, &s, 3045 (PyObject **)&v, &outpos, &p)) 3046 goto onError; 3047 break; 3048 3049 default: 3050 if (s > end) { 3051 message = "\\ at end of string"; 3052 s--; 3053 endinpos = s-starts; 3054 outpos = p-PyUnicode_AS_UNICODE(v); 3055 if (unicode_decode_call_errorhandler( 3056 errors, &errorHandler, 3057 "unicodeescape", message, 3058 &starts, &end, &startinpos, &endinpos, &exc, &s, 3059 (PyObject **)&v, &outpos, &p)) 3060 goto onError; 3061 } 3062 else { 3063 *p++ = '\\'; 3064 *p++ = (unsigned char)s[-1]; 3065 } 3066 break; 3067 } 3068 nextByte: 3069 ; 3070 } 3071 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0) 3072 goto onError; 3073 Py_XDECREF(errorHandler); 3074 Py_XDECREF(exc); 3075 return (PyObject *)v; 3076 3077ucnhashError: 3078 PyErr_SetString( 3079 PyExc_UnicodeError, 3080 "\\N escapes not supported (can't load unicodedata module)" 3081 ); 3082 Py_XDECREF(v); 3083 Py_XDECREF(errorHandler); 3084 Py_XDECREF(exc); 3085 return NULL; 3086 3087onError: 3088 Py_XDECREF(v); 3089 Py_XDECREF(errorHandler); 3090 Py_XDECREF(exc); 3091 return NULL; 3092} 3093 3094/* Return a Unicode-Escape string version of the Unicode object. 3095 3096 If quotes is true, the string is enclosed in u"" or u'' quotes as 3097 appropriate. 3098 3099*/ 3100 3101Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s, 3102 Py_ssize_t size, 3103 Py_UNICODE ch) 3104{ 3105 /* like wcschr, but doesn't stop at NULL characters */ 3106 3107 while (size-- > 0) { 3108 if (*s == ch) 3109 return s; 3110 s++; 3111 } 3112 3113 return NULL; 3114} 3115 3116static const char *hexdigits = "0123456789abcdef"; 3117 3118PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s, 3119 Py_ssize_t size) 3120{ 3121 PyObject *repr, *result; 3122 char *p; 3123 3124#ifdef Py_UNICODE_WIDE 3125 const Py_ssize_t expandsize = 10; 3126#else 3127 const Py_ssize_t expandsize = 6; 3128#endif 3129 3130 /* XXX(nnorwitz): rather than over-allocating, it would be 3131 better to choose a different scheme. Perhaps scan the 3132 first N-chars of the string and allocate based on that size. 3133 */ 3134 /* Initial allocation is based on the longest-possible unichr 3135 escape. 3136 3137 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source 3138 unichr, so in this case it's the longest unichr escape. In 3139 narrow (UTF-16) builds this is five chars per source unichr 3140 since there are two unichrs in the surrogate pair, so in narrow 3141 (UTF-16) builds it's not the longest unichr escape. 3142 3143 In wide or narrow builds '\uxxxx' is 6 chars per source unichr, 3144 so in the narrow (UTF-16) build case it's the longest unichr 3145 escape. 3146 */ 3147 3148 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize) 3149 return PyErr_NoMemory(); 3150 3151 repr = PyByteArray_FromStringAndSize(NULL, 3152 2 3153 + expandsize*size 3154 + 1); 3155 if (repr == NULL) 3156 return NULL; 3157 3158 p = PyByteArray_AS_STRING(repr); 3159 3160 while (size-- > 0) { 3161 Py_UNICODE ch = *s++; 3162 3163 /* Escape backslashes */ 3164 if (ch == '\\') { 3165 *p++ = '\\'; 3166 *p++ = (char) ch; 3167 continue; 3168 } 3169 3170#ifdef Py_UNICODE_WIDE 3171 /* Map 21-bit characters to '\U00xxxxxx' */ 3172 else if (ch >= 0x10000) { 3173 *p++ = '\\'; 3174 *p++ = 'U'; 3175 *p++ = hexdigits[(ch >> 28) & 0x0000000F]; 3176 *p++ = hexdigits[(ch >> 24) & 0x0000000F]; 3177 *p++ = hexdigits[(ch >> 20) & 0x0000000F]; 3178 *p++ = hexdigits[(ch >> 16) & 0x0000000F]; 3179 *p++ = hexdigits[(ch >> 12) & 0x0000000F]; 3180 *p++ = hexdigits[(ch >> 8) & 0x0000000F]; 3181 *p++ = hexdigits[(ch >> 4) & 0x0000000F]; 3182 *p++ = hexdigits[ch & 0x0000000F]; 3183 continue; 3184 } 3185#else 3186 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */ 3187 else if (ch >= 0xD800 && ch < 0xDC00) { 3188 Py_UNICODE ch2; 3189 Py_UCS4 ucs; 3190 3191 ch2 = *s++; 3192 size--; 3193 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) { 3194 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000; 3195 *p++ = '\\'; 3196 *p++ = 'U'; 3197 *p++ = hexdigits[(ucs >> 28) & 0x0000000F]; 3198 *p++ = hexdigits[(ucs >> 24) & 0x0000000F]; 3199 *p++ = hexdigits[(ucs >> 20) & 0x0000000F]; 3200 *p++ = hexdigits[(ucs >> 16) & 0x0000000F]; 3201 *p++ = hexdigits[(ucs >> 12) & 0x0000000F]; 3202 *p++ = hexdigits[(ucs >> 8) & 0x0000000F]; 3203 *p++ = hexdigits[(ucs >> 4) & 0x0000000F]; 3204 *p++ = hexdigits[ucs & 0x0000000F]; 3205 continue; 3206 } 3207 /* Fall through: isolated surrogates are copied as-is */ 3208 s--; 3209 size++; 3210 } 3211#endif 3212 3213 /* Map 16-bit characters to '\uxxxx' */ 3214 if (ch >= 256) { 3215 *p++ = '\\'; 3216 *p++ = 'u'; 3217 *p++ = hexdigits[(ch >> 12) & 0x000F]; 3218 *p++ = hexdigits[(ch >> 8) & 0x000F]; 3219 *p++ = hexdigits[(ch >> 4) & 0x000F]; 3220 *p++ = hexdigits[ch & 0x000F]; 3221 } 3222 3223 /* Map special whitespace to '\t', \n', '\r' */ 3224 else if (ch == '\t') { 3225 *p++ = '\\'; 3226 *p++ = 't'; 3227 } 3228 else if (ch == '\n') { 3229 *p++ = '\\'; 3230 *p++ = 'n'; 3231 } 3232 else if (ch == '\r') { 3233 *p++ = '\\'; 3234 *p++ = 'r'; 3235 } 3236 3237 /* Map non-printable US ASCII to '\xhh' */ 3238 else if (ch < ' ' || ch >= 0x7F) { 3239 *p++ = '\\'; 3240 *p++ = 'x'; 3241 *p++ = hexdigits[(ch >> 4) & 0x000F]; 3242 *p++ = hexdigits[ch & 0x000F]; 3243 } 3244 3245 /* Copy everything else as-is */ 3246 else 3247 *p++ = (char) ch; 3248 } 3249 3250 result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(repr), 3251 p - PyByteArray_AS_STRING(repr)); 3252 Py_DECREF(repr); 3253 return result; 3254} 3255 3256PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode) 3257{ 3258 PyObject *s, *result; 3259 if (!PyUnicode_Check(unicode)) { 3260 PyErr_BadArgument(); 3261 return NULL; 3262 } 3263 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode), 3264 PyUnicode_GET_SIZE(unicode)); 3265 3266 if (!s) 3267 return NULL; 3268 result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(s), 3269 PyByteArray_GET_SIZE(s)); 3270 Py_DECREF(s); 3271 return result; 3272} 3273 3274/* --- Raw Unicode Escape Codec ------------------------------------------- */ 3275 3276PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s, 3277 Py_ssize_t size, 3278 const char *errors) 3279{ 3280 const char *starts = s; 3281 Py_ssize_t startinpos; 3282 Py_ssize_t endinpos; 3283 Py_ssize_t outpos; 3284 PyUnicodeObject *v; 3285 Py_UNICODE *p; 3286 const char *end; 3287 const char *bs; 3288 PyObject *errorHandler = NULL; 3289 PyObject *exc = NULL; 3290 3291 /* Escaped strings will always be longer than the resulting 3292 Unicode string, so we start with size here and then reduce the 3293 length after conversion to the true value. (But decoding error 3294 handler might have to resize the string) */ 3295 v = _PyUnicode_New(size); 3296 if (v == NULL) 3297 goto onError; 3298 if (size == 0) 3299 return (PyObject *)v; 3300 p = PyUnicode_AS_UNICODE(v); 3301 end = s + size; 3302 while (s < end) { 3303 unsigned char c; 3304 Py_UCS4 x; 3305 int i; 3306 int count; 3307 3308 /* Non-escape characters are interpreted as Unicode ordinals */ 3309 if (*s != '\\') { 3310 *p++ = (unsigned char)*s++; 3311 continue; 3312 } 3313 startinpos = s-starts; 3314 3315 /* \u-escapes are only interpreted iff the number of leading 3316 backslashes if odd */ 3317 bs = s; 3318 for (;s < end;) { 3319 if (*s != '\\') 3320 break; 3321 *p++ = (unsigned char)*s++; 3322 } 3323 if (((s - bs) & 1) == 0 || 3324 s >= end || 3325 (*s != 'u' && *s != 'U')) { 3326 continue; 3327 } 3328 p--; 3329 count = *s=='u' ? 4 : 8; 3330 s++; 3331 3332 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */ 3333 outpos = p-PyUnicode_AS_UNICODE(v); 3334 for (x = 0, i = 0; i < count; ++i, ++s) { 3335 c = (unsigned char)*s; 3336 if (!ISXDIGIT(c)) { 3337 endinpos = s-starts; 3338 if (unicode_decode_call_errorhandler( 3339 errors, &errorHandler, 3340 "rawunicodeescape", "truncated \\uXXXX", 3341 &starts, &end, &startinpos, &endinpos, &exc, &s, 3342 (PyObject **)&v, &outpos, &p)) 3343 goto onError; 3344 goto nextByte; 3345 } 3346 x = (x<<4) & ~0xF; 3347 if (c >= '0' && c <= '9') 3348 x += c - '0'; 3349 else if (c >= 'a' && c <= 'f') 3350 x += 10 + c - 'a'; 3351 else 3352 x += 10 + c - 'A'; 3353 } 3354 if (x <= 0xffff) 3355 /* UCS-2 character */ 3356 *p++ = (Py_UNICODE) x; 3357 else if (x <= 0x10ffff) { 3358 /* UCS-4 character. Either store directly, or as 3359 surrogate pair. */ 3360#ifdef Py_UNICODE_WIDE 3361 *p++ = (Py_UNICODE) x; 3362#else 3363 x -= 0x10000L; 3364 *p++ = 0xD800 + (Py_UNICODE) (x >> 10); 3365 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF); 3366#endif 3367 } else { 3368 endinpos = s-starts; 3369 outpos = p-PyUnicode_AS_UNICODE(v); 3370 if (unicode_decode_call_errorhandler( 3371 errors, &errorHandler, 3372 "rawunicodeescape", "\\Uxxxxxxxx out of range", 3373 &starts, &end, &startinpos, &endinpos, &exc, &s, 3374 (PyObject **)&v, &outpos, &p)) 3375 goto onError; 3376 } 3377 nextByte: 3378 ; 3379 } 3380 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0) 3381 goto onError; 3382 Py_XDECREF(errorHandler); 3383 Py_XDECREF(exc); 3384 return (PyObject *)v; 3385 3386 onError: 3387 Py_XDECREF(v); 3388 Py_XDECREF(errorHandler); 3389 Py_XDECREF(exc); 3390 return NULL; 3391} 3392 3393PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s, 3394 Py_ssize_t size) 3395{ 3396 PyObject *repr, *result; 3397 char *p; 3398 char *q; 3399 3400#ifdef Py_UNICODE_WIDE 3401 const Py_ssize_t expandsize = 10; 3402#else 3403 const Py_ssize_t expandsize = 6; 3404#endif 3405 3406 if (size > PY_SSIZE_T_MAX / expandsize) 3407 return PyErr_NoMemory(); 3408 3409 repr = PyByteArray_FromStringAndSize(NULL, expandsize * size); 3410 if (repr == NULL) 3411 return NULL; 3412 if (size == 0) 3413 goto done; 3414 3415 p = q = PyByteArray_AS_STRING(repr); 3416 while (size-- > 0) { 3417 Py_UNICODE ch = *s++; 3418#ifdef Py_UNICODE_WIDE 3419 /* Map 32-bit characters to '\Uxxxxxxxx' */ 3420 if (ch >= 0x10000) { 3421 *p++ = '\\'; 3422 *p++ = 'U'; 3423 *p++ = hexdigits[(ch >> 28) & 0xf]; 3424 *p++ = hexdigits[(ch >> 24) & 0xf]; 3425 *p++ = hexdigits[(ch >> 20) & 0xf]; 3426 *p++ = hexdigits[(ch >> 16) & 0xf]; 3427 *p++ = hexdigits[(ch >> 12) & 0xf]; 3428 *p++ = hexdigits[(ch >> 8) & 0xf]; 3429 *p++ = hexdigits[(ch >> 4) & 0xf]; 3430 *p++ = hexdigits[ch & 15]; 3431 } 3432 else 3433#else 3434 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */ 3435 if (ch >= 0xD800 && ch < 0xDC00) { 3436 Py_UNICODE ch2; 3437 Py_UCS4 ucs; 3438 3439 ch2 = *s++; 3440 size--; 3441 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) { 3442 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000; 3443 *p++ = '\\'; 3444 *p++ = 'U'; 3445 *p++ = hexdigits[(ucs >> 28) & 0xf]; 3446 *p++ = hexdigits[(ucs >> 24) & 0xf]; 3447 *p++ = hexdigits[(ucs >> 20) & 0xf]; 3448 *p++ = hexdigits[(ucs >> 16) & 0xf]; 3449 *p++ = hexdigits[(ucs >> 12) & 0xf]; 3450 *p++ = hexdigits[(ucs >> 8) & 0xf]; 3451 *p++ = hexdigits[(ucs >> 4) & 0xf]; 3452 *p++ = hexdigits[ucs & 0xf]; 3453 continue; 3454 } 3455 /* Fall through: isolated surrogates are copied as-is */ 3456 s--; 3457 size++; 3458 } 3459#endif 3460 /* Map 16-bit characters to '\uxxxx' */ 3461 if (ch >= 256) { 3462 *p++ = '\\'; 3463 *p++ = 'u'; 3464 *p++ = hexdigits[(ch >> 12) & 0xf]; 3465 *p++ = hexdigits[(ch >> 8) & 0xf]; 3466 *p++ = hexdigits[(ch >> 4) & 0xf]; 3467 *p++ = hexdigits[ch & 15]; 3468 } 3469 /* Copy everything else as-is */ 3470 else 3471 *p++ = (char) ch; 3472 } 3473 size = p - q; 3474 3475 done: 3476 result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(repr), size); 3477 Py_DECREF(repr); 3478 return result; 3479} 3480 3481PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode) 3482{ 3483 PyObject *s, *result; 3484 if (!PyUnicode_Check(unicode)) { 3485 PyErr_BadArgument(); 3486 return NULL; 3487 } 3488 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode), 3489 PyUnicode_GET_SIZE(unicode)); 3490 3491 if (!s) 3492 return NULL; 3493 result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(s), 3494 PyByteArray_GET_SIZE(s)); 3495 Py_DECREF(s); 3496 return result; 3497} 3498 3499/* --- Unicode Internal Codec ------------------------------------------- */ 3500 3501PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s, 3502 Py_ssize_t size, 3503 const char *errors) 3504{ 3505 const char *starts = s; 3506 Py_ssize_t startinpos; 3507 Py_ssize_t endinpos; 3508 Py_ssize_t outpos; 3509 PyUnicodeObject *v; 3510 Py_UNICODE *p; 3511 const char *end; 3512 const char *reason; 3513 PyObject *errorHandler = NULL; 3514 PyObject *exc = NULL; 3515 3516#ifdef Py_UNICODE_WIDE 3517 Py_UNICODE unimax = PyUnicode_GetMax(); 3518#endif 3519 3520 /* XXX overflow detection missing */ 3521 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE); 3522 if (v == NULL) 3523 goto onError; 3524 if (PyUnicode_GetSize((PyObject *)v) == 0) 3525 return (PyObject *)v; 3526 p = PyUnicode_AS_UNICODE(v); 3527 end = s + size; 3528 3529 while (s < end) { 3530 memcpy(p, s, sizeof(Py_UNICODE)); 3531 /* We have to sanity check the raw data, otherwise doom looms for 3532 some malformed UCS-4 data. */ 3533 if ( 3534 #ifdef Py_UNICODE_WIDE 3535 *p > unimax || *p < 0 || 3536 #endif 3537 end-s < Py_UNICODE_SIZE 3538 ) 3539 { 3540 startinpos = s - starts; 3541 if (end-s < Py_UNICODE_SIZE) { 3542 endinpos = end-starts; 3543 reason = "truncated input"; 3544 } 3545 else { 3546 endinpos = s - starts + Py_UNICODE_SIZE; 3547 reason = "illegal code point (> 0x10FFFF)"; 3548 } 3549 outpos = p - PyUnicode_AS_UNICODE(v); 3550 if (unicode_decode_call_errorhandler( 3551 errors, &errorHandler, 3552 "unicode_internal", reason, 3553 &starts, &end, &startinpos, &endinpos, &exc, &s, 3554 (PyObject **)&v, &outpos, &p)) { 3555 goto onError; 3556 } 3557 } 3558 else { 3559 p++; 3560 s += Py_UNICODE_SIZE; 3561 } 3562 } 3563 3564 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0) 3565 goto onError; 3566 Py_XDECREF(errorHandler); 3567 Py_XDECREF(exc); 3568 return (PyObject *)v; 3569 3570 onError: 3571 Py_XDECREF(v); 3572 Py_XDECREF(errorHandler); 3573 Py_XDECREF(exc); 3574 return NULL; 3575} 3576 3577/* --- Latin-1 Codec ------------------------------------------------------ */ 3578 3579PyObject *PyUnicode_DecodeLatin1(const char *s, 3580 Py_ssize_t size, 3581 const char *errors) 3582{ 3583 PyUnicodeObject *v; 3584 Py_UNICODE *p; 3585 3586 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */ 3587 if (size == 1) { 3588 Py_UNICODE r = *(unsigned char*)s; 3589 return PyUnicode_FromUnicode(&r, 1); 3590 } 3591 3592 v = _PyUnicode_New(size); 3593 if (v == NULL) 3594 goto onError; 3595 if (size == 0) 3596 return (PyObject *)v; 3597 p = PyUnicode_AS_UNICODE(v); 3598 while (size-- > 0) 3599 *p++ = (unsigned char)*s++; 3600 return (PyObject *)v; 3601 3602 onError: 3603 Py_XDECREF(v); 3604 return NULL; 3605} 3606 3607/* create or adjust a UnicodeEncodeError */ 3608static void make_encode_exception(PyObject **exceptionObject, 3609 const char *encoding, 3610 const Py_UNICODE *unicode, Py_ssize_t size, 3611 Py_ssize_t startpos, Py_ssize_t endpos, 3612 const char *reason) 3613{ 3614 if (*exceptionObject == NULL) { 3615 *exceptionObject = PyUnicodeEncodeError_Create( 3616 encoding, unicode, size, startpos, endpos, reason); 3617 } 3618 else { 3619 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos)) 3620 goto onError; 3621 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos)) 3622 goto onError; 3623 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason)) 3624 goto onError; 3625 return; 3626 onError: 3627 Py_DECREF(*exceptionObject); 3628 *exceptionObject = NULL; 3629 } 3630} 3631 3632/* raises a UnicodeEncodeError */ 3633static void raise_encode_exception(PyObject **exceptionObject, 3634 const char *encoding, 3635 const Py_UNICODE *unicode, Py_ssize_t size, 3636 Py_ssize_t startpos, Py_ssize_t endpos, 3637 const char *reason) 3638{ 3639 make_encode_exception(exceptionObject, 3640 encoding, unicode, size, startpos, endpos, reason); 3641 if (*exceptionObject != NULL) 3642 PyCodec_StrictErrors(*exceptionObject); 3643} 3644 3645/* error handling callback helper: 3646 build arguments, call the callback and check the arguments, 3647 put the result into newpos and return the replacement string, which 3648 has to be freed by the caller */ 3649static PyObject *unicode_encode_call_errorhandler(const char *errors, 3650 PyObject **errorHandler, 3651 const char *encoding, const char *reason, 3652 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject, 3653 Py_ssize_t startpos, Py_ssize_t endpos, 3654 Py_ssize_t *newpos) 3655{ 3656 static char *argparse = "O!n;encoding error handler must return (str, int) tuple"; 3657 3658 PyObject *restuple; 3659 PyObject *resunicode; 3660 3661 if (*errorHandler == NULL) { 3662 *errorHandler = PyCodec_LookupError(errors); 3663 if (*errorHandler == NULL) 3664 return NULL; 3665 } 3666 3667 make_encode_exception(exceptionObject, 3668 encoding, unicode, size, startpos, endpos, reason); 3669 if (*exceptionObject == NULL) 3670 return NULL; 3671 3672 restuple = PyObject_CallFunctionObjArgs( 3673 *errorHandler, *exceptionObject, NULL); 3674 if (restuple == NULL) 3675 return NULL; 3676 if (!PyTuple_Check(restuple)) { 3677 PyErr_Format(PyExc_TypeError, &argparse[4]); 3678 Py_DECREF(restuple); 3679 return NULL; 3680 } 3681 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, 3682 &resunicode, newpos)) { 3683 Py_DECREF(restuple); 3684 return NULL; 3685 } 3686 if (*newpos<0) 3687 *newpos = size+*newpos; 3688 if (*newpos<0 || *newpos>size) { 3689 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos); 3690 Py_DECREF(restuple); 3691 return NULL; 3692 } 3693 Py_INCREF(resunicode); 3694 Py_DECREF(restuple); 3695 return resunicode; 3696} 3697 3698static PyObject *unicode_encode_ucs1(const Py_UNICODE *p, 3699 Py_ssize_t size, 3700 const char *errors, 3701 int limit) 3702{ 3703 /* output object */ 3704 PyObject *res; 3705 /* pointers to the beginning and end+1 of input */ 3706 const Py_UNICODE *startp = p; 3707 const Py_UNICODE *endp = p + size; 3708 /* pointer to the beginning of the unencodable characters */ 3709 /* const Py_UNICODE *badp = NULL; */ 3710 /* pointer into the output */ 3711 char *str; 3712 /* current output position */ 3713 Py_ssize_t ressize; 3714 const char *encoding = (limit == 256) ? "latin-1" : "ascii"; 3715 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)"; 3716 PyObject *errorHandler = NULL; 3717 PyObject *exc = NULL; 3718 PyObject *result = NULL; 3719 /* the following variable is used for caching string comparisons 3720 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */ 3721 int known_errorHandler = -1; 3722 3723 /* allocate enough for a simple encoding without 3724 replacements, if we need more, we'll resize */ 3725 if (size == 0) 3726 return PyBytes_FromStringAndSize(NULL, 0); 3727 res = PyByteArray_FromStringAndSize(NULL, size); 3728 if (res == NULL) 3729 return NULL; 3730 str = PyByteArray_AS_STRING(res); 3731 ressize = size; 3732 3733 while (p<endp) { 3734 Py_UNICODE c = *p; 3735 3736 /* can we encode this? */ 3737 if (c<limit) { 3738 /* no overflow check, because we know that the space is enough */ 3739 *str++ = (char)c; 3740 ++p; 3741 } 3742 else { 3743 Py_ssize_t unicodepos = p-startp; 3744 Py_ssize_t requiredsize; 3745 PyObject *repunicode; 3746 Py_ssize_t repsize; 3747 Py_ssize_t newpos; 3748 Py_ssize_t respos; 3749 Py_UNICODE *uni2; 3750 /* startpos for collecting unencodable chars */ 3751 const Py_UNICODE *collstart = p; 3752 const Py_UNICODE *collend = p; 3753 /* find all unecodable characters */ 3754 while ((collend < endp) && ((*collend)>=limit)) 3755 ++collend; 3756 /* cache callback name lookup (if not done yet, i.e. it's the first error) */ 3757 if (known_errorHandler==-1) { 3758 if ((errors==NULL) || (!strcmp(errors, "strict"))) 3759 known_errorHandler = 1; 3760 else if (!strcmp(errors, "replace")) 3761 known_errorHandler = 2; 3762 else if (!strcmp(errors, "ignore")) 3763 known_errorHandler = 3; 3764 else if (!strcmp(errors, "xmlcharrefreplace")) 3765 known_errorHandler = 4; 3766 else 3767 known_errorHandler = 0; 3768 } 3769 switch (known_errorHandler) { 3770 case 1: /* strict */ 3771 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason); 3772 goto onError; 3773 case 2: /* replace */ 3774 while (collstart++<collend) 3775 *str++ = '?'; /* fall through */ 3776 case 3: /* ignore */ 3777 p = collend; 3778 break; 3779 case 4: /* xmlcharrefreplace */ 3780 respos = str - PyByteArray_AS_STRING(res); 3781 /* determine replacement size (temporarily (mis)uses p) */ 3782 for (p = collstart, repsize = 0; p < collend; ++p) { 3783 if (*p<10) 3784 repsize += 2+1+1; 3785 else if (*p<100) 3786 repsize += 2+2+1; 3787 else if (*p<1000) 3788 repsize += 2+3+1; 3789 else if (*p<10000) 3790 repsize += 2+4+1; 3791#ifndef Py_UNICODE_WIDE 3792 else 3793 repsize += 2+5+1; 3794#else 3795 else if (*p<100000) 3796 repsize += 2+5+1; 3797 else if (*p<1000000) 3798 repsize += 2+6+1; 3799 else 3800 repsize += 2+7+1; 3801#endif 3802 } 3803 requiredsize = respos+repsize+(endp-collend); 3804 if (requiredsize > ressize) { 3805 if (requiredsize<2*ressize) 3806 requiredsize = 2*ressize; 3807 if (PyByteArray_Resize(res, requiredsize)) 3808 goto onError; 3809 str = PyByteArray_AS_STRING(res) + respos; 3810 ressize = requiredsize; 3811 } 3812 /* generate replacement (temporarily (mis)uses p) */ 3813 for (p = collstart; p < collend; ++p) { 3814 str += sprintf(str, "&#%d;", (int)*p); 3815 } 3816 p = collend; 3817 break; 3818 default: 3819 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler, 3820 encoding, reason, startp, size, &exc, 3821 collstart-startp, collend-startp, &newpos); 3822 if (repunicode == NULL) 3823 goto onError; 3824 /* need more space? (at least enough for what we 3825 have+the replacement+the rest of the string, so 3826 we won't have to check space for encodable characters) */ 3827 respos = str - PyByteArray_AS_STRING(res); 3828 repsize = PyUnicode_GET_SIZE(repunicode); 3829 requiredsize = respos+repsize+(endp-collend); 3830 if (requiredsize > ressize) { 3831 if (requiredsize<2*ressize) 3832 requiredsize = 2*ressize; 3833 if (PyByteArray_Resize(res, requiredsize)) { 3834 Py_DECREF(repunicode); 3835 goto onError; 3836 } 3837 str = PyByteArray_AS_STRING(res) + respos; 3838 ressize = requiredsize; 3839 } 3840 /* check if there is anything unencodable in the replacement 3841 and copy it to the output */ 3842 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) { 3843 c = *uni2; 3844 if (c >= limit) { 3845 raise_encode_exception(&exc, encoding, startp, size, 3846 unicodepos, unicodepos+1, reason); 3847 Py_DECREF(repunicode); 3848 goto onError; 3849 } 3850 *str = (char)c; 3851 } 3852 p = startp + newpos; 3853 Py_DECREF(repunicode); 3854 } 3855 } 3856 } 3857 result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(res), 3858 str - PyByteArray_AS_STRING(res)); 3859 onError: 3860 Py_DECREF(res); 3861 Py_XDECREF(errorHandler); 3862 Py_XDECREF(exc); 3863 return result; 3864} 3865 3866PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p, 3867 Py_ssize_t size, 3868 const char *errors) 3869{ 3870 return unicode_encode_ucs1(p, size, errors, 256); 3871} 3872 3873PyObject *PyUnicode_AsLatin1String(PyObject *unicode) 3874{ 3875 if (!PyUnicode_Check(unicode)) { 3876 PyErr_BadArgument(); 3877 return NULL; 3878 } 3879 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode), 3880 PyUnicode_GET_SIZE(unicode), 3881 NULL); 3882} 3883 3884/* --- 7-bit ASCII Codec -------------------------------------------------- */ 3885 3886PyObject *PyUnicode_DecodeASCII(const char *s, 3887 Py_ssize_t size, 3888 const char *errors) 3889{ 3890 const char *starts = s; 3891 PyUnicodeObject *v; 3892 Py_UNICODE *p; 3893 Py_ssize_t startinpos; 3894 Py_ssize_t endinpos; 3895 Py_ssize_t outpos; 3896 const char *e; 3897 PyObject *errorHandler = NULL; 3898 PyObject *exc = NULL; 3899 3900 /* ASCII is equivalent to the first 128 ordinals in Unicode. */ 3901 if (size == 1 && *(unsigned char*)s < 128) { 3902 Py_UNICODE r = *(unsigned char*)s; 3903 return PyUnicode_FromUnicode(&r, 1); 3904 } 3905 3906 v = _PyUnicode_New(size); 3907 if (v == NULL) 3908 goto onError; 3909 if (size == 0) 3910 return (PyObject *)v; 3911 p = PyUnicode_AS_UNICODE(v); 3912 e = s + size; 3913 while (s < e) { 3914 register unsigned char c = (unsigned char)*s; 3915 if (c < 128) { 3916 *p++ = c; 3917 ++s; 3918 } 3919 else { 3920 startinpos = s-starts; 3921 endinpos = startinpos + 1; 3922 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v); 3923 if (unicode_decode_call_errorhandler( 3924 errors, &errorHandler, 3925 "ascii", "ordinal not in range(128)", 3926 &starts, &e, &startinpos, &endinpos, &exc, &s, 3927 (PyObject **)&v, &outpos, &p)) 3928 goto onError; 3929 } 3930 } 3931 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v)) 3932 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0) 3933 goto onError; 3934 Py_XDECREF(errorHandler); 3935 Py_XDECREF(exc); 3936 return (PyObject *)v; 3937 3938 onError: 3939 Py_XDECREF(v); 3940 Py_XDECREF(errorHandler); 3941 Py_XDECREF(exc); 3942 return NULL; 3943} 3944 3945PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p, 3946 Py_ssize_t size, 3947 const char *errors) 3948{ 3949 return unicode_encode_ucs1(p, size, errors, 128); 3950} 3951 3952PyObject *PyUnicode_AsASCIIString(PyObject *unicode) 3953{ 3954 if (!PyUnicode_Check(unicode)) { 3955 PyErr_BadArgument(); 3956 return NULL; 3957 } 3958 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode), 3959 PyUnicode_GET_SIZE(unicode), 3960 NULL); 3961} 3962 3963#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T) 3964 3965/* --- MBCS codecs for Windows -------------------------------------------- */ 3966 3967#if SIZEOF_INT < SIZEOF_SSIZE_T 3968#define NEED_RETRY 3969#endif 3970 3971/* XXX This code is limited to "true" double-byte encodings, as 3972 a) it assumes an incomplete character consists of a single byte, and 3973 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte 3974 encodings, see IsDBCSLeadByteEx documentation. */ 3975 3976static int is_dbcs_lead_byte(const char *s, int offset) 3977{ 3978 const char *curr = s + offset; 3979 3980 if (IsDBCSLeadByte(*curr)) { 3981 const char *prev = CharPrev(s, curr); 3982 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2); 3983 } 3984 return 0; 3985} 3986 3987/* 3988 * Decode MBCS string into unicode object. If 'final' is set, converts 3989 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise. 3990 */ 3991static int decode_mbcs(PyUnicodeObject **v, 3992 const char *s, /* MBCS string */ 3993 int size, /* sizeof MBCS string */ 3994 int final) 3995{ 3996 Py_UNICODE *p; 3997 Py_ssize_t n = 0; 3998 int usize = 0; 3999 4000 assert(size >= 0); 4001 4002 /* Skip trailing lead-byte unless 'final' is set */ 4003 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1)) 4004 --size; 4005 4006 /* First get the size of the result */ 4007 if (size > 0) { 4008 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0); 4009 if (usize == 0) { 4010 PyErr_SetFromWindowsErrWithFilename(0, NULL); 4011 return -1; 4012 } 4013 } 4014 4015 if (*v == NULL) { 4016 /* Create unicode object */ 4017 *v = _PyUnicode_New(usize); 4018 if (*v == NULL) 4019 return -1; 4020 } 4021 else { 4022 /* Extend unicode object */ 4023 n = PyUnicode_GET_SIZE(*v); 4024 if (_PyUnicode_Resize(v, n + usize) < 0) 4025 return -1; 4026 } 4027 4028 /* Do the conversion */ 4029 if (size > 0) { 4030 p = PyUnicode_AS_UNICODE(*v) + n; 4031 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) { 4032 PyErr_SetFromWindowsErrWithFilename(0, NULL); 4033 return -1; 4034 } 4035 } 4036 4037 return size; 4038} 4039 4040PyObject *PyUnicode_DecodeMBCSStateful(const char *s, 4041 Py_ssize_t size, 4042 const char *errors, 4043 Py_ssize_t *consumed) 4044{ 4045 PyUnicodeObject *v = NULL; 4046 int done; 4047 4048 if (consumed) 4049 *consumed = 0; 4050 4051#ifdef NEED_RETRY 4052 retry: 4053 if (size > INT_MAX) 4054 done = decode_mbcs(&v, s, INT_MAX, 0); 4055 else 4056#endif 4057 done = decode_mbcs(&v, s, (int)size, !consumed); 4058 4059 if (done < 0) { 4060 Py_XDECREF(v); 4061 return NULL; 4062 } 4063 4064 if (consumed) 4065 *consumed += done; 4066 4067#ifdef NEED_RETRY 4068 if (size > INT_MAX) { 4069 s += done; 4070 size -= done; 4071 goto retry; 4072 } 4073#endif 4074 4075 return (PyObject *)v; 4076} 4077 4078PyObject *PyUnicode_DecodeMBCS(const char *s, 4079 Py_ssize_t size, 4080 const char *errors) 4081{ 4082 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL); 4083} 4084 4085/* 4086 * Convert unicode into string object (MBCS). 4087 * Returns 0 if succeed, -1 otherwise. 4088 */ 4089static int encode_mbcs(PyObject **repr, 4090 const Py_UNICODE *p, /* unicode */ 4091 int size) /* size of unicode */ 4092{ 4093 int mbcssize = 0; 4094 Py_ssize_t n = 0; 4095 4096 assert(size >= 0); 4097 4098 /* First get the size of the result */ 4099 if (size > 0) { 4100 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL); 4101 if (mbcssize == 0) { 4102 PyErr_SetFromWindowsErrWithFilename(0, NULL); 4103 return -1; 4104 } 4105 } 4106 4107 if (*repr == NULL) { 4108 /* Create string object */ 4109 *repr = PyBytes_FromStringAndSize(NULL, mbcssize); 4110 if (*repr == NULL) 4111 return -1; 4112 } 4113 else { 4114 /* Extend string object */ 4115 n = PyBytes_Size(*repr); 4116 if (_PyBytes_Resize(repr, n + mbcssize) < 0) 4117 return -1; 4118 } 4119 4120 /* Do the conversion */ 4121 if (size > 0) { 4122 char *s = PyBytes_AS_STRING(*repr) + n; 4123 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) { 4124 PyErr_SetFromWindowsErrWithFilename(0, NULL); 4125 return -1; 4126 } 4127 } 4128 4129 return 0; 4130} 4131 4132PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p, 4133 Py_ssize_t size, 4134 const char *errors) 4135{ 4136 PyObject *repr = NULL; 4137 int ret; 4138 4139#ifdef NEED_RETRY 4140 retry: 4141 if (size > INT_MAX) 4142 ret = encode_mbcs(&repr, p, INT_MAX); 4143 else 4144#endif 4145 ret = encode_mbcs(&repr, p, (int)size); 4146 4147 if (ret < 0) { 4148 Py_XDECREF(repr); 4149 return NULL; 4150 } 4151 4152#ifdef NEED_RETRY 4153 if (size > INT_MAX) { 4154 p += INT_MAX; 4155 size -= INT_MAX; 4156 goto retry; 4157 } 4158#endif 4159 4160 return repr; 4161} 4162 4163PyObject *PyUnicode_AsMBCSString(PyObject *unicode) 4164{ 4165 if (!PyUnicode_Check(unicode)) { 4166 PyErr_BadArgument(); 4167 return NULL; 4168 } 4169 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode), 4170 PyUnicode_GET_SIZE(unicode), 4171 NULL); 4172} 4173 4174#undef NEED_RETRY 4175 4176#endif /* MS_WINDOWS */ 4177 4178/* --- Character Mapping Codec -------------------------------------------- */ 4179 4180PyObject *PyUnicode_DecodeCharmap(const char *s, 4181 Py_ssize_t size, 4182 PyObject *mapping, 4183 const char *errors) 4184{ 4185 const char *starts = s; 4186 Py_ssize_t startinpos; 4187 Py_ssize_t endinpos; 4188 Py_ssize_t outpos; 4189 const char *e; 4190 PyUnicodeObject *v; 4191 Py_UNICODE *p; 4192 Py_ssize_t extrachars = 0; 4193 PyObject *errorHandler = NULL; 4194 PyObject *exc = NULL; 4195 Py_UNICODE *mapstring = NULL; 4196 Py_ssize_t maplen = 0; 4197 4198 /* Default to Latin-1 */ 4199 if (mapping == NULL) 4200 return PyUnicode_DecodeLatin1(s, size, errors); 4201 4202 v = _PyUnicode_New(size); 4203 if (v == NULL) 4204 goto onError; 4205 if (size == 0) 4206 return (PyObject *)v; 4207 p = PyUnicode_AS_UNICODE(v); 4208 e = s + size; 4209 if (PyUnicode_CheckExact(mapping)) { 4210 mapstring = PyUnicode_AS_UNICODE(mapping); 4211 maplen = PyUnicode_GET_SIZE(mapping); 4212 while (s < e) { 4213 unsigned char ch = *s; 4214 Py_UNICODE x = 0xfffe; /* illegal value */ 4215 4216 if (ch < maplen) 4217 x = mapstring[ch]; 4218 4219 if (x == 0xfffe) { 4220 /* undefined mapping */ 4221 outpos = p-PyUnicode_AS_UNICODE(v); 4222 startinpos = s-starts; 4223 endinpos = startinpos+1; 4224 if (unicode_decode_call_errorhandler( 4225 errors, &errorHandler, 4226 "charmap", "character maps to <undefined>", 4227 &starts, &e, &startinpos, &endinpos, &exc, &s, 4228 (PyObject **)&v, &outpos, &p)) { 4229 goto onError; 4230 } 4231 continue; 4232 } 4233 *p++ = x; 4234 ++s; 4235 } 4236 } 4237 else { 4238 while (s < e) { 4239 unsigned char ch = *s; 4240 PyObject *w, *x; 4241 4242 /* Get mapping (char ordinal -> integer, Unicode char or None) */ 4243 w = PyLong_FromLong((long)ch); 4244 if (w == NULL) 4245 goto onError; 4246 x = PyObject_GetItem(mapping, w); 4247 Py_DECREF(w); 4248 if (x == NULL) { 4249 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 4250 /* No mapping found means: mapping is undefined. */ 4251 PyErr_Clear(); 4252 x = Py_None; 4253 Py_INCREF(x); 4254 } else 4255 goto onError; 4256 } 4257 4258 /* Apply mapping */ 4259 if (PyLong_Check(x)) { 4260 long value = PyLong_AS_LONG(x); 4261 if (value < 0 || value > 65535) { 4262 PyErr_SetString(PyExc_TypeError, 4263 "character mapping must be in range(65536)"); 4264 Py_DECREF(x); 4265 goto onError; 4266 } 4267 *p++ = (Py_UNICODE)value; 4268 } 4269 else if (x == Py_None) { 4270 /* undefined mapping */ 4271 outpos = p-PyUnicode_AS_UNICODE(v); 4272 startinpos = s-starts; 4273 endinpos = startinpos+1; 4274 if (unicode_decode_call_errorhandler( 4275 errors, &errorHandler, 4276 "charmap", "character maps to <undefined>", 4277 &starts, &e, &startinpos, &endinpos, &exc, &s, 4278 (PyObject **)&v, &outpos, &p)) { 4279 Py_DECREF(x); 4280 goto onError; 4281 } 4282 Py_DECREF(x); 4283 continue; 4284 } 4285 else if (PyUnicode_Check(x)) { 4286 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x); 4287 4288 if (targetsize == 1) 4289 /* 1-1 mapping */ 4290 *p++ = *PyUnicode_AS_UNICODE(x); 4291 4292 else if (targetsize > 1) { 4293 /* 1-n mapping */ 4294 if (targetsize > extrachars) { 4295 /* resize first */ 4296 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v); 4297 Py_ssize_t needed = (targetsize - extrachars) + \ 4298 (targetsize << 2); 4299 extrachars += needed; 4300 /* XXX overflow detection missing */ 4301 if (_PyUnicode_Resize(&v, 4302 PyUnicode_GET_SIZE(v) + needed) < 0) { 4303 Py_DECREF(x); 4304 goto onError; 4305 } 4306 p = PyUnicode_AS_UNICODE(v) + oldpos; 4307 } 4308 Py_UNICODE_COPY(p, 4309 PyUnicode_AS_UNICODE(x), 4310 targetsize); 4311 p += targetsize; 4312 extrachars -= targetsize; 4313 } 4314 /* 1-0 mapping: skip the character */ 4315 } 4316 else { 4317 /* wrong return value */ 4318 PyErr_SetString(PyExc_TypeError, 4319 "character mapping must return integer, None or str"); 4320 Py_DECREF(x); 4321 goto onError; 4322 } 4323 Py_DECREF(x); 4324 ++s; 4325 } 4326 } 4327 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v)) 4328 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0) 4329 goto onError; 4330 Py_XDECREF(errorHandler); 4331 Py_XDECREF(exc); 4332 return (PyObject *)v; 4333 4334 onError: 4335 Py_XDECREF(errorHandler); 4336 Py_XDECREF(exc); 4337 Py_XDECREF(v); 4338 return NULL; 4339} 4340 4341/* Charmap encoding: the lookup table */ 4342 4343struct encoding_map{ 4344 PyObject_HEAD 4345 unsigned char level1[32]; 4346 int count2, count3; 4347 unsigned char level23[1]; 4348}; 4349 4350static PyObject* 4351encoding_map_size(PyObject *obj, PyObject* args) 4352{ 4353 struct encoding_map *map = (struct encoding_map*)obj; 4354 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 + 4355 128*map->count3); 4356} 4357 4358static PyMethodDef encoding_map_methods[] = { 4359 {"size", encoding_map_size, METH_NOARGS, 4360 PyDoc_STR("Return the size (in bytes) of this object") }, 4361 { 0 } 4362}; 4363 4364static void 4365encoding_map_dealloc(PyObject* o) 4366{ 4367 PyObject_FREE(o); 4368} 4369 4370static PyTypeObject EncodingMapType = { 4371 PyVarObject_HEAD_INIT(NULL, 0) 4372 "EncodingMap", /*tp_name*/ 4373 sizeof(struct encoding_map), /*tp_basicsize*/ 4374 0, /*tp_itemsize*/ 4375 /* methods */ 4376 encoding_map_dealloc, /*tp_dealloc*/ 4377 0, /*tp_print*/ 4378 0, /*tp_getattr*/ 4379 0, /*tp_setattr*/ 4380 0, /*tp_compare*/ 4381 0, /*tp_repr*/ 4382 0, /*tp_as_number*/ 4383 0, /*tp_as_sequence*/ 4384 0, /*tp_as_mapping*/ 4385 0, /*tp_hash*/ 4386 0, /*tp_call*/ 4387 0, /*tp_str*/ 4388 0, /*tp_getattro*/ 4389 0, /*tp_setattro*/ 4390 0, /*tp_as_buffer*/ 4391 Py_TPFLAGS_DEFAULT, /*tp_flags*/ 4392 0, /*tp_doc*/ 4393 0, /*tp_traverse*/ 4394 0, /*tp_clear*/ 4395 0, /*tp_richcompare*/ 4396 0, /*tp_weaklistoffset*/ 4397 0, /*tp_iter*/ 4398 0, /*tp_iternext*/ 4399 encoding_map_methods, /*tp_methods*/ 4400 0, /*tp_members*/ 4401 0, /*tp_getset*/ 4402 0, /*tp_base*/ 4403 0, /*tp_dict*/ 4404 0, /*tp_descr_get*/ 4405 0, /*tp_descr_set*/ 4406 0, /*tp_dictoffset*/ 4407 0, /*tp_init*/ 4408 0, /*tp_alloc*/ 4409 0, /*tp_new*/ 4410 0, /*tp_free*/ 4411 0, /*tp_is_gc*/ 4412}; 4413 4414PyObject* 4415PyUnicode_BuildEncodingMap(PyObject* string) 4416{ 4417 Py_UNICODE *decode; 4418 PyObject *result; 4419 struct encoding_map *mresult; 4420 int i; 4421 int need_dict = 0; 4422 unsigned char level1[32]; 4423 unsigned char level2[512]; 4424 unsigned char *mlevel1, *mlevel2, *mlevel3; 4425 int count2 = 0, count3 = 0; 4426 4427 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) { 4428 PyErr_BadArgument(); 4429 return NULL; 4430 } 4431 decode = PyUnicode_AS_UNICODE(string); 4432 memset(level1, 0xFF, sizeof level1); 4433 memset(level2, 0xFF, sizeof level2); 4434 4435 /* If there isn't a one-to-one mapping of NULL to \0, 4436 or if there are non-BMP characters, we need to use 4437 a mapping dictionary. */ 4438 if (decode[0] != 0) 4439 need_dict = 1; 4440 for (i = 1; i < 256; i++) { 4441 int l1, l2; 4442 if (decode[i] == 0 4443 #ifdef Py_UNICODE_WIDE 4444 || decode[i] > 0xFFFF 4445 #endif 4446 ) { 4447 need_dict = 1; 4448 break; 4449 } 4450 if (decode[i] == 0xFFFE) 4451 /* unmapped character */ 4452 continue; 4453 l1 = decode[i] >> 11; 4454 l2 = decode[i] >> 7; 4455 if (level1[l1] == 0xFF) 4456 level1[l1] = count2++; 4457 if (level2[l2] == 0xFF) 4458 level2[l2] = count3++; 4459 } 4460 4461 if (count2 >= 0xFF || count3 >= 0xFF) 4462 need_dict = 1; 4463 4464 if (need_dict) { 4465 PyObject *result = PyDict_New(); 4466 PyObject *key, *value; 4467 if (!result) 4468 return NULL; 4469 for (i = 0; i < 256; i++) { 4470 key = value = NULL; 4471 key = PyLong_FromLong(decode[i]); 4472 value = PyLong_FromLong(i); 4473 if (!key || !value) 4474 goto failed1; 4475 if (PyDict_SetItem(result, key, value) == -1) 4476 goto failed1; 4477 Py_DECREF(key); 4478 Py_DECREF(value); 4479 } 4480 return result; 4481 failed1: 4482 Py_XDECREF(key); 4483 Py_XDECREF(value); 4484 Py_DECREF(result); 4485 return NULL; 4486 } 4487 4488 /* Create a three-level trie */ 4489 result = PyObject_MALLOC(sizeof(struct encoding_map) + 4490 16*count2 + 128*count3 - 1); 4491 if (!result) 4492 return PyErr_NoMemory(); 4493 PyObject_Init(result, &EncodingMapType); 4494 mresult = (struct encoding_map*)result; 4495 mresult->count2 = count2; 4496 mresult->count3 = count3; 4497 mlevel1 = mresult->level1; 4498 mlevel2 = mresult->level23; 4499 mlevel3 = mresult->level23 + 16*count2; 4500 memcpy(mlevel1, level1, 32); 4501 memset(mlevel2, 0xFF, 16*count2); 4502 memset(mlevel3, 0, 128*count3); 4503 count3 = 0; 4504 for (i = 1; i < 256; i++) { 4505 int o1, o2, o3, i2, i3; 4506 if (decode[i] == 0xFFFE) 4507 /* unmapped character */ 4508 continue; 4509 o1 = decode[i]>>11; 4510 o2 = (decode[i]>>7) & 0xF; 4511 i2 = 16*mlevel1[o1] + o2; 4512 if (mlevel2[i2] == 0xFF) 4513 mlevel2[i2] = count3++; 4514 o3 = decode[i] & 0x7F; 4515 i3 = 128*mlevel2[i2] + o3; 4516 mlevel3[i3] = i; 4517 } 4518 return result; 4519} 4520 4521static int 4522encoding_map_lookup(Py_UNICODE c, PyObject *mapping) 4523{ 4524 struct encoding_map *map = (struct encoding_map*)mapping; 4525 int l1 = c>>11; 4526 int l2 = (c>>7) & 0xF; 4527 int l3 = c & 0x7F; 4528 int i; 4529 4530#ifdef Py_UNICODE_WIDE 4531 if (c > 0xFFFF) { 4532 return -1; 4533 } 4534#endif 4535 if (c == 0) 4536 return 0; 4537 /* level 1*/ 4538 i = map->level1[l1]; 4539 if (i == 0xFF) { 4540 return -1; 4541 } 4542 /* level 2*/ 4543 i = map->level23[16*i+l2]; 4544 if (i == 0xFF) { 4545 return -1; 4546 } 4547 /* level 3 */ 4548 i = map->level23[16*map->count2 + 128*i + l3]; 4549 if (i == 0) { 4550 return -1; 4551 } 4552 return i; 4553} 4554 4555/* Lookup the character ch in the mapping. If the character 4556 can't be found, Py_None is returned (or NULL, if another 4557 error occurred). */ 4558static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping) 4559{ 4560 PyObject *w = PyLong_FromLong((long)c); 4561 PyObject *x; 4562 4563 if (w == NULL) 4564 return NULL; 4565 x = PyObject_GetItem(mapping, w); 4566 Py_DECREF(w); 4567 if (x == NULL) { 4568 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 4569 /* No mapping found means: mapping is undefined. */ 4570 PyErr_Clear(); 4571 x = Py_None; 4572 Py_INCREF(x); 4573 return x; 4574 } else 4575 return NULL; 4576 } 4577 else if (x == Py_None) 4578 return x; 4579 else if (PyLong_Check(x)) { 4580 long value = PyLong_AS_LONG(x); 4581 if (value < 0 || value > 255) { 4582 PyErr_SetString(PyExc_TypeError, 4583 "character mapping must be in range(256)"); 4584 Py_DECREF(x); 4585 return NULL; 4586 } 4587 return x; 4588 } 4589 else if (PyBytes_Check(x)) 4590 return x; 4591 else { 4592 /* wrong return value */ 4593 PyErr_Format(PyExc_TypeError, 4594 "character mapping must return integer, bytes or None, not %.400s", 4595 x->ob_type->tp_name); 4596 Py_DECREF(x); 4597 return NULL; 4598 } 4599} 4600 4601static int 4602charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize) 4603{ 4604 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj); 4605 /* exponentially overallocate to minimize reallocations */ 4606 if (requiredsize < 2*outsize) 4607 requiredsize = 2*outsize; 4608 if (_PyBytes_Resize(outobj, requiredsize)) 4609 return -1; 4610 return 0; 4611} 4612 4613typedef enum charmapencode_result { 4614 enc_SUCCESS, enc_FAILED, enc_EXCEPTION 4615}charmapencode_result; 4616/* lookup the character, put the result in the output string and adjust 4617 various state variables. Resize the output bytes object if not enough 4618 space is available. Return a new reference to the object that 4619 was put in the output buffer, or Py_None, if the mapping was undefined 4620 (in which case no character was written) or NULL, if a 4621 reallocation error occurred. The caller must decref the result */ 4622static 4623charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping, 4624 PyObject **outobj, Py_ssize_t *outpos) 4625{ 4626 PyObject *rep; 4627 char *outstart; 4628 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj); 4629 4630 if (Py_TYPE(mapping) == &EncodingMapType) { 4631 int res = encoding_map_lookup(c, mapping); 4632 Py_ssize_t requiredsize = *outpos+1; 4633 if (res == -1) 4634 return enc_FAILED; 4635 if (outsize<requiredsize) 4636 if (charmapencode_resize(outobj, outpos, requiredsize)) 4637 return enc_EXCEPTION; 4638 outstart = PyBytes_AS_STRING(*outobj); 4639 outstart[(*outpos)++] = (char)res; 4640 return enc_SUCCESS; 4641 } 4642 4643 rep = charmapencode_lookup(c, mapping); 4644 if (rep==NULL) 4645 return enc_EXCEPTION; 4646 else if (rep==Py_None) { 4647 Py_DECREF(rep); 4648 return enc_FAILED; 4649 } else { 4650 if (PyLong_Check(rep)) { 4651 Py_ssize_t requiredsize = *outpos+1; 4652 if (outsize<requiredsize) 4653 if (charmapencode_resize(outobj, outpos, requiredsize)) { 4654 Py_DECREF(rep); 4655 return enc_EXCEPTION; 4656 } 4657 outstart = PyBytes_AS_STRING(*outobj); 4658 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep); 4659 } 4660 else { 4661 const char *repchars = PyBytes_AS_STRING(rep); 4662 Py_ssize_t repsize = PyBytes_GET_SIZE(rep); 4663 Py_ssize_t requiredsize = *outpos+repsize; 4664 if (outsize<requiredsize) 4665 if (charmapencode_resize(outobj, outpos, requiredsize)) { 4666 Py_DECREF(rep); 4667 return enc_EXCEPTION; 4668 } 4669 outstart = PyBytes_AS_STRING(*outobj); 4670 memcpy(outstart + *outpos, repchars, repsize); 4671 *outpos += repsize; 4672 } 4673 } 4674 Py_DECREF(rep); 4675 return enc_SUCCESS; 4676} 4677 4678/* handle an error in PyUnicode_EncodeCharmap 4679 Return 0 on success, -1 on error */ 4680static 4681int charmap_encoding_error( 4682 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping, 4683 PyObject **exceptionObject, 4684 int *known_errorHandler, PyObject **errorHandler, const char *errors, 4685 PyObject **res, Py_ssize_t *respos) 4686{ 4687 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */ 4688 Py_ssize_t repsize; 4689 Py_ssize_t newpos; 4690 Py_UNICODE *uni2; 4691 /* startpos for collecting unencodable chars */ 4692 Py_ssize_t collstartpos = *inpos; 4693 Py_ssize_t collendpos = *inpos+1; 4694 Py_ssize_t collpos; 4695 char *encoding = "charmap"; 4696 char *reason = "character maps to <undefined>"; 4697 charmapencode_result x; 4698 4699 /* find all unencodable characters */ 4700 while (collendpos < size) { 4701 PyObject *rep; 4702 if (Py_TYPE(mapping) == &EncodingMapType) { 4703 int res = encoding_map_lookup(p[collendpos], mapping); 4704 if (res != -1) 4705 break; 4706 ++collendpos; 4707 continue; 4708 } 4709 4710 rep = charmapencode_lookup(p[collendpos], mapping); 4711 if (rep==NULL) 4712 return -1; 4713 else if (rep!=Py_None) { 4714 Py_DECREF(rep); 4715 break; 4716 } 4717 Py_DECREF(rep); 4718 ++collendpos; 4719 } 4720 /* cache callback name lookup 4721 * (if not done yet, i.e. it's the first error) */ 4722 if (*known_errorHandler==-1) { 4723 if ((errors==NULL) || (!strcmp(errors, "strict"))) 4724 *known_errorHandler = 1; 4725 else if (!strcmp(errors, "replace")) 4726 *known_errorHandler = 2; 4727 else if (!strcmp(errors, "ignore")) 4728 *known_errorHandler = 3; 4729 else if (!strcmp(errors, "xmlcharrefreplace")) 4730 *known_errorHandler = 4; 4731 else 4732 *known_errorHandler = 0; 4733 } 4734 switch (*known_errorHandler) { 4735 case 1: /* strict */ 4736 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason); 4737 return -1; 4738 case 2: /* replace */ 4739 for (collpos = collstartpos; collpos<collendpos; ++collpos) { 4740 x = charmapencode_output('?', mapping, res, respos); 4741 if (x==enc_EXCEPTION) { 4742 return -1; 4743 } 4744 else if (x==enc_FAILED) { 4745 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason); 4746 return -1; 4747 } 4748 } 4749 /* fall through */ 4750 case 3: /* ignore */ 4751 *inpos = collendpos; 4752 break; 4753 case 4: /* xmlcharrefreplace */ 4754 /* generate replacement (temporarily (mis)uses p) */ 4755 for (collpos = collstartpos; collpos < collendpos; ++collpos) { 4756 char buffer[2+29+1+1]; 4757 char *cp; 4758 sprintf(buffer, "&#%d;", (int)p[collpos]); 4759 for (cp = buffer; *cp; ++cp) { 4760 x = charmapencode_output(*cp, mapping, res, respos); 4761 if (x==enc_EXCEPTION) 4762 return -1; 4763 else if (x==enc_FAILED) { 4764 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason); 4765 return -1; 4766 } 4767 } 4768 } 4769 *inpos = collendpos; 4770 break; 4771 default: 4772 repunicode = unicode_encode_call_errorhandler(errors, errorHandler, 4773 encoding, reason, p, size, exceptionObject, 4774 collstartpos, collendpos, &newpos); 4775 if (repunicode == NULL) 4776 return -1; 4777 /* generate replacement */ 4778 repsize = PyUnicode_GET_SIZE(repunicode); 4779 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) { 4780 x = charmapencode_output(*uni2, mapping, res, respos); 4781 if (x==enc_EXCEPTION) { 4782 return -1; 4783 } 4784 else if (x==enc_FAILED) { 4785 Py_DECREF(repunicode); 4786 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason); 4787 return -1; 4788 } 4789 } 4790 *inpos = newpos; 4791 Py_DECREF(repunicode); 4792 } 4793 return 0; 4794} 4795 4796PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p, 4797 Py_ssize_t size, 4798 PyObject *mapping, 4799 const char *errors) 4800{ 4801 /* output object */ 4802 PyObject *res = NULL; 4803 /* current input position */ 4804 Py_ssize_t inpos = 0; 4805 /* current output position */ 4806 Py_ssize_t respos = 0; 4807 PyObject *errorHandler = NULL; 4808 PyObject *exc = NULL; 4809 /* the following variable is used for caching string comparisons 4810 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 4811 * 3=ignore, 4=xmlcharrefreplace */ 4812 int known_errorHandler = -1; 4813 4814 /* Default to Latin-1 */ 4815 if (mapping == NULL) 4816 return PyUnicode_EncodeLatin1(p, size, errors); 4817 4818 /* allocate enough for a simple encoding without 4819 replacements, if we need more, we'll resize */ 4820 res = PyBytes_FromStringAndSize(NULL, size); 4821 if (res == NULL) 4822 goto onError; 4823 if (size == 0) 4824 return res; 4825 4826 while (inpos<size) { 4827 /* try to encode it */ 4828 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos); 4829 if (x==enc_EXCEPTION) /* error */ 4830 goto onError; 4831 if (x==enc_FAILED) { /* unencodable character */ 4832 if (charmap_encoding_error(p, size, &inpos, mapping, 4833 &exc, 4834 &known_errorHandler, &errorHandler, errors, 4835 &res, &respos)) { 4836 goto onError; 4837 } 4838 } 4839 else 4840 /* done with this character => adjust input position */ 4841 ++inpos; 4842 } 4843 4844 /* Resize if we allocated to much */ 4845 if (respos<PyBytes_GET_SIZE(res)) 4846 _PyBytes_Resize(&res, respos); 4847 4848 Py_XDECREF(exc); 4849 Py_XDECREF(errorHandler); 4850 return res; 4851 4852 onError: 4853 Py_XDECREF(res); 4854 Py_XDECREF(exc); 4855 Py_XDECREF(errorHandler); 4856 return NULL; 4857} 4858 4859PyObject *PyUnicode_AsCharmapString(PyObject *unicode, 4860 PyObject *mapping) 4861{ 4862 if (!PyUnicode_Check(unicode) || mapping == NULL) { 4863 PyErr_BadArgument(); 4864 return NULL; 4865 } 4866 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode), 4867 PyUnicode_GET_SIZE(unicode), 4868 mapping, 4869 NULL); 4870} 4871 4872/* create or adjust a UnicodeTranslateError */ 4873static void make_translate_exception(PyObject **exceptionObject, 4874 const Py_UNICODE *unicode, Py_ssize_t size, 4875 Py_ssize_t startpos, Py_ssize_t endpos, 4876 const char *reason) 4877{ 4878 if (*exceptionObject == NULL) { 4879 *exceptionObject = PyUnicodeTranslateError_Create( 4880 unicode, size, startpos, endpos, reason); 4881 } 4882 else { 4883 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos)) 4884 goto onError; 4885 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos)) 4886 goto onError; 4887 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason)) 4888 goto onError; 4889 return; 4890 onError: 4891 Py_DECREF(*exceptionObject); 4892 *exceptionObject = NULL; 4893 } 4894} 4895 4896/* raises a UnicodeTranslateError */ 4897static void raise_translate_exception(PyObject **exceptionObject, 4898 const Py_UNICODE *unicode, Py_ssize_t size, 4899 Py_ssize_t startpos, Py_ssize_t endpos, 4900 const char *reason) 4901{ 4902 make_translate_exception(exceptionObject, 4903 unicode, size, startpos, endpos, reason); 4904 if (*exceptionObject != NULL) 4905 PyCodec_StrictErrors(*exceptionObject); 4906} 4907 4908/* error handling callback helper: 4909 build arguments, call the callback and check the arguments, 4910 put the result into newpos and return the replacement string, which 4911 has to be freed by the caller */ 4912static PyObject *unicode_translate_call_errorhandler(const char *errors, 4913 PyObject **errorHandler, 4914 const char *reason, 4915 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject, 4916 Py_ssize_t startpos, Py_ssize_t endpos, 4917 Py_ssize_t *newpos) 4918{ 4919 static char *argparse = "O!n;translating error handler must return (str, int) tuple"; 4920 4921 Py_ssize_t i_newpos; 4922 PyObject *restuple; 4923 PyObject *resunicode; 4924 4925 if (*errorHandler == NULL) { 4926 *errorHandler = PyCodec_LookupError(errors); 4927 if (*errorHandler == NULL) 4928 return NULL; 4929 } 4930 4931 make_translate_exception(exceptionObject, 4932 unicode, size, startpos, endpos, reason); 4933 if (*exceptionObject == NULL) 4934 return NULL; 4935 4936 restuple = PyObject_CallFunctionObjArgs( 4937 *errorHandler, *exceptionObject, NULL); 4938 if (restuple == NULL) 4939 return NULL; 4940 if (!PyTuple_Check(restuple)) { 4941 PyErr_Format(PyExc_TypeError, &argparse[4]); 4942 Py_DECREF(restuple); 4943 return NULL; 4944 } 4945 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, 4946 &resunicode, &i_newpos)) { 4947 Py_DECREF(restuple); 4948 return NULL; 4949 } 4950 if (i_newpos<0) 4951 *newpos = size+i_newpos; 4952 else 4953 *newpos = i_newpos; 4954 if (*newpos<0 || *newpos>size) { 4955 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos); 4956 Py_DECREF(restuple); 4957 return NULL; 4958 } 4959 Py_INCREF(resunicode); 4960 Py_DECREF(restuple); 4961 return resunicode; 4962} 4963 4964/* Lookup the character ch in the mapping and put the result in result, 4965 which must be decrefed by the caller. 4966 Return 0 on success, -1 on error */ 4967static 4968int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result) 4969{ 4970 PyObject *w = PyLong_FromLong((long)c); 4971 PyObject *x; 4972 4973 if (w == NULL) 4974 return -1; 4975 x = PyObject_GetItem(mapping, w); 4976 Py_DECREF(w); 4977 if (x == NULL) { 4978 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 4979 /* No mapping found means: use 1:1 mapping. */ 4980 PyErr_Clear(); 4981 *result = NULL; 4982 return 0; 4983 } else 4984 return -1; 4985 } 4986 else if (x == Py_None) { 4987 *result = x; 4988 return 0; 4989 } 4990 else if (PyLong_Check(x)) { 4991 long value = PyLong_AS_LONG(x); 4992 long max = PyUnicode_GetMax(); 4993 if (value < 0 || value > max) { 4994 PyErr_Format(PyExc_TypeError, 4995 "character mapping must be in range(0x%x)", max+1); 4996 Py_DECREF(x); 4997 return -1; 4998 } 4999 *result = x; 5000 return 0; 5001 } 5002 else if (PyUnicode_Check(x)) { 5003 *result = x; 5004 return 0; 5005 } 5006 else { 5007 /* wrong return value */ 5008 PyErr_SetString(PyExc_TypeError, 5009 "character mapping must return integer, None or str"); 5010 Py_DECREF(x); 5011 return -1; 5012 } 5013} 5014/* ensure that *outobj is at least requiredsize characters long, 5015if not reallocate and adjust various state variables. 5016Return 0 on success, -1 on error */ 5017static 5018int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp, 5019 Py_ssize_t requiredsize) 5020{ 5021 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj); 5022 if (requiredsize > oldsize) { 5023 /* remember old output position */ 5024 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj); 5025 /* exponentially overallocate to minimize reallocations */ 5026 if (requiredsize < 2 * oldsize) 5027 requiredsize = 2 * oldsize; 5028 if (_PyUnicode_Resize(outobj, requiredsize) < 0) 5029 return -1; 5030 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos; 5031 } 5032 return 0; 5033} 5034/* lookup the character, put the result in the output string and adjust 5035 various state variables. Return a new reference to the object that 5036 was put in the output buffer in *result, or Py_None, if the mapping was 5037 undefined (in which case no character was written). 5038 The called must decref result. 5039 Return 0 on success, -1 on error. */ 5040static 5041int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp, 5042 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp, 5043 PyObject **res) 5044{ 5045 if (charmaptranslate_lookup(*curinp, mapping, res)) 5046 return -1; 5047 if (*res==NULL) { 5048 /* not found => default to 1:1 mapping */ 5049 *(*outp)++ = *curinp; 5050 } 5051 else if (*res==Py_None) 5052 ; 5053 else if (PyLong_Check(*res)) { 5054 /* no overflow check, because we know that the space is enough */ 5055 *(*outp)++ = (Py_UNICODE)PyLong_AS_LONG(*res); 5056 } 5057 else if (PyUnicode_Check(*res)) { 5058 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res); 5059 if (repsize==1) { 5060 /* no overflow check, because we know that the space is enough */ 5061 *(*outp)++ = *PyUnicode_AS_UNICODE(*res); 5062 } 5063 else if (repsize!=0) { 5064 /* more than one character */ 5065 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) + 5066 (insize - (curinp-startinp)) + 5067 repsize - 1; 5068 if (charmaptranslate_makespace(outobj, outp, requiredsize)) 5069 return -1; 5070 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize); 5071 *outp += repsize; 5072 } 5073 } 5074 else 5075 return -1; 5076 return 0; 5077} 5078 5079PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p, 5080 Py_ssize_t size, 5081 PyObject *mapping, 5082 const char *errors) 5083{ 5084 /* output object */ 5085 PyObject *res = NULL; 5086 /* pointers to the beginning and end+1 of input */ 5087 const Py_UNICODE *startp = p; 5088 const Py_UNICODE *endp = p + size; 5089 /* pointer into the output */ 5090 Py_UNICODE *str; 5091 /* current output position */ 5092 Py_ssize_t respos = 0; 5093 char *reason = "character maps to <undefined>"; 5094 PyObject *errorHandler = NULL; 5095 PyObject *exc = NULL; 5096 /* the following variable is used for caching string comparisons 5097 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 5098 * 3=ignore, 4=xmlcharrefreplace */ 5099 int known_errorHandler = -1; 5100 5101 if (mapping == NULL) { 5102 PyErr_BadArgument(); 5103 return NULL; 5104 } 5105 5106 /* allocate enough for a simple 1:1 translation without 5107 replacements, if we need more, we'll resize */ 5108 res = PyUnicode_FromUnicode(NULL, size); 5109 if (res == NULL) 5110 goto onError; 5111 if (size == 0) 5112 return res; 5113 str = PyUnicode_AS_UNICODE(res); 5114 5115 while (p<endp) { 5116 /* try to encode it */ 5117 PyObject *x = NULL; 5118 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) { 5119 Py_XDECREF(x); 5120 goto onError; 5121 } 5122 Py_XDECREF(x); 5123 if (x!=Py_None) /* it worked => adjust input pointer */ 5124 ++p; 5125 else { /* untranslatable character */ 5126 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */ 5127 Py_ssize_t repsize; 5128 Py_ssize_t newpos; 5129 Py_UNICODE *uni2; 5130 /* startpos for collecting untranslatable chars */ 5131 const Py_UNICODE *collstart = p; 5132 const Py_UNICODE *collend = p+1; 5133 const Py_UNICODE *coll; 5134 5135 /* find all untranslatable characters */ 5136 while (collend < endp) { 5137 if (charmaptranslate_lookup(*collend, mapping, &x)) 5138 goto onError; 5139 Py_XDECREF(x); 5140 if (x!=Py_None) 5141 break; 5142 ++collend; 5143 } 5144 /* cache callback name lookup 5145 * (if not done yet, i.e. it's the first error) */ 5146 if (known_errorHandler==-1) { 5147 if ((errors==NULL) || (!strcmp(errors, "strict"))) 5148 known_errorHandler = 1; 5149 else if (!strcmp(errors, "replace")) 5150 known_errorHandler = 2; 5151 else if (!strcmp(errors, "ignore")) 5152 known_errorHandler = 3; 5153 else if (!strcmp(errors, "xmlcharrefreplace")) 5154 known_errorHandler = 4; 5155 else 5156 known_errorHandler = 0; 5157 } 5158 switch (known_errorHandler) { 5159 case 1: /* strict */ 5160 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason); 5161 goto onError; 5162 case 2: /* replace */ 5163 /* No need to check for space, this is a 1:1 replacement */ 5164 for (coll = collstart; coll<collend; ++coll) 5165 *str++ = '?'; 5166 /* fall through */ 5167 case 3: /* ignore */ 5168 p = collend; 5169 break; 5170 case 4: /* xmlcharrefreplace */ 5171 /* generate replacement (temporarily (mis)uses p) */ 5172 for (p = collstart; p < collend; ++p) { 5173 char buffer[2+29+1+1]; 5174 char *cp; 5175 sprintf(buffer, "&#%d;", (int)*p); 5176 if (charmaptranslate_makespace(&res, &str, 5177 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend))) 5178 goto onError; 5179 for (cp = buffer; *cp; ++cp) 5180 *str++ = *cp; 5181 } 5182 p = collend; 5183 break; 5184 default: 5185 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler, 5186 reason, startp, size, &exc, 5187 collstart-startp, collend-startp, &newpos); 5188 if (repunicode == NULL) 5189 goto onError; 5190 /* generate replacement */ 5191 repsize = PyUnicode_GET_SIZE(repunicode); 5192 if (charmaptranslate_makespace(&res, &str, 5193 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) { 5194 Py_DECREF(repunicode); 5195 goto onError; 5196 } 5197 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) 5198 *str++ = *uni2; 5199 p = startp + newpos; 5200 Py_DECREF(repunicode); 5201 } 5202 } 5203 } 5204 /* Resize if we allocated to much */ 5205 respos = str-PyUnicode_AS_UNICODE(res); 5206 if (respos<PyUnicode_GET_SIZE(res)) { 5207 if (_PyUnicode_Resize(&res, respos) < 0) 5208 goto onError; 5209 } 5210 Py_XDECREF(exc); 5211 Py_XDECREF(errorHandler); 5212 return res; 5213 5214 onError: 5215 Py_XDECREF(res); 5216 Py_XDECREF(exc); 5217 Py_XDECREF(errorHandler); 5218 return NULL; 5219} 5220 5221PyObject *PyUnicode_Translate(PyObject *str, 5222 PyObject *mapping, 5223 const char *errors) 5224{ 5225 PyObject *result; 5226 5227 str = PyUnicode_FromObject(str); 5228 if (str == NULL) 5229 goto onError; 5230 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str), 5231 PyUnicode_GET_SIZE(str), 5232 mapping, 5233 errors); 5234 Py_DECREF(str); 5235 return result; 5236 5237 onError: 5238 Py_XDECREF(str); 5239 return NULL; 5240} 5241 5242/* --- Decimal Encoder ---------------------------------------------------- */ 5243 5244int PyUnicode_EncodeDecimal(Py_UNICODE *s, 5245 Py_ssize_t length, 5246 char *output, 5247 const char *errors) 5248{ 5249 Py_UNICODE *p, *end; 5250 PyObject *errorHandler = NULL; 5251 PyObject *exc = NULL; 5252 const char *encoding = "decimal"; 5253 const char *reason = "invalid decimal Unicode string"; 5254 /* the following variable is used for caching string comparisons 5255 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */ 5256 int known_errorHandler = -1; 5257 5258 if (output == NULL) { 5259 PyErr_BadArgument(); 5260 return -1; 5261 } 5262 5263 p = s; 5264 end = s + length; 5265 while (p < end) { 5266 register Py_UNICODE ch = *p; 5267 int decimal; 5268 PyObject *repunicode; 5269 Py_ssize_t repsize; 5270 Py_ssize_t newpos; 5271 Py_UNICODE *uni2; 5272 Py_UNICODE *collstart; 5273 Py_UNICODE *collend; 5274 5275 if (Py_UNICODE_ISSPACE(ch)) { 5276 *output++ = ' '; 5277 ++p; 5278 continue; 5279 } 5280 decimal = Py_UNICODE_TODECIMAL(ch); 5281 if (decimal >= 0) { 5282 *output++ = '0' + decimal; 5283 ++p; 5284 continue; 5285 } 5286 if (0 < ch && ch < 256) { 5287 *output++ = (char)ch; 5288 ++p; 5289 continue; 5290 } 5291 /* All other characters are considered unencodable */ 5292 collstart = p; 5293 collend = p+1; 5294 while (collend < end) { 5295 if ((0 < *collend && *collend < 256) || 5296 !Py_UNICODE_ISSPACE(*collend) || 5297 Py_UNICODE_TODECIMAL(*collend)) 5298 break; 5299 } 5300 /* cache callback name lookup 5301 * (if not done yet, i.e. it's the first error) */ 5302 if (known_errorHandler==-1) { 5303 if ((errors==NULL) || (!strcmp(errors, "strict"))) 5304 known_errorHandler = 1; 5305 else if (!strcmp(errors, "replace")) 5306 known_errorHandler = 2; 5307 else if (!strcmp(errors, "ignore")) 5308 known_errorHandler = 3; 5309 else if (!strcmp(errors, "xmlcharrefreplace")) 5310 known_errorHandler = 4; 5311 else 5312 known_errorHandler = 0; 5313 } 5314 switch (known_errorHandler) { 5315 case 1: /* strict */ 5316 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason); 5317 goto onError; 5318 case 2: /* replace */ 5319 for (p = collstart; p < collend; ++p) 5320 *output++ = '?'; 5321 /* fall through */ 5322 case 3: /* ignore */ 5323 p = collend; 5324 break; 5325 case 4: /* xmlcharrefreplace */ 5326 /* generate replacement (temporarily (mis)uses p) */ 5327 for (p = collstart; p < collend; ++p) 5328 output += sprintf(output, "&#%d;", (int)*p); 5329 p = collend; 5330 break; 5331 default: 5332 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler, 5333 encoding, reason, s, length, &exc, 5334 collstart-s, collend-s, &newpos); 5335 if (repunicode == NULL) 5336 goto onError; 5337 /* generate replacement */ 5338 repsize = PyUnicode_GET_SIZE(repunicode); 5339 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) { 5340 Py_UNICODE ch = *uni2; 5341 if (Py_UNICODE_ISSPACE(ch)) 5342 *output++ = ' '; 5343 else { 5344 decimal = Py_UNICODE_TODECIMAL(ch); 5345 if (decimal >= 0) 5346 *output++ = '0' + decimal; 5347 else if (0 < ch && ch < 256) 5348 *output++ = (char)ch; 5349 else { 5350 Py_DECREF(repunicode); 5351 raise_encode_exception(&exc, encoding, 5352 s, length, collstart-s, collend-s, reason); 5353 goto onError; 5354 } 5355 } 5356 } 5357 p = s + newpos; 5358 Py_DECREF(repunicode); 5359 } 5360 } 5361 /* 0-terminate the output string */ 5362 *output++ = '\0'; 5363 Py_XDECREF(exc); 5364 Py_XDECREF(errorHandler); 5365 return 0; 5366 5367 onError: 5368 Py_XDECREF(exc); 5369 Py_XDECREF(errorHandler); 5370 return -1; 5371} 5372 5373/* --- Helpers ------------------------------------------------------------ */ 5374 5375#include "stringlib/unicodedefs.h" 5376#include "stringlib/fastsearch.h" 5377#include "stringlib/count.h" 5378/* Include _ParseTupleFinds from find.h */ 5379#define FROM_UNICODE 5380#include "stringlib/find.h" 5381#include "stringlib/partition.h" 5382 5383#define _Py_InsertThousandsGrouping _PyUnicode_InsertThousandsGrouping 5384#include "stringlib/localeutil.h" 5385 5386/* helper macro to fixup start/end slice values */ 5387#define FIX_START_END(obj) \ 5388 if (start < 0) \ 5389 start += (obj)->length; \ 5390 if (start < 0) \ 5391 start = 0; \ 5392 if (end > (obj)->length) \ 5393 end = (obj)->length; \ 5394 if (end < 0) \ 5395 end += (obj)->length; \ 5396 if (end < 0) \ 5397 end = 0; 5398 5399Py_ssize_t PyUnicode_Count(PyObject *str, 5400 PyObject *substr, 5401 Py_ssize_t start, 5402 Py_ssize_t end) 5403{ 5404 Py_ssize_t result; 5405 PyUnicodeObject* str_obj; 5406 PyUnicodeObject* sub_obj; 5407 5408 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str); 5409 if (!str_obj) 5410 return -1; 5411 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr); 5412 if (!sub_obj) { 5413 Py_DECREF(str_obj); 5414 return -1; 5415 } 5416 5417 FIX_START_END(str_obj); 5418 5419 result = stringlib_count( 5420 str_obj->str + start, end - start, sub_obj->str, sub_obj->length 5421 ); 5422 5423 Py_DECREF(sub_obj); 5424 Py_DECREF(str_obj); 5425 5426 return result; 5427} 5428 5429Py_ssize_t PyUnicode_Find(PyObject *str, 5430 PyObject *sub, 5431 Py_ssize_t start, 5432 Py_ssize_t end, 5433 int direction) 5434{ 5435 Py_ssize_t result; 5436 5437 str = PyUnicode_FromObject(str); 5438 if (!str) 5439 return -2; 5440 sub = PyUnicode_FromObject(sub); 5441 if (!sub) { 5442 Py_DECREF(str); 5443 return -2; 5444 } 5445 5446 if (direction > 0) 5447 result = stringlib_find_slice( 5448 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str), 5449 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub), 5450 start, end 5451 ); 5452 else 5453 result = stringlib_rfind_slice( 5454 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str), 5455 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub), 5456 start, end 5457 ); 5458 5459 Py_DECREF(str); 5460 Py_DECREF(sub); 5461 5462 return result; 5463} 5464 5465static 5466int tailmatch(PyUnicodeObject *self, 5467 PyUnicodeObject *substring, 5468 Py_ssize_t start, 5469 Py_ssize_t end, 5470 int direction) 5471{ 5472 if (substring->length == 0) 5473 return 1; 5474 5475 FIX_START_END(self); 5476 5477 end -= substring->length; 5478 if (end < start) 5479 return 0; 5480 5481 if (direction > 0) { 5482 if (Py_UNICODE_MATCH(self, end, substring)) 5483 return 1; 5484 } else { 5485 if (Py_UNICODE_MATCH(self, start, substring)) 5486 return 1; 5487 } 5488 5489 return 0; 5490} 5491 5492Py_ssize_t PyUnicode_Tailmatch(PyObject *str, 5493 PyObject *substr, 5494 Py_ssize_t start, 5495 Py_ssize_t end, 5496 int direction) 5497{ 5498 Py_ssize_t result; 5499 5500 str = PyUnicode_FromObject(str); 5501 if (str == NULL) 5502 return -1; 5503 substr = PyUnicode_FromObject(substr); 5504 if (substr == NULL) { 5505 Py_DECREF(str); 5506 return -1; 5507 } 5508 5509 result = tailmatch((PyUnicodeObject *)str, 5510 (PyUnicodeObject *)substr, 5511 start, end, direction); 5512 Py_DECREF(str); 5513 Py_DECREF(substr); 5514 return result; 5515} 5516 5517/* Apply fixfct filter to the Unicode object self and return a 5518 reference to the modified object */ 5519 5520static 5521PyObject *fixup(PyUnicodeObject *self, 5522 int (*fixfct)(PyUnicodeObject *s)) 5523{ 5524 5525 PyUnicodeObject *u; 5526 5527 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length); 5528 if (u == NULL) 5529 return NULL; 5530 5531 Py_UNICODE_COPY(u->str, self->str, self->length); 5532 5533 if (!fixfct(u) && PyUnicode_CheckExact(self)) { 5534 /* fixfct should return TRUE if it modified the buffer. If 5535 FALSE, return a reference to the original buffer instead 5536 (to save space, not time) */ 5537 Py_INCREF(self); 5538 Py_DECREF(u); 5539 return (PyObject*) self; 5540 } 5541 return (PyObject*) u; 5542} 5543 5544static 5545int fixupper(PyUnicodeObject *self) 5546{ 5547 Py_ssize_t len = self->length; 5548 Py_UNICODE *s = self->str; 5549 int status = 0; 5550 5551 while (len-- > 0) { 5552 register Py_UNICODE ch; 5553 5554 ch = Py_UNICODE_TOUPPER(*s); 5555 if (ch != *s) { 5556 status = 1; 5557 *s = ch; 5558 } 5559 s++; 5560 } 5561 5562 return status; 5563} 5564 5565static 5566int fixlower(PyUnicodeObject *self) 5567{ 5568 Py_ssize_t len = self->length; 5569 Py_UNICODE *s = self->str; 5570 int status = 0; 5571 5572 while (len-- > 0) { 5573 register Py_UNICODE ch; 5574 5575 ch = Py_UNICODE_TOLOWER(*s); 5576 if (ch != *s) { 5577 status = 1; 5578 *s = ch; 5579 } 5580 s++; 5581 } 5582 5583 return status; 5584} 5585 5586static 5587int fixswapcase(PyUnicodeObject *self) 5588{ 5589 Py_ssize_t len = self->length; 5590 Py_UNICODE *s = self->str; 5591 int status = 0; 5592 5593 while (len-- > 0) { 5594 if (Py_UNICODE_ISUPPER(*s)) { 5595 *s = Py_UNICODE_TOLOWER(*s); 5596 status = 1; 5597 } else if (Py_UNICODE_ISLOWER(*s)) { 5598 *s = Py_UNICODE_TOUPPER(*s); 5599 status = 1; 5600 } 5601 s++; 5602 } 5603 5604 return status; 5605} 5606 5607static 5608int fixcapitalize(PyUnicodeObject *self) 5609{ 5610 Py_ssize_t len = self->length; 5611 Py_UNICODE *s = self->str; 5612 int status = 0; 5613 5614 if (len == 0) 5615 return 0; 5616 if (Py_UNICODE_ISLOWER(*s)) { 5617 *s = Py_UNICODE_TOUPPER(*s); 5618 status = 1; 5619 } 5620 s++; 5621 while (--len > 0) { 5622 if (Py_UNICODE_ISUPPER(*s)) { 5623 *s = Py_UNICODE_TOLOWER(*s); 5624 status = 1; 5625 } 5626 s++; 5627 } 5628 return status; 5629} 5630 5631static 5632int fixtitle(PyUnicodeObject *self) 5633{ 5634 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 5635 register Py_UNICODE *e; 5636 int previous_is_cased; 5637 5638 /* Shortcut for single character strings */ 5639 if (PyUnicode_GET_SIZE(self) == 1) { 5640 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p); 5641 if (*p != ch) { 5642 *p = ch; 5643 return 1; 5644 } 5645 else 5646 return 0; 5647 } 5648 5649 e = p + PyUnicode_GET_SIZE(self); 5650 previous_is_cased = 0; 5651 for (; p < e; p++) { 5652 register const Py_UNICODE ch = *p; 5653 5654 if (previous_is_cased) 5655 *p = Py_UNICODE_TOLOWER(ch); 5656 else 5657 *p = Py_UNICODE_TOTITLE(ch); 5658 5659 if (Py_UNICODE_ISLOWER(ch) || 5660 Py_UNICODE_ISUPPER(ch) || 5661 Py_UNICODE_ISTITLE(ch)) 5662 previous_is_cased = 1; 5663 else 5664 previous_is_cased = 0; 5665 } 5666 return 1; 5667} 5668 5669PyObject * 5670PyUnicode_Join(PyObject *separator, PyObject *seq) 5671{ 5672 const Py_UNICODE blank = ' '; 5673 const Py_UNICODE *sep = ␣ 5674 Py_ssize_t seplen = 1; 5675 PyUnicodeObject *res = NULL; /* the result */ 5676 Py_UNICODE *res_p; /* pointer to free byte in res's string area */ 5677 PyObject *fseq; /* PySequence_Fast(seq) */ 5678 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */ 5679 PyObject **items; 5680 PyObject *item; 5681 Py_ssize_t sz, i; 5682 5683 fseq = PySequence_Fast(seq, ""); 5684 if (fseq == NULL) { 5685 return NULL; 5686 } 5687 5688 /* NOTE: the following code can't call back into Python code, 5689 * so we are sure that fseq won't be mutated. 5690 */ 5691 5692 seqlen = PySequence_Fast_GET_SIZE(fseq); 5693 /* If empty sequence, return u"". */ 5694 if (seqlen == 0) { 5695 res = _PyUnicode_New(0); /* empty sequence; return u"" */ 5696 goto Done; 5697 } 5698 items = PySequence_Fast_ITEMS(fseq); 5699 /* If singleton sequence with an exact Unicode, return that. */ 5700 if (seqlen == 1) { 5701 item = items[0]; 5702 if (PyUnicode_CheckExact(item)) { 5703 Py_INCREF(item); 5704 res = (PyUnicodeObject *)item; 5705 goto Done; 5706 } 5707 } 5708 else { 5709 /* Set up sep and seplen */ 5710 if (separator == NULL) { 5711 sep = ␣ 5712 seplen = 1; 5713 } 5714 else { 5715 if (!PyUnicode_Check(separator)) { 5716 PyErr_Format(PyExc_TypeError, 5717 "separator: expected str instance," 5718 " %.80s found", 5719 Py_TYPE(separator)->tp_name); 5720 goto onError; 5721 } 5722 sep = PyUnicode_AS_UNICODE(separator); 5723 seplen = PyUnicode_GET_SIZE(separator); 5724 } 5725 } 5726 5727 /* There are at least two things to join, or else we have a subclass 5728 * of str in the sequence. 5729 * Do a pre-pass to figure out the total amount of space we'll 5730 * need (sz), and see whether all argument are strings. 5731 */ 5732 sz = 0; 5733 for (i = 0; i < seqlen; i++) { 5734 const Py_ssize_t old_sz = sz; 5735 item = items[i]; 5736 if (!PyUnicode_Check(item)) { 5737 PyErr_Format(PyExc_TypeError, 5738 "sequence item %zd: expected str instance," 5739 " %.80s found", 5740 i, Py_TYPE(item)->tp_name); 5741 goto onError; 5742 } 5743 sz += PyUnicode_GET_SIZE(item); 5744 if (i != 0) 5745 sz += seplen; 5746 if (sz < old_sz || sz > PY_SSIZE_T_MAX) { 5747 PyErr_SetString(PyExc_OverflowError, 5748 "join() result is too long for a Python string"); 5749 goto onError; 5750 } 5751 } 5752 5753 res = _PyUnicode_New(sz); 5754 if (res == NULL) 5755 goto onError; 5756 5757 /* Catenate everything. */ 5758 res_p = PyUnicode_AS_UNICODE(res); 5759 for (i = 0; i < seqlen; ++i) { 5760 Py_ssize_t itemlen; 5761 item = items[i]; 5762 itemlen = PyUnicode_GET_SIZE(item); 5763 /* Copy item, and maybe the separator. */ 5764 if (i) { 5765 Py_UNICODE_COPY(res_p, sep, seplen); 5766 res_p += seplen; 5767 } 5768 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen); 5769 res_p += itemlen; 5770 } 5771 5772 Done: 5773 Py_DECREF(fseq); 5774 return (PyObject *)res; 5775 5776 onError: 5777 Py_DECREF(fseq); 5778 Py_XDECREF(res); 5779 return NULL; 5780} 5781 5782static 5783PyUnicodeObject *pad(PyUnicodeObject *self, 5784 Py_ssize_t left, 5785 Py_ssize_t right, 5786 Py_UNICODE fill) 5787{ 5788 PyUnicodeObject *u; 5789 5790 if (left < 0) 5791 left = 0; 5792 if (right < 0) 5793 right = 0; 5794 5795 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) { 5796 Py_INCREF(self); 5797 return self; 5798 } 5799 5800 if (left > PY_SSIZE_T_MAX - self->length || 5801 right > PY_SSIZE_T_MAX - (left + self->length)) { 5802 PyErr_SetString(PyExc_OverflowError, "padded string is too long"); 5803 return NULL; 5804 } 5805 u = _PyUnicode_New(left + self->length + right); 5806 if (u) { 5807 if (left) 5808 Py_UNICODE_FILL(u->str, fill, left); 5809 Py_UNICODE_COPY(u->str + left, self->str, self->length); 5810 if (right) 5811 Py_UNICODE_FILL(u->str + left + self->length, fill, right); 5812 } 5813 5814 return u; 5815} 5816 5817#define SPLIT_APPEND(data, left, right) \ 5818 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \ 5819 if (!str) \ 5820 goto onError; \ 5821 if (PyList_Append(list, str)) { \ 5822 Py_DECREF(str); \ 5823 goto onError; \ 5824 } \ 5825 else \ 5826 Py_DECREF(str); 5827 5828static 5829PyObject *split_whitespace(PyUnicodeObject *self, 5830 PyObject *list, 5831 Py_ssize_t maxcount) 5832{ 5833 register Py_ssize_t i; 5834 register Py_ssize_t j; 5835 Py_ssize_t len = self->length; 5836 PyObject *str; 5837 register const Py_UNICODE *buf = self->str; 5838 5839 for (i = j = 0; i < len; ) { 5840 /* find a token */ 5841 while (i < len && Py_UNICODE_ISSPACE(buf[i])) 5842 i++; 5843 j = i; 5844 while (i < len && !Py_UNICODE_ISSPACE(buf[i])) 5845 i++; 5846 if (j < i) { 5847 if (maxcount-- <= 0) 5848 break; 5849 SPLIT_APPEND(buf, j, i); 5850 while (i < len && Py_UNICODE_ISSPACE(buf[i])) 5851 i++; 5852 j = i; 5853 } 5854 } 5855 if (j < len) { 5856 SPLIT_APPEND(buf, j, len); 5857 } 5858 return list; 5859 5860 onError: 5861 Py_DECREF(list); 5862 return NULL; 5863} 5864 5865PyObject *PyUnicode_Splitlines(PyObject *string, 5866 int keepends) 5867{ 5868 register Py_ssize_t i; 5869 register Py_ssize_t j; 5870 Py_ssize_t len; 5871 PyObject *list; 5872 PyObject *str; 5873 Py_UNICODE *data; 5874 5875 string = PyUnicode_FromObject(string); 5876 if (string == NULL) 5877 return NULL; 5878 data = PyUnicode_AS_UNICODE(string); 5879 len = PyUnicode_GET_SIZE(string); 5880 5881 list = PyList_New(0); 5882 if (!list) 5883 goto onError; 5884 5885 for (i = j = 0; i < len; ) { 5886 Py_ssize_t eol; 5887 5888 /* Find a line and append it */ 5889 while (i < len && !BLOOM_LINEBREAK(data[i])) 5890 i++; 5891 5892 /* Skip the line break reading CRLF as one line break */ 5893 eol = i; 5894 if (i < len) { 5895 if (data[i] == '\r' && i + 1 < len && 5896 data[i+1] == '\n') 5897 i += 2; 5898 else 5899 i++; 5900 if (keepends) 5901 eol = i; 5902 } 5903 SPLIT_APPEND(data, j, eol); 5904 j = i; 5905 } 5906 if (j < len) { 5907 SPLIT_APPEND(data, j, len); 5908 } 5909 5910 Py_DECREF(string); 5911 return list; 5912 5913 onError: 5914 Py_XDECREF(list); 5915 Py_DECREF(string); 5916 return NULL; 5917} 5918 5919static 5920PyObject *split_char(PyUnicodeObject *self, 5921 PyObject *list, 5922 Py_UNICODE ch, 5923 Py_ssize_t maxcount) 5924{ 5925 register Py_ssize_t i; 5926 register Py_ssize_t j; 5927 Py_ssize_t len = self->length; 5928 PyObject *str; 5929 register const Py_UNICODE *buf = self->str; 5930 5931 for (i = j = 0; i < len; ) { 5932 if (buf[i] == ch) { 5933 if (maxcount-- <= 0) 5934 break; 5935 SPLIT_APPEND(buf, j, i); 5936 i = j = i + 1; 5937 } else 5938 i++; 5939 } 5940 if (j <= len) { 5941 SPLIT_APPEND(buf, j, len); 5942 } 5943 return list; 5944 5945 onError: 5946 Py_DECREF(list); 5947 return NULL; 5948} 5949 5950static 5951PyObject *split_substring(PyUnicodeObject *self, 5952 PyObject *list, 5953 PyUnicodeObject *substring, 5954 Py_ssize_t maxcount) 5955{ 5956 register Py_ssize_t i; 5957 register Py_ssize_t j; 5958 Py_ssize_t len = self->length; 5959 Py_ssize_t sublen = substring->length; 5960 PyObject *str; 5961 5962 for (i = j = 0; i <= len - sublen; ) { 5963 if (Py_UNICODE_MATCH(self, i, substring)) { 5964 if (maxcount-- <= 0) 5965 break; 5966 SPLIT_APPEND(self->str, j, i); 5967 i = j = i + sublen; 5968 } else 5969 i++; 5970 } 5971 if (j <= len) { 5972 SPLIT_APPEND(self->str, j, len); 5973 } 5974 return list; 5975 5976 onError: 5977 Py_DECREF(list); 5978 return NULL; 5979} 5980 5981static 5982PyObject *rsplit_whitespace(PyUnicodeObject *self, 5983 PyObject *list, 5984 Py_ssize_t maxcount) 5985{ 5986 register Py_ssize_t i; 5987 register Py_ssize_t j; 5988 Py_ssize_t len = self->length; 5989 PyObject *str; 5990 register const Py_UNICODE *buf = self->str; 5991 5992 for (i = j = len - 1; i >= 0; ) { 5993 /* find a token */ 5994 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i])) 5995 i--; 5996 j = i; 5997 while (i >= 0 && !Py_UNICODE_ISSPACE(buf[i])) 5998 i--; 5999 if (j > i) { 6000 if (maxcount-- <= 0) 6001 break; 6002 SPLIT_APPEND(buf, i + 1, j + 1); 6003 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i])) 6004 i--; 6005 j = i; 6006 } 6007 } 6008 if (j >= 0) { 6009 SPLIT_APPEND(buf, 0, j + 1); 6010 } 6011 if (PyList_Reverse(list) < 0) 6012 goto onError; 6013 return list; 6014 6015 onError: 6016 Py_DECREF(list); 6017 return NULL; 6018} 6019 6020static 6021PyObject *rsplit_char(PyUnicodeObject *self, 6022 PyObject *list, 6023 Py_UNICODE ch, 6024 Py_ssize_t maxcount) 6025{ 6026 register Py_ssize_t i; 6027 register Py_ssize_t j; 6028 Py_ssize_t len = self->length; 6029 PyObject *str; 6030 register const Py_UNICODE *buf = self->str; 6031 6032 for (i = j = len - 1; i >= 0; ) { 6033 if (buf[i] == ch) { 6034 if (maxcount-- <= 0) 6035 break; 6036 SPLIT_APPEND(buf, i + 1, j + 1); 6037 j = i = i - 1; 6038 } else 6039 i--; 6040 } 6041 if (j >= -1) { 6042 SPLIT_APPEND(buf, 0, j + 1); 6043 } 6044 if (PyList_Reverse(list) < 0) 6045 goto onError; 6046 return list; 6047 6048 onError: 6049 Py_DECREF(list); 6050 return NULL; 6051} 6052 6053static 6054PyObject *rsplit_substring(PyUnicodeObject *self, 6055 PyObject *list, 6056 PyUnicodeObject *substring, 6057 Py_ssize_t maxcount) 6058{ 6059 register Py_ssize_t i; 6060 register Py_ssize_t j; 6061 Py_ssize_t len = self->length; 6062 Py_ssize_t sublen = substring->length; 6063 PyObject *str; 6064 6065 for (i = len - sublen, j = len; i >= 0; ) { 6066 if (Py_UNICODE_MATCH(self, i, substring)) { 6067 if (maxcount-- <= 0) 6068 break; 6069 SPLIT_APPEND(self->str, i + sublen, j); 6070 j = i; 6071 i -= sublen; 6072 } else 6073 i--; 6074 } 6075 if (j >= 0) { 6076 SPLIT_APPEND(self->str, 0, j); 6077 } 6078 if (PyList_Reverse(list) < 0) 6079 goto onError; 6080 return list; 6081 6082 onError: 6083 Py_DECREF(list); 6084 return NULL; 6085} 6086 6087#undef SPLIT_APPEND 6088 6089static 6090PyObject *split(PyUnicodeObject *self, 6091 PyUnicodeObject *substring, 6092 Py_ssize_t maxcount) 6093{ 6094 PyObject *list; 6095 6096 if (maxcount < 0) 6097 maxcount = PY_SSIZE_T_MAX; 6098 6099 list = PyList_New(0); 6100 if (!list) 6101 return NULL; 6102 6103 if (substring == NULL) 6104 return split_whitespace(self,list,maxcount); 6105 6106 else if (substring->length == 1) 6107 return split_char(self,list,substring->str[0],maxcount); 6108 6109 else if (substring->length == 0) { 6110 Py_DECREF(list); 6111 PyErr_SetString(PyExc_ValueError, "empty separator"); 6112 return NULL; 6113 } 6114 else 6115 return split_substring(self,list,substring,maxcount); 6116} 6117 6118static 6119PyObject *rsplit(PyUnicodeObject *self, 6120 PyUnicodeObject *substring, 6121 Py_ssize_t maxcount) 6122{ 6123 PyObject *list; 6124 6125 if (maxcount < 0) 6126 maxcount = PY_SSIZE_T_MAX; 6127 6128 list = PyList_New(0); 6129 if (!list) 6130 return NULL; 6131 6132 if (substring == NULL) 6133 return rsplit_whitespace(self,list,maxcount); 6134 6135 else if (substring->length == 1) 6136 return rsplit_char(self,list,substring->str[0],maxcount); 6137 6138 else if (substring->length == 0) { 6139 Py_DECREF(list); 6140 PyErr_SetString(PyExc_ValueError, "empty separator"); 6141 return NULL; 6142 } 6143 else 6144 return rsplit_substring(self,list,substring,maxcount); 6145} 6146 6147static 6148PyObject *replace(PyUnicodeObject *self, 6149 PyUnicodeObject *str1, 6150 PyUnicodeObject *str2, 6151 Py_ssize_t maxcount) 6152{ 6153 PyUnicodeObject *u; 6154 6155 if (maxcount < 0) 6156 maxcount = PY_SSIZE_T_MAX; 6157 6158 if (str1->length == str2->length) { 6159 /* same length */ 6160 Py_ssize_t i; 6161 if (str1->length == 1) { 6162 /* replace characters */ 6163 Py_UNICODE u1, u2; 6164 if (!findchar(self->str, self->length, str1->str[0])) 6165 goto nothing; 6166 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length); 6167 if (!u) 6168 return NULL; 6169 Py_UNICODE_COPY(u->str, self->str, self->length); 6170 u1 = str1->str[0]; 6171 u2 = str2->str[0]; 6172 for (i = 0; i < u->length; i++) 6173 if (u->str[i] == u1) { 6174 if (--maxcount < 0) 6175 break; 6176 u->str[i] = u2; 6177 } 6178 } else { 6179 i = fastsearch( 6180 self->str, self->length, str1->str, str1->length, FAST_SEARCH 6181 ); 6182 if (i < 0) 6183 goto nothing; 6184 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length); 6185 if (!u) 6186 return NULL; 6187 Py_UNICODE_COPY(u->str, self->str, self->length); 6188 while (i <= self->length - str1->length) 6189 if (Py_UNICODE_MATCH(self, i, str1)) { 6190 if (--maxcount < 0) 6191 break; 6192 Py_UNICODE_COPY(u->str+i, str2->str, str2->length); 6193 i += str1->length; 6194 } else 6195 i++; 6196 } 6197 } else { 6198 6199 Py_ssize_t n, i, j, e; 6200 Py_ssize_t product, new_size, delta; 6201 Py_UNICODE *p; 6202 6203 /* replace strings */ 6204 n = stringlib_count(self->str, self->length, str1->str, str1->length); 6205 if (n > maxcount) 6206 n = maxcount; 6207 if (n == 0) 6208 goto nothing; 6209 /* new_size = self->length + n * (str2->length - str1->length)); */ 6210 delta = (str2->length - str1->length); 6211 if (delta == 0) { 6212 new_size = self->length; 6213 } else { 6214 product = n * (str2->length - str1->length); 6215 if ((product / (str2->length - str1->length)) != n) { 6216 PyErr_SetString(PyExc_OverflowError, 6217 "replace string is too long"); 6218 return NULL; 6219 } 6220 new_size = self->length + product; 6221 if (new_size < 0) { 6222 PyErr_SetString(PyExc_OverflowError, 6223 "replace string is too long"); 6224 return NULL; 6225 } 6226 } 6227 u = _PyUnicode_New(new_size); 6228 if (!u) 6229 return NULL; 6230 i = 0; 6231 p = u->str; 6232 e = self->length - str1->length; 6233 if (str1->length > 0) { 6234 while (n-- > 0) { 6235 /* look for next match */ 6236 j = i; 6237 while (j <= e) { 6238 if (Py_UNICODE_MATCH(self, j, str1)) 6239 break; 6240 j++; 6241 } 6242 if (j > i) { 6243 if (j > e) 6244 break; 6245 /* copy unchanged part [i:j] */ 6246 Py_UNICODE_COPY(p, self->str+i, j-i); 6247 p += j - i; 6248 } 6249 /* copy substitution string */ 6250 if (str2->length > 0) { 6251 Py_UNICODE_COPY(p, str2->str, str2->length); 6252 p += str2->length; 6253 } 6254 i = j + str1->length; 6255 } 6256 if (i < self->length) 6257 /* copy tail [i:] */ 6258 Py_UNICODE_COPY(p, self->str+i, self->length-i); 6259 } else { 6260 /* interleave */ 6261 while (n > 0) { 6262 Py_UNICODE_COPY(p, str2->str, str2->length); 6263 p += str2->length; 6264 if (--n <= 0) 6265 break; 6266 *p++ = self->str[i++]; 6267 } 6268 Py_UNICODE_COPY(p, self->str+i, self->length-i); 6269 } 6270 } 6271 return (PyObject *) u; 6272 6273nothing: 6274 /* nothing to replace; return original string (when possible) */ 6275 if (PyUnicode_CheckExact(self)) { 6276 Py_INCREF(self); 6277 return (PyObject *) self; 6278 } 6279 return PyUnicode_FromUnicode(self->str, self->length); 6280} 6281 6282/* --- Unicode Object Methods --------------------------------------------- */ 6283 6284PyDoc_STRVAR(title__doc__, 6285"S.title() -> str\n\ 6286\n\ 6287Return a titlecased version of S, i.e. words start with title case\n\ 6288characters, all remaining cased characters have lower case."); 6289 6290static PyObject* 6291unicode_title(PyUnicodeObject *self) 6292{ 6293 return fixup(self, fixtitle); 6294} 6295 6296PyDoc_STRVAR(capitalize__doc__, 6297"S.capitalize() -> str\n\ 6298\n\ 6299Return a capitalized version of S, i.e. make the first character\n\ 6300have upper case."); 6301 6302static PyObject* 6303unicode_capitalize(PyUnicodeObject *self) 6304{ 6305 return fixup(self, fixcapitalize); 6306} 6307 6308#if 0 6309PyDoc_STRVAR(capwords__doc__, 6310"S.capwords() -> str\n\ 6311\n\ 6312Apply .capitalize() to all words in S and return the result with\n\ 6313normalized whitespace (all whitespace strings are replaced by ' ')."); 6314 6315static PyObject* 6316unicode_capwords(PyUnicodeObject *self) 6317{ 6318 PyObject *list; 6319 PyObject *item; 6320 Py_ssize_t i; 6321 6322 /* Split into words */ 6323 list = split(self, NULL, -1); 6324 if (!list) 6325 return NULL; 6326 6327 /* Capitalize each word */ 6328 for (i = 0; i < PyList_GET_SIZE(list); i++) { 6329 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i), 6330 fixcapitalize); 6331 if (item == NULL) 6332 goto onError; 6333 Py_DECREF(PyList_GET_ITEM(list, i)); 6334 PyList_SET_ITEM(list, i, item); 6335 } 6336 6337 /* Join the words to form a new string */ 6338 item = PyUnicode_Join(NULL, list); 6339 6340onError: 6341 Py_DECREF(list); 6342 return (PyObject *)item; 6343} 6344#endif 6345 6346/* Argument converter. Coerces to a single unicode character */ 6347 6348static int 6349convert_uc(PyObject *obj, void *addr) 6350{ 6351 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr; 6352 PyObject *uniobj; 6353 Py_UNICODE *unistr; 6354 6355 uniobj = PyUnicode_FromObject(obj); 6356 if (uniobj == NULL) { 6357 PyErr_SetString(PyExc_TypeError, 6358 "The fill character cannot be converted to Unicode"); 6359 return 0; 6360 } 6361 if (PyUnicode_GET_SIZE(uniobj) != 1) { 6362 PyErr_SetString(PyExc_TypeError, 6363 "The fill character must be exactly one character long"); 6364 Py_DECREF(uniobj); 6365 return 0; 6366 } 6367 unistr = PyUnicode_AS_UNICODE(uniobj); 6368 *fillcharloc = unistr[0]; 6369 Py_DECREF(uniobj); 6370 return 1; 6371} 6372 6373PyDoc_STRVAR(center__doc__, 6374"S.center(width[, fillchar]) -> str\n\ 6375\n\ 6376Return S centered in a string of length width. Padding is\n\ 6377done using the specified fill character (default is a space)"); 6378 6379static PyObject * 6380unicode_center(PyUnicodeObject *self, PyObject *args) 6381{ 6382 Py_ssize_t marg, left; 6383 Py_ssize_t width; 6384 Py_UNICODE fillchar = ' '; 6385 6386 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar)) 6387 return NULL; 6388 6389 if (self->length >= width && PyUnicode_CheckExact(self)) { 6390 Py_INCREF(self); 6391 return (PyObject*) self; 6392 } 6393 6394 marg = width - self->length; 6395 left = marg / 2 + (marg & width & 1); 6396 6397 return (PyObject*) pad(self, left, marg - left, fillchar); 6398} 6399 6400#if 0 6401 6402/* This code should go into some future Unicode collation support 6403 module. The basic comparison should compare ordinals on a naive 6404 basis (this is what Java does and thus JPython too). */ 6405 6406/* speedy UTF-16 code point order comparison */ 6407/* gleaned from: */ 6408/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */ 6409 6410static short utf16Fixup[32] = 6411{ 6412 0, 0, 0, 0, 0, 0, 0, 0, 6413 0, 0, 0, 0, 0, 0, 0, 0, 6414 0, 0, 0, 0, 0, 0, 0, 0, 6415 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800 6416}; 6417 6418static int 6419unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2) 6420{ 6421 Py_ssize_t len1, len2; 6422 6423 Py_UNICODE *s1 = str1->str; 6424 Py_UNICODE *s2 = str2->str; 6425 6426 len1 = str1->length; 6427 len2 = str2->length; 6428 6429 while (len1 > 0 && len2 > 0) { 6430 Py_UNICODE c1, c2; 6431 6432 c1 = *s1++; 6433 c2 = *s2++; 6434 6435 if (c1 > (1<<11) * 26) 6436 c1 += utf16Fixup[c1>>11]; 6437 if (c2 > (1<<11) * 26) 6438 c2 += utf16Fixup[c2>>11]; 6439 /* now c1 and c2 are in UTF-32-compatible order */ 6440 6441 if (c1 != c2) 6442 return (c1 < c2) ? -1 : 1; 6443 6444 len1--; len2--; 6445 } 6446 6447 return (len1 < len2) ? -1 : (len1 != len2); 6448} 6449 6450#else 6451 6452static int 6453unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2) 6454{ 6455 register Py_ssize_t len1, len2; 6456 6457 Py_UNICODE *s1 = str1->str; 6458 Py_UNICODE *s2 = str2->str; 6459 6460 len1 = str1->length; 6461 len2 = str2->length; 6462 6463 while (len1 > 0 && len2 > 0) { 6464 Py_UNICODE c1, c2; 6465 6466 c1 = *s1++; 6467 c2 = *s2++; 6468 6469 if (c1 != c2) 6470 return (c1 < c2) ? -1 : 1; 6471 6472 len1--; len2--; 6473 } 6474 6475 return (len1 < len2) ? -1 : (len1 != len2); 6476} 6477 6478#endif 6479 6480int PyUnicode_Compare(PyObject *left, 6481 PyObject *right) 6482{ 6483 if (PyUnicode_Check(left) && PyUnicode_Check(right)) 6484 return unicode_compare((PyUnicodeObject *)left, 6485 (PyUnicodeObject *)right); 6486 PyErr_Format(PyExc_TypeError, 6487 "Can't compare %.100s and %.100s", 6488 left->ob_type->tp_name, 6489 right->ob_type->tp_name); 6490 return -1; 6491} 6492 6493int 6494PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str) 6495{ 6496 int i; 6497 Py_UNICODE *id; 6498 assert(PyUnicode_Check(uni)); 6499 id = PyUnicode_AS_UNICODE(uni); 6500 /* Compare Unicode string and source character set string */ 6501 for (i = 0; id[i] && str[i]; i++) 6502 if (id[i] != str[i]) 6503 return ((int)id[i] < (int)str[i]) ? -1 : 1; 6504 if (id[i]) 6505 return 1; /* uni is longer */ 6506 if (str[i]) 6507 return -1; /* str is longer */ 6508 return 0; 6509} 6510 6511PyObject *PyUnicode_RichCompare(PyObject *left, 6512 PyObject *right, 6513 int op) 6514{ 6515 int result; 6516 6517 result = PyUnicode_Compare(left, right); 6518 if (result == -1 && PyErr_Occurred()) 6519 goto onError; 6520 6521 /* Convert the return value to a Boolean */ 6522 switch (op) { 6523 case Py_EQ: 6524 result = (result == 0); 6525 break; 6526 case Py_NE: 6527 result = (result != 0); 6528 break; 6529 case Py_LE: 6530 result = (result <= 0); 6531 break; 6532 case Py_GE: 6533 result = (result >= 0); 6534 break; 6535 case Py_LT: 6536 result = (result == -1); 6537 break; 6538 case Py_GT: 6539 result = (result == 1); 6540 break; 6541 } 6542 return PyBool_FromLong(result); 6543 6544 onError: 6545 6546 /* Standard case 6547 6548 Type errors mean that PyUnicode_FromObject() could not convert 6549 one of the arguments (usually the right hand side) to Unicode, 6550 ie. we can't handle the comparison request. However, it is 6551 possible that the other object knows a comparison method, which 6552 is why we return Py_NotImplemented to give the other object a 6553 chance. 6554 6555 */ 6556 if (PyErr_ExceptionMatches(PyExc_TypeError)) { 6557 PyErr_Clear(); 6558 Py_INCREF(Py_NotImplemented); 6559 return Py_NotImplemented; 6560 } 6561 if (op != Py_EQ && op != Py_NE) 6562 return NULL; 6563 6564 /* Equality comparison. 6565 6566 This is a special case: we silence any PyExc_UnicodeDecodeError 6567 and instead turn it into a PyErr_UnicodeWarning. 6568 6569 */ 6570 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) 6571 return NULL; 6572 PyErr_Clear(); 6573 if (PyErr_WarnEx(PyExc_UnicodeWarning, 6574 (op == Py_EQ) ? 6575 "equal comparison " 6576 "failed to convert both arguments to str - " 6577 "interpreting them as being unequal" 6578 : 6579 "Unicode unequal comparison " 6580 "failed to convert both arguments to str - " 6581 "interpreting them as being unequal", 6582 1) < 0) 6583 return NULL; 6584 result = (op == Py_NE); 6585 return PyBool_FromLong(result); 6586} 6587 6588int PyUnicode_Contains(PyObject *container, 6589 PyObject *element) 6590{ 6591 PyObject *str, *sub; 6592 int result; 6593 6594 /* Coerce the two arguments */ 6595 sub = PyUnicode_FromObject(element); 6596 if (!sub) { 6597 PyErr_Format(PyExc_TypeError, 6598 "'in <string>' requires string as left operand, not %s", 6599 element->ob_type->tp_name); 6600 return -1; 6601 } 6602 6603 str = PyUnicode_FromObject(container); 6604 if (!str) { 6605 Py_DECREF(sub); 6606 return -1; 6607 } 6608 6609 result = stringlib_contains_obj(str, sub); 6610 6611 Py_DECREF(str); 6612 Py_DECREF(sub); 6613 6614 return result; 6615} 6616 6617/* Concat to string or Unicode object giving a new Unicode object. */ 6618 6619PyObject *PyUnicode_Concat(PyObject *left, 6620 PyObject *right) 6621{ 6622 PyUnicodeObject *u = NULL, *v = NULL, *w; 6623 6624 /* Coerce the two arguments */ 6625 u = (PyUnicodeObject *)PyUnicode_FromObject(left); 6626 if (u == NULL) 6627 goto onError; 6628 v = (PyUnicodeObject *)PyUnicode_FromObject(right); 6629 if (v == NULL) 6630 goto onError; 6631 6632 /* Shortcuts */ 6633 if (v == unicode_empty) { 6634 Py_DECREF(v); 6635 return (PyObject *)u; 6636 } 6637 if (u == unicode_empty) { 6638 Py_DECREF(u); 6639 return (PyObject *)v; 6640 } 6641 6642 /* Concat the two Unicode strings */ 6643 w = _PyUnicode_New(u->length + v->length); 6644 if (w == NULL) 6645 goto onError; 6646 Py_UNICODE_COPY(w->str, u->str, u->length); 6647 Py_UNICODE_COPY(w->str + u->length, v->str, v->length); 6648 6649 Py_DECREF(u); 6650 Py_DECREF(v); 6651 return (PyObject *)w; 6652 6653onError: 6654 Py_XDECREF(u); 6655 Py_XDECREF(v); 6656 return NULL; 6657} 6658 6659void 6660PyUnicode_Append(PyObject **pleft, PyObject *right) 6661{ 6662 PyObject *new; 6663 if (*pleft == NULL) 6664 return; 6665 if (right == NULL || !PyUnicode_Check(*pleft)) { 6666 Py_DECREF(*pleft); 6667 *pleft = NULL; 6668 return; 6669 } 6670 new = PyUnicode_Concat(*pleft, right); 6671 Py_DECREF(*pleft); 6672 *pleft = new; 6673} 6674 6675void 6676PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right) 6677{ 6678 PyUnicode_Append(pleft, right); 6679 Py_XDECREF(right); 6680} 6681 6682PyDoc_STRVAR(count__doc__, 6683"S.count(sub[, start[, end]]) -> int\n\ 6684\n\ 6685Return the number of non-overlapping occurrences of substring sub in\n\ 6686string S[start:end]. Optional arguments start and end are\n\ 6687interpreted as in slice notation."); 6688 6689static PyObject * 6690unicode_count(PyUnicodeObject *self, PyObject *args) 6691{ 6692 PyUnicodeObject *substring; 6693 Py_ssize_t start = 0; 6694 Py_ssize_t end = PY_SSIZE_T_MAX; 6695 PyObject *result; 6696 6697 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring, 6698 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 6699 return NULL; 6700 6701 substring = (PyUnicodeObject *)PyUnicode_FromObject( 6702 (PyObject *)substring); 6703 if (substring == NULL) 6704 return NULL; 6705 6706 FIX_START_END(self); 6707 6708 result = PyLong_FromSsize_t( 6709 stringlib_count(self->str + start, end - start, 6710 substring->str, substring->length) 6711 ); 6712 6713 Py_DECREF(substring); 6714 6715 return result; 6716} 6717 6718PyDoc_STRVAR(encode__doc__, 6719"S.encode([encoding[, errors]]) -> bytes\n\ 6720\n\ 6721Encode S using the codec registered for encoding. encoding defaults\n\ 6722to the default encoding. errors may be given to set a different error\n\ 6723handling scheme. Default is 'strict' meaning that encoding errors raise\n\ 6724a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\ 6725'xmlcharrefreplace' as well as any other name registered with\n\ 6726codecs.register_error that can handle UnicodeEncodeErrors."); 6727 6728static PyObject * 6729unicode_encode(PyUnicodeObject *self, PyObject *args) 6730{ 6731 char *encoding = NULL; 6732 char *errors = NULL; 6733 PyObject *v; 6734 6735 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors)) 6736 return NULL; 6737 v = PyUnicode_AsEncodedString((PyObject *)self, encoding, errors); 6738 if (v == NULL) 6739 goto onError; 6740 if (!PyBytes_Check(v)) { 6741 PyErr_Format(PyExc_TypeError, 6742 "encoder did not return a bytes object " 6743 "(type=%.400s)", 6744 Py_TYPE(v)->tp_name); 6745 Py_DECREF(v); 6746 return NULL; 6747 } 6748 return v; 6749 6750 onError: 6751 return NULL; 6752} 6753 6754PyDoc_STRVAR(expandtabs__doc__, 6755"S.expandtabs([tabsize]) -> str\n\ 6756\n\ 6757Return a copy of S where all tab characters are expanded using spaces.\n\ 6758If tabsize is not given, a tab size of 8 characters is assumed."); 6759 6760static PyObject* 6761unicode_expandtabs(PyUnicodeObject *self, PyObject *args) 6762{ 6763 Py_UNICODE *e; 6764 Py_UNICODE *p; 6765 Py_UNICODE *q; 6766 Py_UNICODE *qe; 6767 Py_ssize_t i, j, incr; 6768 PyUnicodeObject *u; 6769 int tabsize = 8; 6770 6771 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize)) 6772 return NULL; 6773 6774 /* First pass: determine size of output string */ 6775 i = 0; /* chars up to and including most recent \n or \r */ 6776 j = 0; /* chars since most recent \n or \r (use in tab calculations) */ 6777 e = self->str + self->length; /* end of input */ 6778 for (p = self->str; p < e; p++) 6779 if (*p == '\t') { 6780 if (tabsize > 0) { 6781 incr = tabsize - (j % tabsize); /* cannot overflow */ 6782 if (j > PY_SSIZE_T_MAX - incr) 6783 goto overflow1; 6784 j += incr; 6785 } 6786 } 6787 else { 6788 if (j > PY_SSIZE_T_MAX - 1) 6789 goto overflow1; 6790 j++; 6791 if (*p == '\n' || *p == '\r') { 6792 if (i > PY_SSIZE_T_MAX - j) 6793 goto overflow1; 6794 i += j; 6795 j = 0; 6796 } 6797 } 6798 6799 if (i > PY_SSIZE_T_MAX - j) 6800 goto overflow1; 6801 6802 /* Second pass: create output string and fill it */ 6803 u = _PyUnicode_New(i + j); 6804 if (!u) 6805 return NULL; 6806 6807 j = 0; /* same as in first pass */ 6808 q = u->str; /* next output char */ 6809 qe = u->str + u->length; /* end of output */ 6810 6811 for (p = self->str; p < e; p++) 6812 if (*p == '\t') { 6813 if (tabsize > 0) { 6814 i = tabsize - (j % tabsize); 6815 j += i; 6816 while (i--) { 6817 if (q >= qe) 6818 goto overflow2; 6819 *q++ = ' '; 6820 } 6821 } 6822 } 6823 else { 6824 if (q >= qe) 6825 goto overflow2; 6826 *q++ = *p; 6827 j++; 6828 if (*p == '\n' || *p == '\r') 6829 j = 0; 6830 } 6831 6832 return (PyObject*) u; 6833 6834 overflow2: 6835 Py_DECREF(u); 6836 overflow1: 6837 PyErr_SetString(PyExc_OverflowError, "new string is too long"); 6838 return NULL; 6839} 6840 6841PyDoc_STRVAR(find__doc__, 6842"S.find(sub[, start[, end]]) -> int\n\ 6843\n\ 6844Return the lowest index in S where substring sub is found,\n\ 6845such that sub is contained within s[start:end]. Optional\n\ 6846arguments start and end are interpreted as in slice notation.\n\ 6847\n\ 6848Return -1 on failure."); 6849 6850static PyObject * 6851unicode_find(PyUnicodeObject *self, PyObject *args) 6852{ 6853 PyObject *substring; 6854 Py_ssize_t start; 6855 Py_ssize_t end; 6856 Py_ssize_t result; 6857 6858 if (!_ParseTupleFinds(args, &substring, &start, &end)) 6859 return NULL; 6860 6861 result = stringlib_find_slice( 6862 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self), 6863 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring), 6864 start, end 6865 ); 6866 6867 Py_DECREF(substring); 6868 6869 return PyLong_FromSsize_t(result); 6870} 6871 6872static PyObject * 6873unicode_getitem(PyUnicodeObject *self, Py_ssize_t index) 6874{ 6875 if (index < 0 || index >= self->length) { 6876 PyErr_SetString(PyExc_IndexError, "string index out of range"); 6877 return NULL; 6878 } 6879 6880 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1); 6881} 6882 6883/* Believe it or not, this produces the same value for ASCII strings 6884 as string_hash(). */ 6885static long 6886unicode_hash(PyUnicodeObject *self) 6887{ 6888 Py_ssize_t len; 6889 Py_UNICODE *p; 6890 long x; 6891 6892 if (self->hash != -1) 6893 return self->hash; 6894 len = Py_SIZE(self); 6895 p = self->str; 6896 x = *p << 7; 6897 while (--len >= 0) 6898 x = (1000003*x) ^ *p++; 6899 x ^= Py_SIZE(self); 6900 if (x == -1) 6901 x = -2; 6902 self->hash = x; 6903 return x; 6904} 6905 6906PyDoc_STRVAR(index__doc__, 6907"S.index(sub[, start[, end]]) -> int\n\ 6908\n\ 6909Like S.find() but raise ValueError when the substring is not found."); 6910 6911static PyObject * 6912unicode_index(PyUnicodeObject *self, PyObject *args) 6913{ 6914 Py_ssize_t result; 6915 PyObject *substring; 6916 Py_ssize_t start; 6917 Py_ssize_t end; 6918 6919 if (!_ParseTupleFinds(args, &substring, &start, &end)) 6920 return NULL; 6921 6922 result = stringlib_find_slice( 6923 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self), 6924 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring), 6925 start, end 6926 ); 6927 6928 Py_DECREF(substring); 6929 6930 if (result < 0) { 6931 PyErr_SetString(PyExc_ValueError, "substring not found"); 6932 return NULL; 6933 } 6934 6935 return PyLong_FromSsize_t(result); 6936} 6937 6938PyDoc_STRVAR(islower__doc__, 6939"S.islower() -> bool\n\ 6940\n\ 6941Return True if all cased characters in S are lowercase and there is\n\ 6942at least one cased character in S, False otherwise."); 6943 6944static PyObject* 6945unicode_islower(PyUnicodeObject *self) 6946{ 6947 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 6948 register const Py_UNICODE *e; 6949 int cased; 6950 6951 /* Shortcut for single character strings */ 6952 if (PyUnicode_GET_SIZE(self) == 1) 6953 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p)); 6954 6955 /* Special case for empty strings */ 6956 if (PyUnicode_GET_SIZE(self) == 0) 6957 return PyBool_FromLong(0); 6958 6959 e = p + PyUnicode_GET_SIZE(self); 6960 cased = 0; 6961 for (; p < e; p++) { 6962 register const Py_UNICODE ch = *p; 6963 6964 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) 6965 return PyBool_FromLong(0); 6966 else if (!cased && Py_UNICODE_ISLOWER(ch)) 6967 cased = 1; 6968 } 6969 return PyBool_FromLong(cased); 6970} 6971 6972PyDoc_STRVAR(isupper__doc__, 6973"S.isupper() -> bool\n\ 6974\n\ 6975Return True if all cased characters in S are uppercase and there is\n\ 6976at least one cased character in S, False otherwise."); 6977 6978static PyObject* 6979unicode_isupper(PyUnicodeObject *self) 6980{ 6981 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 6982 register const Py_UNICODE *e; 6983 int cased; 6984 6985 /* Shortcut for single character strings */ 6986 if (PyUnicode_GET_SIZE(self) == 1) 6987 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0); 6988 6989 /* Special case for empty strings */ 6990 if (PyUnicode_GET_SIZE(self) == 0) 6991 return PyBool_FromLong(0); 6992 6993 e = p + PyUnicode_GET_SIZE(self); 6994 cased = 0; 6995 for (; p < e; p++) { 6996 register const Py_UNICODE ch = *p; 6997 6998 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch)) 6999 return PyBool_FromLong(0); 7000 else if (!cased && Py_UNICODE_ISUPPER(ch)) 7001 cased = 1; 7002 } 7003 return PyBool_FromLong(cased); 7004} 7005 7006PyDoc_STRVAR(istitle__doc__, 7007"S.istitle() -> bool\n\ 7008\n\ 7009Return True if S is a titlecased string and there is at least one\n\ 7010character in S, i.e. upper- and titlecase characters may only\n\ 7011follow uncased characters and lowercase characters only cased ones.\n\ 7012Return False otherwise."); 7013 7014static PyObject* 7015unicode_istitle(PyUnicodeObject *self) 7016{ 7017 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 7018 register const Py_UNICODE *e; 7019 int cased, previous_is_cased; 7020 7021 /* Shortcut for single character strings */ 7022 if (PyUnicode_GET_SIZE(self) == 1) 7023 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) || 7024 (Py_UNICODE_ISUPPER(*p) != 0)); 7025 7026 /* Special case for empty strings */ 7027 if (PyUnicode_GET_SIZE(self) == 0) 7028 return PyBool_FromLong(0); 7029 7030 e = p + PyUnicode_GET_SIZE(self); 7031 cased = 0; 7032 previous_is_cased = 0; 7033 for (; p < e; p++) { 7034 register const Py_UNICODE ch = *p; 7035 7036 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) { 7037 if (previous_is_cased) 7038 return PyBool_FromLong(0); 7039 previous_is_cased = 1; 7040 cased = 1; 7041 } 7042 else if (Py_UNICODE_ISLOWER(ch)) { 7043 if (!previous_is_cased) 7044 return PyBool_FromLong(0); 7045 previous_is_cased = 1; 7046 cased = 1; 7047 } 7048 else 7049 previous_is_cased = 0; 7050 } 7051 return PyBool_FromLong(cased); 7052} 7053 7054PyDoc_STRVAR(isspace__doc__, 7055"S.isspace() -> bool\n\ 7056\n\ 7057Return True if all characters in S are whitespace\n\ 7058and there is at least one character in S, False otherwise."); 7059 7060static PyObject* 7061unicode_isspace(PyUnicodeObject *self) 7062{ 7063 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 7064 register const Py_UNICODE *e; 7065 7066 /* Shortcut for single character strings */ 7067 if (PyUnicode_GET_SIZE(self) == 1 && 7068 Py_UNICODE_ISSPACE(*p)) 7069 return PyBool_FromLong(1); 7070 7071 /* Special case for empty strings */ 7072 if (PyUnicode_GET_SIZE(self) == 0) 7073 return PyBool_FromLong(0); 7074 7075 e = p + PyUnicode_GET_SIZE(self); 7076 for (; p < e; p++) { 7077 if (!Py_UNICODE_ISSPACE(*p)) 7078 return PyBool_FromLong(0); 7079 } 7080 return PyBool_FromLong(1); 7081} 7082 7083PyDoc_STRVAR(isalpha__doc__, 7084"S.isalpha() -> bool\n\ 7085\n\ 7086Return True if all characters in S are alphabetic\n\ 7087and there is at least one character in S, False otherwise."); 7088 7089static PyObject* 7090unicode_isalpha(PyUnicodeObject *self) 7091{ 7092 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 7093 register const Py_UNICODE *e; 7094 7095 /* Shortcut for single character strings */ 7096 if (PyUnicode_GET_SIZE(self) == 1 && 7097 Py_UNICODE_ISALPHA(*p)) 7098 return PyBool_FromLong(1); 7099 7100 /* Special case for empty strings */ 7101 if (PyUnicode_GET_SIZE(self) == 0) 7102 return PyBool_FromLong(0); 7103 7104 e = p + PyUnicode_GET_SIZE(self); 7105 for (; p < e; p++) { 7106 if (!Py_UNICODE_ISALPHA(*p)) 7107 return PyBool_FromLong(0); 7108 } 7109 return PyBool_FromLong(1); 7110} 7111 7112PyDoc_STRVAR(isalnum__doc__, 7113"S.isalnum() -> bool\n\ 7114\n\ 7115Return True if all characters in S are alphanumeric\n\ 7116and there is at least one character in S, False otherwise."); 7117 7118static PyObject* 7119unicode_isalnum(PyUnicodeObject *self) 7120{ 7121 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 7122 register const Py_UNICODE *e; 7123 7124 /* Shortcut for single character strings */ 7125 if (PyUnicode_GET_SIZE(self) == 1 && 7126 Py_UNICODE_ISALNUM(*p)) 7127 return PyBool_FromLong(1); 7128 7129 /* Special case for empty strings */ 7130 if (PyUnicode_GET_SIZE(self) == 0) 7131 return PyBool_FromLong(0); 7132 7133 e = p + PyUnicode_GET_SIZE(self); 7134 for (; p < e; p++) { 7135 if (!Py_UNICODE_ISALNUM(*p)) 7136 return PyBool_FromLong(0); 7137 } 7138 return PyBool_FromLong(1); 7139} 7140 7141PyDoc_STRVAR(isdecimal__doc__, 7142"S.isdecimal() -> bool\n\ 7143\n\ 7144Return True if there are only decimal characters in S,\n\ 7145False otherwise."); 7146 7147static PyObject* 7148unicode_isdecimal(PyUnicodeObject *self) 7149{ 7150 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 7151 register const Py_UNICODE *e; 7152 7153 /* Shortcut for single character strings */ 7154 if (PyUnicode_GET_SIZE(self) == 1 && 7155 Py_UNICODE_ISDECIMAL(*p)) 7156 return PyBool_FromLong(1); 7157 7158 /* Special case for empty strings */ 7159 if (PyUnicode_GET_SIZE(self) == 0) 7160 return PyBool_FromLong(0); 7161 7162 e = p + PyUnicode_GET_SIZE(self); 7163 for (; p < e; p++) { 7164 if (!Py_UNICODE_ISDECIMAL(*p)) 7165 return PyBool_FromLong(0); 7166 } 7167 return PyBool_FromLong(1); 7168} 7169 7170PyDoc_STRVAR(isdigit__doc__, 7171"S.isdigit() -> bool\n\ 7172\n\ 7173Return True if all characters in S are digits\n\ 7174and there is at least one character in S, False otherwise."); 7175 7176static PyObject* 7177unicode_isdigit(PyUnicodeObject *self) 7178{ 7179 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 7180 register const Py_UNICODE *e; 7181 7182 /* Shortcut for single character strings */ 7183 if (PyUnicode_GET_SIZE(self) == 1 && 7184 Py_UNICODE_ISDIGIT(*p)) 7185 return PyBool_FromLong(1); 7186 7187 /* Special case for empty strings */ 7188 if (PyUnicode_GET_SIZE(self) == 0) 7189 return PyBool_FromLong(0); 7190 7191 e = p + PyUnicode_GET_SIZE(self); 7192 for (; p < e; p++) { 7193 if (!Py_UNICODE_ISDIGIT(*p)) 7194 return PyBool_FromLong(0); 7195 } 7196 return PyBool_FromLong(1); 7197} 7198 7199PyDoc_STRVAR(isnumeric__doc__, 7200"S.isnumeric() -> bool\n\ 7201\n\ 7202Return True if there are only numeric characters in S,\n\ 7203False otherwise."); 7204 7205static PyObject* 7206unicode_isnumeric(PyUnicodeObject *self) 7207{ 7208 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 7209 register const Py_UNICODE *e; 7210 7211 /* Shortcut for single character strings */ 7212 if (PyUnicode_GET_SIZE(self) == 1 && 7213 Py_UNICODE_ISNUMERIC(*p)) 7214 return PyBool_FromLong(1); 7215 7216 /* Special case for empty strings */ 7217 if (PyUnicode_GET_SIZE(self) == 0) 7218 return PyBool_FromLong(0); 7219 7220 e = p + PyUnicode_GET_SIZE(self); 7221 for (; p < e; p++) { 7222 if (!Py_UNICODE_ISNUMERIC(*p)) 7223 return PyBool_FromLong(0); 7224 } 7225 return PyBool_FromLong(1); 7226} 7227 7228int 7229PyUnicode_IsIdentifier(PyObject *self) 7230{ 7231 register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self); 7232 register const Py_UNICODE *e; 7233 7234 /* Special case for empty strings */ 7235 if (PyUnicode_GET_SIZE(self) == 0) 7236 return 0; 7237 7238 /* PEP 3131 says that the first character must be in 7239 XID_Start and subsequent characters in XID_Continue, 7240 and for the ASCII range, the 2.x rules apply (i.e 7241 start with letters and underscore, continue with 7242 letters, digits, underscore). However, given the current 7243 definition of XID_Start and XID_Continue, it is sufficient 7244 to check just for these, except that _ must be allowed 7245 as starting an identifier. */ 7246 if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */) 7247 return 0; 7248 7249 e = p + PyUnicode_GET_SIZE(self); 7250 for (p++; p < e; p++) { 7251 if (!_PyUnicode_IsXidContinue(*p)) 7252 return 0; 7253 } 7254 return 1; 7255} 7256 7257PyDoc_STRVAR(isidentifier__doc__, 7258"S.isidentifier() -> bool\n\ 7259\n\ 7260Return True if S is a valid identifier according\n\ 7261to the language definition."); 7262 7263static PyObject* 7264unicode_isidentifier(PyObject *self) 7265{ 7266 return PyBool_FromLong(PyUnicode_IsIdentifier(self)); 7267} 7268 7269PyDoc_STRVAR(isprintable__doc__, 7270"S.isprintable() -> bool\n\ 7271\n\ 7272Return True if all characters in S are considered\n\ 7273printable in repr() or S is empty, False otherwise."); 7274 7275static PyObject* 7276unicode_isprintable(PyObject *self) 7277{ 7278 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 7279 register const Py_UNICODE *e; 7280 7281 /* Shortcut for single character strings */ 7282 if (PyUnicode_GET_SIZE(self) == 1 && Py_UNICODE_ISPRINTABLE(*p)) { 7283 Py_RETURN_TRUE; 7284 } 7285 7286 e = p + PyUnicode_GET_SIZE(self); 7287 for (; p < e; p++) { 7288 if (!Py_UNICODE_ISPRINTABLE(*p)) { 7289 Py_RETURN_FALSE; 7290 } 7291 } 7292 Py_RETURN_TRUE; 7293} 7294 7295PyDoc_STRVAR(join__doc__, 7296"S.join(sequence) -> str\n\ 7297\n\ 7298Return a string which is the concatenation of the strings in the\n\ 7299sequence. The separator between elements is S."); 7300 7301static PyObject* 7302unicode_join(PyObject *self, PyObject *data) 7303{ 7304 return PyUnicode_Join(self, data); 7305} 7306 7307static Py_ssize_t 7308unicode_length(PyUnicodeObject *self) 7309{ 7310 return self->length; 7311} 7312 7313PyDoc_STRVAR(ljust__doc__, 7314"S.ljust(width[, fillchar]) -> str\n\ 7315\n\ 7316Return S left-justified in a Unicode string of length width. Padding is\n\ 7317done using the specified fill character (default is a space)."); 7318 7319static PyObject * 7320unicode_ljust(PyUnicodeObject *self, PyObject *args) 7321{ 7322 Py_ssize_t width; 7323 Py_UNICODE fillchar = ' '; 7324 7325 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar)) 7326 return NULL; 7327 7328 if (self->length >= width && PyUnicode_CheckExact(self)) { 7329 Py_INCREF(self); 7330 return (PyObject*) self; 7331 } 7332 7333 return (PyObject*) pad(self, 0, width - self->length, fillchar); 7334} 7335 7336PyDoc_STRVAR(lower__doc__, 7337"S.lower() -> str\n\ 7338\n\ 7339Return a copy of the string S converted to lowercase."); 7340 7341static PyObject* 7342unicode_lower(PyUnicodeObject *self) 7343{ 7344 return fixup(self, fixlower); 7345} 7346 7347#define LEFTSTRIP 0 7348#define RIGHTSTRIP 1 7349#define BOTHSTRIP 2 7350 7351/* Arrays indexed by above */ 7352static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"}; 7353 7354#define STRIPNAME(i) (stripformat[i]+3) 7355 7356/* externally visible for str.strip(unicode) */ 7357PyObject * 7358_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj) 7359{ 7360 Py_UNICODE *s = PyUnicode_AS_UNICODE(self); 7361 Py_ssize_t len = PyUnicode_GET_SIZE(self); 7362 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj); 7363 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj); 7364 Py_ssize_t i, j; 7365 7366 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen); 7367 7368 i = 0; 7369 if (striptype != RIGHTSTRIP) { 7370 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) { 7371 i++; 7372 } 7373 } 7374 7375 j = len; 7376 if (striptype != LEFTSTRIP) { 7377 do { 7378 j--; 7379 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen)); 7380 j++; 7381 } 7382 7383 if (i == 0 && j == len && PyUnicode_CheckExact(self)) { 7384 Py_INCREF(self); 7385 return (PyObject*)self; 7386 } 7387 else 7388 return PyUnicode_FromUnicode(s+i, j-i); 7389} 7390 7391 7392static PyObject * 7393do_strip(PyUnicodeObject *self, int striptype) 7394{ 7395 Py_UNICODE *s = PyUnicode_AS_UNICODE(self); 7396 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j; 7397 7398 i = 0; 7399 if (striptype != RIGHTSTRIP) { 7400 while (i < len && Py_UNICODE_ISSPACE(s[i])) { 7401 i++; 7402 } 7403 } 7404 7405 j = len; 7406 if (striptype != LEFTSTRIP) { 7407 do { 7408 j--; 7409 } while (j >= i && Py_UNICODE_ISSPACE(s[j])); 7410 j++; 7411 } 7412 7413 if (i == 0 && j == len && PyUnicode_CheckExact(self)) { 7414 Py_INCREF(self); 7415 return (PyObject*)self; 7416 } 7417 else 7418 return PyUnicode_FromUnicode(s+i, j-i); 7419} 7420 7421 7422static PyObject * 7423do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args) 7424{ 7425 PyObject *sep = NULL; 7426 7427 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep)) 7428 return NULL; 7429 7430 if (sep != NULL && sep != Py_None) { 7431 if (PyUnicode_Check(sep)) 7432 return _PyUnicode_XStrip(self, striptype, sep); 7433 else { 7434 PyErr_Format(PyExc_TypeError, 7435 "%s arg must be None or str", 7436 STRIPNAME(striptype)); 7437 return NULL; 7438 } 7439 } 7440 7441 return do_strip(self, striptype); 7442} 7443 7444 7445PyDoc_STRVAR(strip__doc__, 7446"S.strip([chars]) -> str\n\ 7447\n\ 7448Return a copy of the string S with leading and trailing\n\ 7449whitespace removed.\n\ 7450If chars is given and not None, remove characters in chars instead."); 7451 7452static PyObject * 7453unicode_strip(PyUnicodeObject *self, PyObject *args) 7454{ 7455 if (PyTuple_GET_SIZE(args) == 0) 7456 return do_strip(self, BOTHSTRIP); /* Common case */ 7457 else 7458 return do_argstrip(self, BOTHSTRIP, args); 7459} 7460 7461 7462PyDoc_STRVAR(lstrip__doc__, 7463"S.lstrip([chars]) -> str\n\ 7464\n\ 7465Return a copy of the string S with leading whitespace removed.\n\ 7466If chars is given and not None, remove characters in chars instead."); 7467 7468static PyObject * 7469unicode_lstrip(PyUnicodeObject *self, PyObject *args) 7470{ 7471 if (PyTuple_GET_SIZE(args) == 0) 7472 return do_strip(self, LEFTSTRIP); /* Common case */ 7473 else 7474 return do_argstrip(self, LEFTSTRIP, args); 7475} 7476 7477 7478PyDoc_STRVAR(rstrip__doc__, 7479"S.rstrip([chars]) -> str\n\ 7480\n\ 7481Return a copy of the string S with trailing whitespace removed.\n\ 7482If chars is given and not None, remove characters in chars instead."); 7483 7484static PyObject * 7485unicode_rstrip(PyUnicodeObject *self, PyObject *args) 7486{ 7487 if (PyTuple_GET_SIZE(args) == 0) 7488 return do_strip(self, RIGHTSTRIP); /* Common case */ 7489 else 7490 return do_argstrip(self, RIGHTSTRIP, args); 7491} 7492 7493 7494static PyObject* 7495unicode_repeat(PyUnicodeObject *str, Py_ssize_t len) 7496{ 7497 PyUnicodeObject *u; 7498 Py_UNICODE *p; 7499 Py_ssize_t nchars; 7500 size_t nbytes; 7501 7502 if (len < 0) 7503 len = 0; 7504 7505 if (len == 1 && PyUnicode_CheckExact(str)) { 7506 /* no repeat, return original string */ 7507 Py_INCREF(str); 7508 return (PyObject*) str; 7509 } 7510 7511 /* ensure # of chars needed doesn't overflow int and # of bytes 7512 * needed doesn't overflow size_t 7513 */ 7514 nchars = len * str->length; 7515 if (len && nchars / len != str->length) { 7516 PyErr_SetString(PyExc_OverflowError, 7517 "repeated string is too long"); 7518 return NULL; 7519 } 7520 nbytes = (nchars + 1) * sizeof(Py_UNICODE); 7521 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) { 7522 PyErr_SetString(PyExc_OverflowError, 7523 "repeated string is too long"); 7524 return NULL; 7525 } 7526 u = _PyUnicode_New(nchars); 7527 if (!u) 7528 return NULL; 7529 7530 p = u->str; 7531 7532 if (str->length == 1 && len > 0) { 7533 Py_UNICODE_FILL(p, str->str[0], len); 7534 } else { 7535 Py_ssize_t done = 0; /* number of characters copied this far */ 7536 if (done < nchars) { 7537 Py_UNICODE_COPY(p, str->str, str->length); 7538 done = str->length; 7539 } 7540 while (done < nchars) { 7541 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done; 7542 Py_UNICODE_COPY(p+done, p, n); 7543 done += n; 7544 } 7545 } 7546 7547 return (PyObject*) u; 7548} 7549 7550PyObject *PyUnicode_Replace(PyObject *obj, 7551 PyObject *subobj, 7552 PyObject *replobj, 7553 Py_ssize_t maxcount) 7554{ 7555 PyObject *self; 7556 PyObject *str1; 7557 PyObject *str2; 7558 PyObject *result; 7559 7560 self = PyUnicode_FromObject(obj); 7561 if (self == NULL) 7562 return NULL; 7563 str1 = PyUnicode_FromObject(subobj); 7564 if (str1 == NULL) { 7565 Py_DECREF(self); 7566 return NULL; 7567 } 7568 str2 = PyUnicode_FromObject(replobj); 7569 if (str2 == NULL) { 7570 Py_DECREF(self); 7571 Py_DECREF(str1); 7572 return NULL; 7573 } 7574 result = replace((PyUnicodeObject *)self, 7575 (PyUnicodeObject *)str1, 7576 (PyUnicodeObject *)str2, 7577 maxcount); 7578 Py_DECREF(self); 7579 Py_DECREF(str1); 7580 Py_DECREF(str2); 7581 return result; 7582} 7583 7584PyDoc_STRVAR(replace__doc__, 7585"S.replace (old, new[, count]) -> str\n\ 7586\n\ 7587Return a copy of S with all occurrences of substring\n\ 7588old replaced by new. If the optional argument count is\n\ 7589given, only the first count occurrences are replaced."); 7590 7591static PyObject* 7592unicode_replace(PyUnicodeObject *self, PyObject *args) 7593{ 7594 PyUnicodeObject *str1; 7595 PyUnicodeObject *str2; 7596 Py_ssize_t maxcount = -1; 7597 PyObject *result; 7598 7599 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount)) 7600 return NULL; 7601 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1); 7602 if (str1 == NULL) 7603 return NULL; 7604 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2); 7605 if (str2 == NULL) { 7606 Py_DECREF(str1); 7607 return NULL; 7608 } 7609 7610 result = replace(self, str1, str2, maxcount); 7611 7612 Py_DECREF(str1); 7613 Py_DECREF(str2); 7614 return result; 7615} 7616 7617static 7618PyObject *unicode_repr(PyObject *unicode) 7619{ 7620 PyObject *repr; 7621 Py_UNICODE *p; 7622 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode); 7623 Py_ssize_t size = PyUnicode_GET_SIZE(unicode); 7624 7625 /* XXX(nnorwitz): rather than over-allocating, it would be 7626 better to choose a different scheme. Perhaps scan the 7627 first N-chars of the string and allocate based on that size. 7628 */ 7629 /* Initial allocation is based on the longest-possible unichr 7630 escape. 7631 7632 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source 7633 unichr, so in this case it's the longest unichr escape. In 7634 narrow (UTF-16) builds this is five chars per source unichr 7635 since there are two unichrs in the surrogate pair, so in narrow 7636 (UTF-16) builds it's not the longest unichr escape. 7637 7638 In wide or narrow builds '\uxxxx' is 6 chars per source unichr, 7639 so in the narrow (UTF-16) build case it's the longest unichr 7640 escape. 7641 */ 7642 7643 repr = PyUnicode_FromUnicode(NULL, 7644 2 /* quotes */ 7645#ifdef Py_UNICODE_WIDE 7646 + 10*size 7647#else 7648 + 6*size 7649#endif 7650 + 1); 7651 if (repr == NULL) 7652 return NULL; 7653 7654 p = PyUnicode_AS_UNICODE(repr); 7655 7656 /* Add quote */ 7657 *p++ = (findchar(s, size, '\'') && 7658 !findchar(s, size, '"')) ? '"' : '\''; 7659 while (size-- > 0) { 7660 Py_UNICODE ch = *s++; 7661 7662 /* Escape quotes and backslashes */ 7663 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) { 7664 *p++ = '\\'; 7665 *p++ = ch; 7666 continue; 7667 } 7668 7669 /* Map special whitespace to '\t', \n', '\r' */ 7670 if (ch == '\t') { 7671 *p++ = '\\'; 7672 *p++ = 't'; 7673 } 7674 else if (ch == '\n') { 7675 *p++ = '\\'; 7676 *p++ = 'n'; 7677 } 7678 else if (ch == '\r') { 7679 *p++ = '\\'; 7680 *p++ = 'r'; 7681 } 7682 7683 /* Map non-printable US ASCII to '\xhh' */ 7684 else if (ch < ' ' || ch == 0x7F) { 7685 *p++ = '\\'; 7686 *p++ = 'x'; 7687 *p++ = hexdigits[(ch >> 4) & 0x000F]; 7688 *p++ = hexdigits[ch & 0x000F]; 7689 } 7690 7691 /* Copy ASCII characters as-is */ 7692 else if (ch < 0x7F) { 7693 *p++ = ch; 7694 } 7695 7696 /* Non-ASCII characters */ 7697 else { 7698 Py_UCS4 ucs = ch; 7699 7700#ifndef Py_UNICODE_WIDE 7701 Py_UNICODE ch2 = 0; 7702 /* Get code point from surrogate pair */ 7703 if (size > 0) { 7704 ch2 = *s; 7705 if (ch >= 0xD800 && ch < 0xDC00 && ch2 >= 0xDC00 7706 && ch2 <= 0xDFFF) { 7707 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) 7708 + 0x00010000; 7709 s++; 7710 size--; 7711 } 7712 } 7713#endif 7714 /* Map Unicode whitespace and control characters 7715 (categories Z* and C* except ASCII space) 7716 */ 7717 if (!Py_UNICODE_ISPRINTABLE(ucs)) { 7718 /* Map 8-bit characters to '\xhh' */ 7719 if (ucs <= 0xff) { 7720 *p++ = '\\'; 7721 *p++ = 'x'; 7722 *p++ = hexdigits[(ch >> 4) & 0x000F]; 7723 *p++ = hexdigits[ch & 0x000F]; 7724 } 7725 /* Map 21-bit characters to '\U00xxxxxx' */ 7726 else if (ucs >= 0x10000) { 7727 *p++ = '\\'; 7728 *p++ = 'U'; 7729 *p++ = hexdigits[(ucs >> 28) & 0x0000000F]; 7730 *p++ = hexdigits[(ucs >> 24) & 0x0000000F]; 7731 *p++ = hexdigits[(ucs >> 20) & 0x0000000F]; 7732 *p++ = hexdigits[(ucs >> 16) & 0x0000000F]; 7733 *p++ = hexdigits[(ucs >> 12) & 0x0000000F]; 7734 *p++ = hexdigits[(ucs >> 8) & 0x0000000F]; 7735 *p++ = hexdigits[(ucs >> 4) & 0x0000000F]; 7736 *p++ = hexdigits[ucs & 0x0000000F]; 7737 } 7738 /* Map 16-bit characters to '\uxxxx' */ 7739 else { 7740 *p++ = '\\'; 7741 *p++ = 'u'; 7742 *p++ = hexdigits[(ucs >> 12) & 0x000F]; 7743 *p++ = hexdigits[(ucs >> 8) & 0x000F]; 7744 *p++ = hexdigits[(ucs >> 4) & 0x000F]; 7745 *p++ = hexdigits[ucs & 0x000F]; 7746 } 7747 } 7748 /* Copy characters as-is */ 7749 else { 7750 *p++ = ch; 7751#ifndef Py_UNICODE_WIDE 7752 if (ucs >= 0x10000) 7753 *p++ = ch2; 7754#endif 7755 } 7756 } 7757 } 7758 /* Add quote */ 7759 *p++ = PyUnicode_AS_UNICODE(repr)[0]; 7760 7761 *p = '\0'; 7762 _PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr)); 7763 return repr; 7764} 7765 7766PyDoc_STRVAR(rfind__doc__, 7767"S.rfind(sub[, start[, end]]) -> int\n\ 7768\n\ 7769Return the highest index in S where substring sub is found,\n\ 7770such that sub is contained within s[start:end]. Optional\n\ 7771arguments start and end are interpreted as in slice notation.\n\ 7772\n\ 7773Return -1 on failure."); 7774 7775static PyObject * 7776unicode_rfind(PyUnicodeObject *self, PyObject *args) 7777{ 7778 PyObject *substring; 7779 Py_ssize_t start; 7780 Py_ssize_t end; 7781 Py_ssize_t result; 7782 7783 if (!_ParseTupleFinds(args, &substring, &start, &end)) 7784 return NULL; 7785 7786 result = stringlib_rfind_slice( 7787 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self), 7788 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring), 7789 start, end 7790 ); 7791 7792 Py_DECREF(substring); 7793 7794 return PyLong_FromSsize_t(result); 7795} 7796 7797PyDoc_STRVAR(rindex__doc__, 7798"S.rindex(sub[, start[, end]]) -> int\n\ 7799\n\ 7800Like S.rfind() but raise ValueError when the substring is not found."); 7801 7802static PyObject * 7803unicode_rindex(PyUnicodeObject *self, PyObject *args) 7804{ 7805 PyObject *substring; 7806 Py_ssize_t start; 7807 Py_ssize_t end; 7808 Py_ssize_t result; 7809 7810 if (!_ParseTupleFinds(args, &substring, &start, &end)) 7811 return NULL; 7812 7813 result = stringlib_rfind_slice( 7814 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self), 7815 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring), 7816 start, end 7817 ); 7818 7819 Py_DECREF(substring); 7820 7821 if (result < 0) { 7822 PyErr_SetString(PyExc_ValueError, "substring not found"); 7823 return NULL; 7824 } 7825 return PyLong_FromSsize_t(result); 7826} 7827 7828PyDoc_STRVAR(rjust__doc__, 7829"S.rjust(width[, fillchar]) -> str\n\ 7830\n\ 7831Return S right-justified in a string of length width. Padding is\n\ 7832done using the specified fill character (default is a space)."); 7833 7834static PyObject * 7835unicode_rjust(PyUnicodeObject *self, PyObject *args) 7836{ 7837 Py_ssize_t width; 7838 Py_UNICODE fillchar = ' '; 7839 7840 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar)) 7841 return NULL; 7842 7843 if (self->length >= width && PyUnicode_CheckExact(self)) { 7844 Py_INCREF(self); 7845 return (PyObject*) self; 7846 } 7847 7848 return (PyObject*) pad(self, width - self->length, 0, fillchar); 7849} 7850 7851PyObject *PyUnicode_Split(PyObject *s, 7852 PyObject *sep, 7853 Py_ssize_t maxsplit) 7854{ 7855 PyObject *result; 7856 7857 s = PyUnicode_FromObject(s); 7858 if (s == NULL) 7859 return NULL; 7860 if (sep != NULL) { 7861 sep = PyUnicode_FromObject(sep); 7862 if (sep == NULL) { 7863 Py_DECREF(s); 7864 return NULL; 7865 } 7866 } 7867 7868 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit); 7869 7870 Py_DECREF(s); 7871 Py_XDECREF(sep); 7872 return result; 7873} 7874 7875PyDoc_STRVAR(split__doc__, 7876"S.split([sep[, maxsplit]]) -> list of strings\n\ 7877\n\ 7878Return a list of the words in S, using sep as the\n\ 7879delimiter string. If maxsplit is given, at most maxsplit\n\ 7880splits are done. If sep is not specified or is None, any\n\ 7881whitespace string is a separator and empty strings are\n\ 7882removed from the result."); 7883 7884static PyObject* 7885unicode_split(PyUnicodeObject *self, PyObject *args) 7886{ 7887 PyObject *substring = Py_None; 7888 Py_ssize_t maxcount = -1; 7889 7890 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount)) 7891 return NULL; 7892 7893 if (substring == Py_None) 7894 return split(self, NULL, maxcount); 7895 else if (PyUnicode_Check(substring)) 7896 return split(self, (PyUnicodeObject *)substring, maxcount); 7897 else 7898 return PyUnicode_Split((PyObject *)self, substring, maxcount); 7899} 7900 7901PyObject * 7902PyUnicode_Partition(PyObject *str_in, PyObject *sep_in) 7903{ 7904 PyObject* str_obj; 7905 PyObject* sep_obj; 7906 PyObject* out; 7907 7908 str_obj = PyUnicode_FromObject(str_in); 7909 if (!str_obj) 7910 return NULL; 7911 sep_obj = PyUnicode_FromObject(sep_in); 7912 if (!sep_obj) { 7913 Py_DECREF(str_obj); 7914 return NULL; 7915 } 7916 7917 out = stringlib_partition( 7918 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj), 7919 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj) 7920 ); 7921 7922 Py_DECREF(sep_obj); 7923 Py_DECREF(str_obj); 7924 7925 return out; 7926} 7927 7928 7929PyObject * 7930PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in) 7931{ 7932 PyObject* str_obj; 7933 PyObject* sep_obj; 7934 PyObject* out; 7935 7936 str_obj = PyUnicode_FromObject(str_in); 7937 if (!str_obj) 7938 return NULL; 7939 sep_obj = PyUnicode_FromObject(sep_in); 7940 if (!sep_obj) { 7941 Py_DECREF(str_obj); 7942 return NULL; 7943 } 7944 7945 out = stringlib_rpartition( 7946 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj), 7947 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj) 7948 ); 7949 7950 Py_DECREF(sep_obj); 7951 Py_DECREF(str_obj); 7952 7953 return out; 7954} 7955 7956PyDoc_STRVAR(partition__doc__, 7957"S.partition(sep) -> (head, sep, tail)\n\ 7958\n\ 7959Search for the separator sep in S, and return the part before it,\n\ 7960the separator itself, and the part after it. If the separator is not\n\ 7961found, return S and two empty strings."); 7962 7963static PyObject* 7964unicode_partition(PyUnicodeObject *self, PyObject *separator) 7965{ 7966 return PyUnicode_Partition((PyObject *)self, separator); 7967} 7968 7969PyDoc_STRVAR(rpartition__doc__, 7970"S.rpartition(sep) -> (tail, sep, head)\n\ 7971\n\ 7972Search for the separator sep in S, starting at the end of S, and return\n\ 7973the part before it, the separator itself, and the part after it. If the\n\ 7974separator is not found, return two empty strings and S."); 7975 7976static PyObject* 7977unicode_rpartition(PyUnicodeObject *self, PyObject *separator) 7978{ 7979 return PyUnicode_RPartition((PyObject *)self, separator); 7980} 7981 7982PyObject *PyUnicode_RSplit(PyObject *s, 7983 PyObject *sep, 7984 Py_ssize_t maxsplit) 7985{ 7986 PyObject *result; 7987 7988 s = PyUnicode_FromObject(s); 7989 if (s == NULL) 7990 return NULL; 7991 if (sep != NULL) { 7992 sep = PyUnicode_FromObject(sep); 7993 if (sep == NULL) { 7994 Py_DECREF(s); 7995 return NULL; 7996 } 7997 } 7998 7999 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit); 8000 8001 Py_DECREF(s); 8002 Py_XDECREF(sep); 8003 return result; 8004} 8005 8006PyDoc_STRVAR(rsplit__doc__, 8007"S.rsplit([sep[, maxsplit]]) -> list of strings\n\ 8008\n\ 8009Return a list of the words in S, using sep as the\n\ 8010delimiter string, starting at the end of the string and\n\ 8011working to the front. If maxsplit is given, at most maxsplit\n\ 8012splits are done. If sep is not specified, any whitespace string\n\ 8013is a separator."); 8014 8015static PyObject* 8016unicode_rsplit(PyUnicodeObject *self, PyObject *args) 8017{ 8018 PyObject *substring = Py_None; 8019 Py_ssize_t maxcount = -1; 8020 8021 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount)) 8022 return NULL; 8023 8024 if (substring == Py_None) 8025 return rsplit(self, NULL, maxcount); 8026 else if (PyUnicode_Check(substring)) 8027 return rsplit(self, (PyUnicodeObject *)substring, maxcount); 8028 else 8029 return PyUnicode_RSplit((PyObject *)self, substring, maxcount); 8030} 8031 8032PyDoc_STRVAR(splitlines__doc__, 8033"S.splitlines([keepends]]) -> list of strings\n\ 8034\n\ 8035Return a list of the lines in S, breaking at line boundaries.\n\ 8036Line breaks are not included in the resulting list unless keepends\n\ 8037is given and true."); 8038 8039static PyObject* 8040unicode_splitlines(PyUnicodeObject *self, PyObject *args) 8041{ 8042 int keepends = 0; 8043 8044 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends)) 8045 return NULL; 8046 8047 return PyUnicode_Splitlines((PyObject *)self, keepends); 8048} 8049 8050static 8051PyObject *unicode_str(PyObject *self) 8052{ 8053 if (PyUnicode_CheckExact(self)) { 8054 Py_INCREF(self); 8055 return self; 8056 } else 8057 /* Subtype -- return genuine unicode string with the same value. */ 8058 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self), 8059 PyUnicode_GET_SIZE(self)); 8060} 8061 8062PyDoc_STRVAR(swapcase__doc__, 8063"S.swapcase() -> str\n\ 8064\n\ 8065Return a copy of S with uppercase characters converted to lowercase\n\ 8066and vice versa."); 8067 8068static PyObject* 8069unicode_swapcase(PyUnicodeObject *self) 8070{ 8071 return fixup(self, fixswapcase); 8072} 8073 8074PyDoc_STRVAR(maketrans__doc__, 8075"str.maketrans(x[, y[, z]]) -> dict (static method)\n\ 8076\n\ 8077Return a translation table usable for str.translate().\n\ 8078If there is only one argument, it must be a dictionary mapping Unicode\n\ 8079ordinals (integers) or characters to Unicode ordinals, strings or None.\n\ 8080Character keys will be then converted to ordinals.\n\ 8081If there are two arguments, they must be strings of equal length, and\n\ 8082in the resulting dictionary, each character in x will be mapped to the\n\ 8083character at the same position in y. If there is a third argument, it\n\ 8084must be a string, whose characters will be mapped to None in the result."); 8085 8086static PyObject* 8087unicode_maketrans(PyUnicodeObject *null, PyObject *args) 8088{ 8089 PyObject *x, *y = NULL, *z = NULL; 8090 PyObject *new = NULL, *key, *value; 8091 Py_ssize_t i = 0; 8092 int res; 8093 8094 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z)) 8095 return NULL; 8096 new = PyDict_New(); 8097 if (!new) 8098 return NULL; 8099 if (y != NULL) { 8100 /* x must be a string too, of equal length */ 8101 Py_ssize_t ylen = PyUnicode_GET_SIZE(y); 8102 if (!PyUnicode_Check(x)) { 8103 PyErr_SetString(PyExc_TypeError, "first maketrans argument must " 8104 "be a string if there is a second argument"); 8105 goto err; 8106 } 8107 if (PyUnicode_GET_SIZE(x) != ylen) { 8108 PyErr_SetString(PyExc_ValueError, "the first two maketrans " 8109 "arguments must have equal length"); 8110 goto err; 8111 } 8112 /* create entries for translating chars in x to those in y */ 8113 for (i = 0; i < PyUnicode_GET_SIZE(x); i++) { 8114 key = PyLong_FromLong(PyUnicode_AS_UNICODE(x)[i]); 8115 value = PyLong_FromLong(PyUnicode_AS_UNICODE(y)[i]); 8116 if (!key || !value) 8117 goto err; 8118 res = PyDict_SetItem(new, key, value); 8119 Py_DECREF(key); 8120 Py_DECREF(value); 8121 if (res < 0) 8122 goto err; 8123 } 8124 /* create entries for deleting chars in z */ 8125 if (z != NULL) { 8126 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) { 8127 key = PyLong_FromLong(PyUnicode_AS_UNICODE(z)[i]); 8128 if (!key) 8129 goto err; 8130 res = PyDict_SetItem(new, key, Py_None); 8131 Py_DECREF(key); 8132 if (res < 0) 8133 goto err; 8134 } 8135 } 8136 } else { 8137 /* x must be a dict */ 8138 if (!PyDict_Check(x)) { 8139 PyErr_SetString(PyExc_TypeError, "if you give only one argument " 8140 "to maketrans it must be a dict"); 8141 goto err; 8142 } 8143 /* copy entries into the new dict, converting string keys to int keys */ 8144 while (PyDict_Next(x, &i, &key, &value)) { 8145 if (PyUnicode_Check(key)) { 8146 /* convert string keys to integer keys */ 8147 PyObject *newkey; 8148 if (PyUnicode_GET_SIZE(key) != 1) { 8149 PyErr_SetString(PyExc_ValueError, "string keys in translate " 8150 "table must be of length 1"); 8151 goto err; 8152 } 8153 newkey = PyLong_FromLong(PyUnicode_AS_UNICODE(key)[0]); 8154 if (!newkey) 8155 goto err; 8156 res = PyDict_SetItem(new, newkey, value); 8157 Py_DECREF(newkey); 8158 if (res < 0) 8159 goto err; 8160 } else if (PyLong_Check(key)) { 8161 /* just keep integer keys */ 8162 if (PyDict_SetItem(new, key, value) < 0) 8163 goto err; 8164 } else { 8165 PyErr_SetString(PyExc_TypeError, "keys in translate table must " 8166 "be strings or integers"); 8167 goto err; 8168 } 8169 } 8170 } 8171 return new; 8172 err: 8173 Py_DECREF(new); 8174 return NULL; 8175} 8176 8177PyDoc_STRVAR(translate__doc__, 8178"S.translate(table) -> str\n\ 8179\n\ 8180Return a copy of the string S, where all characters have been mapped\n\ 8181through the given translation table, which must be a mapping of\n\ 8182Unicode ordinals to Unicode ordinals, strings, or None.\n\ 8183Unmapped characters are left untouched. Characters mapped to None\n\ 8184are deleted."); 8185 8186static PyObject* 8187unicode_translate(PyUnicodeObject *self, PyObject *table) 8188{ 8189 return PyUnicode_TranslateCharmap(self->str, self->length, table, "ignore"); 8190} 8191 8192PyDoc_STRVAR(upper__doc__, 8193"S.upper() -> str\n\ 8194\n\ 8195Return a copy of S converted to uppercase."); 8196 8197static PyObject* 8198unicode_upper(PyUnicodeObject *self) 8199{ 8200 return fixup(self, fixupper); 8201} 8202 8203PyDoc_STRVAR(zfill__doc__, 8204"S.zfill(width) -> str\n\ 8205\n\ 8206Pad a numeric string S with zeros on the left, to fill a field\n\ 8207of the specified width. The string S is never truncated."); 8208 8209static PyObject * 8210unicode_zfill(PyUnicodeObject *self, PyObject *args) 8211{ 8212 Py_ssize_t fill; 8213 PyUnicodeObject *u; 8214 8215 Py_ssize_t width; 8216 if (!PyArg_ParseTuple(args, "n:zfill", &width)) 8217 return NULL; 8218 8219 if (self->length >= width) { 8220 if (PyUnicode_CheckExact(self)) { 8221 Py_INCREF(self); 8222 return (PyObject*) self; 8223 } 8224 else 8225 return PyUnicode_FromUnicode( 8226 PyUnicode_AS_UNICODE(self), 8227 PyUnicode_GET_SIZE(self) 8228 ); 8229 } 8230 8231 fill = width - self->length; 8232 8233 u = pad(self, fill, 0, '0'); 8234 8235 if (u == NULL) 8236 return NULL; 8237 8238 if (u->str[fill] == '+' || u->str[fill] == '-') { 8239 /* move sign to beginning of string */ 8240 u->str[0] = u->str[fill]; 8241 u->str[fill] = '0'; 8242 } 8243 8244 return (PyObject*) u; 8245} 8246 8247#if 0 8248static PyObject* 8249unicode_freelistsize(PyUnicodeObject *self) 8250{ 8251 return PyLong_FromLong(numfree); 8252} 8253#endif 8254 8255PyDoc_STRVAR(startswith__doc__, 8256"S.startswith(prefix[, start[, end]]) -> bool\n\ 8257\n\ 8258Return True if S starts with the specified prefix, False otherwise.\n\ 8259With optional start, test S beginning at that position.\n\ 8260With optional end, stop comparing S at that position.\n\ 8261prefix can also be a tuple of strings to try."); 8262 8263static PyObject * 8264unicode_startswith(PyUnicodeObject *self, 8265 PyObject *args) 8266{ 8267 PyObject *subobj; 8268 PyUnicodeObject *substring; 8269 Py_ssize_t start = 0; 8270 Py_ssize_t end = PY_SSIZE_T_MAX; 8271 int result; 8272 8273 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj, 8274 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 8275 return NULL; 8276 if (PyTuple_Check(subobj)) { 8277 Py_ssize_t i; 8278 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) { 8279 substring = (PyUnicodeObject *)PyUnicode_FromObject( 8280 PyTuple_GET_ITEM(subobj, i)); 8281 if (substring == NULL) 8282 return NULL; 8283 result = tailmatch(self, substring, start, end, -1); 8284 Py_DECREF(substring); 8285 if (result) { 8286 Py_RETURN_TRUE; 8287 } 8288 } 8289 /* nothing matched */ 8290 Py_RETURN_FALSE; 8291 } 8292 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj); 8293 if (substring == NULL) 8294 return NULL; 8295 result = tailmatch(self, substring, start, end, -1); 8296 Py_DECREF(substring); 8297 return PyBool_FromLong(result); 8298} 8299 8300 8301PyDoc_STRVAR(endswith__doc__, 8302"S.endswith(suffix[, start[, end]]) -> bool\n\ 8303\n\ 8304Return True if S ends with the specified suffix, False otherwise.\n\ 8305With optional start, test S beginning at that position.\n\ 8306With optional end, stop comparing S at that position.\n\ 8307suffix can also be a tuple of strings to try."); 8308 8309static PyObject * 8310unicode_endswith(PyUnicodeObject *self, 8311 PyObject *args) 8312{ 8313 PyObject *subobj; 8314 PyUnicodeObject *substring; 8315 Py_ssize_t start = 0; 8316 Py_ssize_t end = PY_SSIZE_T_MAX; 8317 int result; 8318 8319 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj, 8320 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 8321 return NULL; 8322 if (PyTuple_Check(subobj)) { 8323 Py_ssize_t i; 8324 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) { 8325 substring = (PyUnicodeObject *)PyUnicode_FromObject( 8326 PyTuple_GET_ITEM(subobj, i)); 8327 if (substring == NULL) 8328 return NULL; 8329 result = tailmatch(self, substring, start, end, +1); 8330 Py_DECREF(substring); 8331 if (result) { 8332 Py_RETURN_TRUE; 8333 } 8334 } 8335 Py_RETURN_FALSE; 8336 } 8337 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj); 8338 if (substring == NULL) 8339 return NULL; 8340 8341 result = tailmatch(self, substring, start, end, +1); 8342 Py_DECREF(substring); 8343 return PyBool_FromLong(result); 8344} 8345 8346#include "stringlib/string_format.h" 8347 8348PyDoc_STRVAR(format__doc__, 8349"S.format(*args, **kwargs) -> str\n\ 8350\n\ 8351"); 8352 8353static PyObject * 8354unicode__format__(PyObject* self, PyObject* args) 8355{ 8356 PyObject *format_spec; 8357 8358 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec)) 8359 return NULL; 8360 8361 return _PyUnicode_FormatAdvanced(self, 8362 PyUnicode_AS_UNICODE(format_spec), 8363 PyUnicode_GET_SIZE(format_spec)); 8364} 8365 8366PyDoc_STRVAR(p_format__doc__, 8367"S.__format__(format_spec) -> str\n\ 8368\n\ 8369"); 8370 8371static PyObject * 8372unicode__sizeof__(PyUnicodeObject *v) 8373{ 8374 return PyLong_FromSsize_t(sizeof(PyUnicodeObject) + 8375 sizeof(Py_UNICODE) * (v->length + 1)); 8376} 8377 8378PyDoc_STRVAR(sizeof__doc__, 8379"S.__sizeof__() -> size of S in memory, in bytes"); 8380 8381static PyObject * 8382unicode_getnewargs(PyUnicodeObject *v) 8383{ 8384 return Py_BuildValue("(u#)", v->str, v->length); 8385} 8386 8387 8388static PyMethodDef unicode_methods[] = { 8389 8390 /* Order is according to common usage: often used methods should 8391 appear first, since lookup is done sequentially. */ 8392 8393 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__}, 8394 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__}, 8395 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__}, 8396 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__}, 8397 {"join", (PyCFunction) unicode_join, METH_O, join__doc__}, 8398 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__}, 8399 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__}, 8400 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__}, 8401 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__}, 8402 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__}, 8403 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__}, 8404 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__}, 8405 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__}, 8406 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__}, 8407 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__}, 8408 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__}, 8409 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__}, 8410 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__}, 8411 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__}, 8412 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__}, 8413 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__}, 8414 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__}, 8415 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__}, 8416 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__}, 8417 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__}, 8418 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__}, 8419 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__}, 8420 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__}, 8421 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__}, 8422 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__}, 8423 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__}, 8424 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__}, 8425 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__}, 8426 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__}, 8427 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__}, 8428 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__}, 8429 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__}, 8430 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__}, 8431 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__}, 8432 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__}, 8433 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__}, 8434 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__}, 8435 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS}, 8436 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS}, 8437 {"maketrans", (PyCFunction) unicode_maketrans, 8438 METH_VARARGS | METH_STATIC, maketrans__doc__}, 8439 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__}, 8440#if 0 8441 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__}, 8442#endif 8443 8444#if 0 8445 /* This one is just used for debugging the implementation. */ 8446 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS}, 8447#endif 8448 8449 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS}, 8450 {NULL, NULL} 8451}; 8452 8453static PyObject * 8454unicode_mod(PyObject *v, PyObject *w) 8455{ 8456 if (!PyUnicode_Check(v)) { 8457 Py_INCREF(Py_NotImplemented); 8458 return Py_NotImplemented; 8459 } 8460 return PyUnicode_Format(v, w); 8461} 8462 8463static PyNumberMethods unicode_as_number = { 8464 0, /*nb_add*/ 8465 0, /*nb_subtract*/ 8466 0, /*nb_multiply*/ 8467 unicode_mod, /*nb_remainder*/ 8468}; 8469 8470static PySequenceMethods unicode_as_sequence = { 8471 (lenfunc) unicode_length, /* sq_length */ 8472 PyUnicode_Concat, /* sq_concat */ 8473 (ssizeargfunc) unicode_repeat, /* sq_repeat */ 8474 (ssizeargfunc) unicode_getitem, /* sq_item */ 8475 0, /* sq_slice */ 8476 0, /* sq_ass_item */ 8477 0, /* sq_ass_slice */ 8478 PyUnicode_Contains, /* sq_contains */ 8479}; 8480 8481static PyObject* 8482unicode_subscript(PyUnicodeObject* self, PyObject* item) 8483{ 8484 if (PyIndex_Check(item)) { 8485 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError); 8486 if (i == -1 && PyErr_Occurred()) 8487 return NULL; 8488 if (i < 0) 8489 i += PyUnicode_GET_SIZE(self); 8490 return unicode_getitem(self, i); 8491 } else if (PySlice_Check(item)) { 8492 Py_ssize_t start, stop, step, slicelength, cur, i; 8493 Py_UNICODE* source_buf; 8494 Py_UNICODE* result_buf; 8495 PyObject* result; 8496 8497 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self), 8498 &start, &stop, &step, &slicelength) < 0) { 8499 return NULL; 8500 } 8501 8502 if (slicelength <= 0) { 8503 return PyUnicode_FromUnicode(NULL, 0); 8504 } else if (start == 0 && step == 1 && slicelength == self->length && 8505 PyUnicode_CheckExact(self)) { 8506 Py_INCREF(self); 8507 return (PyObject *)self; 8508 } else if (step == 1) { 8509 return PyUnicode_FromUnicode(self->str + start, slicelength); 8510 } else { 8511 source_buf = PyUnicode_AS_UNICODE((PyObject*)self); 8512 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength* 8513 sizeof(Py_UNICODE)); 8514 8515 if (result_buf == NULL) 8516 return PyErr_NoMemory(); 8517 8518 for (cur = start, i = 0; i < slicelength; cur += step, i++) { 8519 result_buf[i] = source_buf[cur]; 8520 } 8521 8522 result = PyUnicode_FromUnicode(result_buf, slicelength); 8523 PyObject_FREE(result_buf); 8524 return result; 8525 } 8526 } else { 8527 PyErr_SetString(PyExc_TypeError, "string indices must be integers"); 8528 return NULL; 8529 } 8530} 8531 8532static PyMappingMethods unicode_as_mapping = { 8533 (lenfunc)unicode_length, /* mp_length */ 8534 (binaryfunc)unicode_subscript, /* mp_subscript */ 8535 (objobjargproc)0, /* mp_ass_subscript */ 8536}; 8537 8538 8539/* Helpers for PyUnicode_Format() */ 8540 8541static PyObject * 8542getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx) 8543{ 8544 Py_ssize_t argidx = *p_argidx; 8545 if (argidx < arglen) { 8546 (*p_argidx)++; 8547 if (arglen < 0) 8548 return args; 8549 else 8550 return PyTuple_GetItem(args, argidx); 8551 } 8552 PyErr_SetString(PyExc_TypeError, 8553 "not enough arguments for format string"); 8554 return NULL; 8555} 8556 8557static Py_ssize_t 8558strtounicode(Py_UNICODE *buffer, const char *charbuffer) 8559{ 8560 register Py_ssize_t i; 8561 Py_ssize_t len = strlen(charbuffer); 8562 for (i = len - 1; i >= 0; i--) 8563 buffer[i] = (Py_UNICODE) charbuffer[i]; 8564 8565 return len; 8566} 8567 8568static int 8569doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x) 8570{ 8571 Py_ssize_t result; 8572 8573 PyOS_ascii_formatd((char *)buffer, len, format, x); 8574 result = strtounicode(buffer, (char *)buffer); 8575 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int); 8576} 8577 8578#if 0 8579static int 8580longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x) 8581{ 8582 Py_ssize_t result; 8583 8584 PyOS_snprintf((char *)buffer, len, format, x); 8585 result = strtounicode(buffer, (char *)buffer); 8586 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int); 8587} 8588#endif 8589 8590/* XXX To save some code duplication, formatfloat/long/int could have been 8591 shared with stringobject.c, converting from 8-bit to Unicode after the 8592 formatting is done. */ 8593 8594static int 8595formatfloat(Py_UNICODE *buf, 8596 size_t buflen, 8597 int flags, 8598 int prec, 8599 int type, 8600 PyObject *v) 8601{ 8602 /* fmt = '%#.' + `prec` + `type` 8603 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/ 8604 char fmt[20]; 8605 double x; 8606 8607 x = PyFloat_AsDouble(v); 8608 if (x == -1.0 && PyErr_Occurred()) 8609 return -1; 8610 if (prec < 0) 8611 prec = 6; 8612 if (type == 'f' && (fabs(x) / 1e25) >= 1e25) 8613 type = 'g'; 8614 /* Worst case length calc to ensure no buffer overrun: 8615 8616 'g' formats: 8617 fmt = %#.<prec>g 8618 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp 8619 for any double rep.) 8620 len = 1 + prec + 1 + 2 + 5 = 9 + prec 8621 8622 'f' formats: 8623 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50) 8624 len = 1 + 50 + 1 + prec = 52 + prec 8625 8626 If prec=0 the effective precision is 1 (the leading digit is 8627 always given), therefore increase the length by one. 8628 8629 */ 8630 if (((type == 'g' || type == 'G') && 8631 buflen <= (size_t)10 + (size_t)prec) || 8632 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) { 8633 PyErr_SetString(PyExc_OverflowError, 8634 "formatted float is too long (precision too large?)"); 8635 return -1; 8636 } 8637 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c", 8638 (flags&F_ALT) ? "#" : "", 8639 prec, type); 8640 return doubletounicode(buf, buflen, fmt, x); 8641} 8642 8643static PyObject* 8644formatlong(PyObject *val, int flags, int prec, int type) 8645{ 8646 char *buf; 8647 int len; 8648 PyObject *str; /* temporary string object. */ 8649 PyObject *result; 8650 8651 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len); 8652 if (!str) 8653 return NULL; 8654 result = PyUnicode_FromStringAndSize(buf, len); 8655 Py_DECREF(str); 8656 return result; 8657} 8658 8659#if 0 8660static int 8661formatint(Py_UNICODE *buf, 8662 size_t buflen, 8663 int flags, 8664 int prec, 8665 int type, 8666 PyObject *v) 8667{ 8668 /* fmt = '%#.' + `prec` + 'l' + `type` 8669 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine) 8670 * + 1 + 1 8671 * = 24 8672 */ 8673 char fmt[64]; /* plenty big enough! */ 8674 char *sign; 8675 long x; 8676 8677 x = PyLong_AsLong(v); 8678 if (x == -1 && PyErr_Occurred()) 8679 return -1; 8680 if (x < 0 && type == 'u') { 8681 type = 'd'; 8682 } 8683 if (x < 0 && (type == 'x' || type == 'X' || type == 'o')) 8684 sign = "-"; 8685 else 8686 sign = ""; 8687 if (prec < 0) 8688 prec = 1; 8689 8690 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal)) 8691 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11 8692 */ 8693 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) { 8694 PyErr_SetString(PyExc_OverflowError, 8695 "formatted integer is too long (precision too large?)"); 8696 return -1; 8697 } 8698 8699 if ((flags & F_ALT) && 8700 (type == 'x' || type == 'X' || type == 'o')) { 8701 /* When converting under %#o, %#x or %#X, there are a number 8702 * of issues that cause pain: 8703 * - for %#o, we want a different base marker than C 8704 * - when 0 is being converted, the C standard leaves off 8705 * the '0x' or '0X', which is inconsistent with other 8706 * %#x/%#X conversions and inconsistent with Python's 8707 * hex() function 8708 * - there are platforms that violate the standard and 8709 * convert 0 with the '0x' or '0X' 8710 * (Metrowerks, Compaq Tru64) 8711 * - there are platforms that give '0x' when converting 8712 * under %#X, but convert 0 in accordance with the 8713 * standard (OS/2 EMX) 8714 * 8715 * We can achieve the desired consistency by inserting our 8716 * own '0x' or '0X' prefix, and substituting %x/%X in place 8717 * of %#x/%#X. 8718 * 8719 * Note that this is the same approach as used in 8720 * formatint() in stringobject.c 8721 */ 8722 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c", 8723 sign, type, prec, type); 8724 } 8725 else { 8726 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c", 8727 sign, (flags&F_ALT) ? "#" : "", 8728 prec, type); 8729 } 8730 if (sign[0]) 8731 return longtounicode(buf, buflen, fmt, -x); 8732 else 8733 return longtounicode(buf, buflen, fmt, x); 8734} 8735#endif 8736 8737static int 8738formatchar(Py_UNICODE *buf, 8739 size_t buflen, 8740 PyObject *v) 8741{ 8742 /* presume that the buffer is at least 3 characters long */ 8743 if (PyUnicode_Check(v)) { 8744 if (PyUnicode_GET_SIZE(v) == 1) { 8745 buf[0] = PyUnicode_AS_UNICODE(v)[0]; 8746 buf[1] = '\0'; 8747 return 1; 8748 } 8749#ifndef Py_UNICODE_WIDE 8750 if (PyUnicode_GET_SIZE(v) == 2) { 8751 /* Decode a valid surrogate pair */ 8752 int c0 = PyUnicode_AS_UNICODE(v)[0]; 8753 int c1 = PyUnicode_AS_UNICODE(v)[1]; 8754 if (0xD800 <= c0 && c0 <= 0xDBFF && 8755 0xDC00 <= c1 && c1 <= 0xDFFF) { 8756 buf[0] = c0; 8757 buf[1] = c1; 8758 buf[2] = '\0'; 8759 return 2; 8760 } 8761 } 8762#endif 8763 goto onError; 8764 } 8765 else { 8766 /* Integer input truncated to a character */ 8767 long x; 8768 x = PyLong_AsLong(v); 8769 if (x == -1 && PyErr_Occurred()) 8770 goto onError; 8771 8772 if (x < 0 || x > 0x10ffff) { 8773 PyErr_SetString(PyExc_OverflowError, 8774 "%c arg not in range(0x110000)"); 8775 return -1; 8776 } 8777 8778#ifndef Py_UNICODE_WIDE 8779 if (x > 0xffff) { 8780 x -= 0x10000; 8781 buf[0] = (Py_UNICODE)(0xD800 | (x >> 10)); 8782 buf[1] = (Py_UNICODE)(0xDC00 | (x & 0x3FF)); 8783 return 2; 8784 } 8785#endif 8786 buf[0] = (Py_UNICODE) x; 8787 buf[1] = '\0'; 8788 return 1; 8789 } 8790 8791 onError: 8792 PyErr_SetString(PyExc_TypeError, 8793 "%c requires int or char"); 8794 return -1; 8795} 8796 8797/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...) 8798 8799 FORMATBUFLEN is the length of the buffer in which the floats, ints, & 8800 chars are formatted. XXX This is a magic number. Each formatting 8801 routine does bounds checking to ensure no overflow, but a better 8802 solution may be to malloc a buffer of appropriate size for each 8803 format. For now, the current solution is sufficient. 8804*/ 8805#define FORMATBUFLEN (size_t)120 8806 8807PyObject *PyUnicode_Format(PyObject *format, 8808 PyObject *args) 8809{ 8810 Py_UNICODE *fmt, *res; 8811 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx; 8812 int args_owned = 0; 8813 PyUnicodeObject *result = NULL; 8814 PyObject *dict = NULL; 8815 PyObject *uformat; 8816 8817 if (format == NULL || args == NULL) { 8818 PyErr_BadInternalCall(); 8819 return NULL; 8820 } 8821 uformat = PyUnicode_FromObject(format); 8822 if (uformat == NULL) 8823 return NULL; 8824 fmt = PyUnicode_AS_UNICODE(uformat); 8825 fmtcnt = PyUnicode_GET_SIZE(uformat); 8826 8827 reslen = rescnt = fmtcnt + 100; 8828 result = _PyUnicode_New(reslen); 8829 if (result == NULL) 8830 goto onError; 8831 res = PyUnicode_AS_UNICODE(result); 8832 8833 if (PyTuple_Check(args)) { 8834 arglen = PyTuple_Size(args); 8835 argidx = 0; 8836 } 8837 else { 8838 arglen = -1; 8839 argidx = -2; 8840 } 8841 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) && 8842 !PyUnicode_Check(args)) 8843 dict = args; 8844 8845 while (--fmtcnt >= 0) { 8846 if (*fmt != '%') { 8847 if (--rescnt < 0) { 8848 rescnt = fmtcnt + 100; 8849 reslen += rescnt; 8850 if (_PyUnicode_Resize(&result, reslen) < 0) 8851 goto onError; 8852 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt; 8853 --rescnt; 8854 } 8855 *res++ = *fmt++; 8856 } 8857 else { 8858 /* Got a format specifier */ 8859 int flags = 0; 8860 Py_ssize_t width = -1; 8861 int prec = -1; 8862 Py_UNICODE c = '\0'; 8863 Py_UNICODE fill; 8864 int isnumok; 8865 PyObject *v = NULL; 8866 PyObject *temp = NULL; 8867 Py_UNICODE *pbuf; 8868 Py_UNICODE sign; 8869 Py_ssize_t len; 8870 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */ 8871 8872 fmt++; 8873 if (*fmt == '(') { 8874 Py_UNICODE *keystart; 8875 Py_ssize_t keylen; 8876 PyObject *key; 8877 int pcount = 1; 8878 8879 if (dict == NULL) { 8880 PyErr_SetString(PyExc_TypeError, 8881 "format requires a mapping"); 8882 goto onError; 8883 } 8884 ++fmt; 8885 --fmtcnt; 8886 keystart = fmt; 8887 /* Skip over balanced parentheses */ 8888 while (pcount > 0 && --fmtcnt >= 0) { 8889 if (*fmt == ')') 8890 --pcount; 8891 else if (*fmt == '(') 8892 ++pcount; 8893 fmt++; 8894 } 8895 keylen = fmt - keystart - 1; 8896 if (fmtcnt < 0 || pcount > 0) { 8897 PyErr_SetString(PyExc_ValueError, 8898 "incomplete format key"); 8899 goto onError; 8900 } 8901#if 0 8902 /* keys are converted to strings using UTF-8 and 8903 then looked up since Python uses strings to hold 8904 variables names etc. in its namespaces and we 8905 wouldn't want to break common idioms. */ 8906 key = PyUnicode_EncodeUTF8(keystart, 8907 keylen, 8908 NULL); 8909#else 8910 key = PyUnicode_FromUnicode(keystart, keylen); 8911#endif 8912 if (key == NULL) 8913 goto onError; 8914 if (args_owned) { 8915 Py_DECREF(args); 8916 args_owned = 0; 8917 } 8918 args = PyObject_GetItem(dict, key); 8919 Py_DECREF(key); 8920 if (args == NULL) { 8921 goto onError; 8922 } 8923 args_owned = 1; 8924 arglen = -1; 8925 argidx = -2; 8926 } 8927 while (--fmtcnt >= 0) { 8928 switch (c = *fmt++) { 8929 case '-': flags |= F_LJUST; continue; 8930 case '+': flags |= F_SIGN; continue; 8931 case ' ': flags |= F_BLANK; continue; 8932 case '#': flags |= F_ALT; continue; 8933 case '0': flags |= F_ZERO; continue; 8934 } 8935 break; 8936 } 8937 if (c == '*') { 8938 v = getnextarg(args, arglen, &argidx); 8939 if (v == NULL) 8940 goto onError; 8941 if (!PyLong_Check(v)) { 8942 PyErr_SetString(PyExc_TypeError, 8943 "* wants int"); 8944 goto onError; 8945 } 8946 width = PyLong_AsLong(v); 8947 if (width == -1 && PyErr_Occurred()) 8948 goto onError; 8949 if (width < 0) { 8950 flags |= F_LJUST; 8951 width = -width; 8952 } 8953 if (--fmtcnt >= 0) 8954 c = *fmt++; 8955 } 8956 else if (c >= '0' && c <= '9') { 8957 width = c - '0'; 8958 while (--fmtcnt >= 0) { 8959 c = *fmt++; 8960 if (c < '0' || c > '9') 8961 break; 8962 if ((width*10) / 10 != width) { 8963 PyErr_SetString(PyExc_ValueError, 8964 "width too big"); 8965 goto onError; 8966 } 8967 width = width*10 + (c - '0'); 8968 } 8969 } 8970 if (c == '.') { 8971 prec = 0; 8972 if (--fmtcnt >= 0) 8973 c = *fmt++; 8974 if (c == '*') { 8975 v = getnextarg(args, arglen, &argidx); 8976 if (v == NULL) 8977 goto onError; 8978 if (!PyLong_Check(v)) { 8979 PyErr_SetString(PyExc_TypeError, 8980 "* wants int"); 8981 goto onError; 8982 } 8983 prec = PyLong_AsLong(v); 8984 if (prec == -1 && PyErr_Occurred()) 8985 goto onError; 8986 if (prec < 0) 8987 prec = 0; 8988 if (--fmtcnt >= 0) 8989 c = *fmt++; 8990 } 8991 else if (c >= '0' && c <= '9') { 8992 prec = c - '0'; 8993 while (--fmtcnt >= 0) { 8994 c = Py_CHARMASK(*fmt++); 8995 if (c < '0' || c > '9') 8996 break; 8997 if ((prec*10) / 10 != prec) { 8998 PyErr_SetString(PyExc_ValueError, 8999 "prec too big"); 9000 goto onError; 9001 } 9002 prec = prec*10 + (c - '0'); 9003 } 9004 } 9005 } /* prec */ 9006 if (fmtcnt >= 0) { 9007 if (c == 'h' || c == 'l' || c == 'L') { 9008 if (--fmtcnt >= 0) 9009 c = *fmt++; 9010 } 9011 } 9012 if (fmtcnt < 0) { 9013 PyErr_SetString(PyExc_ValueError, 9014 "incomplete format"); 9015 goto onError; 9016 } 9017 if (c != '%') { 9018 v = getnextarg(args, arglen, &argidx); 9019 if (v == NULL) 9020 goto onError; 9021 } 9022 sign = 0; 9023 fill = ' '; 9024 switch (c) { 9025 9026 case '%': 9027 pbuf = formatbuf; 9028 /* presume that buffer length is at least 1 */ 9029 pbuf[0] = '%'; 9030 len = 1; 9031 break; 9032 9033 case 's': 9034 case 'r': 9035 case 'a': 9036 if (PyUnicode_Check(v) && c == 's') { 9037 temp = v; 9038 Py_INCREF(temp); 9039 } 9040 else { 9041 if (c == 's') 9042 temp = PyObject_Str(v); 9043 else if (c == 'r') 9044 temp = PyObject_Repr(v); 9045 else 9046 temp = PyObject_ASCII(v); 9047 if (temp == NULL) 9048 goto onError; 9049 if (PyUnicode_Check(temp)) 9050 /* nothing to do */; 9051 else { 9052 Py_DECREF(temp); 9053 PyErr_SetString(PyExc_TypeError, 9054 "%s argument has non-string str()"); 9055 goto onError; 9056 } 9057 } 9058 pbuf = PyUnicode_AS_UNICODE(temp); 9059 len = PyUnicode_GET_SIZE(temp); 9060 if (prec >= 0 && len > prec) 9061 len = prec; 9062 break; 9063 9064 case 'i': 9065 case 'd': 9066 case 'u': 9067 case 'o': 9068 case 'x': 9069 case 'X': 9070 if (c == 'i') 9071 c = 'd'; 9072 isnumok = 0; 9073 if (PyNumber_Check(v)) { 9074 PyObject *iobj=NULL; 9075 9076 if (PyLong_Check(v)) { 9077 iobj = v; 9078 Py_INCREF(iobj); 9079 } 9080 else { 9081 iobj = PyNumber_Long(v); 9082 } 9083 if (iobj!=NULL) { 9084 if (PyLong_Check(iobj)) { 9085 isnumok = 1; 9086 temp = formatlong(iobj, flags, prec, c); 9087 Py_DECREF(iobj); 9088 if (!temp) 9089 goto onError; 9090 pbuf = PyUnicode_AS_UNICODE(temp); 9091 len = PyUnicode_GET_SIZE(temp); 9092 sign = 1; 9093 } 9094 else { 9095 Py_DECREF(iobj); 9096 } 9097 } 9098 } 9099 if (!isnumok) { 9100 PyErr_Format(PyExc_TypeError, 9101 "%%%c format: a number is required, " 9102 "not %.200s", (char)c, Py_TYPE(v)->tp_name); 9103 goto onError; 9104 } 9105 if (flags & F_ZERO) 9106 fill = '0'; 9107 break; 9108 9109 case 'e': 9110 case 'E': 9111 case 'f': 9112 case 'F': 9113 case 'g': 9114 case 'G': 9115 if (c == 'F') 9116 c = 'f'; 9117 pbuf = formatbuf; 9118 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), 9119 flags, prec, c, v); 9120 if (len < 0) 9121 goto onError; 9122 sign = 1; 9123 if (flags & F_ZERO) 9124 fill = '0'; 9125 break; 9126 9127 case 'c': 9128 pbuf = formatbuf; 9129 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v); 9130 if (len < 0) 9131 goto onError; 9132 break; 9133 9134 default: 9135 PyErr_Format(PyExc_ValueError, 9136 "unsupported format character '%c' (0x%x) " 9137 "at index %zd", 9138 (31<=c && c<=126) ? (char)c : '?', 9139 (int)c, 9140 (Py_ssize_t)(fmt - 1 - 9141 PyUnicode_AS_UNICODE(uformat))); 9142 goto onError; 9143 } 9144 if (sign) { 9145 if (*pbuf == '-' || *pbuf == '+') { 9146 sign = *pbuf++; 9147 len--; 9148 } 9149 else if (flags & F_SIGN) 9150 sign = '+'; 9151 else if (flags & F_BLANK) 9152 sign = ' '; 9153 else 9154 sign = 0; 9155 } 9156 if (width < len) 9157 width = len; 9158 if (rescnt - (sign != 0) < width) { 9159 reslen -= rescnt; 9160 rescnt = width + fmtcnt + 100; 9161 reslen += rescnt; 9162 if (reslen < 0) { 9163 Py_XDECREF(temp); 9164 PyErr_NoMemory(); 9165 goto onError; 9166 } 9167 if (_PyUnicode_Resize(&result, reslen) < 0) { 9168 Py_XDECREF(temp); 9169 goto onError; 9170 } 9171 res = PyUnicode_AS_UNICODE(result) 9172 + reslen - rescnt; 9173 } 9174 if (sign) { 9175 if (fill != ' ') 9176 *res++ = sign; 9177 rescnt--; 9178 if (width > len) 9179 width--; 9180 } 9181 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) { 9182 assert(pbuf[0] == '0'); 9183 assert(pbuf[1] == c); 9184 if (fill != ' ') { 9185 *res++ = *pbuf++; 9186 *res++ = *pbuf++; 9187 } 9188 rescnt -= 2; 9189 width -= 2; 9190 if (width < 0) 9191 width = 0; 9192 len -= 2; 9193 } 9194 if (width > len && !(flags & F_LJUST)) { 9195 do { 9196 --rescnt; 9197 *res++ = fill; 9198 } while (--width > len); 9199 } 9200 if (fill == ' ') { 9201 if (sign) 9202 *res++ = sign; 9203 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) { 9204 assert(pbuf[0] == '0'); 9205 assert(pbuf[1] == c); 9206 *res++ = *pbuf++; 9207 *res++ = *pbuf++; 9208 } 9209 } 9210 Py_UNICODE_COPY(res, pbuf, len); 9211 res += len; 9212 rescnt -= len; 9213 while (--width >= len) { 9214 --rescnt; 9215 *res++ = ' '; 9216 } 9217 if (dict && (argidx < arglen) && c != '%') { 9218 PyErr_SetString(PyExc_TypeError, 9219 "not all arguments converted during string formatting"); 9220 Py_XDECREF(temp); 9221 goto onError; 9222 } 9223 Py_XDECREF(temp); 9224 } /* '%' */ 9225 } /* until end */ 9226 if (argidx < arglen && !dict) { 9227 PyErr_SetString(PyExc_TypeError, 9228 "not all arguments converted during string formatting"); 9229 goto onError; 9230 } 9231 9232 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0) 9233 goto onError; 9234 if (args_owned) { 9235 Py_DECREF(args); 9236 } 9237 Py_DECREF(uformat); 9238 return (PyObject *)result; 9239 9240 onError: 9241 Py_XDECREF(result); 9242 Py_DECREF(uformat); 9243 if (args_owned) { 9244 Py_DECREF(args); 9245 } 9246 return NULL; 9247} 9248 9249static PyObject * 9250unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds); 9251 9252static PyObject * 9253unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds) 9254{ 9255 PyObject *x = NULL; 9256 static char *kwlist[] = {"object", "encoding", "errors", 0}; 9257 char *encoding = NULL; 9258 char *errors = NULL; 9259 9260 if (type != &PyUnicode_Type) 9261 return unicode_subtype_new(type, args, kwds); 9262 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str", 9263 kwlist, &x, &encoding, &errors)) 9264 return NULL; 9265 if (x == NULL) 9266 return (PyObject *)_PyUnicode_New(0); 9267 if (encoding == NULL && errors == NULL) 9268 return PyObject_Str(x); 9269 else 9270 return PyUnicode_FromEncodedObject(x, encoding, errors); 9271} 9272 9273static PyObject * 9274unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds) 9275{ 9276 PyUnicodeObject *tmp, *pnew; 9277 Py_ssize_t n; 9278 9279 assert(PyType_IsSubtype(type, &PyUnicode_Type)); 9280 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds); 9281 if (tmp == NULL) 9282 return NULL; 9283 assert(PyUnicode_Check(tmp)); 9284 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length); 9285 if (pnew == NULL) { 9286 Py_DECREF(tmp); 9287 return NULL; 9288 } 9289 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1)); 9290 if (pnew->str == NULL) { 9291 _Py_ForgetReference((PyObject *)pnew); 9292 PyObject_Del(pnew); 9293 Py_DECREF(tmp); 9294 return PyErr_NoMemory(); 9295 } 9296 Py_UNICODE_COPY(pnew->str, tmp->str, n+1); 9297 pnew->length = n; 9298 pnew->hash = tmp->hash; 9299 Py_DECREF(tmp); 9300 return (PyObject *)pnew; 9301} 9302 9303PyDoc_STRVAR(unicode_doc, 9304"str(string[, encoding[, errors]]) -> str\n\ 9305\n\ 9306Create a new string object from the given encoded string.\n\ 9307encoding defaults to the current default string encoding.\n\ 9308errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'."); 9309 9310static PyObject *unicode_iter(PyObject *seq); 9311 9312PyTypeObject PyUnicode_Type = { 9313 PyVarObject_HEAD_INIT(&PyType_Type, 0) 9314 "str", /* tp_name */ 9315 sizeof(PyUnicodeObject), /* tp_size */ 9316 0, /* tp_itemsize */ 9317 /* Slots */ 9318 (destructor)unicode_dealloc, /* tp_dealloc */ 9319 0, /* tp_print */ 9320 0, /* tp_getattr */ 9321 0, /* tp_setattr */ 9322 0, /* tp_compare */ 9323 unicode_repr, /* tp_repr */ 9324 &unicode_as_number, /* tp_as_number */ 9325 &unicode_as_sequence, /* tp_as_sequence */ 9326 &unicode_as_mapping, /* tp_as_mapping */ 9327 (hashfunc) unicode_hash, /* tp_hash*/ 9328 0, /* tp_call*/ 9329 (reprfunc) unicode_str, /* tp_str */ 9330 PyObject_GenericGetAttr, /* tp_getattro */ 9331 0, /* tp_setattro */ 9332 0, /* tp_as_buffer */ 9333 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | 9334 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */ 9335 unicode_doc, /* tp_doc */ 9336 0, /* tp_traverse */ 9337 0, /* tp_clear */ 9338 PyUnicode_RichCompare, /* tp_richcompare */ 9339 0, /* tp_weaklistoffset */ 9340 unicode_iter, /* tp_iter */ 9341 0, /* tp_iternext */ 9342 unicode_methods, /* tp_methods */ 9343 0, /* tp_members */ 9344 0, /* tp_getset */ 9345 &PyBaseObject_Type, /* tp_base */ 9346 0, /* tp_dict */ 9347 0, /* tp_descr_get */ 9348 0, /* tp_descr_set */ 9349 0, /* tp_dictoffset */ 9350 0, /* tp_init */ 9351 0, /* tp_alloc */ 9352 unicode_new, /* tp_new */ 9353 PyObject_Del, /* tp_free */ 9354}; 9355 9356/* Initialize the Unicode implementation */ 9357 9358void _PyUnicode_Init(void) 9359{ 9360 int i; 9361 9362 /* XXX - move this array to unicodectype.c ? */ 9363 Py_UNICODE linebreak[] = { 9364 0x000A, /* LINE FEED */ 9365 0x000D, /* CARRIAGE RETURN */ 9366 0x001C, /* FILE SEPARATOR */ 9367 0x001D, /* GROUP SEPARATOR */ 9368 0x001E, /* RECORD SEPARATOR */ 9369 0x0085, /* NEXT LINE */ 9370 0x2028, /* LINE SEPARATOR */ 9371 0x2029, /* PARAGRAPH SEPARATOR */ 9372 }; 9373 9374 /* Init the implementation */ 9375 free_list = NULL; 9376 numfree = 0; 9377 unicode_empty = _PyUnicode_New(0); 9378 if (!unicode_empty) 9379 return; 9380 9381 for (i = 0; i < 256; i++) 9382 unicode_latin1[i] = NULL; 9383 if (PyType_Ready(&PyUnicode_Type) < 0) 9384 Py_FatalError("Can't initialize 'unicode'"); 9385 9386 /* initialize the linebreak bloom filter */ 9387 bloom_linebreak = make_bloom_mask( 9388 linebreak, sizeof(linebreak) / sizeof(linebreak[0]) 9389 ); 9390 9391 PyType_Ready(&EncodingMapType); 9392} 9393 9394/* Finalize the Unicode implementation */ 9395 9396int 9397PyUnicode_ClearFreeList(void) 9398{ 9399 int freelist_size = numfree; 9400 PyUnicodeObject *u; 9401 9402 for (u = free_list; u != NULL;) { 9403 PyUnicodeObject *v = u; 9404 u = *(PyUnicodeObject **)u; 9405 if (v->str) 9406 PyObject_DEL(v->str); 9407 Py_XDECREF(v->defenc); 9408 PyObject_Del(v); 9409 numfree--; 9410 } 9411 free_list = NULL; 9412 assert(numfree == 0); 9413 return freelist_size; 9414} 9415 9416void 9417_PyUnicode_Fini(void) 9418{ 9419 int i; 9420 9421 Py_XDECREF(unicode_empty); 9422 unicode_empty = NULL; 9423 9424 for (i = 0; i < 256; i++) { 9425 if (unicode_latin1[i]) { 9426 Py_DECREF(unicode_latin1[i]); 9427 unicode_latin1[i] = NULL; 9428 } 9429 } 9430 (void)PyUnicode_ClearFreeList(); 9431} 9432 9433void 9434PyUnicode_InternInPlace(PyObject **p) 9435{ 9436 register PyUnicodeObject *s = (PyUnicodeObject *)(*p); 9437 PyObject *t; 9438 if (s == NULL || !PyUnicode_Check(s)) 9439 Py_FatalError( 9440 "PyUnicode_InternInPlace: unicode strings only please!"); 9441 /* If it's a subclass, we don't really know what putting 9442 it in the interned dict might do. */ 9443 if (!PyUnicode_CheckExact(s)) 9444 return; 9445 if (PyUnicode_CHECK_INTERNED(s)) 9446 return; 9447 if (interned == NULL) { 9448 interned = PyDict_New(); 9449 if (interned == NULL) { 9450 PyErr_Clear(); /* Don't leave an exception */ 9451 return; 9452 } 9453 } 9454 /* It might be that the GetItem call fails even 9455 though the key is present in the dictionary, 9456 namely when this happens during a stack overflow. */ 9457 Py_ALLOW_RECURSION 9458 t = PyDict_GetItem(interned, (PyObject *)s); 9459 Py_END_ALLOW_RECURSION 9460 9461 if (t) { 9462 Py_INCREF(t); 9463 Py_DECREF(*p); 9464 *p = t; 9465 return; 9466 } 9467 9468 PyThreadState_GET()->recursion_critical = 1; 9469 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) { 9470 PyErr_Clear(); 9471 PyThreadState_GET()->recursion_critical = 0; 9472 return; 9473 } 9474 PyThreadState_GET()->recursion_critical = 0; 9475 /* The two references in interned are not counted by refcnt. 9476 The deallocator will take care of this */ 9477 Py_REFCNT(s) -= 2; 9478 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL; 9479} 9480 9481void 9482PyUnicode_InternImmortal(PyObject **p) 9483{ 9484 PyUnicode_InternInPlace(p); 9485 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) { 9486 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL; 9487 Py_INCREF(*p); 9488 } 9489} 9490 9491PyObject * 9492PyUnicode_InternFromString(const char *cp) 9493{ 9494 PyObject *s = PyUnicode_FromString(cp); 9495 if (s == NULL) 9496 return NULL; 9497 PyUnicode_InternInPlace(&s); 9498 return s; 9499} 9500 9501void _Py_ReleaseInternedUnicodeStrings(void) 9502{ 9503 PyObject *keys; 9504 PyUnicodeObject *s; 9505 Py_ssize_t i, n; 9506 Py_ssize_t immortal_size = 0, mortal_size = 0; 9507 9508 if (interned == NULL || !PyDict_Check(interned)) 9509 return; 9510 keys = PyDict_Keys(interned); 9511 if (keys == NULL || !PyList_Check(keys)) { 9512 PyErr_Clear(); 9513 return; 9514 } 9515 9516 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak 9517 detector, interned unicode strings are not forcibly deallocated; 9518 rather, we give them their stolen references back, and then clear 9519 and DECREF the interned dict. */ 9520 9521 n = PyList_GET_SIZE(keys); 9522 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n", 9523 n); 9524 for (i = 0; i < n; i++) { 9525 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i); 9526 switch (s->state) { 9527 case SSTATE_NOT_INTERNED: 9528 /* XXX Shouldn't happen */ 9529 break; 9530 case SSTATE_INTERNED_IMMORTAL: 9531 Py_REFCNT(s) += 1; 9532 immortal_size += s->length; 9533 break; 9534 case SSTATE_INTERNED_MORTAL: 9535 Py_REFCNT(s) += 2; 9536 mortal_size += s->length; 9537 break; 9538 default: 9539 Py_FatalError("Inconsistent interned string state."); 9540 } 9541 s->state = SSTATE_NOT_INTERNED; 9542 } 9543 fprintf(stderr, "total size of all interned strings: " 9544 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d " 9545 "mortal/immortal\n", mortal_size, immortal_size); 9546 Py_DECREF(keys); 9547 PyDict_Clear(interned); 9548 Py_DECREF(interned); 9549 interned = NULL; 9550} 9551 9552 9553/********************* Unicode Iterator **************************/ 9554 9555typedef struct { 9556 PyObject_HEAD 9557 Py_ssize_t it_index; 9558 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */ 9559} unicodeiterobject; 9560 9561static void 9562unicodeiter_dealloc(unicodeiterobject *it) 9563{ 9564 _PyObject_GC_UNTRACK(it); 9565 Py_XDECREF(it->it_seq); 9566 PyObject_GC_Del(it); 9567} 9568 9569static int 9570unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg) 9571{ 9572 Py_VISIT(it->it_seq); 9573 return 0; 9574} 9575 9576static PyObject * 9577unicodeiter_next(unicodeiterobject *it) 9578{ 9579 PyUnicodeObject *seq; 9580 PyObject *item; 9581 9582 assert(it != NULL); 9583 seq = it->it_seq; 9584 if (seq == NULL) 9585 return NULL; 9586 assert(PyUnicode_Check(seq)); 9587 9588 if (it->it_index < PyUnicode_GET_SIZE(seq)) { 9589 item = PyUnicode_FromUnicode( 9590 PyUnicode_AS_UNICODE(seq)+it->it_index, 1); 9591 if (item != NULL) 9592 ++it->it_index; 9593 return item; 9594 } 9595 9596 Py_DECREF(seq); 9597 it->it_seq = NULL; 9598 return NULL; 9599} 9600 9601static PyObject * 9602unicodeiter_len(unicodeiterobject *it) 9603{ 9604 Py_ssize_t len = 0; 9605 if (it->it_seq) 9606 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index; 9607 return PyLong_FromSsize_t(len); 9608} 9609 9610PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it))."); 9611 9612static PyMethodDef unicodeiter_methods[] = { 9613 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS, 9614 length_hint_doc}, 9615 {NULL, NULL} /* sentinel */ 9616}; 9617 9618PyTypeObject PyUnicodeIter_Type = { 9619 PyVarObject_HEAD_INIT(&PyType_Type, 0) 9620 "str_iterator", /* tp_name */ 9621 sizeof(unicodeiterobject), /* tp_basicsize */ 9622 0, /* tp_itemsize */ 9623 /* methods */ 9624 (destructor)unicodeiter_dealloc, /* tp_dealloc */ 9625 0, /* tp_print */ 9626 0, /* tp_getattr */ 9627 0, /* tp_setattr */ 9628 0, /* tp_compare */ 9629 0, /* tp_repr */ 9630 0, /* tp_as_number */ 9631 0, /* tp_as_sequence */ 9632 0, /* tp_as_mapping */ 9633 0, /* tp_hash */ 9634 0, /* tp_call */ 9635 0, /* tp_str */ 9636 PyObject_GenericGetAttr, /* tp_getattro */ 9637 0, /* tp_setattro */ 9638 0, /* tp_as_buffer */ 9639 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */ 9640 0, /* tp_doc */ 9641 (traverseproc)unicodeiter_traverse, /* tp_traverse */ 9642 0, /* tp_clear */ 9643 0, /* tp_richcompare */ 9644 0, /* tp_weaklistoffset */ 9645 PyObject_SelfIter, /* tp_iter */ 9646 (iternextfunc)unicodeiter_next, /* tp_iternext */ 9647 unicodeiter_methods, /* tp_methods */ 9648 0, 9649}; 9650 9651static PyObject * 9652unicode_iter(PyObject *seq) 9653{ 9654 unicodeiterobject *it; 9655 9656 if (!PyUnicode_Check(seq)) { 9657 PyErr_BadInternalCall(); 9658 return NULL; 9659 } 9660 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type); 9661 if (it == NULL) 9662 return NULL; 9663 it->it_index = 0; 9664 Py_INCREF(seq); 9665 it->it_seq = (PyUnicodeObject *)seq; 9666 _PyObject_GC_TRACK(it); 9667 return (PyObject *)it; 9668} 9669 9670size_t 9671Py_UNICODE_strlen(const Py_UNICODE *u) 9672{ 9673 int res = 0; 9674 while(*u++) 9675 res++; 9676 return res; 9677} 9678 9679Py_UNICODE* 9680Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2) 9681{ 9682 Py_UNICODE *u = s1; 9683 while ((*u++ = *s2++)); 9684 return s1; 9685} 9686 9687Py_UNICODE* 9688Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n) 9689{ 9690 Py_UNICODE *u = s1; 9691 while ((*u++ = *s2++)) 9692 if (n-- == 0) 9693 break; 9694 return s1; 9695} 9696 9697int 9698Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2) 9699{ 9700 while (*s1 && *s2 && *s1 == *s2) 9701 s1++, s2++; 9702 if (*s1 && *s2) 9703 return (*s1 < *s2) ? -1 : +1; 9704 if (*s1) 9705 return 1; 9706 if (*s2) 9707 return -1; 9708 return 0; 9709} 9710 9711Py_UNICODE* 9712Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c) 9713{ 9714 const Py_UNICODE *p; 9715 for (p = s; *p; p++) 9716 if (*p == c) 9717 return (Py_UNICODE*)p; 9718 return NULL; 9719} 9720 9721 9722#ifdef __cplusplus 9723} 9724#endif 9725 9726 9727/* 9728Local variables: 9729c-basic-offset: 4 9730indent-tabs-mode: nil 9731End: 9732*/ 9733