unicodeobject.c revision 72b710a59617ebe6dd1c41613d2c7eb81702efd9
1/* 2 3Unicode implementation based on original code by Fredrik Lundh, 4modified by Marc-Andre Lemburg <mal@lemburg.com> according to the 5Unicode Integration Proposal (see file Misc/unicode.txt). 6 7Major speed upgrades to the method implementations at the Reykjavik 8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke. 9 10Copyright (c) Corporation for National Research Initiatives. 11 12-------------------------------------------------------------------- 13The original string type implementation is: 14 15 Copyright (c) 1999 by Secret Labs AB 16 Copyright (c) 1999 by Fredrik Lundh 17 18By obtaining, using, and/or copying this software and/or its 19associated documentation, you agree that you have read, understood, 20and will comply with the following terms and conditions: 21 22Permission to use, copy, modify, and distribute this software and its 23associated documentation for any purpose and without fee is hereby 24granted, provided that the above copyright notice appears in all 25copies, and that both that copyright notice and this permission notice 26appear in supporting documentation, and that the name of Secret Labs 27AB or the author not be used in advertising or publicity pertaining to 28distribution of the software without specific, written prior 29permission. 30 31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO 32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND 33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR 34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT 37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 38-------------------------------------------------------------------- 39 40*/ 41 42#define PY_SSIZE_T_CLEAN 43#include "Python.h" 44#include "bytes_methods.h" 45 46#include "unicodeobject.h" 47#include "ucnhash.h" 48 49#include "formatter_unicode.h" 50 51#ifdef MS_WINDOWS 52#include <windows.h> 53#endif 54 55/* Limit for the Unicode object free list */ 56 57#define PyUnicode_MAXFREELIST 1024 58 59/* Limit for the Unicode object free list stay alive optimization. 60 61 The implementation will keep allocated Unicode memory intact for 62 all objects on the free list having a size less than this 63 limit. This reduces malloc() overhead for small Unicode objects. 64 65 At worst this will result in PyUnicode_MAXFREELIST * 66 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT + 67 malloc()-overhead) bytes of unused garbage. 68 69 Setting the limit to 0 effectively turns the feature off. 70 71 Note: This is an experimental feature ! If you get core dumps when 72 using Unicode objects, turn this feature off. 73 74*/ 75 76#define KEEPALIVE_SIZE_LIMIT 9 77 78/* Endianness switches; defaults to little endian */ 79 80#ifdef WORDS_BIGENDIAN 81# define BYTEORDER_IS_BIG_ENDIAN 82#else 83# define BYTEORDER_IS_LITTLE_ENDIAN 84#endif 85 86/* --- Globals ------------------------------------------------------------ 87 88 The globals are initialized by the _PyUnicode_Init() API and should 89 not be used before calling that API. 90 91*/ 92 93 94#ifdef __cplusplus 95extern "C" { 96#endif 97 98/* This dictionary holds all interned unicode strings. Note that references 99 to strings in this dictionary are *not* counted in the string's ob_refcnt. 100 When the interned string reaches a refcnt of 0 the string deallocation 101 function will delete the reference from this dictionary. 102 103 Another way to look at this is that to say that the actual reference 104 count of a string is: s->ob_refcnt + (s->state ? 2 : 0) 105*/ 106static PyObject *interned; 107 108/* Free list for Unicode objects */ 109static PyUnicodeObject *free_list; 110static int numfree; 111 112/* The empty Unicode object is shared to improve performance. */ 113static PyUnicodeObject *unicode_empty; 114 115/* Single character Unicode strings in the Latin-1 range are being 116 shared as well. */ 117static PyUnicodeObject *unicode_latin1[256]; 118 119/* Default encoding to use and assume when NULL is passed as encoding 120 parameter; it is fixed to "utf-8". Always use the 121 PyUnicode_GetDefaultEncoding() API to access this global. 122 123 Don't forget to alter Py_FileSystemDefaultEncoding if you change the 124 hard coded default! 125*/ 126static const char unicode_default_encoding[] = "utf-8"; 127 128/* Fast detection of the most frequent whitespace characters */ 129const unsigned char _Py_ascii_whitespace[] = { 130 0, 0, 0, 0, 0, 0, 0, 0, 131// case 0x0009: /* HORIZONTAL TABULATION */ 132// case 0x000A: /* LINE FEED */ 133// case 0x000B: /* VERTICAL TABULATION */ 134// case 0x000C: /* FORM FEED */ 135// case 0x000D: /* CARRIAGE RETURN */ 136 0, 1, 1, 1, 1, 1, 0, 0, 137 0, 0, 0, 0, 0, 0, 0, 0, 138// case 0x001C: /* FILE SEPARATOR */ 139// case 0x001D: /* GROUP SEPARATOR */ 140// case 0x001E: /* RECORD SEPARATOR */ 141// case 0x001F: /* UNIT SEPARATOR */ 142 0, 0, 0, 0, 1, 1, 1, 1, 143// case 0x0020: /* SPACE */ 144 1, 0, 0, 0, 0, 0, 0, 0, 145 0, 0, 0, 0, 0, 0, 0, 0, 146 0, 0, 0, 0, 0, 0, 0, 0, 147 0, 0, 0, 0, 0, 0, 0, 0, 148 149 0, 0, 0, 0, 0, 0, 0, 0, 150 0, 0, 0, 0, 0, 0, 0, 0, 151 0, 0, 0, 0, 0, 0, 0, 0, 152 0, 0, 0, 0, 0, 0, 0, 0, 153 0, 0, 0, 0, 0, 0, 0, 0, 154 0, 0, 0, 0, 0, 0, 0, 0, 155 0, 0, 0, 0, 0, 0, 0, 0, 156 0, 0, 0, 0, 0, 0, 0, 0 157}; 158 159/* Same for linebreaks */ 160static unsigned char ascii_linebreak[] = { 161 0, 0, 0, 0, 0, 0, 0, 0, 162// 0x000A, /* LINE FEED */ 163// 0x000D, /* CARRIAGE RETURN */ 164 0, 0, 1, 0, 0, 1, 0, 0, 165 0, 0, 0, 0, 0, 0, 0, 0, 166// 0x001C, /* FILE SEPARATOR */ 167// 0x001D, /* GROUP SEPARATOR */ 168// 0x001E, /* RECORD SEPARATOR */ 169 0, 0, 0, 0, 1, 1, 1, 0, 170 0, 0, 0, 0, 0, 0, 0, 0, 171 0, 0, 0, 0, 0, 0, 0, 0, 172 0, 0, 0, 0, 0, 0, 0, 0, 173 0, 0, 0, 0, 0, 0, 0, 0, 174 175 0, 0, 0, 0, 0, 0, 0, 0, 176 0, 0, 0, 0, 0, 0, 0, 0, 177 0, 0, 0, 0, 0, 0, 0, 0, 178 0, 0, 0, 0, 0, 0, 0, 0, 179 0, 0, 0, 0, 0, 0, 0, 0, 180 0, 0, 0, 0, 0, 0, 0, 0, 181 0, 0, 0, 0, 0, 0, 0, 0, 182 0, 0, 0, 0, 0, 0, 0, 0 183}; 184 185 186Py_UNICODE 187PyUnicode_GetMax(void) 188{ 189#ifdef Py_UNICODE_WIDE 190 return 0x10FFFF; 191#else 192 /* This is actually an illegal character, so it should 193 not be passed to unichr. */ 194 return 0xFFFF; 195#endif 196} 197 198/* --- Bloom Filters ----------------------------------------------------- */ 199 200/* stuff to implement simple "bloom filters" for Unicode characters. 201 to keep things simple, we use a single bitmask, using the least 5 202 bits from each unicode characters as the bit index. */ 203 204/* the linebreak mask is set up by Unicode_Init below */ 205 206#define BLOOM_MASK unsigned long 207 208static BLOOM_MASK bloom_linebreak; 209 210#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F)))) 211 212#define BLOOM_LINEBREAK(ch) \ 213 ((ch) < 128U ? ascii_linebreak[(ch)] : \ 214 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch))) 215 216Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len) 217{ 218 /* calculate simple bloom-style bitmask for a given unicode string */ 219 220 long mask; 221 Py_ssize_t i; 222 223 mask = 0; 224 for (i = 0; i < len; i++) 225 mask |= (1 << (ptr[i] & 0x1F)); 226 227 return mask; 228} 229 230Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen) 231{ 232 Py_ssize_t i; 233 234 for (i = 0; i < setlen; i++) 235 if (set[i] == chr) 236 return 1; 237 238 return 0; 239} 240 241#define BLOOM_MEMBER(mask, chr, set, setlen)\ 242 BLOOM(mask, chr) && unicode_member(chr, set, setlen) 243 244/* --- Unicode Object ----------------------------------------------------- */ 245 246static 247int unicode_resize(register PyUnicodeObject *unicode, 248 Py_ssize_t length) 249{ 250 void *oldstr; 251 252 /* Shortcut if there's nothing much to do. */ 253 if (unicode->length == length) 254 goto reset; 255 256 /* Resizing shared object (unicode_empty or single character 257 objects) in-place is not allowed. Use PyUnicode_Resize() 258 instead ! */ 259 260 if (unicode == unicode_empty || 261 (unicode->length == 1 && 262 unicode->str[0] < 256U && 263 unicode_latin1[unicode->str[0]] == unicode)) { 264 PyErr_SetString(PyExc_SystemError, 265 "can't resize shared unicode objects"); 266 return -1; 267 } 268 269 /* We allocate one more byte to make sure the string is Ux0000 terminated. 270 The overallocation is also used by fastsearch, which assumes that it's 271 safe to look at str[length] (without making any assumptions about what 272 it contains). */ 273 274 oldstr = unicode->str; 275 unicode->str = PyObject_REALLOC(unicode->str, 276 sizeof(Py_UNICODE) * (length + 1)); 277 if (!unicode->str) { 278 unicode->str = (Py_UNICODE *)oldstr; 279 PyErr_NoMemory(); 280 return -1; 281 } 282 unicode->str[length] = 0; 283 unicode->length = length; 284 285 reset: 286 /* Reset the object caches */ 287 if (unicode->defenc) { 288 Py_DECREF(unicode->defenc); 289 unicode->defenc = NULL; 290 } 291 unicode->hash = -1; 292 293 return 0; 294} 295 296/* We allocate one more byte to make sure the string is 297 Ux0000 terminated; some code (e.g. new_identifier) 298 relies on that. 299 300 XXX This allocator could further be enhanced by assuring that the 301 free list never reduces its size below 1. 302 303*/ 304 305static 306PyUnicodeObject *_PyUnicode_New(Py_ssize_t length) 307{ 308 register PyUnicodeObject *unicode; 309 310 /* Optimization for empty strings */ 311 if (length == 0 && unicode_empty != NULL) { 312 Py_INCREF(unicode_empty); 313 return unicode_empty; 314 } 315 316 /* Unicode freelist & memory allocation */ 317 if (free_list) { 318 unicode = free_list; 319 free_list = *(PyUnicodeObject **)unicode; 320 numfree--; 321 if (unicode->str) { 322 /* Keep-Alive optimization: we only upsize the buffer, 323 never downsize it. */ 324 if ((unicode->length < length) && 325 unicode_resize(unicode, length) < 0) { 326 PyObject_DEL(unicode->str); 327 goto onError; 328 } 329 } 330 else { 331 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1); 332 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size); 333 } 334 PyObject_INIT(unicode, &PyUnicode_Type); 335 } 336 else { 337 size_t new_size; 338 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type); 339 if (unicode == NULL) 340 return NULL; 341 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1); 342 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size); 343 } 344 345 if (!unicode->str) { 346 PyErr_NoMemory(); 347 goto onError; 348 } 349 /* Initialize the first element to guard against cases where 350 * the caller fails before initializing str -- unicode_resize() 351 * reads str[0], and the Keep-Alive optimization can keep memory 352 * allocated for str alive across a call to unicode_dealloc(unicode). 353 * We don't want unicode_resize to read uninitialized memory in 354 * that case. 355 */ 356 unicode->str[0] = 0; 357 unicode->str[length] = 0; 358 unicode->length = length; 359 unicode->hash = -1; 360 unicode->state = 0; 361 unicode->defenc = NULL; 362 return unicode; 363 364 onError: 365 _Py_ForgetReference((PyObject *)unicode); 366 PyObject_Del(unicode); 367 return NULL; 368} 369 370static 371void unicode_dealloc(register PyUnicodeObject *unicode) 372{ 373 switch (PyUnicode_CHECK_INTERNED(unicode)) { 374 case SSTATE_NOT_INTERNED: 375 break; 376 377 case SSTATE_INTERNED_MORTAL: 378 /* revive dead object temporarily for DelItem */ 379 Py_REFCNT(unicode) = 3; 380 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0) 381 Py_FatalError( 382 "deletion of interned unicode string failed"); 383 break; 384 385 case SSTATE_INTERNED_IMMORTAL: 386 Py_FatalError("Immortal interned unicode string died."); 387 388 default: 389 Py_FatalError("Inconsistent interned unicode string state."); 390 } 391 392 if (PyUnicode_CheckExact(unicode) && 393 numfree < PyUnicode_MAXFREELIST) { 394 /* Keep-Alive optimization */ 395 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) { 396 PyObject_DEL(unicode->str); 397 unicode->str = NULL; 398 unicode->length = 0; 399 } 400 if (unicode->defenc) { 401 Py_DECREF(unicode->defenc); 402 unicode->defenc = NULL; 403 } 404 /* Add to free list */ 405 *(PyUnicodeObject **)unicode = free_list; 406 free_list = unicode; 407 numfree++; 408 } 409 else { 410 PyObject_DEL(unicode->str); 411 Py_XDECREF(unicode->defenc); 412 Py_TYPE(unicode)->tp_free((PyObject *)unicode); 413 } 414} 415 416int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length) 417{ 418 register PyUnicodeObject *v; 419 420 /* Argument checks */ 421 if (unicode == NULL) { 422 PyErr_BadInternalCall(); 423 return -1; 424 } 425 v = (PyUnicodeObject *)*unicode; 426 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) { 427 PyErr_BadInternalCall(); 428 return -1; 429 } 430 431 /* Resizing unicode_empty and single character objects is not 432 possible since these are being shared. We simply return a fresh 433 copy with the same Unicode content. */ 434 if (v->length != length && 435 (v == unicode_empty || v->length == 1)) { 436 PyUnicodeObject *w = _PyUnicode_New(length); 437 if (w == NULL) 438 return -1; 439 Py_UNICODE_COPY(w->str, v->str, 440 length < v->length ? length : v->length); 441 Py_DECREF(*unicode); 442 *unicode = (PyObject *)w; 443 return 0; 444 } 445 446 /* Note that we don't have to modify *unicode for unshared Unicode 447 objects, since we can modify them in-place. */ 448 return unicode_resize(v, length); 449} 450 451/* Internal API for use in unicodeobject.c only ! */ 452#define _PyUnicode_Resize(unicodevar, length) \ 453 PyUnicode_Resize(((PyObject **)(unicodevar)), length) 454 455PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u, 456 Py_ssize_t size) 457{ 458 PyUnicodeObject *unicode; 459 460 /* If the Unicode data is known at construction time, we can apply 461 some optimizations which share commonly used objects. */ 462 if (u != NULL) { 463 464 /* Optimization for empty strings */ 465 if (size == 0 && unicode_empty != NULL) { 466 Py_INCREF(unicode_empty); 467 return (PyObject *)unicode_empty; 468 } 469 470 /* Single character Unicode objects in the Latin-1 range are 471 shared when using this constructor */ 472 if (size == 1 && *u < 256) { 473 unicode = unicode_latin1[*u]; 474 if (!unicode) { 475 unicode = _PyUnicode_New(1); 476 if (!unicode) 477 return NULL; 478 unicode->str[0] = *u; 479 unicode_latin1[*u] = unicode; 480 } 481 Py_INCREF(unicode); 482 return (PyObject *)unicode; 483 } 484 } 485 486 unicode = _PyUnicode_New(size); 487 if (!unicode) 488 return NULL; 489 490 /* Copy the Unicode data into the new object */ 491 if (u != NULL) 492 Py_UNICODE_COPY(unicode->str, u, size); 493 494 return (PyObject *)unicode; 495} 496 497PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size) 498{ 499 PyUnicodeObject *unicode; 500 501 if (size < 0) { 502 PyErr_SetString(PyExc_SystemError, 503 "Negative size passed to PyUnicode_FromStringAndSize"); 504 return NULL; 505 } 506 507 /* If the Unicode data is known at construction time, we can apply 508 some optimizations which share commonly used objects. 509 Also, this means the input must be UTF-8, so fall back to the 510 UTF-8 decoder at the end. */ 511 if (u != NULL) { 512 513 /* Optimization for empty strings */ 514 if (size == 0 && unicode_empty != NULL) { 515 Py_INCREF(unicode_empty); 516 return (PyObject *)unicode_empty; 517 } 518 519 /* Single characters are shared when using this constructor. 520 Restrict to ASCII, since the input must be UTF-8. */ 521 if (size == 1 && Py_CHARMASK(*u) < 128) { 522 unicode = unicode_latin1[Py_CHARMASK(*u)]; 523 if (!unicode) { 524 unicode = _PyUnicode_New(1); 525 if (!unicode) 526 return NULL; 527 unicode->str[0] = Py_CHARMASK(*u); 528 unicode_latin1[Py_CHARMASK(*u)] = unicode; 529 } 530 Py_INCREF(unicode); 531 return (PyObject *)unicode; 532 } 533 534 return PyUnicode_DecodeUTF8(u, size, NULL); 535 } 536 537 unicode = _PyUnicode_New(size); 538 if (!unicode) 539 return NULL; 540 541 return (PyObject *)unicode; 542} 543 544PyObject *PyUnicode_FromString(const char *u) 545{ 546 size_t size = strlen(u); 547 if (size > PY_SSIZE_T_MAX) { 548 PyErr_SetString(PyExc_OverflowError, "input too long"); 549 return NULL; 550 } 551 552 return PyUnicode_FromStringAndSize(u, size); 553} 554 555#ifdef HAVE_WCHAR_H 556 557PyObject *PyUnicode_FromWideChar(register const wchar_t *w, 558 Py_ssize_t size) 559{ 560 PyUnicodeObject *unicode; 561 562 if (w == NULL) { 563 if (size == 0) 564 return PyUnicode_FromStringAndSize(NULL, 0); 565 PyErr_BadInternalCall(); 566 return NULL; 567 } 568 569 if (size == -1) { 570 size = wcslen(w); 571 } 572 573 unicode = _PyUnicode_New(size); 574 if (!unicode) 575 return NULL; 576 577 /* Copy the wchar_t data into the new object */ 578#ifdef HAVE_USABLE_WCHAR_T 579 memcpy(unicode->str, w, size * sizeof(wchar_t)); 580#else 581 { 582 register Py_UNICODE *u; 583 register Py_ssize_t i; 584 u = PyUnicode_AS_UNICODE(unicode); 585 for (i = size; i > 0; i--) 586 *u++ = *w++; 587 } 588#endif 589 590 return (PyObject *)unicode; 591} 592 593static void 594makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c) 595{ 596 *fmt++ = '%'; 597 if (width) { 598 if (zeropad) 599 *fmt++ = '0'; 600 fmt += sprintf(fmt, "%d", width); 601 } 602 if (precision) 603 fmt += sprintf(fmt, ".%d", precision); 604 if (longflag) 605 *fmt++ = 'l'; 606 else if (size_tflag) { 607 char *f = PY_FORMAT_SIZE_T; 608 while (*f) 609 *fmt++ = *f++; 610 } 611 *fmt++ = c; 612 *fmt = '\0'; 613} 614 615#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;} 616 617PyObject * 618PyUnicode_FromFormatV(const char *format, va_list vargs) 619{ 620 va_list count; 621 Py_ssize_t callcount = 0; 622 PyObject **callresults = NULL; 623 PyObject **callresult = NULL; 624 Py_ssize_t n = 0; 625 int width = 0; 626 int precision = 0; 627 int zeropad; 628 const char* f; 629 Py_UNICODE *s; 630 PyObject *string; 631 /* used by sprintf */ 632 char buffer[21]; 633 /* use abuffer instead of buffer, if we need more space 634 * (which can happen if there's a format specifier with width). */ 635 char *abuffer = NULL; 636 char *realbuffer; 637 Py_ssize_t abuffersize = 0; 638 char fmt[60]; /* should be enough for %0width.precisionld */ 639 const char *copy; 640 641#ifdef VA_LIST_IS_ARRAY 642 Py_MEMCPY(count, vargs, sizeof(va_list)); 643#else 644#ifdef __va_copy 645 __va_copy(count, vargs); 646#else 647 count = vargs; 648#endif 649#endif 650 /* step 1: count the number of %S/%R format specifications 651 * (we call PyObject_Str()/PyObject_Repr() for these objects 652 * once during step 3 and put the result in an array) */ 653 for (f = format; *f; f++) { 654 if (*f == '%' && (*(f+1)=='S' || *(f+1)=='R')) 655 ++callcount; 656 } 657 /* step 2: allocate memory for the results of 658 * PyObject_Str()/PyObject_Repr() calls */ 659 if (callcount) { 660 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount); 661 if (!callresults) { 662 PyErr_NoMemory(); 663 return NULL; 664 } 665 callresult = callresults; 666 } 667 /* step 3: figure out how large a buffer we need */ 668 for (f = format; *f; f++) { 669 if (*f == '%') { 670 const char* p = f; 671 width = 0; 672 while (ISDIGIT((unsigned)*f)) 673 width = (width*10) + *f++ - '0'; 674 while (*++f && *f != '%' && !ISALPHA((unsigned)*f)) 675 ; 676 677 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since 678 * they don't affect the amount of space we reserve. 679 */ 680 if ((*f == 'l' || *f == 'z') && 681 (f[1] == 'd' || f[1] == 'u')) 682 ++f; 683 684 switch (*f) { 685 case 'c': 686 (void)va_arg(count, int); 687 /* fall through... */ 688 case '%': 689 n++; 690 break; 691 case 'd': case 'u': case 'i': case 'x': 692 (void) va_arg(count, int); 693 /* 20 bytes is enough to hold a 64-bit 694 integer. Decimal takes the most space. 695 This isn't enough for octal. 696 If a width is specified we need more 697 (which we allocate later). */ 698 if (width < 20) 699 width = 20; 700 n += width; 701 if (abuffersize < width) 702 abuffersize = width; 703 break; 704 case 's': 705 { 706 /* UTF-8 */ 707 unsigned char*s; 708 s = va_arg(count, unsigned char*); 709 while (*s) { 710 if (*s < 128) { 711 n++; s++; 712 } else if (*s < 0xc0) { 713 /* invalid UTF-8 */ 714 n++; s++; 715 } else if (*s < 0xc0) { 716 n++; 717 s++; if(!*s)break; 718 s++; 719 } else if (*s < 0xe0) { 720 n++; 721 s++; if(!*s)break; 722 s++; if(!*s)break; 723 s++; 724 } else { 725 #ifdef Py_UNICODE_WIDE 726 n++; 727 #else 728 n+=2; 729 #endif 730 s++; if(!*s)break; 731 s++; if(!*s)break; 732 s++; if(!*s)break; 733 s++; 734 } 735 } 736 break; 737 } 738 case 'U': 739 { 740 PyObject *obj = va_arg(count, PyObject *); 741 assert(obj && PyUnicode_Check(obj)); 742 n += PyUnicode_GET_SIZE(obj); 743 break; 744 } 745 case 'V': 746 { 747 PyObject *obj = va_arg(count, PyObject *); 748 const char *str = va_arg(count, const char *); 749 assert(obj || str); 750 assert(!obj || PyUnicode_Check(obj)); 751 if (obj) 752 n += PyUnicode_GET_SIZE(obj); 753 else 754 n += strlen(str); 755 break; 756 } 757 case 'S': 758 { 759 PyObject *obj = va_arg(count, PyObject *); 760 PyObject *str; 761 assert(obj); 762 str = PyObject_Str(obj); 763 if (!str) 764 goto fail; 765 n += PyUnicode_GET_SIZE(str); 766 /* Remember the str and switch to the next slot */ 767 *callresult++ = str; 768 break; 769 } 770 case 'R': 771 { 772 PyObject *obj = va_arg(count, PyObject *); 773 PyObject *repr; 774 assert(obj); 775 repr = PyObject_Repr(obj); 776 if (!repr) 777 goto fail; 778 n += PyUnicode_GET_SIZE(repr); 779 /* Remember the repr and switch to the next slot */ 780 *callresult++ = repr; 781 break; 782 } 783 case 'p': 784 (void) va_arg(count, int); 785 /* maximum 64-bit pointer representation: 786 * 0xffffffffffffffff 787 * so 19 characters is enough. 788 * XXX I count 18 -- what's the extra for? 789 */ 790 n += 19; 791 break; 792 default: 793 /* if we stumble upon an unknown 794 formatting code, copy the rest of 795 the format string to the output 796 string. (we cannot just skip the 797 code, since there's no way to know 798 what's in the argument list) */ 799 n += strlen(p); 800 goto expand; 801 } 802 } else 803 n++; 804 } 805 expand: 806 if (abuffersize > 20) { 807 abuffer = PyObject_Malloc(abuffersize); 808 if (!abuffer) { 809 PyErr_NoMemory(); 810 goto fail; 811 } 812 realbuffer = abuffer; 813 } 814 else 815 realbuffer = buffer; 816 /* step 4: fill the buffer */ 817 /* Since we've analyzed how much space we need for the worst case, 818 we don't have to resize the string. 819 There can be no errors beyond this point. */ 820 string = PyUnicode_FromUnicode(NULL, n); 821 if (!string) 822 goto fail; 823 824 s = PyUnicode_AS_UNICODE(string); 825 callresult = callresults; 826 827 for (f = format; *f; f++) { 828 if (*f == '%') { 829 const char* p = f++; 830 int longflag = 0; 831 int size_tflag = 0; 832 zeropad = (*f == '0'); 833 /* parse the width.precision part */ 834 width = 0; 835 while (ISDIGIT((unsigned)*f)) 836 width = (width*10) + *f++ - '0'; 837 precision = 0; 838 if (*f == '.') { 839 f++; 840 while (ISDIGIT((unsigned)*f)) 841 precision = (precision*10) + *f++ - '0'; 842 } 843 /* handle the long flag, but only for %ld and %lu. 844 others can be added when necessary. */ 845 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) { 846 longflag = 1; 847 ++f; 848 } 849 /* handle the size_t flag. */ 850 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) { 851 size_tflag = 1; 852 ++f; 853 } 854 855 switch (*f) { 856 case 'c': 857 *s++ = va_arg(vargs, int); 858 break; 859 case 'd': 860 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd'); 861 if (longflag) 862 sprintf(realbuffer, fmt, va_arg(vargs, long)); 863 else if (size_tflag) 864 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t)); 865 else 866 sprintf(realbuffer, fmt, va_arg(vargs, int)); 867 appendstring(realbuffer); 868 break; 869 case 'u': 870 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u'); 871 if (longflag) 872 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long)); 873 else if (size_tflag) 874 sprintf(realbuffer, fmt, va_arg(vargs, size_t)); 875 else 876 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int)); 877 appendstring(realbuffer); 878 break; 879 case 'i': 880 makefmt(fmt, 0, 0, zeropad, width, precision, 'i'); 881 sprintf(realbuffer, fmt, va_arg(vargs, int)); 882 appendstring(realbuffer); 883 break; 884 case 'x': 885 makefmt(fmt, 0, 0, zeropad, width, precision, 'x'); 886 sprintf(realbuffer, fmt, va_arg(vargs, int)); 887 appendstring(realbuffer); 888 break; 889 case 's': 890 { 891 /* Parameter must be UTF-8 encoded. 892 In case of encoding errors, use 893 the replacement character. */ 894 PyObject *u; 895 p = va_arg(vargs, char*); 896 u = PyUnicode_DecodeUTF8(p, strlen(p), 897 "replace"); 898 if (!u) 899 goto fail; 900 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(u), 901 PyUnicode_GET_SIZE(u)); 902 s += PyUnicode_GET_SIZE(u); 903 Py_DECREF(u); 904 break; 905 } 906 case 'U': 907 { 908 PyObject *obj = va_arg(vargs, PyObject *); 909 Py_ssize_t size = PyUnicode_GET_SIZE(obj); 910 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size); 911 s += size; 912 break; 913 } 914 case 'V': 915 { 916 PyObject *obj = va_arg(vargs, PyObject *); 917 const char *str = va_arg(vargs, const char *); 918 if (obj) { 919 Py_ssize_t size = PyUnicode_GET_SIZE(obj); 920 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size); 921 s += size; 922 } else { 923 appendstring(str); 924 } 925 break; 926 } 927 case 'S': 928 case 'R': 929 { 930 Py_UNICODE *ucopy; 931 Py_ssize_t usize; 932 Py_ssize_t upos; 933 /* unused, since we already have the result */ 934 (void) va_arg(vargs, PyObject *); 935 ucopy = PyUnicode_AS_UNICODE(*callresult); 936 usize = PyUnicode_GET_SIZE(*callresult); 937 for (upos = 0; upos<usize;) 938 *s++ = ucopy[upos++]; 939 /* We're done with the unicode()/repr() => forget it */ 940 Py_DECREF(*callresult); 941 /* switch to next unicode()/repr() result */ 942 ++callresult; 943 break; 944 } 945 case 'p': 946 sprintf(buffer, "%p", va_arg(vargs, void*)); 947 /* %p is ill-defined: ensure leading 0x. */ 948 if (buffer[1] == 'X') 949 buffer[1] = 'x'; 950 else if (buffer[1] != 'x') { 951 memmove(buffer+2, buffer, strlen(buffer)+1); 952 buffer[0] = '0'; 953 buffer[1] = 'x'; 954 } 955 appendstring(buffer); 956 break; 957 case '%': 958 *s++ = '%'; 959 break; 960 default: 961 appendstring(p); 962 goto end; 963 } 964 } else 965 *s++ = *f; 966 } 967 968 end: 969 if (callresults) 970 PyObject_Free(callresults); 971 if (abuffer) 972 PyObject_Free(abuffer); 973 _PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string)); 974 return string; 975 fail: 976 if (callresults) { 977 PyObject **callresult2 = callresults; 978 while (callresult2 < callresult) { 979 Py_DECREF(*callresult2); 980 ++callresult2; 981 } 982 PyObject_Free(callresults); 983 } 984 if (abuffer) 985 PyObject_Free(abuffer); 986 return NULL; 987} 988 989#undef appendstring 990 991PyObject * 992PyUnicode_FromFormat(const char *format, ...) 993{ 994 PyObject* ret; 995 va_list vargs; 996 997#ifdef HAVE_STDARG_PROTOTYPES 998 va_start(vargs, format); 999#else 1000 va_start(vargs); 1001#endif 1002 ret = PyUnicode_FromFormatV(format, vargs); 1003 va_end(vargs); 1004 return ret; 1005} 1006 1007Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode, 1008 wchar_t *w, 1009 Py_ssize_t size) 1010{ 1011 if (unicode == NULL) { 1012 PyErr_BadInternalCall(); 1013 return -1; 1014 } 1015 1016 /* If possible, try to copy the 0-termination as well */ 1017 if (size > PyUnicode_GET_SIZE(unicode)) 1018 size = PyUnicode_GET_SIZE(unicode) + 1; 1019 1020#ifdef HAVE_USABLE_WCHAR_T 1021 memcpy(w, unicode->str, size * sizeof(wchar_t)); 1022#else 1023 { 1024 register Py_UNICODE *u; 1025 register Py_ssize_t i; 1026 u = PyUnicode_AS_UNICODE(unicode); 1027 for (i = size; i > 0; i--) 1028 *w++ = *u++; 1029 } 1030#endif 1031 1032 if (size > PyUnicode_GET_SIZE(unicode)) 1033 return PyUnicode_GET_SIZE(unicode); 1034 else 1035 return size; 1036} 1037 1038#endif 1039 1040PyObject *PyUnicode_FromOrdinal(int ordinal) 1041{ 1042 Py_UNICODE s[2]; 1043 1044 if (ordinal < 0 || ordinal > 0x10ffff) { 1045 PyErr_SetString(PyExc_ValueError, 1046 "chr() arg not in range(0x110000)"); 1047 return NULL; 1048 } 1049 1050#ifndef Py_UNICODE_WIDE 1051 if (ordinal > 0xffff) { 1052 ordinal -= 0x10000; 1053 s[0] = 0xD800 | (ordinal >> 10); 1054 s[1] = 0xDC00 | (ordinal & 0x3FF); 1055 return PyUnicode_FromUnicode(s, 2); 1056 } 1057#endif 1058 1059 s[0] = (Py_UNICODE)ordinal; 1060 return PyUnicode_FromUnicode(s, 1); 1061} 1062 1063PyObject *PyUnicode_FromObject(register PyObject *obj) 1064{ 1065 /* XXX Perhaps we should make this API an alias of 1066 PyObject_Str() instead ?! */ 1067 if (PyUnicode_CheckExact(obj)) { 1068 Py_INCREF(obj); 1069 return obj; 1070 } 1071 if (PyUnicode_Check(obj)) { 1072 /* For a Unicode subtype that's not a Unicode object, 1073 return a true Unicode object with the same data. */ 1074 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj), 1075 PyUnicode_GET_SIZE(obj)); 1076 } 1077 PyErr_Format(PyExc_TypeError, 1078 "Can't convert '%.100s' object to str implicitly", 1079 Py_TYPE(obj)->tp_name); 1080 return NULL; 1081} 1082 1083PyObject *PyUnicode_FromEncodedObject(register PyObject *obj, 1084 const char *encoding, 1085 const char *errors) 1086{ 1087 const char *s = NULL; 1088 Py_ssize_t len; 1089 PyObject *v; 1090 1091 if (obj == NULL) { 1092 PyErr_BadInternalCall(); 1093 return NULL; 1094 } 1095 1096 if (PyUnicode_Check(obj)) { 1097 PyErr_SetString(PyExc_TypeError, 1098 "decoding Unicode is not supported"); 1099 return NULL; 1100 } 1101 1102 /* Coerce object */ 1103 if (PyBytes_Check(obj)) { 1104 s = PyBytes_AS_STRING(obj); 1105 len = PyBytes_GET_SIZE(obj); 1106 } 1107 else if (PyObject_AsCharBuffer(obj, &s, &len)) { 1108 /* Overwrite the error message with something more useful in 1109 case of a TypeError. */ 1110 if (PyErr_ExceptionMatches(PyExc_TypeError)) 1111 PyErr_Format(PyExc_TypeError, 1112 "coercing to Unicode: need string or buffer, " 1113 "%.80s found", 1114 Py_TYPE(obj)->tp_name); 1115 goto onError; 1116 } 1117 1118 /* Convert to Unicode */ 1119 if (len == 0) { 1120 Py_INCREF(unicode_empty); 1121 v = (PyObject *)unicode_empty; 1122 } 1123 else 1124 v = PyUnicode_Decode(s, len, encoding, errors); 1125 1126 return v; 1127 1128 onError: 1129 return NULL; 1130} 1131 1132PyObject *PyUnicode_Decode(const char *s, 1133 Py_ssize_t size, 1134 const char *encoding, 1135 const char *errors) 1136{ 1137 PyObject *buffer = NULL, *unicode; 1138 Py_buffer info; 1139 char lower[20]; /* Enough for any encoding name we recognize */ 1140 char *l; 1141 const char *e; 1142 1143 if (encoding == NULL) 1144 encoding = PyUnicode_GetDefaultEncoding(); 1145 1146 /* Convert encoding to lower case and replace '_' with '-' in order to 1147 catch e.g. UTF_8 */ 1148 e = encoding; 1149 l = lower; 1150 while (*e && l < &lower[(sizeof lower) - 2]) { 1151 if (ISUPPER(*e)) { 1152 *l++ = TOLOWER(*e++); 1153 } 1154 else if (*e == '_') { 1155 *l++ = '-'; 1156 e++; 1157 } 1158 else { 1159 *l++ = *e++; 1160 } 1161 } 1162 *l = '\0'; 1163 1164 /* Shortcuts for common default encodings */ 1165 if (strcmp(lower, "utf-8") == 0) 1166 return PyUnicode_DecodeUTF8(s, size, errors); 1167 else if ((strcmp(lower, "latin-1") == 0) || 1168 (strcmp(lower, "iso-8859-1") == 0)) 1169 return PyUnicode_DecodeLatin1(s, size, errors); 1170#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T) 1171 else if (strcmp(lower, "mbcs") == 0) 1172 return PyUnicode_DecodeMBCS(s, size, errors); 1173#endif 1174 else if (strcmp(lower, "ascii") == 0) 1175 return PyUnicode_DecodeASCII(s, size, errors); 1176 else if (strcmp(lower, "utf-16") == 0) 1177 return PyUnicode_DecodeUTF16(s, size, errors, 0); 1178 else if (strcmp(lower, "utf-32") == 0) 1179 return PyUnicode_DecodeUTF32(s, size, errors, 0); 1180 1181 /* Decode via the codec registry */ 1182 buffer = NULL; 1183 if (PyBuffer_FillInfo(&info, (void *)s, size, 1, PyBUF_SIMPLE) < 0) 1184 goto onError; 1185 buffer = PyMemoryView_FromMemory(&info); 1186 if (buffer == NULL) 1187 goto onError; 1188 unicode = PyCodec_Decode(buffer, encoding, errors); 1189 if (unicode == NULL) 1190 goto onError; 1191 if (!PyUnicode_Check(unicode)) { 1192 PyErr_Format(PyExc_TypeError, 1193 "decoder did not return an unicode object (type=%.400s)", 1194 Py_TYPE(unicode)->tp_name); 1195 Py_DECREF(unicode); 1196 goto onError; 1197 } 1198 Py_DECREF(buffer); 1199 return unicode; 1200 1201 onError: 1202 Py_XDECREF(buffer); 1203 return NULL; 1204} 1205 1206PyObject *PyUnicode_AsDecodedObject(PyObject *unicode, 1207 const char *encoding, 1208 const char *errors) 1209{ 1210 PyObject *v; 1211 1212 if (!PyUnicode_Check(unicode)) { 1213 PyErr_BadArgument(); 1214 goto onError; 1215 } 1216 1217 if (encoding == NULL) 1218 encoding = PyUnicode_GetDefaultEncoding(); 1219 1220 /* Decode via the codec registry */ 1221 v = PyCodec_Decode(unicode, encoding, errors); 1222 if (v == NULL) 1223 goto onError; 1224 return v; 1225 1226 onError: 1227 return NULL; 1228} 1229 1230PyObject *PyUnicode_Encode(const Py_UNICODE *s, 1231 Py_ssize_t size, 1232 const char *encoding, 1233 const char *errors) 1234{ 1235 PyObject *v, *unicode; 1236 1237 unicode = PyUnicode_FromUnicode(s, size); 1238 if (unicode == NULL) 1239 return NULL; 1240 v = PyUnicode_AsEncodedString(unicode, encoding, errors); 1241 Py_DECREF(unicode); 1242 return v; 1243} 1244 1245PyObject *PyUnicode_AsEncodedObject(PyObject *unicode, 1246 const char *encoding, 1247 const char *errors) 1248{ 1249 PyObject *v; 1250 1251 if (!PyUnicode_Check(unicode)) { 1252 PyErr_BadArgument(); 1253 goto onError; 1254 } 1255 1256 if (encoding == NULL) 1257 encoding = PyUnicode_GetDefaultEncoding(); 1258 1259 /* Encode via the codec registry */ 1260 v = PyCodec_Encode(unicode, encoding, errors); 1261 if (v == NULL) 1262 goto onError; 1263 return v; 1264 1265 onError: 1266 return NULL; 1267} 1268 1269PyObject *PyUnicode_AsEncodedString(PyObject *unicode, 1270 const char *encoding, 1271 const char *errors) 1272{ 1273 PyObject *v; 1274 1275 if (!PyUnicode_Check(unicode)) { 1276 PyErr_BadArgument(); 1277 goto onError; 1278 } 1279 1280 if (encoding == NULL) 1281 encoding = PyUnicode_GetDefaultEncoding(); 1282 1283 /* Shortcuts for common default encodings */ 1284 if (errors == NULL) { 1285 if (strcmp(encoding, "utf-8") == 0) 1286 return PyUnicode_AsUTF8String(unicode); 1287 else if (strcmp(encoding, "latin-1") == 0) 1288 return PyUnicode_AsLatin1String(unicode); 1289#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T) 1290 else if (strcmp(encoding, "mbcs") == 0) 1291 return PyUnicode_AsMBCSString(unicode); 1292#endif 1293 else if (strcmp(encoding, "ascii") == 0) 1294 return PyUnicode_AsASCIIString(unicode); 1295 } 1296 1297 /* Encode via the codec registry */ 1298 v = PyCodec_Encode(unicode, encoding, errors); 1299 if (v == NULL) 1300 goto onError; 1301 assert(PyBytes_Check(v)); 1302 return v; 1303 1304 onError: 1305 return NULL; 1306} 1307 1308PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode, 1309 const char *errors) 1310{ 1311 PyObject *v = ((PyUnicodeObject *)unicode)->defenc; 1312 if (v) 1313 return v; 1314 if (errors != NULL) 1315 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString"); 1316 v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode), 1317 PyUnicode_GET_SIZE(unicode), 1318 NULL); 1319 if (!v) 1320 return NULL; 1321 ((PyUnicodeObject *)unicode)->defenc = v; 1322 return v; 1323} 1324 1325PyObject* 1326PyUnicode_DecodeFSDefault(const char *s) { 1327 Py_ssize_t size = (Py_ssize_t)strlen(s); 1328 return PyUnicode_DecodeFSDefaultAndSize(s, size); 1329} 1330 1331PyObject* 1332PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size) 1333{ 1334 /* During the early bootstrapping process, Py_FileSystemDefaultEncoding 1335 can be undefined. If it is case, decode using UTF-8. The following assumes 1336 that Py_FileSystemDefaultEncoding is set to a built-in encoding during the 1337 bootstrapping process where the codecs aren't ready yet. 1338 */ 1339 if (Py_FileSystemDefaultEncoding) { 1340#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T) 1341 if (strcmp(Py_FileSystemDefaultEncoding, "mbcs") == 0) { 1342 return PyUnicode_DecodeMBCS(s, size, "replace"); 1343 } 1344#elif defined(__APPLE__) 1345 if (strcmp(Py_FileSystemDefaultEncoding, "utf-8") == 0) { 1346 return PyUnicode_DecodeUTF8(s, size, "replace"); 1347 } 1348#endif 1349 return PyUnicode_Decode(s, size, 1350 Py_FileSystemDefaultEncoding, 1351 "replace"); 1352 } 1353 else { 1354 return PyUnicode_DecodeUTF8(s, size, "replace"); 1355 } 1356} 1357 1358char* 1359PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize) 1360{ 1361 PyObject *bytes; 1362 if (!PyUnicode_Check(unicode)) { 1363 PyErr_BadArgument(); 1364 return NULL; 1365 } 1366 bytes = _PyUnicode_AsDefaultEncodedString(unicode, NULL); 1367 if (bytes == NULL) 1368 return NULL; 1369 if (psize != NULL) 1370 *psize = PyBytes_GET_SIZE(bytes); 1371 return PyBytes_AS_STRING(bytes); 1372} 1373 1374char* 1375PyUnicode_AsString(PyObject *unicode) 1376{ 1377 return PyUnicode_AsStringAndSize(unicode, NULL); 1378} 1379 1380Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode) 1381{ 1382 if (!PyUnicode_Check(unicode)) { 1383 PyErr_BadArgument(); 1384 goto onError; 1385 } 1386 return PyUnicode_AS_UNICODE(unicode); 1387 1388 onError: 1389 return NULL; 1390} 1391 1392Py_ssize_t PyUnicode_GetSize(PyObject *unicode) 1393{ 1394 if (!PyUnicode_Check(unicode)) { 1395 PyErr_BadArgument(); 1396 goto onError; 1397 } 1398 return PyUnicode_GET_SIZE(unicode); 1399 1400 onError: 1401 return -1; 1402} 1403 1404const char *PyUnicode_GetDefaultEncoding(void) 1405{ 1406 return unicode_default_encoding; 1407} 1408 1409int PyUnicode_SetDefaultEncoding(const char *encoding) 1410{ 1411 if (strcmp(encoding, unicode_default_encoding) != 0) { 1412 PyErr_Format(PyExc_ValueError, 1413 "Can only set default encoding to %s", 1414 unicode_default_encoding); 1415 return -1; 1416 } 1417 return 0; 1418} 1419 1420/* error handling callback helper: 1421 build arguments, call the callback and check the arguments, 1422 if no exception occurred, copy the replacement to the output 1423 and adjust various state variables. 1424 return 0 on success, -1 on error 1425*/ 1426 1427static 1428int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler, 1429 const char *encoding, const char *reason, 1430 const char **input, const char **inend, Py_ssize_t *startinpos, 1431 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr, 1432 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr) 1433{ 1434 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple"; 1435 1436 PyObject *restuple = NULL; 1437 PyObject *repunicode = NULL; 1438 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output); 1439 Py_ssize_t insize; 1440 Py_ssize_t requiredsize; 1441 Py_ssize_t newpos; 1442 Py_UNICODE *repptr; 1443 PyObject *inputobj = NULL; 1444 Py_ssize_t repsize; 1445 int res = -1; 1446 1447 if (*errorHandler == NULL) { 1448 *errorHandler = PyCodec_LookupError(errors); 1449 if (*errorHandler == NULL) 1450 goto onError; 1451 } 1452 1453 if (*exceptionObject == NULL) { 1454 *exceptionObject = PyUnicodeDecodeError_Create( 1455 encoding, *input, *inend-*input, *startinpos, *endinpos, reason); 1456 if (*exceptionObject == NULL) 1457 goto onError; 1458 } 1459 else { 1460 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos)) 1461 goto onError; 1462 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos)) 1463 goto onError; 1464 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason)) 1465 goto onError; 1466 } 1467 1468 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL); 1469 if (restuple == NULL) 1470 goto onError; 1471 if (!PyTuple_Check(restuple)) { 1472 PyErr_Format(PyExc_TypeError, &argparse[4]); 1473 goto onError; 1474 } 1475 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos)) 1476 goto onError; 1477 1478 /* Copy back the bytes variables, which might have been modified by the 1479 callback */ 1480 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject); 1481 if (!inputobj) 1482 goto onError; 1483 if (!PyBytes_Check(inputobj)) { 1484 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes"); 1485 } 1486 *input = PyBytes_AS_STRING(inputobj); 1487 insize = PyBytes_GET_SIZE(inputobj); 1488 *inend = *input + insize; 1489 /* we can DECREF safely, as the exception has another reference, 1490 so the object won't go away. */ 1491 Py_DECREF(inputobj); 1492 1493 if (newpos<0) 1494 newpos = insize+newpos; 1495 if (newpos<0 || newpos>insize) { 1496 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos); 1497 goto onError; 1498 } 1499 1500 /* need more space? (at least enough for what we 1501 have+the replacement+the rest of the string (starting 1502 at the new input position), so we won't have to check space 1503 when there are no errors in the rest of the string) */ 1504 repptr = PyUnicode_AS_UNICODE(repunicode); 1505 repsize = PyUnicode_GET_SIZE(repunicode); 1506 requiredsize = *outpos + repsize + insize-newpos; 1507 if (requiredsize > outsize) { 1508 if (requiredsize<2*outsize) 1509 requiredsize = 2*outsize; 1510 if (PyUnicode_Resize(output, requiredsize) < 0) 1511 goto onError; 1512 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos; 1513 } 1514 *endinpos = newpos; 1515 *inptr = *input + newpos; 1516 Py_UNICODE_COPY(*outptr, repptr, repsize); 1517 *outptr += repsize; 1518 *outpos += repsize; 1519 1520 /* we made it! */ 1521 res = 0; 1522 1523 onError: 1524 Py_XDECREF(restuple); 1525 return res; 1526} 1527 1528/* --- UTF-7 Codec -------------------------------------------------------- */ 1529 1530/* see RFC2152 for details */ 1531 1532static 1533char utf7_special[128] = { 1534 /* indicate whether a UTF-7 character is special i.e. cannot be directly 1535 encoded: 1536 0 - not special 1537 1 - special 1538 2 - whitespace (optional) 1539 3 - RFC2152 Set O (optional) */ 1540 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1, 1541 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1542 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1, 1543 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0, 1544 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1545 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3, 1546 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1547 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1, 1548 1549}; 1550 1551/* Note: The comparison (c) <= 0 is a trick to work-around gcc 1552 warnings about the comparison always being false; since 1553 utf7_special[0] is 1, we can safely make that one comparison 1554 true */ 1555 1556#define SPECIAL(c, encodeO, encodeWS) \ 1557 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \ 1558 (encodeWS && (utf7_special[(c)] == 2)) || \ 1559 (encodeO && (utf7_special[(c)] == 3))) 1560 1561#define B64(n) \ 1562 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f]) 1563#define B64CHAR(c) \ 1564 (ISALNUM(c) || (c) == '+' || (c) == '/') 1565#define UB64(c) \ 1566 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \ 1567 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 ) 1568 1569#define ENCODE(out, ch, bits) \ 1570 while (bits >= 6) { \ 1571 *out++ = B64(ch >> (bits-6)); \ 1572 bits -= 6; \ 1573 } 1574 1575#define DECODE(out, ch, bits, surrogate) \ 1576 while (bits >= 16) { \ 1577 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \ 1578 bits -= 16; \ 1579 if (surrogate) { \ 1580 /* We have already generated an error for the high surrogate \ 1581 so let's not bother seeing if the low surrogate is correct or not */ \ 1582 surrogate = 0; \ 1583 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \ 1584 /* This is a surrogate pair. Unfortunately we can't represent \ 1585 it in a 16-bit character */ \ 1586 surrogate = 1; \ 1587 errmsg = "code pairs are not supported"; \ 1588 goto utf7Error; \ 1589 } else { \ 1590 *out++ = outCh; \ 1591 } \ 1592 } 1593 1594PyObject *PyUnicode_DecodeUTF7(const char *s, 1595 Py_ssize_t size, 1596 const char *errors) 1597{ 1598 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL); 1599} 1600 1601PyObject *PyUnicode_DecodeUTF7Stateful(const char *s, 1602 Py_ssize_t size, 1603 const char *errors, 1604 Py_ssize_t *consumed) 1605{ 1606 const char *starts = s; 1607 Py_ssize_t startinpos; 1608 Py_ssize_t endinpos; 1609 Py_ssize_t outpos; 1610 const char *e; 1611 PyUnicodeObject *unicode; 1612 Py_UNICODE *p; 1613 const char *errmsg = ""; 1614 int inShift = 0; 1615 unsigned int bitsleft = 0; 1616 unsigned long charsleft = 0; 1617 int surrogate = 0; 1618 PyObject *errorHandler = NULL; 1619 PyObject *exc = NULL; 1620 1621 unicode = _PyUnicode_New(size); 1622 if (!unicode) 1623 return NULL; 1624 if (size == 0) { 1625 if (consumed) 1626 *consumed = 0; 1627 return (PyObject *)unicode; 1628 } 1629 1630 p = unicode->str; 1631 e = s + size; 1632 1633 while (s < e) { 1634 Py_UNICODE ch; 1635 restart: 1636 ch = *s; 1637 1638 if (inShift) { 1639 if ((ch == '-') || !B64CHAR(ch)) { 1640 inShift = 0; 1641 s++; 1642 1643 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate); 1644 if (bitsleft >= 6) { 1645 /* The shift sequence has a partial character in it. If 1646 bitsleft < 6 then we could just classify it as padding 1647 but that is not the case here */ 1648 1649 errmsg = "partial character in shift sequence"; 1650 goto utf7Error; 1651 } 1652 /* According to RFC2152 the remaining bits should be zero. We 1653 choose to signal an error/insert a replacement character 1654 here so indicate the potential of a misencoded character. */ 1655 1656 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */ 1657 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) { 1658 errmsg = "non-zero padding bits in shift sequence"; 1659 goto utf7Error; 1660 } 1661 1662 if (ch == '-') { 1663 if ((s < e) && (*(s) == '-')) { 1664 *p++ = '-'; 1665 inShift = 1; 1666 } 1667 } else if (SPECIAL(ch,0,0)) { 1668 errmsg = "unexpected special character"; 1669 goto utf7Error; 1670 } else { 1671 *p++ = ch; 1672 } 1673 } else { 1674 charsleft = (charsleft << 6) | UB64(ch); 1675 bitsleft += 6; 1676 s++; 1677 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate); 1678 } 1679 } 1680 else if ( ch == '+' ) { 1681 startinpos = s-starts; 1682 s++; 1683 if (s < e && *s == '-') { 1684 s++; 1685 *p++ = '+'; 1686 } else 1687 { 1688 inShift = 1; 1689 bitsleft = 0; 1690 } 1691 } 1692 else if (SPECIAL(ch,0,0)) { 1693 startinpos = s-starts; 1694 errmsg = "unexpected special character"; 1695 s++; 1696 goto utf7Error; 1697 } 1698 else { 1699 *p++ = ch; 1700 s++; 1701 } 1702 continue; 1703 utf7Error: 1704 outpos = p-PyUnicode_AS_UNICODE(unicode); 1705 endinpos = s-starts; 1706 if (unicode_decode_call_errorhandler( 1707 errors, &errorHandler, 1708 "utf7", errmsg, 1709 &starts, &e, &startinpos, &endinpos, &exc, &s, 1710 (PyObject **)&unicode, &outpos, &p)) 1711 goto onError; 1712 } 1713 1714 if (inShift && !consumed) { 1715 outpos = p-PyUnicode_AS_UNICODE(unicode); 1716 endinpos = size; 1717 if (unicode_decode_call_errorhandler( 1718 errors, &errorHandler, 1719 "utf7", "unterminated shift sequence", 1720 &starts, &e, &startinpos, &endinpos, &exc, &s, 1721 (PyObject **)&unicode, &outpos, &p)) 1722 goto onError; 1723 if (s < e) 1724 goto restart; 1725 } 1726 if (consumed) { 1727 if(inShift) 1728 *consumed = startinpos; 1729 else 1730 *consumed = s-starts; 1731 } 1732 1733 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0) 1734 goto onError; 1735 1736 Py_XDECREF(errorHandler); 1737 Py_XDECREF(exc); 1738 return (PyObject *)unicode; 1739 1740onError: 1741 Py_XDECREF(errorHandler); 1742 Py_XDECREF(exc); 1743 Py_DECREF(unicode); 1744 return NULL; 1745} 1746 1747 1748PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s, 1749 Py_ssize_t size, 1750 int encodeSetO, 1751 int encodeWhiteSpace, 1752 const char *errors) 1753{ 1754 PyObject *v, *result; 1755 /* It might be possible to tighten this worst case */ 1756 Py_ssize_t cbAllocated = 5 * size; 1757 int inShift = 0; 1758 Py_ssize_t i = 0; 1759 unsigned int bitsleft = 0; 1760 unsigned long charsleft = 0; 1761 char * out; 1762 char * start; 1763 1764 if (size == 0) 1765 return PyBytes_FromStringAndSize(NULL, 0); 1766 1767 v = PyByteArray_FromStringAndSize(NULL, cbAllocated); 1768 if (v == NULL) 1769 return NULL; 1770 1771 start = out = PyByteArray_AS_STRING(v); 1772 for (;i < size; ++i) { 1773 Py_UNICODE ch = s[i]; 1774 1775 if (!inShift) { 1776 if (ch == '+') { 1777 *out++ = '+'; 1778 *out++ = '-'; 1779 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) { 1780 charsleft = ch; 1781 bitsleft = 16; 1782 *out++ = '+'; 1783 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft); 1784 inShift = bitsleft > 0; 1785 } else { 1786 *out++ = (char) ch; 1787 } 1788 } else { 1789 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) { 1790 *out++ = B64(charsleft << (6-bitsleft)); 1791 charsleft = 0; 1792 bitsleft = 0; 1793 /* Characters not in the BASE64 set implicitly unshift the sequence 1794 so no '-' is required, except if the character is itself a '-' */ 1795 if (B64CHAR(ch) || ch == '-') { 1796 *out++ = '-'; 1797 } 1798 inShift = 0; 1799 *out++ = (char) ch; 1800 } else { 1801 bitsleft += 16; 1802 charsleft = (charsleft << 16) | ch; 1803 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft); 1804 1805 /* If the next character is special then we dont' need to terminate 1806 the shift sequence. If the next character is not a BASE64 character 1807 or '-' then the shift sequence will be terminated implicitly and we 1808 don't have to insert a '-'. */ 1809 1810 if (bitsleft == 0) { 1811 if (i + 1 < size) { 1812 Py_UNICODE ch2 = s[i+1]; 1813 1814 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) { 1815 1816 } else if (B64CHAR(ch2) || ch2 == '-') { 1817 *out++ = '-'; 1818 inShift = 0; 1819 } else { 1820 inShift = 0; 1821 } 1822 1823 } 1824 else { 1825 *out++ = '-'; 1826 inShift = 0; 1827 } 1828 } 1829 } 1830 } 1831 } 1832 if (bitsleft) { 1833 *out++= B64(charsleft << (6-bitsleft) ); 1834 *out++ = '-'; 1835 } 1836 1837 result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), out - start); 1838 Py_DECREF(v); 1839 return result; 1840} 1841 1842#undef SPECIAL 1843#undef B64 1844#undef B64CHAR 1845#undef UB64 1846#undef ENCODE 1847#undef DECODE 1848 1849/* --- UTF-8 Codec -------------------------------------------------------- */ 1850 1851static 1852char utf8_code_length[256] = { 1853 /* Map UTF-8 encoded prefix byte to sequence length. zero means 1854 illegal prefix. see RFC 2279 for details */ 1855 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1856 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1857 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1858 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1859 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1860 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1861 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1862 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1863 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1864 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1865 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1866 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1867 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1868 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1869 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1870 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0 1871}; 1872 1873PyObject *PyUnicode_DecodeUTF8(const char *s, 1874 Py_ssize_t size, 1875 const char *errors) 1876{ 1877 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL); 1878} 1879 1880PyObject *PyUnicode_DecodeUTF8Stateful(const char *s, 1881 Py_ssize_t size, 1882 const char *errors, 1883 Py_ssize_t *consumed) 1884{ 1885 const char *starts = s; 1886 int n; 1887 Py_ssize_t startinpos; 1888 Py_ssize_t endinpos; 1889 Py_ssize_t outpos; 1890 const char *e; 1891 PyUnicodeObject *unicode; 1892 Py_UNICODE *p; 1893 const char *errmsg = ""; 1894 PyObject *errorHandler = NULL; 1895 PyObject *exc = NULL; 1896 1897 /* Note: size will always be longer than the resulting Unicode 1898 character count */ 1899 unicode = _PyUnicode_New(size); 1900 if (!unicode) 1901 return NULL; 1902 if (size == 0) { 1903 if (consumed) 1904 *consumed = 0; 1905 return (PyObject *)unicode; 1906 } 1907 1908 /* Unpack UTF-8 encoded data */ 1909 p = unicode->str; 1910 e = s + size; 1911 1912 while (s < e) { 1913 Py_UCS4 ch = (unsigned char)*s; 1914 1915 if (ch < 0x80) { 1916 *p++ = (Py_UNICODE)ch; 1917 s++; 1918 continue; 1919 } 1920 1921 n = utf8_code_length[ch]; 1922 1923 if (s + n > e) { 1924 if (consumed) 1925 break; 1926 else { 1927 errmsg = "unexpected end of data"; 1928 startinpos = s-starts; 1929 endinpos = size; 1930 goto utf8Error; 1931 } 1932 } 1933 1934 switch (n) { 1935 1936 case 0: 1937 errmsg = "unexpected code byte"; 1938 startinpos = s-starts; 1939 endinpos = startinpos+1; 1940 goto utf8Error; 1941 1942 case 1: 1943 errmsg = "internal error"; 1944 startinpos = s-starts; 1945 endinpos = startinpos+1; 1946 goto utf8Error; 1947 1948 case 2: 1949 if ((s[1] & 0xc0) != 0x80) { 1950 errmsg = "invalid data"; 1951 startinpos = s-starts; 1952 endinpos = startinpos+2; 1953 goto utf8Error; 1954 } 1955 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f); 1956 if (ch < 0x80) { 1957 startinpos = s-starts; 1958 endinpos = startinpos+2; 1959 errmsg = "illegal encoding"; 1960 goto utf8Error; 1961 } 1962 else 1963 *p++ = (Py_UNICODE)ch; 1964 break; 1965 1966 case 3: 1967 if ((s[1] & 0xc0) != 0x80 || 1968 (s[2] & 0xc0) != 0x80) { 1969 errmsg = "invalid data"; 1970 startinpos = s-starts; 1971 endinpos = startinpos+3; 1972 goto utf8Error; 1973 } 1974 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f); 1975 if (ch < 0x0800) { 1976 /* Note: UTF-8 encodings of surrogates are considered 1977 legal UTF-8 sequences; 1978 1979 XXX For wide builds (UCS-4) we should probably try 1980 to recombine the surrogates into a single code 1981 unit. 1982 */ 1983 errmsg = "illegal encoding"; 1984 startinpos = s-starts; 1985 endinpos = startinpos+3; 1986 goto utf8Error; 1987 } 1988 else 1989 *p++ = (Py_UNICODE)ch; 1990 break; 1991 1992 case 4: 1993 if ((s[1] & 0xc0) != 0x80 || 1994 (s[2] & 0xc0) != 0x80 || 1995 (s[3] & 0xc0) != 0x80) { 1996 errmsg = "invalid data"; 1997 startinpos = s-starts; 1998 endinpos = startinpos+4; 1999 goto utf8Error; 2000 } 2001 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) + 2002 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f); 2003 /* validate and convert to UTF-16 */ 2004 if ((ch < 0x10000) /* minimum value allowed for 4 2005 byte encoding */ 2006 || (ch > 0x10ffff)) /* maximum value allowed for 2007 UTF-16 */ 2008 { 2009 errmsg = "illegal encoding"; 2010 startinpos = s-starts; 2011 endinpos = startinpos+4; 2012 goto utf8Error; 2013 } 2014#ifdef Py_UNICODE_WIDE 2015 *p++ = (Py_UNICODE)ch; 2016#else 2017 /* compute and append the two surrogates: */ 2018 2019 /* translate from 10000..10FFFF to 0..FFFF */ 2020 ch -= 0x10000; 2021 2022 /* high surrogate = top 10 bits added to D800 */ 2023 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10)); 2024 2025 /* low surrogate = bottom 10 bits added to DC00 */ 2026 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF)); 2027#endif 2028 break; 2029 2030 default: 2031 /* Other sizes are only needed for UCS-4 */ 2032 errmsg = "unsupported Unicode code range"; 2033 startinpos = s-starts; 2034 endinpos = startinpos+n; 2035 goto utf8Error; 2036 } 2037 s += n; 2038 continue; 2039 2040 utf8Error: 2041 outpos = p-PyUnicode_AS_UNICODE(unicode); 2042 if (unicode_decode_call_errorhandler( 2043 errors, &errorHandler, 2044 "utf8", errmsg, 2045 &starts, &e, &startinpos, &endinpos, &exc, &s, 2046 (PyObject **)&unicode, &outpos, &p)) 2047 goto onError; 2048 } 2049 if (consumed) 2050 *consumed = s-starts; 2051 2052 /* Adjust length */ 2053 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0) 2054 goto onError; 2055 2056 Py_XDECREF(errorHandler); 2057 Py_XDECREF(exc); 2058 return (PyObject *)unicode; 2059 2060onError: 2061 Py_XDECREF(errorHandler); 2062 Py_XDECREF(exc); 2063 Py_DECREF(unicode); 2064 return NULL; 2065} 2066 2067/* Allocation strategy: if the string is short, convert into a stack buffer 2068 and allocate exactly as much space needed at the end. Else allocate the 2069 maximum possible needed (4 result bytes per Unicode character), and return 2070 the excess memory at the end. 2071*/ 2072PyObject * 2073PyUnicode_EncodeUTF8(const Py_UNICODE *s, 2074 Py_ssize_t size, 2075 const char *errors) 2076{ 2077#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */ 2078 2079 Py_ssize_t i; /* index into s of next input byte */ 2080 PyObject *result; /* result string object */ 2081 char *p; /* next free byte in output buffer */ 2082 Py_ssize_t nallocated; /* number of result bytes allocated */ 2083 Py_ssize_t nneeded; /* number of result bytes needed */ 2084 char stackbuf[MAX_SHORT_UNICHARS * 4]; 2085 2086 assert(s != NULL); 2087 assert(size >= 0); 2088 2089 if (size <= MAX_SHORT_UNICHARS) { 2090 /* Write into the stack buffer; nallocated can't overflow. 2091 * At the end, we'll allocate exactly as much heap space as it 2092 * turns out we need. 2093 */ 2094 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int); 2095 result = NULL; /* will allocate after we're done */ 2096 p = stackbuf; 2097 } 2098 else { 2099 /* Overallocate on the heap, and give the excess back at the end. */ 2100 nallocated = size * 4; 2101 if (nallocated / 4 != size) /* overflow! */ 2102 return PyErr_NoMemory(); 2103 result = PyBytes_FromStringAndSize(NULL, nallocated); 2104 if (result == NULL) 2105 return NULL; 2106 p = PyBytes_AS_STRING(result); 2107 } 2108 2109 for (i = 0; i < size;) { 2110 Py_UCS4 ch = s[i++]; 2111 2112 if (ch < 0x80) 2113 /* Encode ASCII */ 2114 *p++ = (char) ch; 2115 2116 else if (ch < 0x0800) { 2117 /* Encode Latin-1 */ 2118 *p++ = (char)(0xc0 | (ch >> 6)); 2119 *p++ = (char)(0x80 | (ch & 0x3f)); 2120 } 2121 else { 2122 /* Encode UCS2 Unicode ordinals */ 2123 if (ch < 0x10000) { 2124 /* Special case: check for high surrogate */ 2125 if (0xD800 <= ch && ch <= 0xDBFF && i != size) { 2126 Py_UCS4 ch2 = s[i]; 2127 /* Check for low surrogate and combine the two to 2128 form a UCS4 value */ 2129 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) { 2130 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000; 2131 i++; 2132 goto encodeUCS4; 2133 } 2134 /* Fall through: handles isolated high surrogates */ 2135 } 2136 *p++ = (char)(0xe0 | (ch >> 12)); 2137 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); 2138 *p++ = (char)(0x80 | (ch & 0x3f)); 2139 continue; 2140 } 2141encodeUCS4: 2142 /* Encode UCS4 Unicode ordinals */ 2143 *p++ = (char)(0xf0 | (ch >> 18)); 2144 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f)); 2145 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); 2146 *p++ = (char)(0x80 | (ch & 0x3f)); 2147 } 2148 } 2149 2150 if (result == NULL) { 2151 /* This was stack allocated. */ 2152 nneeded = p - stackbuf; 2153 assert(nneeded <= nallocated); 2154 result = PyBytes_FromStringAndSize(stackbuf, nneeded); 2155 } 2156 else { 2157 /* Cut back to size actually needed. */ 2158 nneeded = p - PyBytes_AS_STRING(result); 2159 assert(nneeded <= nallocated); 2160 _PyBytes_Resize(&result, nneeded); 2161 } 2162 return result; 2163 2164#undef MAX_SHORT_UNICHARS 2165} 2166 2167PyObject *PyUnicode_AsUTF8String(PyObject *unicode) 2168{ 2169 if (!PyUnicode_Check(unicode)) { 2170 PyErr_BadArgument(); 2171 return NULL; 2172 } 2173 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode), 2174 PyUnicode_GET_SIZE(unicode), 2175 NULL); 2176} 2177 2178/* --- UTF-32 Codec ------------------------------------------------------- */ 2179 2180PyObject * 2181PyUnicode_DecodeUTF32(const char *s, 2182 Py_ssize_t size, 2183 const char *errors, 2184 int *byteorder) 2185{ 2186 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL); 2187} 2188 2189PyObject * 2190PyUnicode_DecodeUTF32Stateful(const char *s, 2191 Py_ssize_t size, 2192 const char *errors, 2193 int *byteorder, 2194 Py_ssize_t *consumed) 2195{ 2196 const char *starts = s; 2197 Py_ssize_t startinpos; 2198 Py_ssize_t endinpos; 2199 Py_ssize_t outpos; 2200 PyUnicodeObject *unicode; 2201 Py_UNICODE *p; 2202#ifndef Py_UNICODE_WIDE 2203 int i, pairs; 2204#else 2205 const int pairs = 0; 2206#endif 2207 const unsigned char *q, *e; 2208 int bo = 0; /* assume native ordering by default */ 2209 const char *errmsg = ""; 2210 /* Offsets from q for retrieving bytes in the right order. */ 2211#ifdef BYTEORDER_IS_LITTLE_ENDIAN 2212 int iorder[] = {0, 1, 2, 3}; 2213#else 2214 int iorder[] = {3, 2, 1, 0}; 2215#endif 2216 PyObject *errorHandler = NULL; 2217 PyObject *exc = NULL; 2218 /* On narrow builds we split characters outside the BMP into two 2219 codepoints => count how much extra space we need. */ 2220#ifndef Py_UNICODE_WIDE 2221 for (i = pairs = 0; i < size/4; i++) 2222 if (((Py_UCS4 *)s)[i] >= 0x10000) 2223 pairs++; 2224#endif 2225 2226 /* This might be one to much, because of a BOM */ 2227 unicode = _PyUnicode_New((size+3)/4+pairs); 2228 if (!unicode) 2229 return NULL; 2230 if (size == 0) 2231 return (PyObject *)unicode; 2232 2233 /* Unpack UTF-32 encoded data */ 2234 p = unicode->str; 2235 q = (unsigned char *)s; 2236 e = q + size; 2237 2238 if (byteorder) 2239 bo = *byteorder; 2240 2241 /* Check for BOM marks (U+FEFF) in the input and adjust current 2242 byte order setting accordingly. In native mode, the leading BOM 2243 mark is skipped, in all other modes, it is copied to the output 2244 stream as-is (giving a ZWNBSP character). */ 2245 if (bo == 0) { 2246 if (size >= 4) { 2247 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) | 2248 (q[iorder[1]] << 8) | q[iorder[0]]; 2249#ifdef BYTEORDER_IS_LITTLE_ENDIAN 2250 if (bom == 0x0000FEFF) { 2251 q += 4; 2252 bo = -1; 2253 } 2254 else if (bom == 0xFFFE0000) { 2255 q += 4; 2256 bo = 1; 2257 } 2258#else 2259 if (bom == 0x0000FEFF) { 2260 q += 4; 2261 bo = 1; 2262 } 2263 else if (bom == 0xFFFE0000) { 2264 q += 4; 2265 bo = -1; 2266 } 2267#endif 2268 } 2269 } 2270 2271 if (bo == -1) { 2272 /* force LE */ 2273 iorder[0] = 0; 2274 iorder[1] = 1; 2275 iorder[2] = 2; 2276 iorder[3] = 3; 2277 } 2278 else if (bo == 1) { 2279 /* force BE */ 2280 iorder[0] = 3; 2281 iorder[1] = 2; 2282 iorder[2] = 1; 2283 iorder[3] = 0; 2284 } 2285 2286 while (q < e) { 2287 Py_UCS4 ch; 2288 /* remaining bytes at the end? (size should be divisible by 4) */ 2289 if (e-q<4) { 2290 if (consumed) 2291 break; 2292 errmsg = "truncated data"; 2293 startinpos = ((const char *)q)-starts; 2294 endinpos = ((const char *)e)-starts; 2295 goto utf32Error; 2296 /* The remaining input chars are ignored if the callback 2297 chooses to skip the input */ 2298 } 2299 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) | 2300 (q[iorder[1]] << 8) | q[iorder[0]]; 2301 2302 if (ch >= 0x110000) 2303 { 2304 errmsg = "codepoint not in range(0x110000)"; 2305 startinpos = ((const char *)q)-starts; 2306 endinpos = startinpos+4; 2307 goto utf32Error; 2308 } 2309#ifndef Py_UNICODE_WIDE 2310 if (ch >= 0x10000) 2311 { 2312 *p++ = 0xD800 | ((ch-0x10000) >> 10); 2313 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF); 2314 } 2315 else 2316#endif 2317 *p++ = ch; 2318 q += 4; 2319 continue; 2320 utf32Error: 2321 outpos = p-PyUnicode_AS_UNICODE(unicode); 2322 if (unicode_decode_call_errorhandler( 2323 errors, &errorHandler, 2324 "utf32", errmsg, 2325 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q, 2326 (PyObject **)&unicode, &outpos, &p)) 2327 goto onError; 2328 } 2329 2330 if (byteorder) 2331 *byteorder = bo; 2332 2333 if (consumed) 2334 *consumed = (const char *)q-starts; 2335 2336 /* Adjust length */ 2337 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0) 2338 goto onError; 2339 2340 Py_XDECREF(errorHandler); 2341 Py_XDECREF(exc); 2342 return (PyObject *)unicode; 2343 2344onError: 2345 Py_DECREF(unicode); 2346 Py_XDECREF(errorHandler); 2347 Py_XDECREF(exc); 2348 return NULL; 2349} 2350 2351PyObject * 2352PyUnicode_EncodeUTF32(const Py_UNICODE *s, 2353 Py_ssize_t size, 2354 const char *errors, 2355 int byteorder) 2356{ 2357 PyObject *v, *result; 2358 unsigned char *p; 2359#ifndef Py_UNICODE_WIDE 2360 int i, pairs; 2361#else 2362 const int pairs = 0; 2363#endif 2364 /* Offsets from p for storing byte pairs in the right order. */ 2365#ifdef BYTEORDER_IS_LITTLE_ENDIAN 2366 int iorder[] = {0, 1, 2, 3}; 2367#else 2368 int iorder[] = {3, 2, 1, 0}; 2369#endif 2370 2371#define STORECHAR(CH) \ 2372 do { \ 2373 p[iorder[3]] = ((CH) >> 24) & 0xff; \ 2374 p[iorder[2]] = ((CH) >> 16) & 0xff; \ 2375 p[iorder[1]] = ((CH) >> 8) & 0xff; \ 2376 p[iorder[0]] = (CH) & 0xff; \ 2377 p += 4; \ 2378 } while(0) 2379 2380 /* In narrow builds we can output surrogate pairs as one codepoint, 2381 so we need less space. */ 2382#ifndef Py_UNICODE_WIDE 2383 for (i = pairs = 0; i < size-1; i++) 2384 if (0xD800 <= s[i] && s[i] <= 0xDBFF && 2385 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF) 2386 pairs++; 2387#endif 2388 v = PyByteArray_FromStringAndSize(NULL, 2389 4 * (size - pairs + (byteorder == 0))); 2390 if (v == NULL) 2391 return NULL; 2392 2393 p = (unsigned char *)PyByteArray_AS_STRING(v); 2394 if (byteorder == 0) 2395 STORECHAR(0xFEFF); 2396 if (size == 0) 2397 goto done; 2398 2399 if (byteorder == -1) { 2400 /* force LE */ 2401 iorder[0] = 0; 2402 iorder[1] = 1; 2403 iorder[2] = 2; 2404 iorder[3] = 3; 2405 } 2406 else if (byteorder == 1) { 2407 /* force BE */ 2408 iorder[0] = 3; 2409 iorder[1] = 2; 2410 iorder[2] = 1; 2411 iorder[3] = 0; 2412 } 2413 2414 while (size-- > 0) { 2415 Py_UCS4 ch = *s++; 2416#ifndef Py_UNICODE_WIDE 2417 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) { 2418 Py_UCS4 ch2 = *s; 2419 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) { 2420 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000; 2421 s++; 2422 size--; 2423 } 2424 } 2425#endif 2426 STORECHAR(ch); 2427 } 2428 2429 done: 2430 result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v)); 2431 Py_DECREF(v); 2432 return result; 2433#undef STORECHAR 2434} 2435 2436PyObject *PyUnicode_AsUTF32String(PyObject *unicode) 2437{ 2438 if (!PyUnicode_Check(unicode)) { 2439 PyErr_BadArgument(); 2440 return NULL; 2441 } 2442 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode), 2443 PyUnicode_GET_SIZE(unicode), 2444 NULL, 2445 0); 2446} 2447 2448/* --- UTF-16 Codec ------------------------------------------------------- */ 2449 2450PyObject * 2451PyUnicode_DecodeUTF16(const char *s, 2452 Py_ssize_t size, 2453 const char *errors, 2454 int *byteorder) 2455{ 2456 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL); 2457} 2458 2459PyObject * 2460PyUnicode_DecodeUTF16Stateful(const char *s, 2461 Py_ssize_t size, 2462 const char *errors, 2463 int *byteorder, 2464 Py_ssize_t *consumed) 2465{ 2466 const char *starts = s; 2467 Py_ssize_t startinpos; 2468 Py_ssize_t endinpos; 2469 Py_ssize_t outpos; 2470 PyUnicodeObject *unicode; 2471 Py_UNICODE *p; 2472 const unsigned char *q, *e; 2473 int bo = 0; /* assume native ordering by default */ 2474 const char *errmsg = ""; 2475 /* Offsets from q for retrieving byte pairs in the right order. */ 2476#ifdef BYTEORDER_IS_LITTLE_ENDIAN 2477 int ihi = 1, ilo = 0; 2478#else 2479 int ihi = 0, ilo = 1; 2480#endif 2481 PyObject *errorHandler = NULL; 2482 PyObject *exc = NULL; 2483 2484 /* Note: size will always be longer than the resulting Unicode 2485 character count */ 2486 unicode = _PyUnicode_New(size); 2487 if (!unicode) 2488 return NULL; 2489 if (size == 0) 2490 return (PyObject *)unicode; 2491 2492 /* Unpack UTF-16 encoded data */ 2493 p = unicode->str; 2494 q = (unsigned char *)s; 2495 e = q + size; 2496 2497 if (byteorder) 2498 bo = *byteorder; 2499 2500 /* Check for BOM marks (U+FEFF) in the input and adjust current 2501 byte order setting accordingly. In native mode, the leading BOM 2502 mark is skipped, in all other modes, it is copied to the output 2503 stream as-is (giving a ZWNBSP character). */ 2504 if (bo == 0) { 2505 if (size >= 2) { 2506 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo]; 2507#ifdef BYTEORDER_IS_LITTLE_ENDIAN 2508 if (bom == 0xFEFF) { 2509 q += 2; 2510 bo = -1; 2511 } 2512 else if (bom == 0xFFFE) { 2513 q += 2; 2514 bo = 1; 2515 } 2516#else 2517 if (bom == 0xFEFF) { 2518 q += 2; 2519 bo = 1; 2520 } 2521 else if (bom == 0xFFFE) { 2522 q += 2; 2523 bo = -1; 2524 } 2525#endif 2526 } 2527 } 2528 2529 if (bo == -1) { 2530 /* force LE */ 2531 ihi = 1; 2532 ilo = 0; 2533 } 2534 else if (bo == 1) { 2535 /* force BE */ 2536 ihi = 0; 2537 ilo = 1; 2538 } 2539 2540 while (q < e) { 2541 Py_UNICODE ch; 2542 /* remaining bytes at the end? (size should be even) */ 2543 if (e-q<2) { 2544 if (consumed) 2545 break; 2546 errmsg = "truncated data"; 2547 startinpos = ((const char *)q)-starts; 2548 endinpos = ((const char *)e)-starts; 2549 goto utf16Error; 2550 /* The remaining input chars are ignored if the callback 2551 chooses to skip the input */ 2552 } 2553 ch = (q[ihi] << 8) | q[ilo]; 2554 2555 q += 2; 2556 2557 if (ch < 0xD800 || ch > 0xDFFF) { 2558 *p++ = ch; 2559 continue; 2560 } 2561 2562 /* UTF-16 code pair: */ 2563 if (q >= e) { 2564 errmsg = "unexpected end of data"; 2565 startinpos = (((const char *)q)-2)-starts; 2566 endinpos = ((const char *)e)-starts; 2567 goto utf16Error; 2568 } 2569 if (0xD800 <= ch && ch <= 0xDBFF) { 2570 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo]; 2571 q += 2; 2572 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) { 2573#ifndef Py_UNICODE_WIDE 2574 *p++ = ch; 2575 *p++ = ch2; 2576#else 2577 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000; 2578#endif 2579 continue; 2580 } 2581 else { 2582 errmsg = "illegal UTF-16 surrogate"; 2583 startinpos = (((const char *)q)-4)-starts; 2584 endinpos = startinpos+2; 2585 goto utf16Error; 2586 } 2587 2588 } 2589 errmsg = "illegal encoding"; 2590 startinpos = (((const char *)q)-2)-starts; 2591 endinpos = startinpos+2; 2592 /* Fall through to report the error */ 2593 2594 utf16Error: 2595 outpos = p-PyUnicode_AS_UNICODE(unicode); 2596 if (unicode_decode_call_errorhandler( 2597 errors, &errorHandler, 2598 "utf16", errmsg, 2599 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q, 2600 (PyObject **)&unicode, &outpos, &p)) 2601 goto onError; 2602 } 2603 2604 if (byteorder) 2605 *byteorder = bo; 2606 2607 if (consumed) 2608 *consumed = (const char *)q-starts; 2609 2610 /* Adjust length */ 2611 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0) 2612 goto onError; 2613 2614 Py_XDECREF(errorHandler); 2615 Py_XDECREF(exc); 2616 return (PyObject *)unicode; 2617 2618onError: 2619 Py_DECREF(unicode); 2620 Py_XDECREF(errorHandler); 2621 Py_XDECREF(exc); 2622 return NULL; 2623} 2624 2625PyObject * 2626PyUnicode_EncodeUTF16(const Py_UNICODE *s, 2627 Py_ssize_t size, 2628 const char *errors, 2629 int byteorder) 2630{ 2631 PyObject *v, *result; 2632 unsigned char *p; 2633#ifdef Py_UNICODE_WIDE 2634 int i, pairs; 2635#else 2636 const int pairs = 0; 2637#endif 2638 /* Offsets from p for storing byte pairs in the right order. */ 2639#ifdef BYTEORDER_IS_LITTLE_ENDIAN 2640 int ihi = 1, ilo = 0; 2641#else 2642 int ihi = 0, ilo = 1; 2643#endif 2644 2645#define STORECHAR(CH) \ 2646 do { \ 2647 p[ihi] = ((CH) >> 8) & 0xff; \ 2648 p[ilo] = (CH) & 0xff; \ 2649 p += 2; \ 2650 } while(0) 2651 2652#ifdef Py_UNICODE_WIDE 2653 for (i = pairs = 0; i < size; i++) 2654 if (s[i] >= 0x10000) 2655 pairs++; 2656#endif 2657 v = PyByteArray_FromStringAndSize(NULL, 2658 2 * (size + pairs + (byteorder == 0))); 2659 if (v == NULL) 2660 return NULL; 2661 2662 p = (unsigned char *)PyByteArray_AS_STRING(v); 2663 if (byteorder == 0) 2664 STORECHAR(0xFEFF); 2665 if (size == 0) 2666 goto done; 2667 2668 if (byteorder == -1) { 2669 /* force LE */ 2670 ihi = 1; 2671 ilo = 0; 2672 } 2673 else if (byteorder == 1) { 2674 /* force BE */ 2675 ihi = 0; 2676 ilo = 1; 2677 } 2678 2679 while (size-- > 0) { 2680 Py_UNICODE ch = *s++; 2681 Py_UNICODE ch2 = 0; 2682#ifdef Py_UNICODE_WIDE 2683 if (ch >= 0x10000) { 2684 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF); 2685 ch = 0xD800 | ((ch-0x10000) >> 10); 2686 } 2687#endif 2688 STORECHAR(ch); 2689 if (ch2) 2690 STORECHAR(ch2); 2691 } 2692 2693 done: 2694 result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v)); 2695 Py_DECREF(v); 2696 return result; 2697#undef STORECHAR 2698} 2699 2700PyObject *PyUnicode_AsUTF16String(PyObject *unicode) 2701{ 2702 if (!PyUnicode_Check(unicode)) { 2703 PyErr_BadArgument(); 2704 return NULL; 2705 } 2706 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode), 2707 PyUnicode_GET_SIZE(unicode), 2708 NULL, 2709 0); 2710} 2711 2712/* --- Unicode Escape Codec ----------------------------------------------- */ 2713 2714static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL; 2715 2716PyObject *PyUnicode_DecodeUnicodeEscape(const char *s, 2717 Py_ssize_t size, 2718 const char *errors) 2719{ 2720 const char *starts = s; 2721 Py_ssize_t startinpos; 2722 Py_ssize_t endinpos; 2723 Py_ssize_t outpos; 2724 int i; 2725 PyUnicodeObject *v; 2726 Py_UNICODE *p; 2727 const char *end; 2728 char* message; 2729 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */ 2730 PyObject *errorHandler = NULL; 2731 PyObject *exc = NULL; 2732 2733 /* Escaped strings will always be longer than the resulting 2734 Unicode string, so we start with size here and then reduce the 2735 length after conversion to the true value. 2736 (but if the error callback returns a long replacement string 2737 we'll have to allocate more space) */ 2738 v = _PyUnicode_New(size); 2739 if (v == NULL) 2740 goto onError; 2741 if (size == 0) 2742 return (PyObject *)v; 2743 2744 p = PyUnicode_AS_UNICODE(v); 2745 end = s + size; 2746 2747 while (s < end) { 2748 unsigned char c; 2749 Py_UNICODE x; 2750 int digits; 2751 2752 /* Non-escape characters are interpreted as Unicode ordinals */ 2753 if (*s != '\\') { 2754 *p++ = (unsigned char) *s++; 2755 continue; 2756 } 2757 2758 startinpos = s-starts; 2759 /* \ - Escapes */ 2760 s++; 2761 c = *s++; 2762 if (s > end) 2763 c = '\0'; /* Invalid after \ */ 2764 switch (c) { 2765 2766 /* \x escapes */ 2767 case '\n': break; 2768 case '\\': *p++ = '\\'; break; 2769 case '\'': *p++ = '\''; break; 2770 case '\"': *p++ = '\"'; break; 2771 case 'b': *p++ = '\b'; break; 2772 case 'f': *p++ = '\014'; break; /* FF */ 2773 case 't': *p++ = '\t'; break; 2774 case 'n': *p++ = '\n'; break; 2775 case 'r': *p++ = '\r'; break; 2776 case 'v': *p++ = '\013'; break; /* VT */ 2777 case 'a': *p++ = '\007'; break; /* BEL, not classic C */ 2778 2779 /* \OOO (octal) escapes */ 2780 case '0': case '1': case '2': case '3': 2781 case '4': case '5': case '6': case '7': 2782 x = s[-1] - '0'; 2783 if (s < end && '0' <= *s && *s <= '7') { 2784 x = (x<<3) + *s++ - '0'; 2785 if (s < end && '0' <= *s && *s <= '7') 2786 x = (x<<3) + *s++ - '0'; 2787 } 2788 *p++ = x; 2789 break; 2790 2791 /* hex escapes */ 2792 /* \xXX */ 2793 case 'x': 2794 digits = 2; 2795 message = "truncated \\xXX escape"; 2796 goto hexescape; 2797 2798 /* \uXXXX */ 2799 case 'u': 2800 digits = 4; 2801 message = "truncated \\uXXXX escape"; 2802 goto hexescape; 2803 2804 /* \UXXXXXXXX */ 2805 case 'U': 2806 digits = 8; 2807 message = "truncated \\UXXXXXXXX escape"; 2808 hexescape: 2809 chr = 0; 2810 outpos = p-PyUnicode_AS_UNICODE(v); 2811 if (s+digits>end) { 2812 endinpos = size; 2813 if (unicode_decode_call_errorhandler( 2814 errors, &errorHandler, 2815 "unicodeescape", "end of string in escape sequence", 2816 &starts, &end, &startinpos, &endinpos, &exc, &s, 2817 (PyObject **)&v, &outpos, &p)) 2818 goto onError; 2819 goto nextByte; 2820 } 2821 for (i = 0; i < digits; ++i) { 2822 c = (unsigned char) s[i]; 2823 if (!ISXDIGIT(c)) { 2824 endinpos = (s+i+1)-starts; 2825 if (unicode_decode_call_errorhandler( 2826 errors, &errorHandler, 2827 "unicodeescape", message, 2828 &starts, &end, &startinpos, &endinpos, &exc, &s, 2829 (PyObject **)&v, &outpos, &p)) 2830 goto onError; 2831 goto nextByte; 2832 } 2833 chr = (chr<<4) & ~0xF; 2834 if (c >= '0' && c <= '9') 2835 chr += c - '0'; 2836 else if (c >= 'a' && c <= 'f') 2837 chr += 10 + c - 'a'; 2838 else 2839 chr += 10 + c - 'A'; 2840 } 2841 s += i; 2842 if (chr == 0xffffffff && PyErr_Occurred()) 2843 /* _decoding_error will have already written into the 2844 target buffer. */ 2845 break; 2846 store: 2847 /* when we get here, chr is a 32-bit unicode character */ 2848 if (chr <= 0xffff) 2849 /* UCS-2 character */ 2850 *p++ = (Py_UNICODE) chr; 2851 else if (chr <= 0x10ffff) { 2852 /* UCS-4 character. Either store directly, or as 2853 surrogate pair. */ 2854#ifdef Py_UNICODE_WIDE 2855 *p++ = chr; 2856#else 2857 chr -= 0x10000L; 2858 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10); 2859 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF); 2860#endif 2861 } else { 2862 endinpos = s-starts; 2863 outpos = p-PyUnicode_AS_UNICODE(v); 2864 if (unicode_decode_call_errorhandler( 2865 errors, &errorHandler, 2866 "unicodeescape", "illegal Unicode character", 2867 &starts, &end, &startinpos, &endinpos, &exc, &s, 2868 (PyObject **)&v, &outpos, &p)) 2869 goto onError; 2870 } 2871 break; 2872 2873 /* \N{name} */ 2874 case 'N': 2875 message = "malformed \\N character escape"; 2876 if (ucnhash_CAPI == NULL) { 2877 /* load the unicode data module */ 2878 PyObject *m, *api; 2879 m = PyImport_ImportModuleNoBlock("unicodedata"); 2880 if (m == NULL) 2881 goto ucnhashError; 2882 api = PyObject_GetAttrString(m, "ucnhash_CAPI"); 2883 Py_DECREF(m); 2884 if (api == NULL) 2885 goto ucnhashError; 2886 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api); 2887 Py_DECREF(api); 2888 if (ucnhash_CAPI == NULL) 2889 goto ucnhashError; 2890 } 2891 if (*s == '{') { 2892 const char *start = s+1; 2893 /* look for the closing brace */ 2894 while (*s != '}' && s < end) 2895 s++; 2896 if (s > start && s < end && *s == '}') { 2897 /* found a name. look it up in the unicode database */ 2898 message = "unknown Unicode character name"; 2899 s++; 2900 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr)) 2901 goto store; 2902 } 2903 } 2904 endinpos = s-starts; 2905 outpos = p-PyUnicode_AS_UNICODE(v); 2906 if (unicode_decode_call_errorhandler( 2907 errors, &errorHandler, 2908 "unicodeescape", message, 2909 &starts, &end, &startinpos, &endinpos, &exc, &s, 2910 (PyObject **)&v, &outpos, &p)) 2911 goto onError; 2912 break; 2913 2914 default: 2915 if (s > end) { 2916 message = "\\ at end of string"; 2917 s--; 2918 endinpos = s-starts; 2919 outpos = p-PyUnicode_AS_UNICODE(v); 2920 if (unicode_decode_call_errorhandler( 2921 errors, &errorHandler, 2922 "unicodeescape", message, 2923 &starts, &end, &startinpos, &endinpos, &exc, &s, 2924 (PyObject **)&v, &outpos, &p)) 2925 goto onError; 2926 } 2927 else { 2928 *p++ = '\\'; 2929 *p++ = (unsigned char)s[-1]; 2930 } 2931 break; 2932 } 2933 nextByte: 2934 ; 2935 } 2936 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0) 2937 goto onError; 2938 Py_XDECREF(errorHandler); 2939 Py_XDECREF(exc); 2940 return (PyObject *)v; 2941 2942ucnhashError: 2943 PyErr_SetString( 2944 PyExc_UnicodeError, 2945 "\\N escapes not supported (can't load unicodedata module)" 2946 ); 2947 Py_XDECREF(v); 2948 Py_XDECREF(errorHandler); 2949 Py_XDECREF(exc); 2950 return NULL; 2951 2952onError: 2953 Py_XDECREF(v); 2954 Py_XDECREF(errorHandler); 2955 Py_XDECREF(exc); 2956 return NULL; 2957} 2958 2959/* Return a Unicode-Escape string version of the Unicode object. 2960 2961 If quotes is true, the string is enclosed in u"" or u'' quotes as 2962 appropriate. 2963 2964*/ 2965 2966Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s, 2967 Py_ssize_t size, 2968 Py_UNICODE ch) 2969{ 2970 /* like wcschr, but doesn't stop at NULL characters */ 2971 2972 while (size-- > 0) { 2973 if (*s == ch) 2974 return s; 2975 s++; 2976 } 2977 2978 return NULL; 2979} 2980 2981static const char *hexdigits = "0123456789abcdef"; 2982 2983PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s, 2984 Py_ssize_t size) 2985{ 2986 PyObject *repr, *result; 2987 char *p; 2988 2989 /* XXX(nnorwitz): rather than over-allocating, it would be 2990 better to choose a different scheme. Perhaps scan the 2991 first N-chars of the string and allocate based on that size. 2992 */ 2993 /* Initial allocation is based on the longest-possible unichr 2994 escape. 2995 2996 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source 2997 unichr, so in this case it's the longest unichr escape. In 2998 narrow (UTF-16) builds this is five chars per source unichr 2999 since there are two unichrs in the surrogate pair, so in narrow 3000 (UTF-16) builds it's not the longest unichr escape. 3001 3002 In wide or narrow builds '\uxxxx' is 6 chars per source unichr, 3003 so in the narrow (UTF-16) build case it's the longest unichr 3004 escape. 3005 */ 3006 3007 repr = PyByteArray_FromStringAndSize(NULL, 3008#ifdef Py_UNICODE_WIDE 3009 + 10*size 3010#else 3011 + 6*size 3012#endif 3013 + 1); 3014 if (repr == NULL) 3015 return NULL; 3016 3017 p = PyByteArray_AS_STRING(repr); 3018 3019 while (size-- > 0) { 3020 Py_UNICODE ch = *s++; 3021 3022 /* Escape backslashes */ 3023 if (ch == '\\') { 3024 *p++ = '\\'; 3025 *p++ = (char) ch; 3026 continue; 3027 } 3028 3029#ifdef Py_UNICODE_WIDE 3030 /* Map 21-bit characters to '\U00xxxxxx' */ 3031 else if (ch >= 0x10000) { 3032 *p++ = '\\'; 3033 *p++ = 'U'; 3034 *p++ = hexdigits[(ch >> 28) & 0x0000000F]; 3035 *p++ = hexdigits[(ch >> 24) & 0x0000000F]; 3036 *p++ = hexdigits[(ch >> 20) & 0x0000000F]; 3037 *p++ = hexdigits[(ch >> 16) & 0x0000000F]; 3038 *p++ = hexdigits[(ch >> 12) & 0x0000000F]; 3039 *p++ = hexdigits[(ch >> 8) & 0x0000000F]; 3040 *p++ = hexdigits[(ch >> 4) & 0x0000000F]; 3041 *p++ = hexdigits[ch & 0x0000000F]; 3042 continue; 3043 } 3044#else 3045 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */ 3046 else if (ch >= 0xD800 && ch < 0xDC00) { 3047 Py_UNICODE ch2; 3048 Py_UCS4 ucs; 3049 3050 ch2 = *s++; 3051 size--; 3052 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) { 3053 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000; 3054 *p++ = '\\'; 3055 *p++ = 'U'; 3056 *p++ = hexdigits[(ucs >> 28) & 0x0000000F]; 3057 *p++ = hexdigits[(ucs >> 24) & 0x0000000F]; 3058 *p++ = hexdigits[(ucs >> 20) & 0x0000000F]; 3059 *p++ = hexdigits[(ucs >> 16) & 0x0000000F]; 3060 *p++ = hexdigits[(ucs >> 12) & 0x0000000F]; 3061 *p++ = hexdigits[(ucs >> 8) & 0x0000000F]; 3062 *p++ = hexdigits[(ucs >> 4) & 0x0000000F]; 3063 *p++ = hexdigits[ucs & 0x0000000F]; 3064 continue; 3065 } 3066 /* Fall through: isolated surrogates are copied as-is */ 3067 s--; 3068 size++; 3069 } 3070#endif 3071 3072 /* Map 16-bit characters to '\uxxxx' */ 3073 if (ch >= 256) { 3074 *p++ = '\\'; 3075 *p++ = 'u'; 3076 *p++ = hexdigits[(ch >> 12) & 0x000F]; 3077 *p++ = hexdigits[(ch >> 8) & 0x000F]; 3078 *p++ = hexdigits[(ch >> 4) & 0x000F]; 3079 *p++ = hexdigits[ch & 0x000F]; 3080 } 3081 3082 /* Map special whitespace to '\t', \n', '\r' */ 3083 else if (ch == '\t') { 3084 *p++ = '\\'; 3085 *p++ = 't'; 3086 } 3087 else if (ch == '\n') { 3088 *p++ = '\\'; 3089 *p++ = 'n'; 3090 } 3091 else if (ch == '\r') { 3092 *p++ = '\\'; 3093 *p++ = 'r'; 3094 } 3095 3096 /* Map non-printable US ASCII to '\xhh' */ 3097 else if (ch < ' ' || ch >= 0x7F) { 3098 *p++ = '\\'; 3099 *p++ = 'x'; 3100 *p++ = hexdigits[(ch >> 4) & 0x000F]; 3101 *p++ = hexdigits[ch & 0x000F]; 3102 } 3103 3104 /* Copy everything else as-is */ 3105 else 3106 *p++ = (char) ch; 3107 } 3108 3109 result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(repr), 3110 p - PyByteArray_AS_STRING(repr)); 3111 Py_DECREF(repr); 3112 return result; 3113} 3114 3115PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode) 3116{ 3117 PyObject *s, *result; 3118 if (!PyUnicode_Check(unicode)) { 3119 PyErr_BadArgument(); 3120 return NULL; 3121 } 3122 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode), 3123 PyUnicode_GET_SIZE(unicode)); 3124 3125 if (!s) 3126 return NULL; 3127 result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(s), 3128 PyByteArray_GET_SIZE(s)); 3129 Py_DECREF(s); 3130 return result; 3131} 3132 3133/* --- Raw Unicode Escape Codec ------------------------------------------- */ 3134 3135PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s, 3136 Py_ssize_t size, 3137 const char *errors) 3138{ 3139 const char *starts = s; 3140 Py_ssize_t startinpos; 3141 Py_ssize_t endinpos; 3142 Py_ssize_t outpos; 3143 PyUnicodeObject *v; 3144 Py_UNICODE *p; 3145 const char *end; 3146 const char *bs; 3147 PyObject *errorHandler = NULL; 3148 PyObject *exc = NULL; 3149 3150 /* Escaped strings will always be longer than the resulting 3151 Unicode string, so we start with size here and then reduce the 3152 length after conversion to the true value. (But decoding error 3153 handler might have to resize the string) */ 3154 v = _PyUnicode_New(size); 3155 if (v == NULL) 3156 goto onError; 3157 if (size == 0) 3158 return (PyObject *)v; 3159 p = PyUnicode_AS_UNICODE(v); 3160 end = s + size; 3161 while (s < end) { 3162 unsigned char c; 3163 Py_UCS4 x; 3164 int i; 3165 int count; 3166 3167 /* Non-escape characters are interpreted as Unicode ordinals */ 3168 if (*s != '\\') { 3169 *p++ = (unsigned char)*s++; 3170 continue; 3171 } 3172 startinpos = s-starts; 3173 3174 /* \u-escapes are only interpreted iff the number of leading 3175 backslashes if odd */ 3176 bs = s; 3177 for (;s < end;) { 3178 if (*s != '\\') 3179 break; 3180 *p++ = (unsigned char)*s++; 3181 } 3182 if (((s - bs) & 1) == 0 || 3183 s >= end || 3184 (*s != 'u' && *s != 'U')) { 3185 continue; 3186 } 3187 p--; 3188 count = *s=='u' ? 4 : 8; 3189 s++; 3190 3191 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */ 3192 outpos = p-PyUnicode_AS_UNICODE(v); 3193 for (x = 0, i = 0; i < count; ++i, ++s) { 3194 c = (unsigned char)*s; 3195 if (!ISXDIGIT(c)) { 3196 endinpos = s-starts; 3197 if (unicode_decode_call_errorhandler( 3198 errors, &errorHandler, 3199 "rawunicodeescape", "truncated \\uXXXX", 3200 &starts, &end, &startinpos, &endinpos, &exc, &s, 3201 (PyObject **)&v, &outpos, &p)) 3202 goto onError; 3203 goto nextByte; 3204 } 3205 x = (x<<4) & ~0xF; 3206 if (c >= '0' && c <= '9') 3207 x += c - '0'; 3208 else if (c >= 'a' && c <= 'f') 3209 x += 10 + c - 'a'; 3210 else 3211 x += 10 + c - 'A'; 3212 } 3213 if (x <= 0xffff) 3214 /* UCS-2 character */ 3215 *p++ = (Py_UNICODE) x; 3216 else if (x <= 0x10ffff) { 3217 /* UCS-4 character. Either store directly, or as 3218 surrogate pair. */ 3219#ifdef Py_UNICODE_WIDE 3220 *p++ = (Py_UNICODE) x; 3221#else 3222 x -= 0x10000L; 3223 *p++ = 0xD800 + (Py_UNICODE) (x >> 10); 3224 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF); 3225#endif 3226 } else { 3227 endinpos = s-starts; 3228 outpos = p-PyUnicode_AS_UNICODE(v); 3229 if (unicode_decode_call_errorhandler( 3230 errors, &errorHandler, 3231 "rawunicodeescape", "\\Uxxxxxxxx out of range", 3232 &starts, &end, &startinpos, &endinpos, &exc, &s, 3233 (PyObject **)&v, &outpos, &p)) 3234 goto onError; 3235 } 3236 nextByte: 3237 ; 3238 } 3239 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0) 3240 goto onError; 3241 Py_XDECREF(errorHandler); 3242 Py_XDECREF(exc); 3243 return (PyObject *)v; 3244 3245 onError: 3246 Py_XDECREF(v); 3247 Py_XDECREF(errorHandler); 3248 Py_XDECREF(exc); 3249 return NULL; 3250} 3251 3252PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s, 3253 Py_ssize_t size) 3254{ 3255 PyObject *repr, *result; 3256 char *p; 3257 char *q; 3258 3259#ifdef Py_UNICODE_WIDE 3260 repr = PyByteArray_FromStringAndSize(NULL, 10 * size); 3261#else 3262 repr = PyByteArray_FromStringAndSize(NULL, 6 * size); 3263#endif 3264 if (repr == NULL) 3265 return NULL; 3266 if (size == 0) 3267 goto done; 3268 3269 p = q = PyByteArray_AS_STRING(repr); 3270 while (size-- > 0) { 3271 Py_UNICODE ch = *s++; 3272#ifdef Py_UNICODE_WIDE 3273 /* Map 32-bit characters to '\Uxxxxxxxx' */ 3274 if (ch >= 0x10000) { 3275 *p++ = '\\'; 3276 *p++ = 'U'; 3277 *p++ = hexdigits[(ch >> 28) & 0xf]; 3278 *p++ = hexdigits[(ch >> 24) & 0xf]; 3279 *p++ = hexdigits[(ch >> 20) & 0xf]; 3280 *p++ = hexdigits[(ch >> 16) & 0xf]; 3281 *p++ = hexdigits[(ch >> 12) & 0xf]; 3282 *p++ = hexdigits[(ch >> 8) & 0xf]; 3283 *p++ = hexdigits[(ch >> 4) & 0xf]; 3284 *p++ = hexdigits[ch & 15]; 3285 } 3286 else 3287#else 3288 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */ 3289 if (ch >= 0xD800 && ch < 0xDC00) { 3290 Py_UNICODE ch2; 3291 Py_UCS4 ucs; 3292 3293 ch2 = *s++; 3294 size--; 3295 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) { 3296 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000; 3297 *p++ = '\\'; 3298 *p++ = 'U'; 3299 *p++ = hexdigits[(ucs >> 28) & 0xf]; 3300 *p++ = hexdigits[(ucs >> 24) & 0xf]; 3301 *p++ = hexdigits[(ucs >> 20) & 0xf]; 3302 *p++ = hexdigits[(ucs >> 16) & 0xf]; 3303 *p++ = hexdigits[(ucs >> 12) & 0xf]; 3304 *p++ = hexdigits[(ucs >> 8) & 0xf]; 3305 *p++ = hexdigits[(ucs >> 4) & 0xf]; 3306 *p++ = hexdigits[ucs & 0xf]; 3307 continue; 3308 } 3309 /* Fall through: isolated surrogates are copied as-is */ 3310 s--; 3311 size++; 3312 } 3313#endif 3314 /* Map 16-bit characters to '\uxxxx' */ 3315 if (ch >= 256) { 3316 *p++ = '\\'; 3317 *p++ = 'u'; 3318 *p++ = hexdigits[(ch >> 12) & 0xf]; 3319 *p++ = hexdigits[(ch >> 8) & 0xf]; 3320 *p++ = hexdigits[(ch >> 4) & 0xf]; 3321 *p++ = hexdigits[ch & 15]; 3322 } 3323 /* Copy everything else as-is */ 3324 else 3325 *p++ = (char) ch; 3326 } 3327 size = p - q; 3328 3329 done: 3330 result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(repr), size); 3331 Py_DECREF(repr); 3332 return result; 3333} 3334 3335PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode) 3336{ 3337 PyObject *s, *result; 3338 if (!PyUnicode_Check(unicode)) { 3339 PyErr_BadArgument(); 3340 return NULL; 3341 } 3342 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode), 3343 PyUnicode_GET_SIZE(unicode)); 3344 3345 if (!s) 3346 return NULL; 3347 result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(s), 3348 PyByteArray_GET_SIZE(s)); 3349 Py_DECREF(s); 3350 return result; 3351} 3352 3353/* --- Unicode Internal Codec ------------------------------------------- */ 3354 3355PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s, 3356 Py_ssize_t size, 3357 const char *errors) 3358{ 3359 const char *starts = s; 3360 Py_ssize_t startinpos; 3361 Py_ssize_t endinpos; 3362 Py_ssize_t outpos; 3363 PyUnicodeObject *v; 3364 Py_UNICODE *p; 3365 const char *end; 3366 const char *reason; 3367 PyObject *errorHandler = NULL; 3368 PyObject *exc = NULL; 3369 3370#ifdef Py_UNICODE_WIDE 3371 Py_UNICODE unimax = PyUnicode_GetMax(); 3372#endif 3373 3374 /* XXX overflow detection missing */ 3375 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE); 3376 if (v == NULL) 3377 goto onError; 3378 if (PyUnicode_GetSize((PyObject *)v) == 0) 3379 return (PyObject *)v; 3380 p = PyUnicode_AS_UNICODE(v); 3381 end = s + size; 3382 3383 while (s < end) { 3384 memcpy(p, s, sizeof(Py_UNICODE)); 3385 /* We have to sanity check the raw data, otherwise doom looms for 3386 some malformed UCS-4 data. */ 3387 if ( 3388 #ifdef Py_UNICODE_WIDE 3389 *p > unimax || *p < 0 || 3390 #endif 3391 end-s < Py_UNICODE_SIZE 3392 ) 3393 { 3394 startinpos = s - starts; 3395 if (end-s < Py_UNICODE_SIZE) { 3396 endinpos = end-starts; 3397 reason = "truncated input"; 3398 } 3399 else { 3400 endinpos = s - starts + Py_UNICODE_SIZE; 3401 reason = "illegal code point (> 0x10FFFF)"; 3402 } 3403 outpos = p - PyUnicode_AS_UNICODE(v); 3404 if (unicode_decode_call_errorhandler( 3405 errors, &errorHandler, 3406 "unicode_internal", reason, 3407 &starts, &end, &startinpos, &endinpos, &exc, &s, 3408 (PyObject **)&v, &outpos, &p)) { 3409 goto onError; 3410 } 3411 } 3412 else { 3413 p++; 3414 s += Py_UNICODE_SIZE; 3415 } 3416 } 3417 3418 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0) 3419 goto onError; 3420 Py_XDECREF(errorHandler); 3421 Py_XDECREF(exc); 3422 return (PyObject *)v; 3423 3424 onError: 3425 Py_XDECREF(v); 3426 Py_XDECREF(errorHandler); 3427 Py_XDECREF(exc); 3428 return NULL; 3429} 3430 3431/* --- Latin-1 Codec ------------------------------------------------------ */ 3432 3433PyObject *PyUnicode_DecodeLatin1(const char *s, 3434 Py_ssize_t size, 3435 const char *errors) 3436{ 3437 PyUnicodeObject *v; 3438 Py_UNICODE *p; 3439 3440 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */ 3441 if (size == 1) { 3442 Py_UNICODE r = *(unsigned char*)s; 3443 return PyUnicode_FromUnicode(&r, 1); 3444 } 3445 3446 v = _PyUnicode_New(size); 3447 if (v == NULL) 3448 goto onError; 3449 if (size == 0) 3450 return (PyObject *)v; 3451 p = PyUnicode_AS_UNICODE(v); 3452 while (size-- > 0) 3453 *p++ = (unsigned char)*s++; 3454 return (PyObject *)v; 3455 3456 onError: 3457 Py_XDECREF(v); 3458 return NULL; 3459} 3460 3461/* create or adjust a UnicodeEncodeError */ 3462static void make_encode_exception(PyObject **exceptionObject, 3463 const char *encoding, 3464 const Py_UNICODE *unicode, Py_ssize_t size, 3465 Py_ssize_t startpos, Py_ssize_t endpos, 3466 const char *reason) 3467{ 3468 if (*exceptionObject == NULL) { 3469 *exceptionObject = PyUnicodeEncodeError_Create( 3470 encoding, unicode, size, startpos, endpos, reason); 3471 } 3472 else { 3473 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos)) 3474 goto onError; 3475 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos)) 3476 goto onError; 3477 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason)) 3478 goto onError; 3479 return; 3480 onError: 3481 Py_DECREF(*exceptionObject); 3482 *exceptionObject = NULL; 3483 } 3484} 3485 3486/* raises a UnicodeEncodeError */ 3487static void raise_encode_exception(PyObject **exceptionObject, 3488 const char *encoding, 3489 const Py_UNICODE *unicode, Py_ssize_t size, 3490 Py_ssize_t startpos, Py_ssize_t endpos, 3491 const char *reason) 3492{ 3493 make_encode_exception(exceptionObject, 3494 encoding, unicode, size, startpos, endpos, reason); 3495 if (*exceptionObject != NULL) 3496 PyCodec_StrictErrors(*exceptionObject); 3497} 3498 3499/* error handling callback helper: 3500 build arguments, call the callback and check the arguments, 3501 put the result into newpos and return the replacement string, which 3502 has to be freed by the caller */ 3503static PyObject *unicode_encode_call_errorhandler(const char *errors, 3504 PyObject **errorHandler, 3505 const char *encoding, const char *reason, 3506 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject, 3507 Py_ssize_t startpos, Py_ssize_t endpos, 3508 Py_ssize_t *newpos) 3509{ 3510 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple"; 3511 3512 PyObject *restuple; 3513 PyObject *resunicode; 3514 3515 if (*errorHandler == NULL) { 3516 *errorHandler = PyCodec_LookupError(errors); 3517 if (*errorHandler == NULL) 3518 return NULL; 3519 } 3520 3521 make_encode_exception(exceptionObject, 3522 encoding, unicode, size, startpos, endpos, reason); 3523 if (*exceptionObject == NULL) 3524 return NULL; 3525 3526 restuple = PyObject_CallFunctionObjArgs( 3527 *errorHandler, *exceptionObject, NULL); 3528 if (restuple == NULL) 3529 return NULL; 3530 if (!PyTuple_Check(restuple)) { 3531 PyErr_Format(PyExc_TypeError, &argparse[4]); 3532 Py_DECREF(restuple); 3533 return NULL; 3534 } 3535 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, 3536 &resunicode, newpos)) { 3537 Py_DECREF(restuple); 3538 return NULL; 3539 } 3540 if (*newpos<0) 3541 *newpos = size+*newpos; 3542 if (*newpos<0 || *newpos>size) { 3543 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos); 3544 Py_DECREF(restuple); 3545 return NULL; 3546 } 3547 Py_INCREF(resunicode); 3548 Py_DECREF(restuple); 3549 return resunicode; 3550} 3551 3552static PyObject *unicode_encode_ucs1(const Py_UNICODE *p, 3553 Py_ssize_t size, 3554 const char *errors, 3555 int limit) 3556{ 3557 /* output object */ 3558 PyObject *res; 3559 /* pointers to the beginning and end+1 of input */ 3560 const Py_UNICODE *startp = p; 3561 const Py_UNICODE *endp = p + size; 3562 /* pointer to the beginning of the unencodable characters */ 3563 /* const Py_UNICODE *badp = NULL; */ 3564 /* pointer into the output */ 3565 char *str; 3566 /* current output position */ 3567 Py_ssize_t ressize; 3568 const char *encoding = (limit == 256) ? "latin-1" : "ascii"; 3569 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)"; 3570 PyObject *errorHandler = NULL; 3571 PyObject *exc = NULL; 3572 PyObject *result = NULL; 3573 /* the following variable is used for caching string comparisons 3574 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */ 3575 int known_errorHandler = -1; 3576 3577 /* allocate enough for a simple encoding without 3578 replacements, if we need more, we'll resize */ 3579 if (size == 0) 3580 return PyBytes_FromStringAndSize(NULL, 0); 3581 res = PyByteArray_FromStringAndSize(NULL, size); 3582 if (res == NULL) 3583 return NULL; 3584 str = PyByteArray_AS_STRING(res); 3585 ressize = size; 3586 3587 while (p<endp) { 3588 Py_UNICODE c = *p; 3589 3590 /* can we encode this? */ 3591 if (c<limit) { 3592 /* no overflow check, because we know that the space is enough */ 3593 *str++ = (char)c; 3594 ++p; 3595 } 3596 else { 3597 Py_ssize_t unicodepos = p-startp; 3598 Py_ssize_t requiredsize; 3599 PyObject *repunicode; 3600 Py_ssize_t repsize; 3601 Py_ssize_t newpos; 3602 Py_ssize_t respos; 3603 Py_UNICODE *uni2; 3604 /* startpos for collecting unencodable chars */ 3605 const Py_UNICODE *collstart = p; 3606 const Py_UNICODE *collend = p; 3607 /* find all unecodable characters */ 3608 while ((collend < endp) && ((*collend)>=limit)) 3609 ++collend; 3610 /* cache callback name lookup (if not done yet, i.e. it's the first error) */ 3611 if (known_errorHandler==-1) { 3612 if ((errors==NULL) || (!strcmp(errors, "strict"))) 3613 known_errorHandler = 1; 3614 else if (!strcmp(errors, "replace")) 3615 known_errorHandler = 2; 3616 else if (!strcmp(errors, "ignore")) 3617 known_errorHandler = 3; 3618 else if (!strcmp(errors, "xmlcharrefreplace")) 3619 known_errorHandler = 4; 3620 else 3621 known_errorHandler = 0; 3622 } 3623 switch (known_errorHandler) { 3624 case 1: /* strict */ 3625 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason); 3626 goto onError; 3627 case 2: /* replace */ 3628 while (collstart++<collend) 3629 *str++ = '?'; /* fall through */ 3630 case 3: /* ignore */ 3631 p = collend; 3632 break; 3633 case 4: /* xmlcharrefreplace */ 3634 respos = str - PyByteArray_AS_STRING(res); 3635 /* determine replacement size (temporarily (mis)uses p) */ 3636 for (p = collstart, repsize = 0; p < collend; ++p) { 3637 if (*p<10) 3638 repsize += 2+1+1; 3639 else if (*p<100) 3640 repsize += 2+2+1; 3641 else if (*p<1000) 3642 repsize += 2+3+1; 3643 else if (*p<10000) 3644 repsize += 2+4+1; 3645#ifndef Py_UNICODE_WIDE 3646 else 3647 repsize += 2+5+1; 3648#else 3649 else if (*p<100000) 3650 repsize += 2+5+1; 3651 else if (*p<1000000) 3652 repsize += 2+6+1; 3653 else 3654 repsize += 2+7+1; 3655#endif 3656 } 3657 requiredsize = respos+repsize+(endp-collend); 3658 if (requiredsize > ressize) { 3659 if (requiredsize<2*ressize) 3660 requiredsize = 2*ressize; 3661 if (PyByteArray_Resize(res, requiredsize)) 3662 goto onError; 3663 str = PyByteArray_AS_STRING(res) + respos; 3664 ressize = requiredsize; 3665 } 3666 /* generate replacement (temporarily (mis)uses p) */ 3667 for (p = collstart; p < collend; ++p) { 3668 str += sprintf(str, "&#%d;", (int)*p); 3669 } 3670 p = collend; 3671 break; 3672 default: 3673 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler, 3674 encoding, reason, startp, size, &exc, 3675 collstart-startp, collend-startp, &newpos); 3676 if (repunicode == NULL) 3677 goto onError; 3678 /* need more space? (at least enough for what we 3679 have+the replacement+the rest of the string, so 3680 we won't have to check space for encodable characters) */ 3681 respos = str - PyByteArray_AS_STRING(res); 3682 repsize = PyUnicode_GET_SIZE(repunicode); 3683 requiredsize = respos+repsize+(endp-collend); 3684 if (requiredsize > ressize) { 3685 if (requiredsize<2*ressize) 3686 requiredsize = 2*ressize; 3687 if (PyByteArray_Resize(res, requiredsize)) { 3688 Py_DECREF(repunicode); 3689 goto onError; 3690 } 3691 str = PyByteArray_AS_STRING(res) + respos; 3692 ressize = requiredsize; 3693 } 3694 /* check if there is anything unencodable in the replacement 3695 and copy it to the output */ 3696 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) { 3697 c = *uni2; 3698 if (c >= limit) { 3699 raise_encode_exception(&exc, encoding, startp, size, 3700 unicodepos, unicodepos+1, reason); 3701 Py_DECREF(repunicode); 3702 goto onError; 3703 } 3704 *str = (char)c; 3705 } 3706 p = startp + newpos; 3707 Py_DECREF(repunicode); 3708 } 3709 } 3710 } 3711 result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(res), 3712 str - PyByteArray_AS_STRING(res)); 3713 onError: 3714 Py_DECREF(res); 3715 Py_XDECREF(errorHandler); 3716 Py_XDECREF(exc); 3717 return result; 3718} 3719 3720PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p, 3721 Py_ssize_t size, 3722 const char *errors) 3723{ 3724 return unicode_encode_ucs1(p, size, errors, 256); 3725} 3726 3727PyObject *PyUnicode_AsLatin1String(PyObject *unicode) 3728{ 3729 if (!PyUnicode_Check(unicode)) { 3730 PyErr_BadArgument(); 3731 return NULL; 3732 } 3733 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode), 3734 PyUnicode_GET_SIZE(unicode), 3735 NULL); 3736} 3737 3738/* --- 7-bit ASCII Codec -------------------------------------------------- */ 3739 3740PyObject *PyUnicode_DecodeASCII(const char *s, 3741 Py_ssize_t size, 3742 const char *errors) 3743{ 3744 const char *starts = s; 3745 PyUnicodeObject *v; 3746 Py_UNICODE *p; 3747 Py_ssize_t startinpos; 3748 Py_ssize_t endinpos; 3749 Py_ssize_t outpos; 3750 const char *e; 3751 PyObject *errorHandler = NULL; 3752 PyObject *exc = NULL; 3753 3754 /* ASCII is equivalent to the first 128 ordinals in Unicode. */ 3755 if (size == 1 && *(unsigned char*)s < 128) { 3756 Py_UNICODE r = *(unsigned char*)s; 3757 return PyUnicode_FromUnicode(&r, 1); 3758 } 3759 3760 v = _PyUnicode_New(size); 3761 if (v == NULL) 3762 goto onError; 3763 if (size == 0) 3764 return (PyObject *)v; 3765 p = PyUnicode_AS_UNICODE(v); 3766 e = s + size; 3767 while (s < e) { 3768 register unsigned char c = (unsigned char)*s; 3769 if (c < 128) { 3770 *p++ = c; 3771 ++s; 3772 } 3773 else { 3774 startinpos = s-starts; 3775 endinpos = startinpos + 1; 3776 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v); 3777 if (unicode_decode_call_errorhandler( 3778 errors, &errorHandler, 3779 "ascii", "ordinal not in range(128)", 3780 &starts, &e, &startinpos, &endinpos, &exc, &s, 3781 (PyObject **)&v, &outpos, &p)) 3782 goto onError; 3783 } 3784 } 3785 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v)) 3786 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0) 3787 goto onError; 3788 Py_XDECREF(errorHandler); 3789 Py_XDECREF(exc); 3790 return (PyObject *)v; 3791 3792 onError: 3793 Py_XDECREF(v); 3794 Py_XDECREF(errorHandler); 3795 Py_XDECREF(exc); 3796 return NULL; 3797} 3798 3799PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p, 3800 Py_ssize_t size, 3801 const char *errors) 3802{ 3803 return unicode_encode_ucs1(p, size, errors, 128); 3804} 3805 3806PyObject *PyUnicode_AsASCIIString(PyObject *unicode) 3807{ 3808 if (!PyUnicode_Check(unicode)) { 3809 PyErr_BadArgument(); 3810 return NULL; 3811 } 3812 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode), 3813 PyUnicode_GET_SIZE(unicode), 3814 NULL); 3815} 3816 3817#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T) 3818 3819/* --- MBCS codecs for Windows -------------------------------------------- */ 3820 3821#if SIZEOF_INT < SIZEOF_SSIZE_T 3822#define NEED_RETRY 3823#endif 3824 3825/* XXX This code is limited to "true" double-byte encodings, as 3826 a) it assumes an incomplete character consists of a single byte, and 3827 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte 3828 encodings, see IsDBCSLeadByteEx documentation. */ 3829 3830static int is_dbcs_lead_byte(const char *s, int offset) 3831{ 3832 const char *curr = s + offset; 3833 3834 if (IsDBCSLeadByte(*curr)) { 3835 const char *prev = CharPrev(s, curr); 3836 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2); 3837 } 3838 return 0; 3839} 3840 3841/* 3842 * Decode MBCS string into unicode object. If 'final' is set, converts 3843 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise. 3844 */ 3845static int decode_mbcs(PyUnicodeObject **v, 3846 const char *s, /* MBCS string */ 3847 int size, /* sizeof MBCS string */ 3848 int final) 3849{ 3850 Py_UNICODE *p; 3851 Py_ssize_t n = 0; 3852 int usize = 0; 3853 3854 assert(size >= 0); 3855 3856 /* Skip trailing lead-byte unless 'final' is set */ 3857 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1)) 3858 --size; 3859 3860 /* First get the size of the result */ 3861 if (size > 0) { 3862 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0); 3863 if (usize == 0) { 3864 PyErr_SetFromWindowsErrWithFilename(0, NULL); 3865 return -1; 3866 } 3867 } 3868 3869 if (*v == NULL) { 3870 /* Create unicode object */ 3871 *v = _PyUnicode_New(usize); 3872 if (*v == NULL) 3873 return -1; 3874 } 3875 else { 3876 /* Extend unicode object */ 3877 n = PyUnicode_GET_SIZE(*v); 3878 if (_PyUnicode_Resize(v, n + usize) < 0) 3879 return -1; 3880 } 3881 3882 /* Do the conversion */ 3883 if (size > 0) { 3884 p = PyUnicode_AS_UNICODE(*v) + n; 3885 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) { 3886 PyErr_SetFromWindowsErrWithFilename(0, NULL); 3887 return -1; 3888 } 3889 } 3890 3891 return size; 3892} 3893 3894PyObject *PyUnicode_DecodeMBCSStateful(const char *s, 3895 Py_ssize_t size, 3896 const char *errors, 3897 Py_ssize_t *consumed) 3898{ 3899 PyUnicodeObject *v = NULL; 3900 int done; 3901 3902 if (consumed) 3903 *consumed = 0; 3904 3905#ifdef NEED_RETRY 3906 retry: 3907 if (size > INT_MAX) 3908 done = decode_mbcs(&v, s, INT_MAX, 0); 3909 else 3910#endif 3911 done = decode_mbcs(&v, s, (int)size, !consumed); 3912 3913 if (done < 0) { 3914 Py_XDECREF(v); 3915 return NULL; 3916 } 3917 3918 if (consumed) 3919 *consumed += done; 3920 3921#ifdef NEED_RETRY 3922 if (size > INT_MAX) { 3923 s += done; 3924 size -= done; 3925 goto retry; 3926 } 3927#endif 3928 3929 return (PyObject *)v; 3930} 3931 3932PyObject *PyUnicode_DecodeMBCS(const char *s, 3933 Py_ssize_t size, 3934 const char *errors) 3935{ 3936 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL); 3937} 3938 3939/* 3940 * Convert unicode into string object (MBCS). 3941 * Returns 0 if succeed, -1 otherwise. 3942 */ 3943static int encode_mbcs(PyObject **repr, 3944 const Py_UNICODE *p, /* unicode */ 3945 int size) /* size of unicode */ 3946{ 3947 int mbcssize = 0; 3948 Py_ssize_t n = 0; 3949 3950 assert(size >= 0); 3951 3952 /* First get the size of the result */ 3953 if (size > 0) { 3954 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL); 3955 if (mbcssize == 0) { 3956 PyErr_SetFromWindowsErrWithFilename(0, NULL); 3957 return -1; 3958 } 3959 } 3960 3961 if (*repr == NULL) { 3962 /* Create string object */ 3963 *repr = PyBytes_FromStringAndSize(NULL, mbcssize); 3964 if (*repr == NULL) 3965 return -1; 3966 } 3967 else { 3968 /* Extend string object */ 3969 n = PyBytes_Size(*repr); 3970 if (_PyBytes_Resize(repr, n + mbcssize) < 0) 3971 return -1; 3972 } 3973 3974 /* Do the conversion */ 3975 if (size > 0) { 3976 char *s = PyBytes_AS_STRING(*repr) + n; 3977 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) { 3978 PyErr_SetFromWindowsErrWithFilename(0, NULL); 3979 return -1; 3980 } 3981 } 3982 3983 return 0; 3984} 3985 3986PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p, 3987 Py_ssize_t size, 3988 const char *errors) 3989{ 3990 PyObject *repr = NULL; 3991 int ret; 3992 3993#ifdef NEED_RETRY 3994 retry: 3995 if (size > INT_MAX) 3996 ret = encode_mbcs(&repr, p, INT_MAX); 3997 else 3998#endif 3999 ret = encode_mbcs(&repr, p, (int)size); 4000 4001 if (ret < 0) { 4002 Py_XDECREF(repr); 4003 return NULL; 4004 } 4005 4006#ifdef NEED_RETRY 4007 if (size > INT_MAX) { 4008 p += INT_MAX; 4009 size -= INT_MAX; 4010 goto retry; 4011 } 4012#endif 4013 4014 return repr; 4015} 4016 4017PyObject *PyUnicode_AsMBCSString(PyObject *unicode) 4018{ 4019 if (!PyUnicode_Check(unicode)) { 4020 PyErr_BadArgument(); 4021 return NULL; 4022 } 4023 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode), 4024 PyUnicode_GET_SIZE(unicode), 4025 NULL); 4026} 4027 4028#undef NEED_RETRY 4029 4030#endif /* MS_WINDOWS */ 4031 4032/* --- Character Mapping Codec -------------------------------------------- */ 4033 4034PyObject *PyUnicode_DecodeCharmap(const char *s, 4035 Py_ssize_t size, 4036 PyObject *mapping, 4037 const char *errors) 4038{ 4039 const char *starts = s; 4040 Py_ssize_t startinpos; 4041 Py_ssize_t endinpos; 4042 Py_ssize_t outpos; 4043 const char *e; 4044 PyUnicodeObject *v; 4045 Py_UNICODE *p; 4046 Py_ssize_t extrachars = 0; 4047 PyObject *errorHandler = NULL; 4048 PyObject *exc = NULL; 4049 Py_UNICODE *mapstring = NULL; 4050 Py_ssize_t maplen = 0; 4051 4052 /* Default to Latin-1 */ 4053 if (mapping == NULL) 4054 return PyUnicode_DecodeLatin1(s, size, errors); 4055 4056 v = _PyUnicode_New(size); 4057 if (v == NULL) 4058 goto onError; 4059 if (size == 0) 4060 return (PyObject *)v; 4061 p = PyUnicode_AS_UNICODE(v); 4062 e = s + size; 4063 if (PyUnicode_CheckExact(mapping)) { 4064 mapstring = PyUnicode_AS_UNICODE(mapping); 4065 maplen = PyUnicode_GET_SIZE(mapping); 4066 while (s < e) { 4067 unsigned char ch = *s; 4068 Py_UNICODE x = 0xfffe; /* illegal value */ 4069 4070 if (ch < maplen) 4071 x = mapstring[ch]; 4072 4073 if (x == 0xfffe) { 4074 /* undefined mapping */ 4075 outpos = p-PyUnicode_AS_UNICODE(v); 4076 startinpos = s-starts; 4077 endinpos = startinpos+1; 4078 if (unicode_decode_call_errorhandler( 4079 errors, &errorHandler, 4080 "charmap", "character maps to <undefined>", 4081 &starts, &e, &startinpos, &endinpos, &exc, &s, 4082 (PyObject **)&v, &outpos, &p)) { 4083 goto onError; 4084 } 4085 continue; 4086 } 4087 *p++ = x; 4088 ++s; 4089 } 4090 } 4091 else { 4092 while (s < e) { 4093 unsigned char ch = *s; 4094 PyObject *w, *x; 4095 4096 /* Get mapping (char ordinal -> integer, Unicode char or None) */ 4097 w = PyLong_FromLong((long)ch); 4098 if (w == NULL) 4099 goto onError; 4100 x = PyObject_GetItem(mapping, w); 4101 Py_DECREF(w); 4102 if (x == NULL) { 4103 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 4104 /* No mapping found means: mapping is undefined. */ 4105 PyErr_Clear(); 4106 x = Py_None; 4107 Py_INCREF(x); 4108 } else 4109 goto onError; 4110 } 4111 4112 /* Apply mapping */ 4113 if (PyLong_Check(x)) { 4114 long value = PyLong_AS_LONG(x); 4115 if (value < 0 || value > 65535) { 4116 PyErr_SetString(PyExc_TypeError, 4117 "character mapping must be in range(65536)"); 4118 Py_DECREF(x); 4119 goto onError; 4120 } 4121 *p++ = (Py_UNICODE)value; 4122 } 4123 else if (x == Py_None) { 4124 /* undefined mapping */ 4125 outpos = p-PyUnicode_AS_UNICODE(v); 4126 startinpos = s-starts; 4127 endinpos = startinpos+1; 4128 if (unicode_decode_call_errorhandler( 4129 errors, &errorHandler, 4130 "charmap", "character maps to <undefined>", 4131 &starts, &e, &startinpos, &endinpos, &exc, &s, 4132 (PyObject **)&v, &outpos, &p)) { 4133 Py_DECREF(x); 4134 goto onError; 4135 } 4136 Py_DECREF(x); 4137 continue; 4138 } 4139 else if (PyUnicode_Check(x)) { 4140 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x); 4141 4142 if (targetsize == 1) 4143 /* 1-1 mapping */ 4144 *p++ = *PyUnicode_AS_UNICODE(x); 4145 4146 else if (targetsize > 1) { 4147 /* 1-n mapping */ 4148 if (targetsize > extrachars) { 4149 /* resize first */ 4150 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v); 4151 Py_ssize_t needed = (targetsize - extrachars) + \ 4152 (targetsize << 2); 4153 extrachars += needed; 4154 /* XXX overflow detection missing */ 4155 if (_PyUnicode_Resize(&v, 4156 PyUnicode_GET_SIZE(v) + needed) < 0) { 4157 Py_DECREF(x); 4158 goto onError; 4159 } 4160 p = PyUnicode_AS_UNICODE(v) + oldpos; 4161 } 4162 Py_UNICODE_COPY(p, 4163 PyUnicode_AS_UNICODE(x), 4164 targetsize); 4165 p += targetsize; 4166 extrachars -= targetsize; 4167 } 4168 /* 1-0 mapping: skip the character */ 4169 } 4170 else { 4171 /* wrong return value */ 4172 PyErr_SetString(PyExc_TypeError, 4173 "character mapping must return integer, None or unicode"); 4174 Py_DECREF(x); 4175 goto onError; 4176 } 4177 Py_DECREF(x); 4178 ++s; 4179 } 4180 } 4181 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v)) 4182 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0) 4183 goto onError; 4184 Py_XDECREF(errorHandler); 4185 Py_XDECREF(exc); 4186 return (PyObject *)v; 4187 4188 onError: 4189 Py_XDECREF(errorHandler); 4190 Py_XDECREF(exc); 4191 Py_XDECREF(v); 4192 return NULL; 4193} 4194 4195/* Charmap encoding: the lookup table */ 4196 4197struct encoding_map{ 4198 PyObject_HEAD 4199 unsigned char level1[32]; 4200 int count2, count3; 4201 unsigned char level23[1]; 4202}; 4203 4204static PyObject* 4205encoding_map_size(PyObject *obj, PyObject* args) 4206{ 4207 struct encoding_map *map = (struct encoding_map*)obj; 4208 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 + 4209 128*map->count3); 4210} 4211 4212static PyMethodDef encoding_map_methods[] = { 4213 {"size", encoding_map_size, METH_NOARGS, 4214 PyDoc_STR("Return the size (in bytes) of this object") }, 4215 { 0 } 4216}; 4217 4218static void 4219encoding_map_dealloc(PyObject* o) 4220{ 4221 PyObject_FREE(o); 4222} 4223 4224static PyTypeObject EncodingMapType = { 4225 PyVarObject_HEAD_INIT(NULL, 0) 4226 "EncodingMap", /*tp_name*/ 4227 sizeof(struct encoding_map), /*tp_basicsize*/ 4228 0, /*tp_itemsize*/ 4229 /* methods */ 4230 encoding_map_dealloc, /*tp_dealloc*/ 4231 0, /*tp_print*/ 4232 0, /*tp_getattr*/ 4233 0, /*tp_setattr*/ 4234 0, /*tp_compare*/ 4235 0, /*tp_repr*/ 4236 0, /*tp_as_number*/ 4237 0, /*tp_as_sequence*/ 4238 0, /*tp_as_mapping*/ 4239 0, /*tp_hash*/ 4240 0, /*tp_call*/ 4241 0, /*tp_str*/ 4242 0, /*tp_getattro*/ 4243 0, /*tp_setattro*/ 4244 0, /*tp_as_buffer*/ 4245 Py_TPFLAGS_DEFAULT, /*tp_flags*/ 4246 0, /*tp_doc*/ 4247 0, /*tp_traverse*/ 4248 0, /*tp_clear*/ 4249 0, /*tp_richcompare*/ 4250 0, /*tp_weaklistoffset*/ 4251 0, /*tp_iter*/ 4252 0, /*tp_iternext*/ 4253 encoding_map_methods, /*tp_methods*/ 4254 0, /*tp_members*/ 4255 0, /*tp_getset*/ 4256 0, /*tp_base*/ 4257 0, /*tp_dict*/ 4258 0, /*tp_descr_get*/ 4259 0, /*tp_descr_set*/ 4260 0, /*tp_dictoffset*/ 4261 0, /*tp_init*/ 4262 0, /*tp_alloc*/ 4263 0, /*tp_new*/ 4264 0, /*tp_free*/ 4265 0, /*tp_is_gc*/ 4266}; 4267 4268PyObject* 4269PyUnicode_BuildEncodingMap(PyObject* string) 4270{ 4271 Py_UNICODE *decode; 4272 PyObject *result; 4273 struct encoding_map *mresult; 4274 int i; 4275 int need_dict = 0; 4276 unsigned char level1[32]; 4277 unsigned char level2[512]; 4278 unsigned char *mlevel1, *mlevel2, *mlevel3; 4279 int count2 = 0, count3 = 0; 4280 4281 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) { 4282 PyErr_BadArgument(); 4283 return NULL; 4284 } 4285 decode = PyUnicode_AS_UNICODE(string); 4286 memset(level1, 0xFF, sizeof level1); 4287 memset(level2, 0xFF, sizeof level2); 4288 4289 /* If there isn't a one-to-one mapping of NULL to \0, 4290 or if there are non-BMP characters, we need to use 4291 a mapping dictionary. */ 4292 if (decode[0] != 0) 4293 need_dict = 1; 4294 for (i = 1; i < 256; i++) { 4295 int l1, l2; 4296 if (decode[i] == 0 4297 #ifdef Py_UNICODE_WIDE 4298 || decode[i] > 0xFFFF 4299 #endif 4300 ) { 4301 need_dict = 1; 4302 break; 4303 } 4304 if (decode[i] == 0xFFFE) 4305 /* unmapped character */ 4306 continue; 4307 l1 = decode[i] >> 11; 4308 l2 = decode[i] >> 7; 4309 if (level1[l1] == 0xFF) 4310 level1[l1] = count2++; 4311 if (level2[l2] == 0xFF) 4312 level2[l2] = count3++; 4313 } 4314 4315 if (count2 >= 0xFF || count3 >= 0xFF) 4316 need_dict = 1; 4317 4318 if (need_dict) { 4319 PyObject *result = PyDict_New(); 4320 PyObject *key, *value; 4321 if (!result) 4322 return NULL; 4323 for (i = 0; i < 256; i++) { 4324 key = value = NULL; 4325 key = PyLong_FromLong(decode[i]); 4326 value = PyLong_FromLong(i); 4327 if (!key || !value) 4328 goto failed1; 4329 if (PyDict_SetItem(result, key, value) == -1) 4330 goto failed1; 4331 Py_DECREF(key); 4332 Py_DECREF(value); 4333 } 4334 return result; 4335 failed1: 4336 Py_XDECREF(key); 4337 Py_XDECREF(value); 4338 Py_DECREF(result); 4339 return NULL; 4340 } 4341 4342 /* Create a three-level trie */ 4343 result = PyObject_MALLOC(sizeof(struct encoding_map) + 4344 16*count2 + 128*count3 - 1); 4345 if (!result) 4346 return PyErr_NoMemory(); 4347 PyObject_Init(result, &EncodingMapType); 4348 mresult = (struct encoding_map*)result; 4349 mresult->count2 = count2; 4350 mresult->count3 = count3; 4351 mlevel1 = mresult->level1; 4352 mlevel2 = mresult->level23; 4353 mlevel3 = mresult->level23 + 16*count2; 4354 memcpy(mlevel1, level1, 32); 4355 memset(mlevel2, 0xFF, 16*count2); 4356 memset(mlevel3, 0, 128*count3); 4357 count3 = 0; 4358 for (i = 1; i < 256; i++) { 4359 int o1, o2, o3, i2, i3; 4360 if (decode[i] == 0xFFFE) 4361 /* unmapped character */ 4362 continue; 4363 o1 = decode[i]>>11; 4364 o2 = (decode[i]>>7) & 0xF; 4365 i2 = 16*mlevel1[o1] + o2; 4366 if (mlevel2[i2] == 0xFF) 4367 mlevel2[i2] = count3++; 4368 o3 = decode[i] & 0x7F; 4369 i3 = 128*mlevel2[i2] + o3; 4370 mlevel3[i3] = i; 4371 } 4372 return result; 4373} 4374 4375static int 4376encoding_map_lookup(Py_UNICODE c, PyObject *mapping) 4377{ 4378 struct encoding_map *map = (struct encoding_map*)mapping; 4379 int l1 = c>>11; 4380 int l2 = (c>>7) & 0xF; 4381 int l3 = c & 0x7F; 4382 int i; 4383 4384#ifdef Py_UNICODE_WIDE 4385 if (c > 0xFFFF) { 4386 return -1; 4387 } 4388#endif 4389 if (c == 0) 4390 return 0; 4391 /* level 1*/ 4392 i = map->level1[l1]; 4393 if (i == 0xFF) { 4394 return -1; 4395 } 4396 /* level 2*/ 4397 i = map->level23[16*i+l2]; 4398 if (i == 0xFF) { 4399 return -1; 4400 } 4401 /* level 3 */ 4402 i = map->level23[16*map->count2 + 128*i + l3]; 4403 if (i == 0) { 4404 return -1; 4405 } 4406 return i; 4407} 4408 4409/* Lookup the character ch in the mapping. If the character 4410 can't be found, Py_None is returned (or NULL, if another 4411 error occurred). */ 4412static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping) 4413{ 4414 PyObject *w = PyLong_FromLong((long)c); 4415 PyObject *x; 4416 4417 if (w == NULL) 4418 return NULL; 4419 x = PyObject_GetItem(mapping, w); 4420 Py_DECREF(w); 4421 if (x == NULL) { 4422 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 4423 /* No mapping found means: mapping is undefined. */ 4424 PyErr_Clear(); 4425 x = Py_None; 4426 Py_INCREF(x); 4427 return x; 4428 } else 4429 return NULL; 4430 } 4431 else if (x == Py_None) 4432 return x; 4433 else if (PyLong_Check(x)) { 4434 long value = PyLong_AS_LONG(x); 4435 if (value < 0 || value > 255) { 4436 PyErr_SetString(PyExc_TypeError, 4437 "character mapping must be in range(256)"); 4438 Py_DECREF(x); 4439 return NULL; 4440 } 4441 return x; 4442 } 4443 else if (PyBytes_Check(x)) 4444 return x; 4445 else { 4446 /* wrong return value */ 4447 PyErr_Format(PyExc_TypeError, 4448 "character mapping must return integer, bytes or None, not %.400s", 4449 x->ob_type->tp_name); 4450 Py_DECREF(x); 4451 return NULL; 4452 } 4453} 4454 4455static int 4456charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize) 4457{ 4458 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj); 4459 /* exponentially overallocate to minimize reallocations */ 4460 if (requiredsize < 2*outsize) 4461 requiredsize = 2*outsize; 4462 if (_PyBytes_Resize(outobj, requiredsize)) 4463 return -1; 4464 return 0; 4465} 4466 4467typedef enum charmapencode_result { 4468 enc_SUCCESS, enc_FAILED, enc_EXCEPTION 4469}charmapencode_result; 4470/* lookup the character, put the result in the output string and adjust 4471 various state variables. Resize the output bytes object if not enough 4472 space is available. Return a new reference to the object that 4473 was put in the output buffer, or Py_None, if the mapping was undefined 4474 (in which case no character was written) or NULL, if a 4475 reallocation error occurred. The caller must decref the result */ 4476static 4477charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping, 4478 PyObject **outobj, Py_ssize_t *outpos) 4479{ 4480 PyObject *rep; 4481 char *outstart; 4482 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj); 4483 4484 if (Py_TYPE(mapping) == &EncodingMapType) { 4485 int res = encoding_map_lookup(c, mapping); 4486 Py_ssize_t requiredsize = *outpos+1; 4487 if (res == -1) 4488 return enc_FAILED; 4489 if (outsize<requiredsize) 4490 if (charmapencode_resize(outobj, outpos, requiredsize)) 4491 return enc_EXCEPTION; 4492 outstart = PyBytes_AS_STRING(*outobj); 4493 outstart[(*outpos)++] = (char)res; 4494 return enc_SUCCESS; 4495 } 4496 4497 rep = charmapencode_lookup(c, mapping); 4498 if (rep==NULL) 4499 return enc_EXCEPTION; 4500 else if (rep==Py_None) { 4501 Py_DECREF(rep); 4502 return enc_FAILED; 4503 } else { 4504 if (PyLong_Check(rep)) { 4505 Py_ssize_t requiredsize = *outpos+1; 4506 if (outsize<requiredsize) 4507 if (charmapencode_resize(outobj, outpos, requiredsize)) { 4508 Py_DECREF(rep); 4509 return enc_EXCEPTION; 4510 } 4511 outstart = PyBytes_AS_STRING(*outobj); 4512 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep); 4513 } 4514 else { 4515 const char *repchars = PyBytes_AS_STRING(rep); 4516 Py_ssize_t repsize = PyBytes_GET_SIZE(rep); 4517 Py_ssize_t requiredsize = *outpos+repsize; 4518 if (outsize<requiredsize) 4519 if (charmapencode_resize(outobj, outpos, requiredsize)) { 4520 Py_DECREF(rep); 4521 return enc_EXCEPTION; 4522 } 4523 outstart = PyBytes_AS_STRING(*outobj); 4524 memcpy(outstart + *outpos, repchars, repsize); 4525 *outpos += repsize; 4526 } 4527 } 4528 Py_DECREF(rep); 4529 return enc_SUCCESS; 4530} 4531 4532/* handle an error in PyUnicode_EncodeCharmap 4533 Return 0 on success, -1 on error */ 4534static 4535int charmap_encoding_error( 4536 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping, 4537 PyObject **exceptionObject, 4538 int *known_errorHandler, PyObject **errorHandler, const char *errors, 4539 PyObject **res, Py_ssize_t *respos) 4540{ 4541 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */ 4542 Py_ssize_t repsize; 4543 Py_ssize_t newpos; 4544 Py_UNICODE *uni2; 4545 /* startpos for collecting unencodable chars */ 4546 Py_ssize_t collstartpos = *inpos; 4547 Py_ssize_t collendpos = *inpos+1; 4548 Py_ssize_t collpos; 4549 char *encoding = "charmap"; 4550 char *reason = "character maps to <undefined>"; 4551 charmapencode_result x; 4552 4553 /* find all unencodable characters */ 4554 while (collendpos < size) { 4555 PyObject *rep; 4556 if (Py_TYPE(mapping) == &EncodingMapType) { 4557 int res = encoding_map_lookup(p[collendpos], mapping); 4558 if (res != -1) 4559 break; 4560 ++collendpos; 4561 continue; 4562 } 4563 4564 rep = charmapencode_lookup(p[collendpos], mapping); 4565 if (rep==NULL) 4566 return -1; 4567 else if (rep!=Py_None) { 4568 Py_DECREF(rep); 4569 break; 4570 } 4571 Py_DECREF(rep); 4572 ++collendpos; 4573 } 4574 /* cache callback name lookup 4575 * (if not done yet, i.e. it's the first error) */ 4576 if (*known_errorHandler==-1) { 4577 if ((errors==NULL) || (!strcmp(errors, "strict"))) 4578 *known_errorHandler = 1; 4579 else if (!strcmp(errors, "replace")) 4580 *known_errorHandler = 2; 4581 else if (!strcmp(errors, "ignore")) 4582 *known_errorHandler = 3; 4583 else if (!strcmp(errors, "xmlcharrefreplace")) 4584 *known_errorHandler = 4; 4585 else 4586 *known_errorHandler = 0; 4587 } 4588 switch (*known_errorHandler) { 4589 case 1: /* strict */ 4590 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason); 4591 return -1; 4592 case 2: /* replace */ 4593 for (collpos = collstartpos; collpos<collendpos; ++collpos) { 4594 x = charmapencode_output('?', mapping, res, respos); 4595 if (x==enc_EXCEPTION) { 4596 return -1; 4597 } 4598 else if (x==enc_FAILED) { 4599 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason); 4600 return -1; 4601 } 4602 } 4603 /* fall through */ 4604 case 3: /* ignore */ 4605 *inpos = collendpos; 4606 break; 4607 case 4: /* xmlcharrefreplace */ 4608 /* generate replacement (temporarily (mis)uses p) */ 4609 for (collpos = collstartpos; collpos < collendpos; ++collpos) { 4610 char buffer[2+29+1+1]; 4611 char *cp; 4612 sprintf(buffer, "&#%d;", (int)p[collpos]); 4613 for (cp = buffer; *cp; ++cp) { 4614 x = charmapencode_output(*cp, mapping, res, respos); 4615 if (x==enc_EXCEPTION) 4616 return -1; 4617 else if (x==enc_FAILED) { 4618 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason); 4619 return -1; 4620 } 4621 } 4622 } 4623 *inpos = collendpos; 4624 break; 4625 default: 4626 repunicode = unicode_encode_call_errorhandler(errors, errorHandler, 4627 encoding, reason, p, size, exceptionObject, 4628 collstartpos, collendpos, &newpos); 4629 if (repunicode == NULL) 4630 return -1; 4631 /* generate replacement */ 4632 repsize = PyUnicode_GET_SIZE(repunicode); 4633 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) { 4634 x = charmapencode_output(*uni2, mapping, res, respos); 4635 if (x==enc_EXCEPTION) { 4636 return -1; 4637 } 4638 else if (x==enc_FAILED) { 4639 Py_DECREF(repunicode); 4640 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason); 4641 return -1; 4642 } 4643 } 4644 *inpos = newpos; 4645 Py_DECREF(repunicode); 4646 } 4647 return 0; 4648} 4649 4650PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p, 4651 Py_ssize_t size, 4652 PyObject *mapping, 4653 const char *errors) 4654{ 4655 /* output object */ 4656 PyObject *res = NULL; 4657 /* current input position */ 4658 Py_ssize_t inpos = 0; 4659 /* current output position */ 4660 Py_ssize_t respos = 0; 4661 PyObject *errorHandler = NULL; 4662 PyObject *exc = NULL; 4663 /* the following variable is used for caching string comparisons 4664 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 4665 * 3=ignore, 4=xmlcharrefreplace */ 4666 int known_errorHandler = -1; 4667 4668 /* Default to Latin-1 */ 4669 if (mapping == NULL) 4670 return PyUnicode_EncodeLatin1(p, size, errors); 4671 4672 /* allocate enough for a simple encoding without 4673 replacements, if we need more, we'll resize */ 4674 res = PyBytes_FromStringAndSize(NULL, size); 4675 if (res == NULL) 4676 goto onError; 4677 if (size == 0) 4678 return res; 4679 4680 while (inpos<size) { 4681 /* try to encode it */ 4682 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos); 4683 if (x==enc_EXCEPTION) /* error */ 4684 goto onError; 4685 if (x==enc_FAILED) { /* unencodable character */ 4686 if (charmap_encoding_error(p, size, &inpos, mapping, 4687 &exc, 4688 &known_errorHandler, &errorHandler, errors, 4689 &res, &respos)) { 4690 goto onError; 4691 } 4692 } 4693 else 4694 /* done with this character => adjust input position */ 4695 ++inpos; 4696 } 4697 4698 /* Resize if we allocated to much */ 4699 if (respos<PyBytes_GET_SIZE(res)) 4700 _PyBytes_Resize(&res, respos); 4701 4702 Py_XDECREF(exc); 4703 Py_XDECREF(errorHandler); 4704 return res; 4705 4706 onError: 4707 Py_XDECREF(res); 4708 Py_XDECREF(exc); 4709 Py_XDECREF(errorHandler); 4710 return NULL; 4711} 4712 4713PyObject *PyUnicode_AsCharmapString(PyObject *unicode, 4714 PyObject *mapping) 4715{ 4716 if (!PyUnicode_Check(unicode) || mapping == NULL) { 4717 PyErr_BadArgument(); 4718 return NULL; 4719 } 4720 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode), 4721 PyUnicode_GET_SIZE(unicode), 4722 mapping, 4723 NULL); 4724} 4725 4726/* create or adjust a UnicodeTranslateError */ 4727static void make_translate_exception(PyObject **exceptionObject, 4728 const Py_UNICODE *unicode, Py_ssize_t size, 4729 Py_ssize_t startpos, Py_ssize_t endpos, 4730 const char *reason) 4731{ 4732 if (*exceptionObject == NULL) { 4733 *exceptionObject = PyUnicodeTranslateError_Create( 4734 unicode, size, startpos, endpos, reason); 4735 } 4736 else { 4737 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos)) 4738 goto onError; 4739 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos)) 4740 goto onError; 4741 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason)) 4742 goto onError; 4743 return; 4744 onError: 4745 Py_DECREF(*exceptionObject); 4746 *exceptionObject = NULL; 4747 } 4748} 4749 4750/* raises a UnicodeTranslateError */ 4751static void raise_translate_exception(PyObject **exceptionObject, 4752 const Py_UNICODE *unicode, Py_ssize_t size, 4753 Py_ssize_t startpos, Py_ssize_t endpos, 4754 const char *reason) 4755{ 4756 make_translate_exception(exceptionObject, 4757 unicode, size, startpos, endpos, reason); 4758 if (*exceptionObject != NULL) 4759 PyCodec_StrictErrors(*exceptionObject); 4760} 4761 4762/* error handling callback helper: 4763 build arguments, call the callback and check the arguments, 4764 put the result into newpos and return the replacement string, which 4765 has to be freed by the caller */ 4766static PyObject *unicode_translate_call_errorhandler(const char *errors, 4767 PyObject **errorHandler, 4768 const char *reason, 4769 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject, 4770 Py_ssize_t startpos, Py_ssize_t endpos, 4771 Py_ssize_t *newpos) 4772{ 4773 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple"; 4774 4775 Py_ssize_t i_newpos; 4776 PyObject *restuple; 4777 PyObject *resunicode; 4778 4779 if (*errorHandler == NULL) { 4780 *errorHandler = PyCodec_LookupError(errors); 4781 if (*errorHandler == NULL) 4782 return NULL; 4783 } 4784 4785 make_translate_exception(exceptionObject, 4786 unicode, size, startpos, endpos, reason); 4787 if (*exceptionObject == NULL) 4788 return NULL; 4789 4790 restuple = PyObject_CallFunctionObjArgs( 4791 *errorHandler, *exceptionObject, NULL); 4792 if (restuple == NULL) 4793 return NULL; 4794 if (!PyTuple_Check(restuple)) { 4795 PyErr_Format(PyExc_TypeError, &argparse[4]); 4796 Py_DECREF(restuple); 4797 return NULL; 4798 } 4799 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, 4800 &resunicode, &i_newpos)) { 4801 Py_DECREF(restuple); 4802 return NULL; 4803 } 4804 if (i_newpos<0) 4805 *newpos = size+i_newpos; 4806 else 4807 *newpos = i_newpos; 4808 if (*newpos<0 || *newpos>size) { 4809 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos); 4810 Py_DECREF(restuple); 4811 return NULL; 4812 } 4813 Py_INCREF(resunicode); 4814 Py_DECREF(restuple); 4815 return resunicode; 4816} 4817 4818/* Lookup the character ch in the mapping and put the result in result, 4819 which must be decrefed by the caller. 4820 Return 0 on success, -1 on error */ 4821static 4822int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result) 4823{ 4824 PyObject *w = PyLong_FromLong((long)c); 4825 PyObject *x; 4826 4827 if (w == NULL) 4828 return -1; 4829 x = PyObject_GetItem(mapping, w); 4830 Py_DECREF(w); 4831 if (x == NULL) { 4832 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 4833 /* No mapping found means: use 1:1 mapping. */ 4834 PyErr_Clear(); 4835 *result = NULL; 4836 return 0; 4837 } else 4838 return -1; 4839 } 4840 else if (x == Py_None) { 4841 *result = x; 4842 return 0; 4843 } 4844 else if (PyLong_Check(x)) { 4845 long value = PyLong_AS_LONG(x); 4846 long max = PyUnicode_GetMax(); 4847 if (value < 0 || value > max) { 4848 PyErr_Format(PyExc_TypeError, 4849 "character mapping must be in range(0x%x)", max+1); 4850 Py_DECREF(x); 4851 return -1; 4852 } 4853 *result = x; 4854 return 0; 4855 } 4856 else if (PyUnicode_Check(x)) { 4857 *result = x; 4858 return 0; 4859 } 4860 else { 4861 /* wrong return value */ 4862 PyErr_SetString(PyExc_TypeError, 4863 "character mapping must return integer, None or unicode"); 4864 Py_DECREF(x); 4865 return -1; 4866 } 4867} 4868/* ensure that *outobj is at least requiredsize characters long, 4869if not reallocate and adjust various state variables. 4870Return 0 on success, -1 on error */ 4871static 4872int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp, 4873 Py_ssize_t requiredsize) 4874{ 4875 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj); 4876 if (requiredsize > oldsize) { 4877 /* remember old output position */ 4878 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj); 4879 /* exponentially overallocate to minimize reallocations */ 4880 if (requiredsize < 2 * oldsize) 4881 requiredsize = 2 * oldsize; 4882 if (_PyUnicode_Resize(outobj, requiredsize) < 0) 4883 return -1; 4884 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos; 4885 } 4886 return 0; 4887} 4888/* lookup the character, put the result in the output string and adjust 4889 various state variables. Return a new reference to the object that 4890 was put in the output buffer in *result, or Py_None, if the mapping was 4891 undefined (in which case no character was written). 4892 The called must decref result. 4893 Return 0 on success, -1 on error. */ 4894static 4895int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp, 4896 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp, 4897 PyObject **res) 4898{ 4899 if (charmaptranslate_lookup(*curinp, mapping, res)) 4900 return -1; 4901 if (*res==NULL) { 4902 /* not found => default to 1:1 mapping */ 4903 *(*outp)++ = *curinp; 4904 } 4905 else if (*res==Py_None) 4906 ; 4907 else if (PyLong_Check(*res)) { 4908 /* no overflow check, because we know that the space is enough */ 4909 *(*outp)++ = (Py_UNICODE)PyLong_AS_LONG(*res); 4910 } 4911 else if (PyUnicode_Check(*res)) { 4912 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res); 4913 if (repsize==1) { 4914 /* no overflow check, because we know that the space is enough */ 4915 *(*outp)++ = *PyUnicode_AS_UNICODE(*res); 4916 } 4917 else if (repsize!=0) { 4918 /* more than one character */ 4919 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) + 4920 (insize - (curinp-startinp)) + 4921 repsize - 1; 4922 if (charmaptranslate_makespace(outobj, outp, requiredsize)) 4923 return -1; 4924 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize); 4925 *outp += repsize; 4926 } 4927 } 4928 else 4929 return -1; 4930 return 0; 4931} 4932 4933PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p, 4934 Py_ssize_t size, 4935 PyObject *mapping, 4936 const char *errors) 4937{ 4938 /* output object */ 4939 PyObject *res = NULL; 4940 /* pointers to the beginning and end+1 of input */ 4941 const Py_UNICODE *startp = p; 4942 const Py_UNICODE *endp = p + size; 4943 /* pointer into the output */ 4944 Py_UNICODE *str; 4945 /* current output position */ 4946 Py_ssize_t respos = 0; 4947 char *reason = "character maps to <undefined>"; 4948 PyObject *errorHandler = NULL; 4949 PyObject *exc = NULL; 4950 /* the following variable is used for caching string comparisons 4951 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 4952 * 3=ignore, 4=xmlcharrefreplace */ 4953 int known_errorHandler = -1; 4954 4955 if (mapping == NULL) { 4956 PyErr_BadArgument(); 4957 return NULL; 4958 } 4959 4960 /* allocate enough for a simple 1:1 translation without 4961 replacements, if we need more, we'll resize */ 4962 res = PyUnicode_FromUnicode(NULL, size); 4963 if (res == NULL) 4964 goto onError; 4965 if (size == 0) 4966 return res; 4967 str = PyUnicode_AS_UNICODE(res); 4968 4969 while (p<endp) { 4970 /* try to encode it */ 4971 PyObject *x = NULL; 4972 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) { 4973 Py_XDECREF(x); 4974 goto onError; 4975 } 4976 Py_XDECREF(x); 4977 if (x!=Py_None) /* it worked => adjust input pointer */ 4978 ++p; 4979 else { /* untranslatable character */ 4980 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */ 4981 Py_ssize_t repsize; 4982 Py_ssize_t newpos; 4983 Py_UNICODE *uni2; 4984 /* startpos for collecting untranslatable chars */ 4985 const Py_UNICODE *collstart = p; 4986 const Py_UNICODE *collend = p+1; 4987 const Py_UNICODE *coll; 4988 4989 /* find all untranslatable characters */ 4990 while (collend < endp) { 4991 if (charmaptranslate_lookup(*collend, mapping, &x)) 4992 goto onError; 4993 Py_XDECREF(x); 4994 if (x!=Py_None) 4995 break; 4996 ++collend; 4997 } 4998 /* cache callback name lookup 4999 * (if not done yet, i.e. it's the first error) */ 5000 if (known_errorHandler==-1) { 5001 if ((errors==NULL) || (!strcmp(errors, "strict"))) 5002 known_errorHandler = 1; 5003 else if (!strcmp(errors, "replace")) 5004 known_errorHandler = 2; 5005 else if (!strcmp(errors, "ignore")) 5006 known_errorHandler = 3; 5007 else if (!strcmp(errors, "xmlcharrefreplace")) 5008 known_errorHandler = 4; 5009 else 5010 known_errorHandler = 0; 5011 } 5012 switch (known_errorHandler) { 5013 case 1: /* strict */ 5014 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason); 5015 goto onError; 5016 case 2: /* replace */ 5017 /* No need to check for space, this is a 1:1 replacement */ 5018 for (coll = collstart; coll<collend; ++coll) 5019 *str++ = '?'; 5020 /* fall through */ 5021 case 3: /* ignore */ 5022 p = collend; 5023 break; 5024 case 4: /* xmlcharrefreplace */ 5025 /* generate replacement (temporarily (mis)uses p) */ 5026 for (p = collstart; p < collend; ++p) { 5027 char buffer[2+29+1+1]; 5028 char *cp; 5029 sprintf(buffer, "&#%d;", (int)*p); 5030 if (charmaptranslate_makespace(&res, &str, 5031 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend))) 5032 goto onError; 5033 for (cp = buffer; *cp; ++cp) 5034 *str++ = *cp; 5035 } 5036 p = collend; 5037 break; 5038 default: 5039 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler, 5040 reason, startp, size, &exc, 5041 collstart-startp, collend-startp, &newpos); 5042 if (repunicode == NULL) 5043 goto onError; 5044 /* generate replacement */ 5045 repsize = PyUnicode_GET_SIZE(repunicode); 5046 if (charmaptranslate_makespace(&res, &str, 5047 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) { 5048 Py_DECREF(repunicode); 5049 goto onError; 5050 } 5051 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) 5052 *str++ = *uni2; 5053 p = startp + newpos; 5054 Py_DECREF(repunicode); 5055 } 5056 } 5057 } 5058 /* Resize if we allocated to much */ 5059 respos = str-PyUnicode_AS_UNICODE(res); 5060 if (respos<PyUnicode_GET_SIZE(res)) { 5061 if (_PyUnicode_Resize(&res, respos) < 0) 5062 goto onError; 5063 } 5064 Py_XDECREF(exc); 5065 Py_XDECREF(errorHandler); 5066 return res; 5067 5068 onError: 5069 Py_XDECREF(res); 5070 Py_XDECREF(exc); 5071 Py_XDECREF(errorHandler); 5072 return NULL; 5073} 5074 5075PyObject *PyUnicode_Translate(PyObject *str, 5076 PyObject *mapping, 5077 const char *errors) 5078{ 5079 PyObject *result; 5080 5081 str = PyUnicode_FromObject(str); 5082 if (str == NULL) 5083 goto onError; 5084 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str), 5085 PyUnicode_GET_SIZE(str), 5086 mapping, 5087 errors); 5088 Py_DECREF(str); 5089 return result; 5090 5091 onError: 5092 Py_XDECREF(str); 5093 return NULL; 5094} 5095 5096/* --- Decimal Encoder ---------------------------------------------------- */ 5097 5098int PyUnicode_EncodeDecimal(Py_UNICODE *s, 5099 Py_ssize_t length, 5100 char *output, 5101 const char *errors) 5102{ 5103 Py_UNICODE *p, *end; 5104 PyObject *errorHandler = NULL; 5105 PyObject *exc = NULL; 5106 const char *encoding = "decimal"; 5107 const char *reason = "invalid decimal Unicode string"; 5108 /* the following variable is used for caching string comparisons 5109 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */ 5110 int known_errorHandler = -1; 5111 5112 if (output == NULL) { 5113 PyErr_BadArgument(); 5114 return -1; 5115 } 5116 5117 p = s; 5118 end = s + length; 5119 while (p < end) { 5120 register Py_UNICODE ch = *p; 5121 int decimal; 5122 PyObject *repunicode; 5123 Py_ssize_t repsize; 5124 Py_ssize_t newpos; 5125 Py_UNICODE *uni2; 5126 Py_UNICODE *collstart; 5127 Py_UNICODE *collend; 5128 5129 if (Py_UNICODE_ISSPACE(ch)) { 5130 *output++ = ' '; 5131 ++p; 5132 continue; 5133 } 5134 decimal = Py_UNICODE_TODECIMAL(ch); 5135 if (decimal >= 0) { 5136 *output++ = '0' + decimal; 5137 ++p; 5138 continue; 5139 } 5140 if (0 < ch && ch < 256) { 5141 *output++ = (char)ch; 5142 ++p; 5143 continue; 5144 } 5145 /* All other characters are considered unencodable */ 5146 collstart = p; 5147 collend = p+1; 5148 while (collend < end) { 5149 if ((0 < *collend && *collend < 256) || 5150 !Py_UNICODE_ISSPACE(*collend) || 5151 Py_UNICODE_TODECIMAL(*collend)) 5152 break; 5153 } 5154 /* cache callback name lookup 5155 * (if not done yet, i.e. it's the first error) */ 5156 if (known_errorHandler==-1) { 5157 if ((errors==NULL) || (!strcmp(errors, "strict"))) 5158 known_errorHandler = 1; 5159 else if (!strcmp(errors, "replace")) 5160 known_errorHandler = 2; 5161 else if (!strcmp(errors, "ignore")) 5162 known_errorHandler = 3; 5163 else if (!strcmp(errors, "xmlcharrefreplace")) 5164 known_errorHandler = 4; 5165 else 5166 known_errorHandler = 0; 5167 } 5168 switch (known_errorHandler) { 5169 case 1: /* strict */ 5170 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason); 5171 goto onError; 5172 case 2: /* replace */ 5173 for (p = collstart; p < collend; ++p) 5174 *output++ = '?'; 5175 /* fall through */ 5176 case 3: /* ignore */ 5177 p = collend; 5178 break; 5179 case 4: /* xmlcharrefreplace */ 5180 /* generate replacement (temporarily (mis)uses p) */ 5181 for (p = collstart; p < collend; ++p) 5182 output += sprintf(output, "&#%d;", (int)*p); 5183 p = collend; 5184 break; 5185 default: 5186 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler, 5187 encoding, reason, s, length, &exc, 5188 collstart-s, collend-s, &newpos); 5189 if (repunicode == NULL) 5190 goto onError; 5191 /* generate replacement */ 5192 repsize = PyUnicode_GET_SIZE(repunicode); 5193 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) { 5194 Py_UNICODE ch = *uni2; 5195 if (Py_UNICODE_ISSPACE(ch)) 5196 *output++ = ' '; 5197 else { 5198 decimal = Py_UNICODE_TODECIMAL(ch); 5199 if (decimal >= 0) 5200 *output++ = '0' + decimal; 5201 else if (0 < ch && ch < 256) 5202 *output++ = (char)ch; 5203 else { 5204 Py_DECREF(repunicode); 5205 raise_encode_exception(&exc, encoding, 5206 s, length, collstart-s, collend-s, reason); 5207 goto onError; 5208 } 5209 } 5210 } 5211 p = s + newpos; 5212 Py_DECREF(repunicode); 5213 } 5214 } 5215 /* 0-terminate the output string */ 5216 *output++ = '\0'; 5217 Py_XDECREF(exc); 5218 Py_XDECREF(errorHandler); 5219 return 0; 5220 5221 onError: 5222 Py_XDECREF(exc); 5223 Py_XDECREF(errorHandler); 5224 return -1; 5225} 5226 5227/* --- Helpers ------------------------------------------------------------ */ 5228 5229#include "stringlib/unicodedefs.h" 5230#include "stringlib/fastsearch.h" 5231#include "stringlib/count.h" 5232/* Include _ParseTupleFinds from find.h */ 5233#define FROM_UNICODE 5234#include "stringlib/find.h" 5235#include "stringlib/partition.h" 5236 5237#define _Py_InsertThousandsGrouping _PyUnicode_InsertThousandsGrouping 5238#include "stringlib/localeutil.h" 5239 5240/* helper macro to fixup start/end slice values */ 5241#define FIX_START_END(obj) \ 5242 if (start < 0) \ 5243 start += (obj)->length; \ 5244 if (start < 0) \ 5245 start = 0; \ 5246 if (end > (obj)->length) \ 5247 end = (obj)->length; \ 5248 if (end < 0) \ 5249 end += (obj)->length; \ 5250 if (end < 0) \ 5251 end = 0; 5252 5253Py_ssize_t PyUnicode_Count(PyObject *str, 5254 PyObject *substr, 5255 Py_ssize_t start, 5256 Py_ssize_t end) 5257{ 5258 Py_ssize_t result; 5259 PyUnicodeObject* str_obj; 5260 PyUnicodeObject* sub_obj; 5261 5262 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str); 5263 if (!str_obj) 5264 return -1; 5265 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr); 5266 if (!sub_obj) { 5267 Py_DECREF(str_obj); 5268 return -1; 5269 } 5270 5271 FIX_START_END(str_obj); 5272 5273 result = stringlib_count( 5274 str_obj->str + start, end - start, sub_obj->str, sub_obj->length 5275 ); 5276 5277 Py_DECREF(sub_obj); 5278 Py_DECREF(str_obj); 5279 5280 return result; 5281} 5282 5283Py_ssize_t PyUnicode_Find(PyObject *str, 5284 PyObject *sub, 5285 Py_ssize_t start, 5286 Py_ssize_t end, 5287 int direction) 5288{ 5289 Py_ssize_t result; 5290 5291 str = PyUnicode_FromObject(str); 5292 if (!str) 5293 return -2; 5294 sub = PyUnicode_FromObject(sub); 5295 if (!sub) { 5296 Py_DECREF(str); 5297 return -2; 5298 } 5299 5300 if (direction > 0) 5301 result = stringlib_find_slice( 5302 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str), 5303 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub), 5304 start, end 5305 ); 5306 else 5307 result = stringlib_rfind_slice( 5308 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str), 5309 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub), 5310 start, end 5311 ); 5312 5313 Py_DECREF(str); 5314 Py_DECREF(sub); 5315 5316 return result; 5317} 5318 5319static 5320int tailmatch(PyUnicodeObject *self, 5321 PyUnicodeObject *substring, 5322 Py_ssize_t start, 5323 Py_ssize_t end, 5324 int direction) 5325{ 5326 if (substring->length == 0) 5327 return 1; 5328 5329 FIX_START_END(self); 5330 5331 end -= substring->length; 5332 if (end < start) 5333 return 0; 5334 5335 if (direction > 0) { 5336 if (Py_UNICODE_MATCH(self, end, substring)) 5337 return 1; 5338 } else { 5339 if (Py_UNICODE_MATCH(self, start, substring)) 5340 return 1; 5341 } 5342 5343 return 0; 5344} 5345 5346Py_ssize_t PyUnicode_Tailmatch(PyObject *str, 5347 PyObject *substr, 5348 Py_ssize_t start, 5349 Py_ssize_t end, 5350 int direction) 5351{ 5352 Py_ssize_t result; 5353 5354 str = PyUnicode_FromObject(str); 5355 if (str == NULL) 5356 return -1; 5357 substr = PyUnicode_FromObject(substr); 5358 if (substr == NULL) { 5359 Py_DECREF(str); 5360 return -1; 5361 } 5362 5363 result = tailmatch((PyUnicodeObject *)str, 5364 (PyUnicodeObject *)substr, 5365 start, end, direction); 5366 Py_DECREF(str); 5367 Py_DECREF(substr); 5368 return result; 5369} 5370 5371/* Apply fixfct filter to the Unicode object self and return a 5372 reference to the modified object */ 5373 5374static 5375PyObject *fixup(PyUnicodeObject *self, 5376 int (*fixfct)(PyUnicodeObject *s)) 5377{ 5378 5379 PyUnicodeObject *u; 5380 5381 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length); 5382 if (u == NULL) 5383 return NULL; 5384 5385 Py_UNICODE_COPY(u->str, self->str, self->length); 5386 5387 if (!fixfct(u) && PyUnicode_CheckExact(self)) { 5388 /* fixfct should return TRUE if it modified the buffer. If 5389 FALSE, return a reference to the original buffer instead 5390 (to save space, not time) */ 5391 Py_INCREF(self); 5392 Py_DECREF(u); 5393 return (PyObject*) self; 5394 } 5395 return (PyObject*) u; 5396} 5397 5398static 5399int fixupper(PyUnicodeObject *self) 5400{ 5401 Py_ssize_t len = self->length; 5402 Py_UNICODE *s = self->str; 5403 int status = 0; 5404 5405 while (len-- > 0) { 5406 register Py_UNICODE ch; 5407 5408 ch = Py_UNICODE_TOUPPER(*s); 5409 if (ch != *s) { 5410 status = 1; 5411 *s = ch; 5412 } 5413 s++; 5414 } 5415 5416 return status; 5417} 5418 5419static 5420int fixlower(PyUnicodeObject *self) 5421{ 5422 Py_ssize_t len = self->length; 5423 Py_UNICODE *s = self->str; 5424 int status = 0; 5425 5426 while (len-- > 0) { 5427 register Py_UNICODE ch; 5428 5429 ch = Py_UNICODE_TOLOWER(*s); 5430 if (ch != *s) { 5431 status = 1; 5432 *s = ch; 5433 } 5434 s++; 5435 } 5436 5437 return status; 5438} 5439 5440static 5441int fixswapcase(PyUnicodeObject *self) 5442{ 5443 Py_ssize_t len = self->length; 5444 Py_UNICODE *s = self->str; 5445 int status = 0; 5446 5447 while (len-- > 0) { 5448 if (Py_UNICODE_ISUPPER(*s)) { 5449 *s = Py_UNICODE_TOLOWER(*s); 5450 status = 1; 5451 } else if (Py_UNICODE_ISLOWER(*s)) { 5452 *s = Py_UNICODE_TOUPPER(*s); 5453 status = 1; 5454 } 5455 s++; 5456 } 5457 5458 return status; 5459} 5460 5461static 5462int fixcapitalize(PyUnicodeObject *self) 5463{ 5464 Py_ssize_t len = self->length; 5465 Py_UNICODE *s = self->str; 5466 int status = 0; 5467 5468 if (len == 0) 5469 return 0; 5470 if (Py_UNICODE_ISLOWER(*s)) { 5471 *s = Py_UNICODE_TOUPPER(*s); 5472 status = 1; 5473 } 5474 s++; 5475 while (--len > 0) { 5476 if (Py_UNICODE_ISUPPER(*s)) { 5477 *s = Py_UNICODE_TOLOWER(*s); 5478 status = 1; 5479 } 5480 s++; 5481 } 5482 return status; 5483} 5484 5485static 5486int fixtitle(PyUnicodeObject *self) 5487{ 5488 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 5489 register Py_UNICODE *e; 5490 int previous_is_cased; 5491 5492 /* Shortcut for single character strings */ 5493 if (PyUnicode_GET_SIZE(self) == 1) { 5494 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p); 5495 if (*p != ch) { 5496 *p = ch; 5497 return 1; 5498 } 5499 else 5500 return 0; 5501 } 5502 5503 e = p + PyUnicode_GET_SIZE(self); 5504 previous_is_cased = 0; 5505 for (; p < e; p++) { 5506 register const Py_UNICODE ch = *p; 5507 5508 if (previous_is_cased) 5509 *p = Py_UNICODE_TOLOWER(ch); 5510 else 5511 *p = Py_UNICODE_TOTITLE(ch); 5512 5513 if (Py_UNICODE_ISLOWER(ch) || 5514 Py_UNICODE_ISUPPER(ch) || 5515 Py_UNICODE_ISTITLE(ch)) 5516 previous_is_cased = 1; 5517 else 5518 previous_is_cased = 0; 5519 } 5520 return 1; 5521} 5522 5523PyObject * 5524PyUnicode_Join(PyObject *separator, PyObject *seq) 5525{ 5526 PyObject *internal_separator = NULL; 5527 const Py_UNICODE blank = ' '; 5528 const Py_UNICODE *sep = ␣ 5529 Py_ssize_t seplen = 1; 5530 PyUnicodeObject *res = NULL; /* the result */ 5531 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */ 5532 Py_ssize_t res_used; /* # used bytes */ 5533 Py_UNICODE *res_p; /* pointer to free byte in res's string area */ 5534 PyObject *fseq; /* PySequence_Fast(seq) */ 5535 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */ 5536 PyObject *item; 5537 Py_ssize_t i; 5538 5539 fseq = PySequence_Fast(seq, ""); 5540 if (fseq == NULL) { 5541 return NULL; 5542 } 5543 5544 /* Grrrr. A codec may be invoked to convert str objects to 5545 * Unicode, and so it's possible to call back into Python code 5546 * during PyUnicode_FromObject(), and so it's possible for a sick 5547 * codec to change the size of fseq (if seq is a list). Therefore 5548 * we have to keep refetching the size -- can't assume seqlen 5549 * is invariant. 5550 */ 5551 seqlen = PySequence_Fast_GET_SIZE(fseq); 5552 /* If empty sequence, return u"". */ 5553 if (seqlen == 0) { 5554 res = _PyUnicode_New(0); /* empty sequence; return u"" */ 5555 goto Done; 5556 } 5557 /* If singleton sequence with an exact Unicode, return that. */ 5558 if (seqlen == 1) { 5559 item = PySequence_Fast_GET_ITEM(fseq, 0); 5560 if (PyUnicode_CheckExact(item)) { 5561 Py_INCREF(item); 5562 res = (PyUnicodeObject *)item; 5563 goto Done; 5564 } 5565 } 5566 5567 /* At least two items to join, or one that isn't exact Unicode. */ 5568 if (seqlen > 1) { 5569 /* Set up sep and seplen -- they're needed. */ 5570 if (separator == NULL) { 5571 sep = ␣ 5572 seplen = 1; 5573 } 5574 else { 5575 internal_separator = PyUnicode_FromObject(separator); 5576 if (internal_separator == NULL) 5577 goto onError; 5578 sep = PyUnicode_AS_UNICODE(internal_separator); 5579 seplen = PyUnicode_GET_SIZE(internal_separator); 5580 /* In case PyUnicode_FromObject() mutated seq. */ 5581 seqlen = PySequence_Fast_GET_SIZE(fseq); 5582 } 5583 } 5584 5585 /* Get space. */ 5586 res = _PyUnicode_New(res_alloc); 5587 if (res == NULL) 5588 goto onError; 5589 res_p = PyUnicode_AS_UNICODE(res); 5590 res_used = 0; 5591 5592 for (i = 0; i < seqlen; ++i) { 5593 Py_ssize_t itemlen; 5594 Py_ssize_t new_res_used; 5595 5596 item = PySequence_Fast_GET_ITEM(fseq, i); 5597 /* Convert item to Unicode. */ 5598 if (!PyUnicode_Check(item)) { 5599 PyErr_Format(PyExc_TypeError, 5600 "sequence item %zd: expected str instance," 5601 " %.80s found", 5602 i, Py_TYPE(item)->tp_name); 5603 goto onError; 5604 } 5605 item = PyUnicode_FromObject(item); 5606 if (item == NULL) 5607 goto onError; 5608 /* We own a reference to item from here on. */ 5609 5610 /* In case PyUnicode_FromObject() mutated seq. */ 5611 seqlen = PySequence_Fast_GET_SIZE(fseq); 5612 5613 /* Make sure we have enough space for the separator and the item. */ 5614 itemlen = PyUnicode_GET_SIZE(item); 5615 new_res_used = res_used + itemlen; 5616 if (new_res_used < 0) 5617 goto Overflow; 5618 if (i < seqlen - 1) { 5619 new_res_used += seplen; 5620 if (new_res_used < 0) 5621 goto Overflow; 5622 } 5623 if (new_res_used > res_alloc) { 5624 /* double allocated size until it's big enough */ 5625 do { 5626 res_alloc += res_alloc; 5627 if (res_alloc <= 0) 5628 goto Overflow; 5629 } while (new_res_used > res_alloc); 5630 if (_PyUnicode_Resize(&res, res_alloc) < 0) { 5631 Py_DECREF(item); 5632 goto onError; 5633 } 5634 res_p = PyUnicode_AS_UNICODE(res) + res_used; 5635 } 5636 5637 /* Copy item, and maybe the separator. */ 5638 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen); 5639 res_p += itemlen; 5640 if (i < seqlen - 1) { 5641 Py_UNICODE_COPY(res_p, sep, seplen); 5642 res_p += seplen; 5643 } 5644 Py_DECREF(item); 5645 res_used = new_res_used; 5646 } 5647 5648 /* Shrink res to match the used area; this probably can't fail, 5649 * but it's cheap to check. 5650 */ 5651 if (_PyUnicode_Resize(&res, res_used) < 0) 5652 goto onError; 5653 5654 Done: 5655 Py_XDECREF(internal_separator); 5656 Py_DECREF(fseq); 5657 return (PyObject *)res; 5658 5659 Overflow: 5660 PyErr_SetString(PyExc_OverflowError, 5661 "join() result is too long for a Python string"); 5662 Py_DECREF(item); 5663 /* fall through */ 5664 5665 onError: 5666 Py_XDECREF(internal_separator); 5667 Py_DECREF(fseq); 5668 Py_XDECREF(res); 5669 return NULL; 5670} 5671 5672static 5673PyUnicodeObject *pad(PyUnicodeObject *self, 5674 Py_ssize_t left, 5675 Py_ssize_t right, 5676 Py_UNICODE fill) 5677{ 5678 PyUnicodeObject *u; 5679 5680 if (left < 0) 5681 left = 0; 5682 if (right < 0) 5683 right = 0; 5684 5685 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) { 5686 Py_INCREF(self); 5687 return self; 5688 } 5689 5690 u = _PyUnicode_New(left + self->length + right); 5691 if (u) { 5692 if (left) 5693 Py_UNICODE_FILL(u->str, fill, left); 5694 Py_UNICODE_COPY(u->str + left, self->str, self->length); 5695 if (right) 5696 Py_UNICODE_FILL(u->str + left + self->length, fill, right); 5697 } 5698 5699 return u; 5700} 5701 5702#define SPLIT_APPEND(data, left, right) \ 5703 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \ 5704 if (!str) \ 5705 goto onError; \ 5706 if (PyList_Append(list, str)) { \ 5707 Py_DECREF(str); \ 5708 goto onError; \ 5709 } \ 5710 else \ 5711 Py_DECREF(str); 5712 5713static 5714PyObject *split_whitespace(PyUnicodeObject *self, 5715 PyObject *list, 5716 Py_ssize_t maxcount) 5717{ 5718 register Py_ssize_t i; 5719 register Py_ssize_t j; 5720 Py_ssize_t len = self->length; 5721 PyObject *str; 5722 register const Py_UNICODE *buf = self->str; 5723 5724 for (i = j = 0; i < len; ) { 5725 /* find a token */ 5726 while (i < len && Py_UNICODE_ISSPACE(buf[i])) 5727 i++; 5728 j = i; 5729 while (i < len && !Py_UNICODE_ISSPACE(buf[i])) 5730 i++; 5731 if (j < i) { 5732 if (maxcount-- <= 0) 5733 break; 5734 SPLIT_APPEND(buf, j, i); 5735 while (i < len && Py_UNICODE_ISSPACE(buf[i])) 5736 i++; 5737 j = i; 5738 } 5739 } 5740 if (j < len) { 5741 SPLIT_APPEND(buf, j, len); 5742 } 5743 return list; 5744 5745 onError: 5746 Py_DECREF(list); 5747 return NULL; 5748} 5749 5750PyObject *PyUnicode_Splitlines(PyObject *string, 5751 int keepends) 5752{ 5753 register Py_ssize_t i; 5754 register Py_ssize_t j; 5755 Py_ssize_t len; 5756 PyObject *list; 5757 PyObject *str; 5758 Py_UNICODE *data; 5759 5760 string = PyUnicode_FromObject(string); 5761 if (string == NULL) 5762 return NULL; 5763 data = PyUnicode_AS_UNICODE(string); 5764 len = PyUnicode_GET_SIZE(string); 5765 5766 list = PyList_New(0); 5767 if (!list) 5768 goto onError; 5769 5770 for (i = j = 0; i < len; ) { 5771 Py_ssize_t eol; 5772 5773 /* Find a line and append it */ 5774 while (i < len && !BLOOM_LINEBREAK(data[i])) 5775 i++; 5776 5777 /* Skip the line break reading CRLF as one line break */ 5778 eol = i; 5779 if (i < len) { 5780 if (data[i] == '\r' && i + 1 < len && 5781 data[i+1] == '\n') 5782 i += 2; 5783 else 5784 i++; 5785 if (keepends) 5786 eol = i; 5787 } 5788 SPLIT_APPEND(data, j, eol); 5789 j = i; 5790 } 5791 if (j < len) { 5792 SPLIT_APPEND(data, j, len); 5793 } 5794 5795 Py_DECREF(string); 5796 return list; 5797 5798 onError: 5799 Py_XDECREF(list); 5800 Py_DECREF(string); 5801 return NULL; 5802} 5803 5804static 5805PyObject *split_char(PyUnicodeObject *self, 5806 PyObject *list, 5807 Py_UNICODE ch, 5808 Py_ssize_t maxcount) 5809{ 5810 register Py_ssize_t i; 5811 register Py_ssize_t j; 5812 Py_ssize_t len = self->length; 5813 PyObject *str; 5814 register const Py_UNICODE *buf = self->str; 5815 5816 for (i = j = 0; i < len; ) { 5817 if (buf[i] == ch) { 5818 if (maxcount-- <= 0) 5819 break; 5820 SPLIT_APPEND(buf, j, i); 5821 i = j = i + 1; 5822 } else 5823 i++; 5824 } 5825 if (j <= len) { 5826 SPLIT_APPEND(buf, j, len); 5827 } 5828 return list; 5829 5830 onError: 5831 Py_DECREF(list); 5832 return NULL; 5833} 5834 5835static 5836PyObject *split_substring(PyUnicodeObject *self, 5837 PyObject *list, 5838 PyUnicodeObject *substring, 5839 Py_ssize_t maxcount) 5840{ 5841 register Py_ssize_t i; 5842 register Py_ssize_t j; 5843 Py_ssize_t len = self->length; 5844 Py_ssize_t sublen = substring->length; 5845 PyObject *str; 5846 5847 for (i = j = 0; i <= len - sublen; ) { 5848 if (Py_UNICODE_MATCH(self, i, substring)) { 5849 if (maxcount-- <= 0) 5850 break; 5851 SPLIT_APPEND(self->str, j, i); 5852 i = j = i + sublen; 5853 } else 5854 i++; 5855 } 5856 if (j <= len) { 5857 SPLIT_APPEND(self->str, j, len); 5858 } 5859 return list; 5860 5861 onError: 5862 Py_DECREF(list); 5863 return NULL; 5864} 5865 5866static 5867PyObject *rsplit_whitespace(PyUnicodeObject *self, 5868 PyObject *list, 5869 Py_ssize_t maxcount) 5870{ 5871 register Py_ssize_t i; 5872 register Py_ssize_t j; 5873 Py_ssize_t len = self->length; 5874 PyObject *str; 5875 register const Py_UNICODE *buf = self->str; 5876 5877 for (i = j = len - 1; i >= 0; ) { 5878 /* find a token */ 5879 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i])) 5880 i--; 5881 j = i; 5882 while (i >= 0 && !Py_UNICODE_ISSPACE(buf[i])) 5883 i--; 5884 if (j > i) { 5885 if (maxcount-- <= 0) 5886 break; 5887 SPLIT_APPEND(buf, i + 1, j + 1); 5888 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i])) 5889 i--; 5890 j = i; 5891 } 5892 } 5893 if (j >= 0) { 5894 SPLIT_APPEND(buf, 0, j + 1); 5895 } 5896 if (PyList_Reverse(list) < 0) 5897 goto onError; 5898 return list; 5899 5900 onError: 5901 Py_DECREF(list); 5902 return NULL; 5903} 5904 5905static 5906PyObject *rsplit_char(PyUnicodeObject *self, 5907 PyObject *list, 5908 Py_UNICODE ch, 5909 Py_ssize_t maxcount) 5910{ 5911 register Py_ssize_t i; 5912 register Py_ssize_t j; 5913 Py_ssize_t len = self->length; 5914 PyObject *str; 5915 register const Py_UNICODE *buf = self->str; 5916 5917 for (i = j = len - 1; i >= 0; ) { 5918 if (buf[i] == ch) { 5919 if (maxcount-- <= 0) 5920 break; 5921 SPLIT_APPEND(buf, i + 1, j + 1); 5922 j = i = i - 1; 5923 } else 5924 i--; 5925 } 5926 if (j >= -1) { 5927 SPLIT_APPEND(buf, 0, j + 1); 5928 } 5929 if (PyList_Reverse(list) < 0) 5930 goto onError; 5931 return list; 5932 5933 onError: 5934 Py_DECREF(list); 5935 return NULL; 5936} 5937 5938static 5939PyObject *rsplit_substring(PyUnicodeObject *self, 5940 PyObject *list, 5941 PyUnicodeObject *substring, 5942 Py_ssize_t maxcount) 5943{ 5944 register Py_ssize_t i; 5945 register Py_ssize_t j; 5946 Py_ssize_t len = self->length; 5947 Py_ssize_t sublen = substring->length; 5948 PyObject *str; 5949 5950 for (i = len - sublen, j = len; i >= 0; ) { 5951 if (Py_UNICODE_MATCH(self, i, substring)) { 5952 if (maxcount-- <= 0) 5953 break; 5954 SPLIT_APPEND(self->str, i + sublen, j); 5955 j = i; 5956 i -= sublen; 5957 } else 5958 i--; 5959 } 5960 if (j >= 0) { 5961 SPLIT_APPEND(self->str, 0, j); 5962 } 5963 if (PyList_Reverse(list) < 0) 5964 goto onError; 5965 return list; 5966 5967 onError: 5968 Py_DECREF(list); 5969 return NULL; 5970} 5971 5972#undef SPLIT_APPEND 5973 5974static 5975PyObject *split(PyUnicodeObject *self, 5976 PyUnicodeObject *substring, 5977 Py_ssize_t maxcount) 5978{ 5979 PyObject *list; 5980 5981 if (maxcount < 0) 5982 maxcount = PY_SSIZE_T_MAX; 5983 5984 list = PyList_New(0); 5985 if (!list) 5986 return NULL; 5987 5988 if (substring == NULL) 5989 return split_whitespace(self,list,maxcount); 5990 5991 else if (substring->length == 1) 5992 return split_char(self,list,substring->str[0],maxcount); 5993 5994 else if (substring->length == 0) { 5995 Py_DECREF(list); 5996 PyErr_SetString(PyExc_ValueError, "empty separator"); 5997 return NULL; 5998 } 5999 else 6000 return split_substring(self,list,substring,maxcount); 6001} 6002 6003static 6004PyObject *rsplit(PyUnicodeObject *self, 6005 PyUnicodeObject *substring, 6006 Py_ssize_t maxcount) 6007{ 6008 PyObject *list; 6009 6010 if (maxcount < 0) 6011 maxcount = PY_SSIZE_T_MAX; 6012 6013 list = PyList_New(0); 6014 if (!list) 6015 return NULL; 6016 6017 if (substring == NULL) 6018 return rsplit_whitespace(self,list,maxcount); 6019 6020 else if (substring->length == 1) 6021 return rsplit_char(self,list,substring->str[0],maxcount); 6022 6023 else if (substring->length == 0) { 6024 Py_DECREF(list); 6025 PyErr_SetString(PyExc_ValueError, "empty separator"); 6026 return NULL; 6027 } 6028 else 6029 return rsplit_substring(self,list,substring,maxcount); 6030} 6031 6032static 6033PyObject *replace(PyUnicodeObject *self, 6034 PyUnicodeObject *str1, 6035 PyUnicodeObject *str2, 6036 Py_ssize_t maxcount) 6037{ 6038 PyUnicodeObject *u; 6039 6040 if (maxcount < 0) 6041 maxcount = PY_SSIZE_T_MAX; 6042 6043 if (str1->length == str2->length) { 6044 /* same length */ 6045 Py_ssize_t i; 6046 if (str1->length == 1) { 6047 /* replace characters */ 6048 Py_UNICODE u1, u2; 6049 if (!findchar(self->str, self->length, str1->str[0])) 6050 goto nothing; 6051 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length); 6052 if (!u) 6053 return NULL; 6054 Py_UNICODE_COPY(u->str, self->str, self->length); 6055 u1 = str1->str[0]; 6056 u2 = str2->str[0]; 6057 for (i = 0; i < u->length; i++) 6058 if (u->str[i] == u1) { 6059 if (--maxcount < 0) 6060 break; 6061 u->str[i] = u2; 6062 } 6063 } else { 6064 i = fastsearch( 6065 self->str, self->length, str1->str, str1->length, FAST_SEARCH 6066 ); 6067 if (i < 0) 6068 goto nothing; 6069 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length); 6070 if (!u) 6071 return NULL; 6072 Py_UNICODE_COPY(u->str, self->str, self->length); 6073 while (i <= self->length - str1->length) 6074 if (Py_UNICODE_MATCH(self, i, str1)) { 6075 if (--maxcount < 0) 6076 break; 6077 Py_UNICODE_COPY(u->str+i, str2->str, str2->length); 6078 i += str1->length; 6079 } else 6080 i++; 6081 } 6082 } else { 6083 6084 Py_ssize_t n, i, j, e; 6085 Py_ssize_t product, new_size, delta; 6086 Py_UNICODE *p; 6087 6088 /* replace strings */ 6089 n = stringlib_count(self->str, self->length, str1->str, str1->length); 6090 if (n > maxcount) 6091 n = maxcount; 6092 if (n == 0) 6093 goto nothing; 6094 /* new_size = self->length + n * (str2->length - str1->length)); */ 6095 delta = (str2->length - str1->length); 6096 if (delta == 0) { 6097 new_size = self->length; 6098 } else { 6099 product = n * (str2->length - str1->length); 6100 if ((product / (str2->length - str1->length)) != n) { 6101 PyErr_SetString(PyExc_OverflowError, 6102 "replace string is too long"); 6103 return NULL; 6104 } 6105 new_size = self->length + product; 6106 if (new_size < 0) { 6107 PyErr_SetString(PyExc_OverflowError, 6108 "replace string is too long"); 6109 return NULL; 6110 } 6111 } 6112 u = _PyUnicode_New(new_size); 6113 if (!u) 6114 return NULL; 6115 i = 0; 6116 p = u->str; 6117 e = self->length - str1->length; 6118 if (str1->length > 0) { 6119 while (n-- > 0) { 6120 /* look for next match */ 6121 j = i; 6122 while (j <= e) { 6123 if (Py_UNICODE_MATCH(self, j, str1)) 6124 break; 6125 j++; 6126 } 6127 if (j > i) { 6128 if (j > e) 6129 break; 6130 /* copy unchanged part [i:j] */ 6131 Py_UNICODE_COPY(p, self->str+i, j-i); 6132 p += j - i; 6133 } 6134 /* copy substitution string */ 6135 if (str2->length > 0) { 6136 Py_UNICODE_COPY(p, str2->str, str2->length); 6137 p += str2->length; 6138 } 6139 i = j + str1->length; 6140 } 6141 if (i < self->length) 6142 /* copy tail [i:] */ 6143 Py_UNICODE_COPY(p, self->str+i, self->length-i); 6144 } else { 6145 /* interleave */ 6146 while (n > 0) { 6147 Py_UNICODE_COPY(p, str2->str, str2->length); 6148 p += str2->length; 6149 if (--n <= 0) 6150 break; 6151 *p++ = self->str[i++]; 6152 } 6153 Py_UNICODE_COPY(p, self->str+i, self->length-i); 6154 } 6155 } 6156 return (PyObject *) u; 6157 6158nothing: 6159 /* nothing to replace; return original string (when possible) */ 6160 if (PyUnicode_CheckExact(self)) { 6161 Py_INCREF(self); 6162 return (PyObject *) self; 6163 } 6164 return PyUnicode_FromUnicode(self->str, self->length); 6165} 6166 6167/* --- Unicode Object Methods --------------------------------------------- */ 6168 6169PyDoc_STRVAR(title__doc__, 6170"S.title() -> unicode\n\ 6171\n\ 6172Return a titlecased version of S, i.e. words start with title case\n\ 6173characters, all remaining cased characters have lower case."); 6174 6175static PyObject* 6176unicode_title(PyUnicodeObject *self) 6177{ 6178 return fixup(self, fixtitle); 6179} 6180 6181PyDoc_STRVAR(capitalize__doc__, 6182"S.capitalize() -> unicode\n\ 6183\n\ 6184Return a capitalized version of S, i.e. make the first character\n\ 6185have upper case."); 6186 6187static PyObject* 6188unicode_capitalize(PyUnicodeObject *self) 6189{ 6190 return fixup(self, fixcapitalize); 6191} 6192 6193#if 0 6194PyDoc_STRVAR(capwords__doc__, 6195"S.capwords() -> unicode\n\ 6196\n\ 6197Apply .capitalize() to all words in S and return the result with\n\ 6198normalized whitespace (all whitespace strings are replaced by ' ')."); 6199 6200static PyObject* 6201unicode_capwords(PyUnicodeObject *self) 6202{ 6203 PyObject *list; 6204 PyObject *item; 6205 Py_ssize_t i; 6206 6207 /* Split into words */ 6208 list = split(self, NULL, -1); 6209 if (!list) 6210 return NULL; 6211 6212 /* Capitalize each word */ 6213 for (i = 0; i < PyList_GET_SIZE(list); i++) { 6214 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i), 6215 fixcapitalize); 6216 if (item == NULL) 6217 goto onError; 6218 Py_DECREF(PyList_GET_ITEM(list, i)); 6219 PyList_SET_ITEM(list, i, item); 6220 } 6221 6222 /* Join the words to form a new string */ 6223 item = PyUnicode_Join(NULL, list); 6224 6225onError: 6226 Py_DECREF(list); 6227 return (PyObject *)item; 6228} 6229#endif 6230 6231/* Argument converter. Coerces to a single unicode character */ 6232 6233static int 6234convert_uc(PyObject *obj, void *addr) 6235{ 6236 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr; 6237 PyObject *uniobj; 6238 Py_UNICODE *unistr; 6239 6240 uniobj = PyUnicode_FromObject(obj); 6241 if (uniobj == NULL) { 6242 PyErr_SetString(PyExc_TypeError, 6243 "The fill character cannot be converted to Unicode"); 6244 return 0; 6245 } 6246 if (PyUnicode_GET_SIZE(uniobj) != 1) { 6247 PyErr_SetString(PyExc_TypeError, 6248 "The fill character must be exactly one character long"); 6249 Py_DECREF(uniobj); 6250 return 0; 6251 } 6252 unistr = PyUnicode_AS_UNICODE(uniobj); 6253 *fillcharloc = unistr[0]; 6254 Py_DECREF(uniobj); 6255 return 1; 6256} 6257 6258PyDoc_STRVAR(center__doc__, 6259"S.center(width[, fillchar]) -> unicode\n\ 6260\n\ 6261Return S centered in a Unicode string of length width. Padding is\n\ 6262done using the specified fill character (default is a space)"); 6263 6264static PyObject * 6265unicode_center(PyUnicodeObject *self, PyObject *args) 6266{ 6267 Py_ssize_t marg, left; 6268 Py_ssize_t width; 6269 Py_UNICODE fillchar = ' '; 6270 6271 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar)) 6272 return NULL; 6273 6274 if (self->length >= width && PyUnicode_CheckExact(self)) { 6275 Py_INCREF(self); 6276 return (PyObject*) self; 6277 } 6278 6279 marg = width - self->length; 6280 left = marg / 2 + (marg & width & 1); 6281 6282 return (PyObject*) pad(self, left, marg - left, fillchar); 6283} 6284 6285#if 0 6286 6287/* This code should go into some future Unicode collation support 6288 module. The basic comparison should compare ordinals on a naive 6289 basis (this is what Java does and thus JPython too). */ 6290 6291/* speedy UTF-16 code point order comparison */ 6292/* gleaned from: */ 6293/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */ 6294 6295static short utf16Fixup[32] = 6296{ 6297 0, 0, 0, 0, 0, 0, 0, 0, 6298 0, 0, 0, 0, 0, 0, 0, 0, 6299 0, 0, 0, 0, 0, 0, 0, 0, 6300 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800 6301}; 6302 6303static int 6304unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2) 6305{ 6306 Py_ssize_t len1, len2; 6307 6308 Py_UNICODE *s1 = str1->str; 6309 Py_UNICODE *s2 = str2->str; 6310 6311 len1 = str1->length; 6312 len2 = str2->length; 6313 6314 while (len1 > 0 && len2 > 0) { 6315 Py_UNICODE c1, c2; 6316 6317 c1 = *s1++; 6318 c2 = *s2++; 6319 6320 if (c1 > (1<<11) * 26) 6321 c1 += utf16Fixup[c1>>11]; 6322 if (c2 > (1<<11) * 26) 6323 c2 += utf16Fixup[c2>>11]; 6324 /* now c1 and c2 are in UTF-32-compatible order */ 6325 6326 if (c1 != c2) 6327 return (c1 < c2) ? -1 : 1; 6328 6329 len1--; len2--; 6330 } 6331 6332 return (len1 < len2) ? -1 : (len1 != len2); 6333} 6334 6335#else 6336 6337static int 6338unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2) 6339{ 6340 register Py_ssize_t len1, len2; 6341 6342 Py_UNICODE *s1 = str1->str; 6343 Py_UNICODE *s2 = str2->str; 6344 6345 len1 = str1->length; 6346 len2 = str2->length; 6347 6348 while (len1 > 0 && len2 > 0) { 6349 Py_UNICODE c1, c2; 6350 6351 c1 = *s1++; 6352 c2 = *s2++; 6353 6354 if (c1 != c2) 6355 return (c1 < c2) ? -1 : 1; 6356 6357 len1--; len2--; 6358 } 6359 6360 return (len1 < len2) ? -1 : (len1 != len2); 6361} 6362 6363#endif 6364 6365int PyUnicode_Compare(PyObject *left, 6366 PyObject *right) 6367{ 6368 if (PyUnicode_Check(left) && PyUnicode_Check(right)) 6369 return unicode_compare((PyUnicodeObject *)left, 6370 (PyUnicodeObject *)right); 6371 PyErr_Format(PyExc_TypeError, 6372 "Can't compare %.100s and %.100s", 6373 left->ob_type->tp_name, 6374 right->ob_type->tp_name); 6375 return -1; 6376} 6377 6378int 6379PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str) 6380{ 6381 int i; 6382 Py_UNICODE *id; 6383 assert(PyUnicode_Check(uni)); 6384 id = PyUnicode_AS_UNICODE(uni); 6385 /* Compare Unicode string and source character set string */ 6386 for (i = 0; id[i] && str[i]; i++) 6387 if (id[i] != str[i]) 6388 return ((int)id[i] < (int)str[i]) ? -1 : 1; 6389 if (id[i]) 6390 return 1; /* uni is longer */ 6391 if (str[i]) 6392 return -1; /* str is longer */ 6393 return 0; 6394} 6395 6396PyObject *PyUnicode_RichCompare(PyObject *left, 6397 PyObject *right, 6398 int op) 6399{ 6400 int result; 6401 6402 result = PyUnicode_Compare(left, right); 6403 if (result == -1 && PyErr_Occurred()) 6404 goto onError; 6405 6406 /* Convert the return value to a Boolean */ 6407 switch (op) { 6408 case Py_EQ: 6409 result = (result == 0); 6410 break; 6411 case Py_NE: 6412 result = (result != 0); 6413 break; 6414 case Py_LE: 6415 result = (result <= 0); 6416 break; 6417 case Py_GE: 6418 result = (result >= 0); 6419 break; 6420 case Py_LT: 6421 result = (result == -1); 6422 break; 6423 case Py_GT: 6424 result = (result == 1); 6425 break; 6426 } 6427 return PyBool_FromLong(result); 6428 6429 onError: 6430 6431 /* Standard case 6432 6433 Type errors mean that PyUnicode_FromObject() could not convert 6434 one of the arguments (usually the right hand side) to Unicode, 6435 ie. we can't handle the comparison request. However, it is 6436 possible that the other object knows a comparison method, which 6437 is why we return Py_NotImplemented to give the other object a 6438 chance. 6439 6440 */ 6441 if (PyErr_ExceptionMatches(PyExc_TypeError)) { 6442 PyErr_Clear(); 6443 Py_INCREF(Py_NotImplemented); 6444 return Py_NotImplemented; 6445 } 6446 if (op != Py_EQ && op != Py_NE) 6447 return NULL; 6448 6449 /* Equality comparison. 6450 6451 This is a special case: we silence any PyExc_UnicodeDecodeError 6452 and instead turn it into a PyErr_UnicodeWarning. 6453 6454 */ 6455 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) 6456 return NULL; 6457 PyErr_Clear(); 6458 if (PyErr_WarnEx(PyExc_UnicodeWarning, 6459 (op == Py_EQ) ? 6460 "Unicode equal comparison " 6461 "failed to convert both arguments to Unicode - " 6462 "interpreting them as being unequal" 6463 : 6464 "Unicode unequal comparison " 6465 "failed to convert both arguments to Unicode - " 6466 "interpreting them as being unequal", 6467 1) < 0) 6468 return NULL; 6469 result = (op == Py_NE); 6470 return PyBool_FromLong(result); 6471} 6472 6473int PyUnicode_Contains(PyObject *container, 6474 PyObject *element) 6475{ 6476 PyObject *str, *sub; 6477 int result; 6478 6479 /* Coerce the two arguments */ 6480 sub = PyUnicode_FromObject(element); 6481 if (!sub) { 6482 PyErr_Format(PyExc_TypeError, 6483 "'in <string>' requires string as left operand, not %s", 6484 element->ob_type->tp_name); 6485 return -1; 6486 } 6487 6488 str = PyUnicode_FromObject(container); 6489 if (!str) { 6490 Py_DECREF(sub); 6491 return -1; 6492 } 6493 6494 result = stringlib_contains_obj(str, sub); 6495 6496 Py_DECREF(str); 6497 Py_DECREF(sub); 6498 6499 return result; 6500} 6501 6502/* Concat to string or Unicode object giving a new Unicode object. */ 6503 6504PyObject *PyUnicode_Concat(PyObject *left, 6505 PyObject *right) 6506{ 6507 PyUnicodeObject *u = NULL, *v = NULL, *w; 6508 6509 /* Coerce the two arguments */ 6510 u = (PyUnicodeObject *)PyUnicode_FromObject(left); 6511 if (u == NULL) 6512 goto onError; 6513 v = (PyUnicodeObject *)PyUnicode_FromObject(right); 6514 if (v == NULL) 6515 goto onError; 6516 6517 /* Shortcuts */ 6518 if (v == unicode_empty) { 6519 Py_DECREF(v); 6520 return (PyObject *)u; 6521 } 6522 if (u == unicode_empty) { 6523 Py_DECREF(u); 6524 return (PyObject *)v; 6525 } 6526 6527 /* Concat the two Unicode strings */ 6528 w = _PyUnicode_New(u->length + v->length); 6529 if (w == NULL) 6530 goto onError; 6531 Py_UNICODE_COPY(w->str, u->str, u->length); 6532 Py_UNICODE_COPY(w->str + u->length, v->str, v->length); 6533 6534 Py_DECREF(u); 6535 Py_DECREF(v); 6536 return (PyObject *)w; 6537 6538onError: 6539 Py_XDECREF(u); 6540 Py_XDECREF(v); 6541 return NULL; 6542} 6543 6544void 6545PyUnicode_Append(PyObject **pleft, PyObject *right) 6546{ 6547 PyObject *new; 6548 if (*pleft == NULL) 6549 return; 6550 if (right == NULL || !PyUnicode_Check(*pleft)) { 6551 Py_DECREF(*pleft); 6552 *pleft = NULL; 6553 return; 6554 } 6555 new = PyUnicode_Concat(*pleft, right); 6556 Py_DECREF(*pleft); 6557 *pleft = new; 6558} 6559 6560void 6561PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right) 6562{ 6563 PyUnicode_Append(pleft, right); 6564 Py_XDECREF(right); 6565} 6566 6567PyDoc_STRVAR(count__doc__, 6568"S.count(sub[, start[, end]]) -> int\n\ 6569\n\ 6570Return the number of non-overlapping occurrences of substring sub in\n\ 6571Unicode string S[start:end]. Optional arguments start and end are\n\ 6572interpreted as in slice notation."); 6573 6574static PyObject * 6575unicode_count(PyUnicodeObject *self, PyObject *args) 6576{ 6577 PyUnicodeObject *substring; 6578 Py_ssize_t start = 0; 6579 Py_ssize_t end = PY_SSIZE_T_MAX; 6580 PyObject *result; 6581 6582 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring, 6583 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 6584 return NULL; 6585 6586 substring = (PyUnicodeObject *)PyUnicode_FromObject( 6587 (PyObject *)substring); 6588 if (substring == NULL) 6589 return NULL; 6590 6591 FIX_START_END(self); 6592 6593 result = PyLong_FromSsize_t( 6594 stringlib_count(self->str + start, end - start, 6595 substring->str, substring->length) 6596 ); 6597 6598 Py_DECREF(substring); 6599 6600 return result; 6601} 6602 6603PyDoc_STRVAR(encode__doc__, 6604"S.encode([encoding[,errors]]) -> string or unicode\n\ 6605\n\ 6606Encodes S using the codec registered for encoding. encoding defaults\n\ 6607to the default encoding. errors may be given to set a different error\n\ 6608handling scheme. Default is 'strict' meaning that encoding errors raise\n\ 6609a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\ 6610'xmlcharrefreplace' as well as any other name registered with\n\ 6611codecs.register_error that can handle UnicodeEncodeErrors."); 6612 6613static PyObject * 6614unicode_encode(PyUnicodeObject *self, PyObject *args) 6615{ 6616 char *encoding = NULL; 6617 char *errors = NULL; 6618 PyObject *v; 6619 6620 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors)) 6621 return NULL; 6622 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors); 6623 if (v == NULL) 6624 goto onError; 6625 if (!PyBytes_Check(v)) { 6626 PyErr_Format(PyExc_TypeError, 6627 "encoder did not return a bytes object " 6628 "(type=%.400s)", 6629 Py_TYPE(v)->tp_name); 6630 Py_DECREF(v); 6631 return NULL; 6632 } 6633 return v; 6634 6635 onError: 6636 return NULL; 6637} 6638 6639PyDoc_STRVAR(expandtabs__doc__, 6640"S.expandtabs([tabsize]) -> unicode\n\ 6641\n\ 6642Return a copy of S where all tab characters are expanded using spaces.\n\ 6643If tabsize is not given, a tab size of 8 characters is assumed."); 6644 6645static PyObject* 6646unicode_expandtabs(PyUnicodeObject *self, PyObject *args) 6647{ 6648 Py_UNICODE *e; 6649 Py_UNICODE *p; 6650 Py_UNICODE *q; 6651 Py_UNICODE *qe; 6652 Py_ssize_t i, j, incr; 6653 PyUnicodeObject *u; 6654 int tabsize = 8; 6655 6656 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize)) 6657 return NULL; 6658 6659 /* First pass: determine size of output string */ 6660 i = 0; /* chars up to and including most recent \n or \r */ 6661 j = 0; /* chars since most recent \n or \r (use in tab calculations) */ 6662 e = self->str + self->length; /* end of input */ 6663 for (p = self->str; p < e; p++) 6664 if (*p == '\t') { 6665 if (tabsize > 0) { 6666 incr = tabsize - (j % tabsize); /* cannot overflow */ 6667 if (j > PY_SSIZE_T_MAX - incr) 6668 goto overflow1; 6669 j += incr; 6670 } 6671 } 6672 else { 6673 if (j > PY_SSIZE_T_MAX - 1) 6674 goto overflow1; 6675 j++; 6676 if (*p == '\n' || *p == '\r') { 6677 if (i > PY_SSIZE_T_MAX - j) 6678 goto overflow1; 6679 i += j; 6680 j = 0; 6681 } 6682 } 6683 6684 if (i > PY_SSIZE_T_MAX - j) 6685 goto overflow1; 6686 6687 /* Second pass: create output string and fill it */ 6688 u = _PyUnicode_New(i + j); 6689 if (!u) 6690 return NULL; 6691 6692 j = 0; /* same as in first pass */ 6693 q = u->str; /* next output char */ 6694 qe = u->str + u->length; /* end of output */ 6695 6696 for (p = self->str; p < e; p++) 6697 if (*p == '\t') { 6698 if (tabsize > 0) { 6699 i = tabsize - (j % tabsize); 6700 j += i; 6701 while (i--) { 6702 if (q >= qe) 6703 goto overflow2; 6704 *q++ = ' '; 6705 } 6706 } 6707 } 6708 else { 6709 if (q >= qe) 6710 goto overflow2; 6711 *q++ = *p; 6712 j++; 6713 if (*p == '\n' || *p == '\r') 6714 j = 0; 6715 } 6716 6717 return (PyObject*) u; 6718 6719 overflow2: 6720 Py_DECREF(u); 6721 overflow1: 6722 PyErr_SetString(PyExc_OverflowError, "new string is too long"); 6723 return NULL; 6724} 6725 6726PyDoc_STRVAR(find__doc__, 6727"S.find(sub [,start [,end]]) -> int\n\ 6728\n\ 6729Return the lowest index in S where substring sub is found,\n\ 6730such that sub is contained within s[start:end]. Optional\n\ 6731arguments start and end are interpreted as in slice notation.\n\ 6732\n\ 6733Return -1 on failure."); 6734 6735static PyObject * 6736unicode_find(PyUnicodeObject *self, PyObject *args) 6737{ 6738 PyObject *substring; 6739 Py_ssize_t start; 6740 Py_ssize_t end; 6741 Py_ssize_t result; 6742 6743 if (!_ParseTupleFinds(args, &substring, &start, &end)) 6744 return NULL; 6745 6746 result = stringlib_find_slice( 6747 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self), 6748 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring), 6749 start, end 6750 ); 6751 6752 Py_DECREF(substring); 6753 6754 return PyLong_FromSsize_t(result); 6755} 6756 6757static PyObject * 6758unicode_getitem(PyUnicodeObject *self, Py_ssize_t index) 6759{ 6760 if (index < 0 || index >= self->length) { 6761 PyErr_SetString(PyExc_IndexError, "string index out of range"); 6762 return NULL; 6763 } 6764 6765 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1); 6766} 6767 6768/* Believe it or not, this produces the same value for ASCII strings 6769 as string_hash(). */ 6770static long 6771unicode_hash(PyUnicodeObject *self) 6772{ 6773 Py_ssize_t len; 6774 Py_UNICODE *p; 6775 long x; 6776 6777 if (self->hash != -1) 6778 return self->hash; 6779 len = Py_SIZE(self); 6780 p = self->str; 6781 x = *p << 7; 6782 while (--len >= 0) 6783 x = (1000003*x) ^ *p++; 6784 x ^= Py_SIZE(self); 6785 if (x == -1) 6786 x = -2; 6787 self->hash = x; 6788 return x; 6789} 6790 6791PyDoc_STRVAR(index__doc__, 6792"S.index(sub [,start [,end]]) -> int\n\ 6793\n\ 6794Like S.find() but raise ValueError when the substring is not found."); 6795 6796static PyObject * 6797unicode_index(PyUnicodeObject *self, PyObject *args) 6798{ 6799 Py_ssize_t result; 6800 PyObject *substring; 6801 Py_ssize_t start; 6802 Py_ssize_t end; 6803 6804 if (!_ParseTupleFinds(args, &substring, &start, &end)) 6805 return NULL; 6806 6807 result = stringlib_find_slice( 6808 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self), 6809 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring), 6810 start, end 6811 ); 6812 6813 Py_DECREF(substring); 6814 6815 if (result < 0) { 6816 PyErr_SetString(PyExc_ValueError, "substring not found"); 6817 return NULL; 6818 } 6819 6820 return PyLong_FromSsize_t(result); 6821} 6822 6823PyDoc_STRVAR(islower__doc__, 6824"S.islower() -> bool\n\ 6825\n\ 6826Return True if all cased characters in S are lowercase and there is\n\ 6827at least one cased character in S, False otherwise."); 6828 6829static PyObject* 6830unicode_islower(PyUnicodeObject *self) 6831{ 6832 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 6833 register const Py_UNICODE *e; 6834 int cased; 6835 6836 /* Shortcut for single character strings */ 6837 if (PyUnicode_GET_SIZE(self) == 1) 6838 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p)); 6839 6840 /* Special case for empty strings */ 6841 if (PyUnicode_GET_SIZE(self) == 0) 6842 return PyBool_FromLong(0); 6843 6844 e = p + PyUnicode_GET_SIZE(self); 6845 cased = 0; 6846 for (; p < e; p++) { 6847 register const Py_UNICODE ch = *p; 6848 6849 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) 6850 return PyBool_FromLong(0); 6851 else if (!cased && Py_UNICODE_ISLOWER(ch)) 6852 cased = 1; 6853 } 6854 return PyBool_FromLong(cased); 6855} 6856 6857PyDoc_STRVAR(isupper__doc__, 6858"S.isupper() -> bool\n\ 6859\n\ 6860Return True if all cased characters in S are uppercase and there is\n\ 6861at least one cased character in S, False otherwise."); 6862 6863static PyObject* 6864unicode_isupper(PyUnicodeObject *self) 6865{ 6866 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 6867 register const Py_UNICODE *e; 6868 int cased; 6869 6870 /* Shortcut for single character strings */ 6871 if (PyUnicode_GET_SIZE(self) == 1) 6872 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0); 6873 6874 /* Special case for empty strings */ 6875 if (PyUnicode_GET_SIZE(self) == 0) 6876 return PyBool_FromLong(0); 6877 6878 e = p + PyUnicode_GET_SIZE(self); 6879 cased = 0; 6880 for (; p < e; p++) { 6881 register const Py_UNICODE ch = *p; 6882 6883 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch)) 6884 return PyBool_FromLong(0); 6885 else if (!cased && Py_UNICODE_ISUPPER(ch)) 6886 cased = 1; 6887 } 6888 return PyBool_FromLong(cased); 6889} 6890 6891PyDoc_STRVAR(istitle__doc__, 6892"S.istitle() -> bool\n\ 6893\n\ 6894Return True if S is a titlecased string and there is at least one\n\ 6895character in S, i.e. upper- and titlecase characters may only\n\ 6896follow uncased characters and lowercase characters only cased ones.\n\ 6897Return False otherwise."); 6898 6899static PyObject* 6900unicode_istitle(PyUnicodeObject *self) 6901{ 6902 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 6903 register const Py_UNICODE *e; 6904 int cased, previous_is_cased; 6905 6906 /* Shortcut for single character strings */ 6907 if (PyUnicode_GET_SIZE(self) == 1) 6908 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) || 6909 (Py_UNICODE_ISUPPER(*p) != 0)); 6910 6911 /* Special case for empty strings */ 6912 if (PyUnicode_GET_SIZE(self) == 0) 6913 return PyBool_FromLong(0); 6914 6915 e = p + PyUnicode_GET_SIZE(self); 6916 cased = 0; 6917 previous_is_cased = 0; 6918 for (; p < e; p++) { 6919 register const Py_UNICODE ch = *p; 6920 6921 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) { 6922 if (previous_is_cased) 6923 return PyBool_FromLong(0); 6924 previous_is_cased = 1; 6925 cased = 1; 6926 } 6927 else if (Py_UNICODE_ISLOWER(ch)) { 6928 if (!previous_is_cased) 6929 return PyBool_FromLong(0); 6930 previous_is_cased = 1; 6931 cased = 1; 6932 } 6933 else 6934 previous_is_cased = 0; 6935 } 6936 return PyBool_FromLong(cased); 6937} 6938 6939PyDoc_STRVAR(isspace__doc__, 6940"S.isspace() -> bool\n\ 6941\n\ 6942Return True if all characters in S are whitespace\n\ 6943and there is at least one character in S, False otherwise."); 6944 6945static PyObject* 6946unicode_isspace(PyUnicodeObject *self) 6947{ 6948 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 6949 register const Py_UNICODE *e; 6950 6951 /* Shortcut for single character strings */ 6952 if (PyUnicode_GET_SIZE(self) == 1 && 6953 Py_UNICODE_ISSPACE(*p)) 6954 return PyBool_FromLong(1); 6955 6956 /* Special case for empty strings */ 6957 if (PyUnicode_GET_SIZE(self) == 0) 6958 return PyBool_FromLong(0); 6959 6960 e = p + PyUnicode_GET_SIZE(self); 6961 for (; p < e; p++) { 6962 if (!Py_UNICODE_ISSPACE(*p)) 6963 return PyBool_FromLong(0); 6964 } 6965 return PyBool_FromLong(1); 6966} 6967 6968PyDoc_STRVAR(isalpha__doc__, 6969"S.isalpha() -> bool\n\ 6970\n\ 6971Return True if all characters in S are alphabetic\n\ 6972and there is at least one character in S, False otherwise."); 6973 6974static PyObject* 6975unicode_isalpha(PyUnicodeObject *self) 6976{ 6977 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 6978 register const Py_UNICODE *e; 6979 6980 /* Shortcut for single character strings */ 6981 if (PyUnicode_GET_SIZE(self) == 1 && 6982 Py_UNICODE_ISALPHA(*p)) 6983 return PyBool_FromLong(1); 6984 6985 /* Special case for empty strings */ 6986 if (PyUnicode_GET_SIZE(self) == 0) 6987 return PyBool_FromLong(0); 6988 6989 e = p + PyUnicode_GET_SIZE(self); 6990 for (; p < e; p++) { 6991 if (!Py_UNICODE_ISALPHA(*p)) 6992 return PyBool_FromLong(0); 6993 } 6994 return PyBool_FromLong(1); 6995} 6996 6997PyDoc_STRVAR(isalnum__doc__, 6998"S.isalnum() -> bool\n\ 6999\n\ 7000Return True if all characters in S are alphanumeric\n\ 7001and there is at least one character in S, False otherwise."); 7002 7003static PyObject* 7004unicode_isalnum(PyUnicodeObject *self) 7005{ 7006 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 7007 register const Py_UNICODE *e; 7008 7009 /* Shortcut for single character strings */ 7010 if (PyUnicode_GET_SIZE(self) == 1 && 7011 Py_UNICODE_ISALNUM(*p)) 7012 return PyBool_FromLong(1); 7013 7014 /* Special case for empty strings */ 7015 if (PyUnicode_GET_SIZE(self) == 0) 7016 return PyBool_FromLong(0); 7017 7018 e = p + PyUnicode_GET_SIZE(self); 7019 for (; p < e; p++) { 7020 if (!Py_UNICODE_ISALNUM(*p)) 7021 return PyBool_FromLong(0); 7022 } 7023 return PyBool_FromLong(1); 7024} 7025 7026PyDoc_STRVAR(isdecimal__doc__, 7027"S.isdecimal() -> bool\n\ 7028\n\ 7029Return True if there are only decimal characters in S,\n\ 7030False otherwise."); 7031 7032static PyObject* 7033unicode_isdecimal(PyUnicodeObject *self) 7034{ 7035 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 7036 register const Py_UNICODE *e; 7037 7038 /* Shortcut for single character strings */ 7039 if (PyUnicode_GET_SIZE(self) == 1 && 7040 Py_UNICODE_ISDECIMAL(*p)) 7041 return PyBool_FromLong(1); 7042 7043 /* Special case for empty strings */ 7044 if (PyUnicode_GET_SIZE(self) == 0) 7045 return PyBool_FromLong(0); 7046 7047 e = p + PyUnicode_GET_SIZE(self); 7048 for (; p < e; p++) { 7049 if (!Py_UNICODE_ISDECIMAL(*p)) 7050 return PyBool_FromLong(0); 7051 } 7052 return PyBool_FromLong(1); 7053} 7054 7055PyDoc_STRVAR(isdigit__doc__, 7056"S.isdigit() -> bool\n\ 7057\n\ 7058Return True if all characters in S are digits\n\ 7059and there is at least one character in S, False otherwise."); 7060 7061static PyObject* 7062unicode_isdigit(PyUnicodeObject *self) 7063{ 7064 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 7065 register const Py_UNICODE *e; 7066 7067 /* Shortcut for single character strings */ 7068 if (PyUnicode_GET_SIZE(self) == 1 && 7069 Py_UNICODE_ISDIGIT(*p)) 7070 return PyBool_FromLong(1); 7071 7072 /* Special case for empty strings */ 7073 if (PyUnicode_GET_SIZE(self) == 0) 7074 return PyBool_FromLong(0); 7075 7076 e = p + PyUnicode_GET_SIZE(self); 7077 for (; p < e; p++) { 7078 if (!Py_UNICODE_ISDIGIT(*p)) 7079 return PyBool_FromLong(0); 7080 } 7081 return PyBool_FromLong(1); 7082} 7083 7084PyDoc_STRVAR(isnumeric__doc__, 7085"S.isnumeric() -> bool\n\ 7086\n\ 7087Return True if there are only numeric characters in S,\n\ 7088False otherwise."); 7089 7090static PyObject* 7091unicode_isnumeric(PyUnicodeObject *self) 7092{ 7093 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 7094 register const Py_UNICODE *e; 7095 7096 /* Shortcut for single character strings */ 7097 if (PyUnicode_GET_SIZE(self) == 1 && 7098 Py_UNICODE_ISNUMERIC(*p)) 7099 return PyBool_FromLong(1); 7100 7101 /* Special case for empty strings */ 7102 if (PyUnicode_GET_SIZE(self) == 0) 7103 return PyBool_FromLong(0); 7104 7105 e = p + PyUnicode_GET_SIZE(self); 7106 for (; p < e; p++) { 7107 if (!Py_UNICODE_ISNUMERIC(*p)) 7108 return PyBool_FromLong(0); 7109 } 7110 return PyBool_FromLong(1); 7111} 7112 7113int 7114PyUnicode_IsIdentifier(PyObject *self) 7115{ 7116 register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self); 7117 register const Py_UNICODE *e; 7118 7119 /* Special case for empty strings */ 7120 if (PyUnicode_GET_SIZE(self) == 0) 7121 return 0; 7122 7123 /* PEP 3131 says that the first character must be in 7124 XID_Start and subsequent characters in XID_Continue, 7125 and for the ASCII range, the 2.x rules apply (i.e 7126 start with letters and underscore, continue with 7127 letters, digits, underscore). However, given the current 7128 definition of XID_Start and XID_Continue, it is sufficient 7129 to check just for these, except that _ must be allowed 7130 as starting an identifier. */ 7131 if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */) 7132 return 0; 7133 7134 e = p + PyUnicode_GET_SIZE(self); 7135 for (p++; p < e; p++) { 7136 if (!_PyUnicode_IsXidContinue(*p)) 7137 return 0; 7138 } 7139 return 1; 7140} 7141 7142PyDoc_STRVAR(isidentifier__doc__, 7143"S.isidentifier() -> bool\n\ 7144\n\ 7145Return True if S is a valid identifier according\n\ 7146to the language definition."); 7147 7148static PyObject* 7149unicode_isidentifier(PyObject *self) 7150{ 7151 return PyBool_FromLong(PyUnicode_IsIdentifier(self)); 7152} 7153 7154PyDoc_STRVAR(join__doc__, 7155"S.join(sequence) -> unicode\n\ 7156\n\ 7157Return a string which is the concatenation of the strings in the\n\ 7158sequence. The separator between elements is S."); 7159 7160static PyObject* 7161unicode_join(PyObject *self, PyObject *data) 7162{ 7163 return PyUnicode_Join(self, data); 7164} 7165 7166static Py_ssize_t 7167unicode_length(PyUnicodeObject *self) 7168{ 7169 return self->length; 7170} 7171 7172PyDoc_STRVAR(ljust__doc__, 7173"S.ljust(width[, fillchar]) -> int\n\ 7174\n\ 7175Return S left justified in a Unicode string of length width. Padding is\n\ 7176done using the specified fill character (default is a space)."); 7177 7178static PyObject * 7179unicode_ljust(PyUnicodeObject *self, PyObject *args) 7180{ 7181 Py_ssize_t width; 7182 Py_UNICODE fillchar = ' '; 7183 7184 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar)) 7185 return NULL; 7186 7187 if (self->length >= width && PyUnicode_CheckExact(self)) { 7188 Py_INCREF(self); 7189 return (PyObject*) self; 7190 } 7191 7192 return (PyObject*) pad(self, 0, width - self->length, fillchar); 7193} 7194 7195PyDoc_STRVAR(lower__doc__, 7196"S.lower() -> unicode\n\ 7197\n\ 7198Return a copy of the string S converted to lowercase."); 7199 7200static PyObject* 7201unicode_lower(PyUnicodeObject *self) 7202{ 7203 return fixup(self, fixlower); 7204} 7205 7206#define LEFTSTRIP 0 7207#define RIGHTSTRIP 1 7208#define BOTHSTRIP 2 7209 7210/* Arrays indexed by above */ 7211static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"}; 7212 7213#define STRIPNAME(i) (stripformat[i]+3) 7214 7215/* externally visible for str.strip(unicode) */ 7216PyObject * 7217_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj) 7218{ 7219 Py_UNICODE *s = PyUnicode_AS_UNICODE(self); 7220 Py_ssize_t len = PyUnicode_GET_SIZE(self); 7221 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj); 7222 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj); 7223 Py_ssize_t i, j; 7224 7225 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen); 7226 7227 i = 0; 7228 if (striptype != RIGHTSTRIP) { 7229 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) { 7230 i++; 7231 } 7232 } 7233 7234 j = len; 7235 if (striptype != LEFTSTRIP) { 7236 do { 7237 j--; 7238 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen)); 7239 j++; 7240 } 7241 7242 if (i == 0 && j == len && PyUnicode_CheckExact(self)) { 7243 Py_INCREF(self); 7244 return (PyObject*)self; 7245 } 7246 else 7247 return PyUnicode_FromUnicode(s+i, j-i); 7248} 7249 7250 7251static PyObject * 7252do_strip(PyUnicodeObject *self, int striptype) 7253{ 7254 Py_UNICODE *s = PyUnicode_AS_UNICODE(self); 7255 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j; 7256 7257 i = 0; 7258 if (striptype != RIGHTSTRIP) { 7259 while (i < len && Py_UNICODE_ISSPACE(s[i])) { 7260 i++; 7261 } 7262 } 7263 7264 j = len; 7265 if (striptype != LEFTSTRIP) { 7266 do { 7267 j--; 7268 } while (j >= i && Py_UNICODE_ISSPACE(s[j])); 7269 j++; 7270 } 7271 7272 if (i == 0 && j == len && PyUnicode_CheckExact(self)) { 7273 Py_INCREF(self); 7274 return (PyObject*)self; 7275 } 7276 else 7277 return PyUnicode_FromUnicode(s+i, j-i); 7278} 7279 7280 7281static PyObject * 7282do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args) 7283{ 7284 PyObject *sep = NULL; 7285 7286 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep)) 7287 return NULL; 7288 7289 if (sep != NULL && sep != Py_None) { 7290 if (PyUnicode_Check(sep)) 7291 return _PyUnicode_XStrip(self, striptype, sep); 7292 else { 7293 PyErr_Format(PyExc_TypeError, 7294 "%s arg must be None, unicode or str", 7295 STRIPNAME(striptype)); 7296 return NULL; 7297 } 7298 } 7299 7300 return do_strip(self, striptype); 7301} 7302 7303 7304PyDoc_STRVAR(strip__doc__, 7305"S.strip([chars]) -> unicode\n\ 7306\n\ 7307Return a copy of the string S with leading and trailing\n\ 7308whitespace removed.\n\ 7309If chars is given and not None, remove characters in chars instead.\n\ 7310If chars is a str, it will be converted to unicode before stripping"); 7311 7312static PyObject * 7313unicode_strip(PyUnicodeObject *self, PyObject *args) 7314{ 7315 if (PyTuple_GET_SIZE(args) == 0) 7316 return do_strip(self, BOTHSTRIP); /* Common case */ 7317 else 7318 return do_argstrip(self, BOTHSTRIP, args); 7319} 7320 7321 7322PyDoc_STRVAR(lstrip__doc__, 7323"S.lstrip([chars]) -> unicode\n\ 7324\n\ 7325Return a copy of the string S with leading whitespace removed.\n\ 7326If chars is given and not None, remove characters in chars instead.\n\ 7327If chars is a str, it will be converted to unicode before stripping"); 7328 7329static PyObject * 7330unicode_lstrip(PyUnicodeObject *self, PyObject *args) 7331{ 7332 if (PyTuple_GET_SIZE(args) == 0) 7333 return do_strip(self, LEFTSTRIP); /* Common case */ 7334 else 7335 return do_argstrip(self, LEFTSTRIP, args); 7336} 7337 7338 7339PyDoc_STRVAR(rstrip__doc__, 7340"S.rstrip([chars]) -> unicode\n\ 7341\n\ 7342Return a copy of the string S with trailing whitespace removed.\n\ 7343If chars is given and not None, remove characters in chars instead.\n\ 7344If chars is a str, it will be converted to unicode before stripping"); 7345 7346static PyObject * 7347unicode_rstrip(PyUnicodeObject *self, PyObject *args) 7348{ 7349 if (PyTuple_GET_SIZE(args) == 0) 7350 return do_strip(self, RIGHTSTRIP); /* Common case */ 7351 else 7352 return do_argstrip(self, RIGHTSTRIP, args); 7353} 7354 7355 7356static PyObject* 7357unicode_repeat(PyUnicodeObject *str, Py_ssize_t len) 7358{ 7359 PyUnicodeObject *u; 7360 Py_UNICODE *p; 7361 Py_ssize_t nchars; 7362 size_t nbytes; 7363 7364 if (len < 0) 7365 len = 0; 7366 7367 if (len == 1 && PyUnicode_CheckExact(str)) { 7368 /* no repeat, return original string */ 7369 Py_INCREF(str); 7370 return (PyObject*) str; 7371 } 7372 7373 /* ensure # of chars needed doesn't overflow int and # of bytes 7374 * needed doesn't overflow size_t 7375 */ 7376 nchars = len * str->length; 7377 if (len && nchars / len != str->length) { 7378 PyErr_SetString(PyExc_OverflowError, 7379 "repeated string is too long"); 7380 return NULL; 7381 } 7382 nbytes = (nchars + 1) * sizeof(Py_UNICODE); 7383 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) { 7384 PyErr_SetString(PyExc_OverflowError, 7385 "repeated string is too long"); 7386 return NULL; 7387 } 7388 u = _PyUnicode_New(nchars); 7389 if (!u) 7390 return NULL; 7391 7392 p = u->str; 7393 7394 if (str->length == 1 && len > 0) { 7395 Py_UNICODE_FILL(p, str->str[0], len); 7396 } else { 7397 Py_ssize_t done = 0; /* number of characters copied this far */ 7398 if (done < nchars) { 7399 Py_UNICODE_COPY(p, str->str, str->length); 7400 done = str->length; 7401 } 7402 while (done < nchars) { 7403 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done; 7404 Py_UNICODE_COPY(p+done, p, n); 7405 done += n; 7406 } 7407 } 7408 7409 return (PyObject*) u; 7410} 7411 7412PyObject *PyUnicode_Replace(PyObject *obj, 7413 PyObject *subobj, 7414 PyObject *replobj, 7415 Py_ssize_t maxcount) 7416{ 7417 PyObject *self; 7418 PyObject *str1; 7419 PyObject *str2; 7420 PyObject *result; 7421 7422 self = PyUnicode_FromObject(obj); 7423 if (self == NULL) 7424 return NULL; 7425 str1 = PyUnicode_FromObject(subobj); 7426 if (str1 == NULL) { 7427 Py_DECREF(self); 7428 return NULL; 7429 } 7430 str2 = PyUnicode_FromObject(replobj); 7431 if (str2 == NULL) { 7432 Py_DECREF(self); 7433 Py_DECREF(str1); 7434 return NULL; 7435 } 7436 result = replace((PyUnicodeObject *)self, 7437 (PyUnicodeObject *)str1, 7438 (PyUnicodeObject *)str2, 7439 maxcount); 7440 Py_DECREF(self); 7441 Py_DECREF(str1); 7442 Py_DECREF(str2); 7443 return result; 7444} 7445 7446PyDoc_STRVAR(replace__doc__, 7447"S.replace (old, new[, maxsplit]) -> unicode\n\ 7448\n\ 7449Return a copy of S with all occurrences of substring\n\ 7450old replaced by new. If the optional argument maxsplit is\n\ 7451given, only the first maxsplit occurrences are replaced."); 7452 7453static PyObject* 7454unicode_replace(PyUnicodeObject *self, PyObject *args) 7455{ 7456 PyUnicodeObject *str1; 7457 PyUnicodeObject *str2; 7458 Py_ssize_t maxcount = -1; 7459 PyObject *result; 7460 7461 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount)) 7462 return NULL; 7463 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1); 7464 if (str1 == NULL) 7465 return NULL; 7466 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2); 7467 if (str2 == NULL) { 7468 Py_DECREF(str1); 7469 return NULL; 7470 } 7471 7472 result = replace(self, str1, str2, maxcount); 7473 7474 Py_DECREF(str1); 7475 Py_DECREF(str2); 7476 return result; 7477} 7478 7479static 7480PyObject *unicode_repr(PyObject *unicode) 7481{ 7482 PyObject *repr; 7483 Py_UNICODE *p; 7484 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode); 7485 Py_ssize_t size = PyUnicode_GET_SIZE(unicode); 7486 7487 /* XXX(nnorwitz): rather than over-allocating, it would be 7488 better to choose a different scheme. Perhaps scan the 7489 first N-chars of the string and allocate based on that size. 7490 */ 7491 /* Initial allocation is based on the longest-possible unichr 7492 escape. 7493 7494 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source 7495 unichr, so in this case it's the longest unichr escape. In 7496 narrow (UTF-16) builds this is five chars per source unichr 7497 since there are two unichrs in the surrogate pair, so in narrow 7498 (UTF-16) builds it's not the longest unichr escape. 7499 7500 In wide or narrow builds '\uxxxx' is 6 chars per source unichr, 7501 so in the narrow (UTF-16) build case it's the longest unichr 7502 escape. 7503 */ 7504 7505 repr = PyUnicode_FromUnicode(NULL, 7506 2 /* quotes */ 7507#ifdef Py_UNICODE_WIDE 7508 + 10*size 7509#else 7510 + 6*size 7511#endif 7512 + 1); 7513 if (repr == NULL) 7514 return NULL; 7515 7516 p = PyUnicode_AS_UNICODE(repr); 7517 7518 /* Add quote */ 7519 *p++ = (findchar(s, size, '\'') && 7520 !findchar(s, size, '"')) ? '"' : '\''; 7521 while (size-- > 0) { 7522 Py_UNICODE ch = *s++; 7523 7524 /* Escape quotes and backslashes */ 7525 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) { 7526 *p++ = '\\'; 7527 *p++ = ch; 7528 continue; 7529 } 7530 7531#ifdef Py_UNICODE_WIDE 7532 /* Map 21-bit characters to '\U00xxxxxx' */ 7533 else if (ch >= 0x10000) { 7534 *p++ = '\\'; 7535 *p++ = 'U'; 7536 *p++ = hexdigits[(ch >> 28) & 0x0000000F]; 7537 *p++ = hexdigits[(ch >> 24) & 0x0000000F]; 7538 *p++ = hexdigits[(ch >> 20) & 0x0000000F]; 7539 *p++ = hexdigits[(ch >> 16) & 0x0000000F]; 7540 *p++ = hexdigits[(ch >> 12) & 0x0000000F]; 7541 *p++ = hexdigits[(ch >> 8) & 0x0000000F]; 7542 *p++ = hexdigits[(ch >> 4) & 0x0000000F]; 7543 *p++ = hexdigits[ch & 0x0000000F]; 7544 continue; 7545 } 7546#else 7547 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */ 7548 else if (ch >= 0xD800 && ch < 0xDC00) { 7549 Py_UNICODE ch2; 7550 Py_UCS4 ucs; 7551 7552 ch2 = *s++; 7553 size--; 7554 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) { 7555 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000; 7556 *p++ = '\\'; 7557 *p++ = 'U'; 7558 *p++ = hexdigits[(ucs >> 28) & 0x0000000F]; 7559 *p++ = hexdigits[(ucs >> 24) & 0x0000000F]; 7560 *p++ = hexdigits[(ucs >> 20) & 0x0000000F]; 7561 *p++ = hexdigits[(ucs >> 16) & 0x0000000F]; 7562 *p++ = hexdigits[(ucs >> 12) & 0x0000000F]; 7563 *p++ = hexdigits[(ucs >> 8) & 0x0000000F]; 7564 *p++ = hexdigits[(ucs >> 4) & 0x0000000F]; 7565 *p++ = hexdigits[ucs & 0x0000000F]; 7566 continue; 7567 } 7568 /* Fall through: isolated surrogates are copied as-is */ 7569 s--; 7570 size++; 7571 } 7572#endif 7573 7574 /* Map 16-bit characters to '\uxxxx' */ 7575 if (ch >= 256) { 7576 *p++ = '\\'; 7577 *p++ = 'u'; 7578 *p++ = hexdigits[(ch >> 12) & 0x000F]; 7579 *p++ = hexdigits[(ch >> 8) & 0x000F]; 7580 *p++ = hexdigits[(ch >> 4) & 0x000F]; 7581 *p++ = hexdigits[ch & 0x000F]; 7582 } 7583 7584 /* Map special whitespace to '\t', \n', '\r' */ 7585 else if (ch == '\t') { 7586 *p++ = '\\'; 7587 *p++ = 't'; 7588 } 7589 else if (ch == '\n') { 7590 *p++ = '\\'; 7591 *p++ = 'n'; 7592 } 7593 else if (ch == '\r') { 7594 *p++ = '\\'; 7595 *p++ = 'r'; 7596 } 7597 7598 /* Map non-printable US ASCII to '\xhh' */ 7599 else if (ch < ' ' || ch >= 0x7F) { 7600 *p++ = '\\'; 7601 *p++ = 'x'; 7602 *p++ = hexdigits[(ch >> 4) & 0x000F]; 7603 *p++ = hexdigits[ch & 0x000F]; 7604 } 7605 7606 /* Copy everything else as-is */ 7607 else 7608 *p++ = (char) ch; 7609 } 7610 /* Add quote */ 7611 *p++ = PyUnicode_AS_UNICODE(repr)[0]; 7612 7613 *p = '\0'; 7614 _PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr)); 7615 return repr; 7616} 7617 7618PyDoc_STRVAR(rfind__doc__, 7619"S.rfind(sub [,start [,end]]) -> int\n\ 7620\n\ 7621Return the highest index in S where substring sub is found,\n\ 7622such that sub is contained within s[start:end]. Optional\n\ 7623arguments start and end are interpreted as in slice notation.\n\ 7624\n\ 7625Return -1 on failure."); 7626 7627static PyObject * 7628unicode_rfind(PyUnicodeObject *self, PyObject *args) 7629{ 7630 PyObject *substring; 7631 Py_ssize_t start; 7632 Py_ssize_t end; 7633 Py_ssize_t result; 7634 7635 if (!_ParseTupleFinds(args, &substring, &start, &end)) 7636 return NULL; 7637 7638 result = stringlib_rfind_slice( 7639 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self), 7640 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring), 7641 start, end 7642 ); 7643 7644 Py_DECREF(substring); 7645 7646 return PyLong_FromSsize_t(result); 7647} 7648 7649PyDoc_STRVAR(rindex__doc__, 7650"S.rindex(sub [,start [,end]]) -> int\n\ 7651\n\ 7652Like S.rfind() but raise ValueError when the substring is not found."); 7653 7654static PyObject * 7655unicode_rindex(PyUnicodeObject *self, PyObject *args) 7656{ 7657 PyObject *substring; 7658 Py_ssize_t start; 7659 Py_ssize_t end; 7660 Py_ssize_t result; 7661 7662 if (!_ParseTupleFinds(args, &substring, &start, &end)) 7663 return NULL; 7664 7665 result = stringlib_rfind_slice( 7666 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self), 7667 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring), 7668 start, end 7669 ); 7670 7671 Py_DECREF(substring); 7672 7673 if (result < 0) { 7674 PyErr_SetString(PyExc_ValueError, "substring not found"); 7675 return NULL; 7676 } 7677 return PyLong_FromSsize_t(result); 7678} 7679 7680PyDoc_STRVAR(rjust__doc__, 7681"S.rjust(width[, fillchar]) -> unicode\n\ 7682\n\ 7683Return S right justified in a Unicode string of length width. Padding is\n\ 7684done using the specified fill character (default is a space)."); 7685 7686static PyObject * 7687unicode_rjust(PyUnicodeObject *self, PyObject *args) 7688{ 7689 Py_ssize_t width; 7690 Py_UNICODE fillchar = ' '; 7691 7692 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar)) 7693 return NULL; 7694 7695 if (self->length >= width && PyUnicode_CheckExact(self)) { 7696 Py_INCREF(self); 7697 return (PyObject*) self; 7698 } 7699 7700 return (PyObject*) pad(self, width - self->length, 0, fillchar); 7701} 7702 7703PyObject *PyUnicode_Split(PyObject *s, 7704 PyObject *sep, 7705 Py_ssize_t maxsplit) 7706{ 7707 PyObject *result; 7708 7709 s = PyUnicode_FromObject(s); 7710 if (s == NULL) 7711 return NULL; 7712 if (sep != NULL) { 7713 sep = PyUnicode_FromObject(sep); 7714 if (sep == NULL) { 7715 Py_DECREF(s); 7716 return NULL; 7717 } 7718 } 7719 7720 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit); 7721 7722 Py_DECREF(s); 7723 Py_XDECREF(sep); 7724 return result; 7725} 7726 7727PyDoc_STRVAR(split__doc__, 7728"S.split([sep [,maxsplit]]) -> list of strings\n\ 7729\n\ 7730Return a list of the words in S, using sep as the\n\ 7731delimiter string. If maxsplit is given, at most maxsplit\n\ 7732splits are done. If sep is not specified or is None, any\n\ 7733whitespace string is a separator and empty strings are\n\ 7734removed from the result."); 7735 7736static PyObject* 7737unicode_split(PyUnicodeObject *self, PyObject *args) 7738{ 7739 PyObject *substring = Py_None; 7740 Py_ssize_t maxcount = -1; 7741 7742 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount)) 7743 return NULL; 7744 7745 if (substring == Py_None) 7746 return split(self, NULL, maxcount); 7747 else if (PyUnicode_Check(substring)) 7748 return split(self, (PyUnicodeObject *)substring, maxcount); 7749 else 7750 return PyUnicode_Split((PyObject *)self, substring, maxcount); 7751} 7752 7753PyObject * 7754PyUnicode_Partition(PyObject *str_in, PyObject *sep_in) 7755{ 7756 PyObject* str_obj; 7757 PyObject* sep_obj; 7758 PyObject* out; 7759 7760 str_obj = PyUnicode_FromObject(str_in); 7761 if (!str_obj) 7762 return NULL; 7763 sep_obj = PyUnicode_FromObject(sep_in); 7764 if (!sep_obj) { 7765 Py_DECREF(str_obj); 7766 return NULL; 7767 } 7768 7769 out = stringlib_partition( 7770 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj), 7771 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj) 7772 ); 7773 7774 Py_DECREF(sep_obj); 7775 Py_DECREF(str_obj); 7776 7777 return out; 7778} 7779 7780 7781PyObject * 7782PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in) 7783{ 7784 PyObject* str_obj; 7785 PyObject* sep_obj; 7786 PyObject* out; 7787 7788 str_obj = PyUnicode_FromObject(str_in); 7789 if (!str_obj) 7790 return NULL; 7791 sep_obj = PyUnicode_FromObject(sep_in); 7792 if (!sep_obj) { 7793 Py_DECREF(str_obj); 7794 return NULL; 7795 } 7796 7797 out = stringlib_rpartition( 7798 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj), 7799 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj) 7800 ); 7801 7802 Py_DECREF(sep_obj); 7803 Py_DECREF(str_obj); 7804 7805 return out; 7806} 7807 7808PyDoc_STRVAR(partition__doc__, 7809"S.partition(sep) -> (head, sep, tail)\n\ 7810\n\ 7811Searches for the separator sep in S, and returns the part before it,\n\ 7812the separator itself, and the part after it. If the separator is not\n\ 7813found, returns S and two empty strings."); 7814 7815static PyObject* 7816unicode_partition(PyUnicodeObject *self, PyObject *separator) 7817{ 7818 return PyUnicode_Partition((PyObject *)self, separator); 7819} 7820 7821PyDoc_STRVAR(rpartition__doc__, 7822"S.rpartition(sep) -> (tail, sep, head)\n\ 7823\n\ 7824Searches for the separator sep in S, starting at the end of S, and returns\n\ 7825the part before it, the separator itself, and the part after it. If the\n\ 7826separator is not found, returns two empty strings and S."); 7827 7828static PyObject* 7829unicode_rpartition(PyUnicodeObject *self, PyObject *separator) 7830{ 7831 return PyUnicode_RPartition((PyObject *)self, separator); 7832} 7833 7834PyObject *PyUnicode_RSplit(PyObject *s, 7835 PyObject *sep, 7836 Py_ssize_t maxsplit) 7837{ 7838 PyObject *result; 7839 7840 s = PyUnicode_FromObject(s); 7841 if (s == NULL) 7842 return NULL; 7843 if (sep != NULL) { 7844 sep = PyUnicode_FromObject(sep); 7845 if (sep == NULL) { 7846 Py_DECREF(s); 7847 return NULL; 7848 } 7849 } 7850 7851 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit); 7852 7853 Py_DECREF(s); 7854 Py_XDECREF(sep); 7855 return result; 7856} 7857 7858PyDoc_STRVAR(rsplit__doc__, 7859"S.rsplit([sep [,maxsplit]]) -> list of strings\n\ 7860\n\ 7861Return a list of the words in S, using sep as the\n\ 7862delimiter string, starting at the end of the string and\n\ 7863working to the front. If maxsplit is given, at most maxsplit\n\ 7864splits are done. If sep is not specified, any whitespace string\n\ 7865is a separator."); 7866 7867static PyObject* 7868unicode_rsplit(PyUnicodeObject *self, PyObject *args) 7869{ 7870 PyObject *substring = Py_None; 7871 Py_ssize_t maxcount = -1; 7872 7873 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount)) 7874 return NULL; 7875 7876 if (substring == Py_None) 7877 return rsplit(self, NULL, maxcount); 7878 else if (PyUnicode_Check(substring)) 7879 return rsplit(self, (PyUnicodeObject *)substring, maxcount); 7880 else 7881 return PyUnicode_RSplit((PyObject *)self, substring, maxcount); 7882} 7883 7884PyDoc_STRVAR(splitlines__doc__, 7885"S.splitlines([keepends]]) -> list of strings\n\ 7886\n\ 7887Return a list of the lines in S, breaking at line boundaries.\n\ 7888Line breaks are not included in the resulting list unless keepends\n\ 7889is given and true."); 7890 7891static PyObject* 7892unicode_splitlines(PyUnicodeObject *self, PyObject *args) 7893{ 7894 int keepends = 0; 7895 7896 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends)) 7897 return NULL; 7898 7899 return PyUnicode_Splitlines((PyObject *)self, keepends); 7900} 7901 7902static 7903PyObject *unicode_str(PyObject *self) 7904{ 7905 if (PyUnicode_CheckExact(self)) { 7906 Py_INCREF(self); 7907 return self; 7908 } else 7909 /* Subtype -- return genuine unicode string with the same value. */ 7910 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self), 7911 PyUnicode_GET_SIZE(self)); 7912} 7913 7914PyDoc_STRVAR(swapcase__doc__, 7915"S.swapcase() -> unicode\n\ 7916\n\ 7917Return a copy of S with uppercase characters converted to lowercase\n\ 7918and vice versa."); 7919 7920static PyObject* 7921unicode_swapcase(PyUnicodeObject *self) 7922{ 7923 return fixup(self, fixswapcase); 7924} 7925 7926PyDoc_STRVAR(maketrans__doc__, 7927"str.maketrans(x[, y[, z]]) -> dict (static method)\n\ 7928\n\ 7929Return a translation table usable for str.translate().\n\ 7930If there is only one argument, it must be a dictionary mapping Unicode\n\ 7931ordinals (integers) or characters to Unicode ordinals, strings or None.\n\ 7932Character keys will then be converted to ordinals.\n\ 7933If there are two arguments, they must be strings of equal length, and\n\ 7934in the resulting dictionary, each character in x will be mapped to the\n\ 7935character at the same position in y. If there is a third argument, it\n\ 7936must be a string, whose characters will be mapped to None in the result."); 7937 7938static PyObject* 7939unicode_maketrans(PyUnicodeObject *null, PyObject *args) 7940{ 7941 PyObject *x, *y = NULL, *z = NULL; 7942 PyObject *new = NULL, *key, *value; 7943 Py_ssize_t i = 0; 7944 int res; 7945 7946 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z)) 7947 return NULL; 7948 new = PyDict_New(); 7949 if (!new) 7950 return NULL; 7951 if (y != NULL) { 7952 /* x must be a string too, of equal length */ 7953 Py_ssize_t ylen = PyUnicode_GET_SIZE(y); 7954 if (!PyUnicode_Check(x)) { 7955 PyErr_SetString(PyExc_TypeError, "first maketrans argument must " 7956 "be a string if there is a second argument"); 7957 goto err; 7958 } 7959 if (PyUnicode_GET_SIZE(x) != ylen) { 7960 PyErr_SetString(PyExc_ValueError, "the first two maketrans " 7961 "arguments must have equal length"); 7962 goto err; 7963 } 7964 /* create entries for translating chars in x to those in y */ 7965 for (i = 0; i < PyUnicode_GET_SIZE(x); i++) { 7966 key = PyLong_FromLong(PyUnicode_AS_UNICODE(x)[i]); 7967 value = PyLong_FromLong(PyUnicode_AS_UNICODE(y)[i]); 7968 if (!key || !value) 7969 goto err; 7970 res = PyDict_SetItem(new, key, value); 7971 Py_DECREF(key); 7972 Py_DECREF(value); 7973 if (res < 0) 7974 goto err; 7975 } 7976 /* create entries for deleting chars in z */ 7977 if (z != NULL) { 7978 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) { 7979 key = PyLong_FromLong(PyUnicode_AS_UNICODE(z)[i]); 7980 if (!key) 7981 goto err; 7982 res = PyDict_SetItem(new, key, Py_None); 7983 Py_DECREF(key); 7984 if (res < 0) 7985 goto err; 7986 } 7987 } 7988 } else { 7989 /* x must be a dict */ 7990 if (!PyDict_Check(x)) { 7991 PyErr_SetString(PyExc_TypeError, "if you give only one argument " 7992 "to maketrans it must be a dict"); 7993 goto err; 7994 } 7995 /* copy entries into the new dict, converting string keys to int keys */ 7996 while (PyDict_Next(x, &i, &key, &value)) { 7997 if (PyUnicode_Check(key)) { 7998 /* convert string keys to integer keys */ 7999 PyObject *newkey; 8000 if (PyUnicode_GET_SIZE(key) != 1) { 8001 PyErr_SetString(PyExc_ValueError, "string keys in translate " 8002 "table must be of length 1"); 8003 goto err; 8004 } 8005 newkey = PyLong_FromLong(PyUnicode_AS_UNICODE(key)[0]); 8006 if (!newkey) 8007 goto err; 8008 res = PyDict_SetItem(new, newkey, value); 8009 Py_DECREF(newkey); 8010 if (res < 0) 8011 goto err; 8012 } else if (PyLong_Check(key)) { 8013 /* just keep integer keys */ 8014 if (PyDict_SetItem(new, key, value) < 0) 8015 goto err; 8016 } else { 8017 PyErr_SetString(PyExc_TypeError, "keys in translate table must " 8018 "be strings or integers"); 8019 goto err; 8020 } 8021 } 8022 } 8023 return new; 8024 err: 8025 Py_DECREF(new); 8026 return NULL; 8027} 8028 8029PyDoc_STRVAR(translate__doc__, 8030"S.translate(table) -> unicode\n\ 8031\n\ 8032Return a copy of the string S, where all characters have been mapped\n\ 8033through the given translation table, which must be a mapping of\n\ 8034Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\ 8035Unmapped characters are left untouched. Characters mapped to None\n\ 8036are deleted."); 8037 8038static PyObject* 8039unicode_translate(PyUnicodeObject *self, PyObject *table) 8040{ 8041 return PyUnicode_TranslateCharmap(self->str, self->length, table, "ignore"); 8042} 8043 8044PyDoc_STRVAR(upper__doc__, 8045"S.upper() -> unicode\n\ 8046\n\ 8047Return a copy of S converted to uppercase."); 8048 8049static PyObject* 8050unicode_upper(PyUnicodeObject *self) 8051{ 8052 return fixup(self, fixupper); 8053} 8054 8055PyDoc_STRVAR(zfill__doc__, 8056"S.zfill(width) -> unicode\n\ 8057\n\ 8058Pad a numeric string x with zeros on the left, to fill a field\n\ 8059of the specified width. The string x is never truncated."); 8060 8061static PyObject * 8062unicode_zfill(PyUnicodeObject *self, PyObject *args) 8063{ 8064 Py_ssize_t fill; 8065 PyUnicodeObject *u; 8066 8067 Py_ssize_t width; 8068 if (!PyArg_ParseTuple(args, "n:zfill", &width)) 8069 return NULL; 8070 8071 if (self->length >= width) { 8072 if (PyUnicode_CheckExact(self)) { 8073 Py_INCREF(self); 8074 return (PyObject*) self; 8075 } 8076 else 8077 return PyUnicode_FromUnicode( 8078 PyUnicode_AS_UNICODE(self), 8079 PyUnicode_GET_SIZE(self) 8080 ); 8081 } 8082 8083 fill = width - self->length; 8084 8085 u = pad(self, fill, 0, '0'); 8086 8087 if (u == NULL) 8088 return NULL; 8089 8090 if (u->str[fill] == '+' || u->str[fill] == '-') { 8091 /* move sign to beginning of string */ 8092 u->str[0] = u->str[fill]; 8093 u->str[fill] = '0'; 8094 } 8095 8096 return (PyObject*) u; 8097} 8098 8099#if 0 8100static PyObject* 8101unicode_freelistsize(PyUnicodeObject *self) 8102{ 8103 return PyLong_FromLong(numfree); 8104} 8105#endif 8106 8107PyDoc_STRVAR(startswith__doc__, 8108"S.startswith(prefix[, start[, end]]) -> bool\n\ 8109\n\ 8110Return True if S starts with the specified prefix, False otherwise.\n\ 8111With optional start, test S beginning at that position.\n\ 8112With optional end, stop comparing S at that position.\n\ 8113prefix can also be a tuple of strings to try."); 8114 8115static PyObject * 8116unicode_startswith(PyUnicodeObject *self, 8117 PyObject *args) 8118{ 8119 PyObject *subobj; 8120 PyUnicodeObject *substring; 8121 Py_ssize_t start = 0; 8122 Py_ssize_t end = PY_SSIZE_T_MAX; 8123 int result; 8124 8125 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj, 8126 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 8127 return NULL; 8128 if (PyTuple_Check(subobj)) { 8129 Py_ssize_t i; 8130 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) { 8131 substring = (PyUnicodeObject *)PyUnicode_FromObject( 8132 PyTuple_GET_ITEM(subobj, i)); 8133 if (substring == NULL) 8134 return NULL; 8135 result = tailmatch(self, substring, start, end, -1); 8136 Py_DECREF(substring); 8137 if (result) { 8138 Py_RETURN_TRUE; 8139 } 8140 } 8141 /* nothing matched */ 8142 Py_RETURN_FALSE; 8143 } 8144 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj); 8145 if (substring == NULL) 8146 return NULL; 8147 result = tailmatch(self, substring, start, end, -1); 8148 Py_DECREF(substring); 8149 return PyBool_FromLong(result); 8150} 8151 8152 8153PyDoc_STRVAR(endswith__doc__, 8154"S.endswith(suffix[, start[, end]]) -> bool\n\ 8155\n\ 8156Return True if S ends with the specified suffix, False otherwise.\n\ 8157With optional start, test S beginning at that position.\n\ 8158With optional end, stop comparing S at that position.\n\ 8159suffix can also be a tuple of strings to try."); 8160 8161static PyObject * 8162unicode_endswith(PyUnicodeObject *self, 8163 PyObject *args) 8164{ 8165 PyObject *subobj; 8166 PyUnicodeObject *substring; 8167 Py_ssize_t start = 0; 8168 Py_ssize_t end = PY_SSIZE_T_MAX; 8169 int result; 8170 8171 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj, 8172 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 8173 return NULL; 8174 if (PyTuple_Check(subobj)) { 8175 Py_ssize_t i; 8176 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) { 8177 substring = (PyUnicodeObject *)PyUnicode_FromObject( 8178 PyTuple_GET_ITEM(subobj, i)); 8179 if (substring == NULL) 8180 return NULL; 8181 result = tailmatch(self, substring, start, end, +1); 8182 Py_DECREF(substring); 8183 if (result) { 8184 Py_RETURN_TRUE; 8185 } 8186 } 8187 Py_RETURN_FALSE; 8188 } 8189 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj); 8190 if (substring == NULL) 8191 return NULL; 8192 8193 result = tailmatch(self, substring, start, end, +1); 8194 Py_DECREF(substring); 8195 return PyBool_FromLong(result); 8196} 8197 8198#include "stringlib/string_format.h" 8199 8200PyDoc_STRVAR(format__doc__, 8201"S.format(*args, **kwargs) -> unicode\n\ 8202\n\ 8203"); 8204 8205PyDoc_STRVAR(p_format__doc__, 8206"S.__format__(format_spec) -> unicode\n\ 8207\n\ 8208"); 8209 8210static PyObject * 8211unicode_getnewargs(PyUnicodeObject *v) 8212{ 8213 return Py_BuildValue("(u#)", v->str, v->length); 8214} 8215 8216 8217static PyMethodDef unicode_methods[] = { 8218 8219 /* Order is according to common usage: often used methods should 8220 appear first, since lookup is done sequentially. */ 8221 8222 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__}, 8223 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__}, 8224 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__}, 8225 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__}, 8226 {"join", (PyCFunction) unicode_join, METH_O, join__doc__}, 8227 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__}, 8228 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__}, 8229 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__}, 8230 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__}, 8231 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__}, 8232 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__}, 8233 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__}, 8234 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__}, 8235 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__}, 8236 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__}, 8237 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__}, 8238 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__}, 8239 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__}, 8240 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__}, 8241 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__}, 8242 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__}, 8243 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__}, 8244 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__}, 8245 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__}, 8246 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__}, 8247 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__}, 8248 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__}, 8249 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__}, 8250 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__}, 8251 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__}, 8252 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__}, 8253 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__}, 8254 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__}, 8255 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__}, 8256 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__}, 8257 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__}, 8258 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__}, 8259 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__}, 8260 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__}, 8261 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__}, 8262 {"__format__", (PyCFunction) unicode_unicode__format__, METH_VARARGS, p_format__doc__}, 8263 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS}, 8264 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS}, 8265 {"maketrans", (PyCFunction) unicode_maketrans, 8266 METH_VARARGS | METH_STATIC, maketrans__doc__}, 8267#if 0 8268 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__}, 8269#endif 8270 8271#if 0 8272 /* This one is just used for debugging the implementation. */ 8273 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS}, 8274#endif 8275 8276 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS}, 8277 {NULL, NULL} 8278}; 8279 8280static PyObject * 8281unicode_mod(PyObject *v, PyObject *w) 8282{ 8283 if (!PyUnicode_Check(v)) { 8284 Py_INCREF(Py_NotImplemented); 8285 return Py_NotImplemented; 8286 } 8287 return PyUnicode_Format(v, w); 8288} 8289 8290static PyNumberMethods unicode_as_number = { 8291 0, /*nb_add*/ 8292 0, /*nb_subtract*/ 8293 0, /*nb_multiply*/ 8294 unicode_mod, /*nb_remainder*/ 8295}; 8296 8297static PySequenceMethods unicode_as_sequence = { 8298 (lenfunc) unicode_length, /* sq_length */ 8299 PyUnicode_Concat, /* sq_concat */ 8300 (ssizeargfunc) unicode_repeat, /* sq_repeat */ 8301 (ssizeargfunc) unicode_getitem, /* sq_item */ 8302 0, /* sq_slice */ 8303 0, /* sq_ass_item */ 8304 0, /* sq_ass_slice */ 8305 PyUnicode_Contains, /* sq_contains */ 8306}; 8307 8308static PyObject* 8309unicode_subscript(PyUnicodeObject* self, PyObject* item) 8310{ 8311 if (PyIndex_Check(item)) { 8312 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError); 8313 if (i == -1 && PyErr_Occurred()) 8314 return NULL; 8315 if (i < 0) 8316 i += PyUnicode_GET_SIZE(self); 8317 return unicode_getitem(self, i); 8318 } else if (PySlice_Check(item)) { 8319 Py_ssize_t start, stop, step, slicelength, cur, i; 8320 Py_UNICODE* source_buf; 8321 Py_UNICODE* result_buf; 8322 PyObject* result; 8323 8324 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self), 8325 &start, &stop, &step, &slicelength) < 0) { 8326 return NULL; 8327 } 8328 8329 if (slicelength <= 0) { 8330 return PyUnicode_FromUnicode(NULL, 0); 8331 } else if (start == 0 && step == 1 && slicelength == self->length && 8332 PyUnicode_CheckExact(self)) { 8333 Py_INCREF(self); 8334 return (PyObject *)self; 8335 } else if (step == 1) { 8336 return PyUnicode_FromUnicode(self->str + start, slicelength); 8337 } else { 8338 source_buf = PyUnicode_AS_UNICODE((PyObject*)self); 8339 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength* 8340 sizeof(Py_UNICODE)); 8341 8342 if (result_buf == NULL) 8343 return PyErr_NoMemory(); 8344 8345 for (cur = start, i = 0; i < slicelength; cur += step, i++) { 8346 result_buf[i] = source_buf[cur]; 8347 } 8348 8349 result = PyUnicode_FromUnicode(result_buf, slicelength); 8350 PyObject_FREE(result_buf); 8351 return result; 8352 } 8353 } else { 8354 PyErr_SetString(PyExc_TypeError, "string indices must be integers"); 8355 return NULL; 8356 } 8357} 8358 8359static PyMappingMethods unicode_as_mapping = { 8360 (lenfunc)unicode_length, /* mp_length */ 8361 (binaryfunc)unicode_subscript, /* mp_subscript */ 8362 (objobjargproc)0, /* mp_ass_subscript */ 8363}; 8364 8365 8366/* Helpers for PyUnicode_Format() */ 8367 8368static PyObject * 8369getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx) 8370{ 8371 Py_ssize_t argidx = *p_argidx; 8372 if (argidx < arglen) { 8373 (*p_argidx)++; 8374 if (arglen < 0) 8375 return args; 8376 else 8377 return PyTuple_GetItem(args, argidx); 8378 } 8379 PyErr_SetString(PyExc_TypeError, 8380 "not enough arguments for format string"); 8381 return NULL; 8382} 8383 8384static Py_ssize_t 8385strtounicode(Py_UNICODE *buffer, const char *charbuffer) 8386{ 8387 register Py_ssize_t i; 8388 Py_ssize_t len = strlen(charbuffer); 8389 for (i = len - 1; i >= 0; i--) 8390 buffer[i] = (Py_UNICODE) charbuffer[i]; 8391 8392 return len; 8393} 8394 8395static int 8396doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x) 8397{ 8398 Py_ssize_t result; 8399 8400 PyOS_ascii_formatd((char *)buffer, len, format, x); 8401 result = strtounicode(buffer, (char *)buffer); 8402 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int); 8403} 8404 8405#if 0 8406static int 8407longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x) 8408{ 8409 Py_ssize_t result; 8410 8411 PyOS_snprintf((char *)buffer, len, format, x); 8412 result = strtounicode(buffer, (char *)buffer); 8413 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int); 8414} 8415#endif 8416 8417/* XXX To save some code duplication, formatfloat/long/int could have been 8418 shared with stringobject.c, converting from 8-bit to Unicode after the 8419 formatting is done. */ 8420 8421static int 8422formatfloat(Py_UNICODE *buf, 8423 size_t buflen, 8424 int flags, 8425 int prec, 8426 int type, 8427 PyObject *v) 8428{ 8429 /* fmt = '%#.' + `prec` + `type` 8430 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/ 8431 char fmt[20]; 8432 double x; 8433 8434 x = PyFloat_AsDouble(v); 8435 if (x == -1.0 && PyErr_Occurred()) 8436 return -1; 8437 if (prec < 0) 8438 prec = 6; 8439 if (type == 'f' && (fabs(x) / 1e25) >= 1e25) 8440 type = 'g'; 8441 /* Worst case length calc to ensure no buffer overrun: 8442 8443 'g' formats: 8444 fmt = %#.<prec>g 8445 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp 8446 for any double rep.) 8447 len = 1 + prec + 1 + 2 + 5 = 9 + prec 8448 8449 'f' formats: 8450 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50) 8451 len = 1 + 50 + 1 + prec = 52 + prec 8452 8453 If prec=0 the effective precision is 1 (the leading digit is 8454 always given), therefore increase the length by one. 8455 8456 */ 8457 if (((type == 'g' || type == 'G') && 8458 buflen <= (size_t)10 + (size_t)prec) || 8459 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) { 8460 PyErr_SetString(PyExc_OverflowError, 8461 "formatted float is too long (precision too large?)"); 8462 return -1; 8463 } 8464 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c", 8465 (flags&F_ALT) ? "#" : "", 8466 prec, type); 8467 return doubletounicode(buf, buflen, fmt, x); 8468} 8469 8470static PyObject* 8471formatlong(PyObject *val, int flags, int prec, int type) 8472{ 8473 char *buf; 8474 int len; 8475 PyObject *str; /* temporary string object. */ 8476 PyObject *result; 8477 8478 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len); 8479 if (!str) 8480 return NULL; 8481 result = PyUnicode_FromStringAndSize(buf, len); 8482 Py_DECREF(str); 8483 return result; 8484} 8485 8486#if 0 8487static int 8488formatint(Py_UNICODE *buf, 8489 size_t buflen, 8490 int flags, 8491 int prec, 8492 int type, 8493 PyObject *v) 8494{ 8495 /* fmt = '%#.' + `prec` + 'l' + `type` 8496 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine) 8497 * + 1 + 1 8498 * = 24 8499 */ 8500 char fmt[64]; /* plenty big enough! */ 8501 char *sign; 8502 long x; 8503 8504 x = PyLong_AsLong(v); 8505 if (x == -1 && PyErr_Occurred()) 8506 return -1; 8507 if (x < 0 && type == 'u') { 8508 type = 'd'; 8509 } 8510 if (x < 0 && (type == 'x' || type == 'X' || type == 'o')) 8511 sign = "-"; 8512 else 8513 sign = ""; 8514 if (prec < 0) 8515 prec = 1; 8516 8517 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal)) 8518 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11 8519 */ 8520 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) { 8521 PyErr_SetString(PyExc_OverflowError, 8522 "formatted integer is too long (precision too large?)"); 8523 return -1; 8524 } 8525 8526 if ((flags & F_ALT) && 8527 (type == 'x' || type == 'X' || type == 'o')) { 8528 /* When converting under %#o, %#x or %#X, there are a number 8529 * of issues that cause pain: 8530 * - for %#o, we want a different base marker than C 8531 * - when 0 is being converted, the C standard leaves off 8532 * the '0x' or '0X', which is inconsistent with other 8533 * %#x/%#X conversions and inconsistent with Python's 8534 * hex() function 8535 * - there are platforms that violate the standard and 8536 * convert 0 with the '0x' or '0X' 8537 * (Metrowerks, Compaq Tru64) 8538 * - there are platforms that give '0x' when converting 8539 * under %#X, but convert 0 in accordance with the 8540 * standard (OS/2 EMX) 8541 * 8542 * We can achieve the desired consistency by inserting our 8543 * own '0x' or '0X' prefix, and substituting %x/%X in place 8544 * of %#x/%#X. 8545 * 8546 * Note that this is the same approach as used in 8547 * formatint() in stringobject.c 8548 */ 8549 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c", 8550 sign, type, prec, type); 8551 } 8552 else { 8553 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c", 8554 sign, (flags&F_ALT) ? "#" : "", 8555 prec, type); 8556 } 8557 if (sign[0]) 8558 return longtounicode(buf, buflen, fmt, -x); 8559 else 8560 return longtounicode(buf, buflen, fmt, x); 8561} 8562#endif 8563 8564static int 8565formatchar(Py_UNICODE *buf, 8566 size_t buflen, 8567 PyObject *v) 8568{ 8569 /* presume that the buffer is at least 2 characters long */ 8570 if (PyUnicode_Check(v)) { 8571 if (PyUnicode_GET_SIZE(v) != 1) 8572 goto onError; 8573 buf[0] = PyUnicode_AS_UNICODE(v)[0]; 8574 } 8575 else { 8576 /* Integer input truncated to a character */ 8577 long x; 8578 x = PyLong_AsLong(v); 8579 if (x == -1 && PyErr_Occurred()) 8580 goto onError; 8581#ifdef Py_UNICODE_WIDE 8582 if (x < 0 || x > 0x10ffff) { 8583 PyErr_SetString(PyExc_OverflowError, 8584 "%c arg not in range(0x110000) " 8585 "(wide Python build)"); 8586 return -1; 8587 } 8588#else 8589 if (x < 0 || x > 0xffff) { 8590 PyErr_SetString(PyExc_OverflowError, 8591 "%c arg not in range(0x10000) " 8592 "(narrow Python build)"); 8593 return -1; 8594 } 8595#endif 8596 buf[0] = (Py_UNICODE) x; 8597 } 8598 buf[1] = '\0'; 8599 return 1; 8600 8601 onError: 8602 PyErr_SetString(PyExc_TypeError, 8603 "%c requires int or char"); 8604 return -1; 8605} 8606 8607/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...) 8608 8609 FORMATBUFLEN is the length of the buffer in which the floats, ints, & 8610 chars are formatted. XXX This is a magic number. Each formatting 8611 routine does bounds checking to ensure no overflow, but a better 8612 solution may be to malloc a buffer of appropriate size for each 8613 format. For now, the current solution is sufficient. 8614*/ 8615#define FORMATBUFLEN (size_t)120 8616 8617PyObject *PyUnicode_Format(PyObject *format, 8618 PyObject *args) 8619{ 8620 Py_UNICODE *fmt, *res; 8621 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx; 8622 int args_owned = 0; 8623 PyUnicodeObject *result = NULL; 8624 PyObject *dict = NULL; 8625 PyObject *uformat; 8626 8627 if (format == NULL || args == NULL) { 8628 PyErr_BadInternalCall(); 8629 return NULL; 8630 } 8631 uformat = PyUnicode_FromObject(format); 8632 if (uformat == NULL) 8633 return NULL; 8634 fmt = PyUnicode_AS_UNICODE(uformat); 8635 fmtcnt = PyUnicode_GET_SIZE(uformat); 8636 8637 reslen = rescnt = fmtcnt + 100; 8638 result = _PyUnicode_New(reslen); 8639 if (result == NULL) 8640 goto onError; 8641 res = PyUnicode_AS_UNICODE(result); 8642 8643 if (PyTuple_Check(args)) { 8644 arglen = PyTuple_Size(args); 8645 argidx = 0; 8646 } 8647 else { 8648 arglen = -1; 8649 argidx = -2; 8650 } 8651 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) && 8652 !PyUnicode_Check(args)) 8653 dict = args; 8654 8655 while (--fmtcnt >= 0) { 8656 if (*fmt != '%') { 8657 if (--rescnt < 0) { 8658 rescnt = fmtcnt + 100; 8659 reslen += rescnt; 8660 if (_PyUnicode_Resize(&result, reslen) < 0) 8661 goto onError; 8662 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt; 8663 --rescnt; 8664 } 8665 *res++ = *fmt++; 8666 } 8667 else { 8668 /* Got a format specifier */ 8669 int flags = 0; 8670 Py_ssize_t width = -1; 8671 int prec = -1; 8672 Py_UNICODE c = '\0'; 8673 Py_UNICODE fill; 8674 int isnumok; 8675 PyObject *v = NULL; 8676 PyObject *temp = NULL; 8677 Py_UNICODE *pbuf; 8678 Py_UNICODE sign; 8679 Py_ssize_t len; 8680 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */ 8681 8682 fmt++; 8683 if (*fmt == '(') { 8684 Py_UNICODE *keystart; 8685 Py_ssize_t keylen; 8686 PyObject *key; 8687 int pcount = 1; 8688 8689 if (dict == NULL) { 8690 PyErr_SetString(PyExc_TypeError, 8691 "format requires a mapping"); 8692 goto onError; 8693 } 8694 ++fmt; 8695 --fmtcnt; 8696 keystart = fmt; 8697 /* Skip over balanced parentheses */ 8698 while (pcount > 0 && --fmtcnt >= 0) { 8699 if (*fmt == ')') 8700 --pcount; 8701 else if (*fmt == '(') 8702 ++pcount; 8703 fmt++; 8704 } 8705 keylen = fmt - keystart - 1; 8706 if (fmtcnt < 0 || pcount > 0) { 8707 PyErr_SetString(PyExc_ValueError, 8708 "incomplete format key"); 8709 goto onError; 8710 } 8711#if 0 8712 /* keys are converted to strings using UTF-8 and 8713 then looked up since Python uses strings to hold 8714 variables names etc. in its namespaces and we 8715 wouldn't want to break common idioms. */ 8716 key = PyUnicode_EncodeUTF8(keystart, 8717 keylen, 8718 NULL); 8719#else 8720 key = PyUnicode_FromUnicode(keystart, keylen); 8721#endif 8722 if (key == NULL) 8723 goto onError; 8724 if (args_owned) { 8725 Py_DECREF(args); 8726 args_owned = 0; 8727 } 8728 args = PyObject_GetItem(dict, key); 8729 Py_DECREF(key); 8730 if (args == NULL) { 8731 goto onError; 8732 } 8733 args_owned = 1; 8734 arglen = -1; 8735 argidx = -2; 8736 } 8737 while (--fmtcnt >= 0) { 8738 switch (c = *fmt++) { 8739 case '-': flags |= F_LJUST; continue; 8740 case '+': flags |= F_SIGN; continue; 8741 case ' ': flags |= F_BLANK; continue; 8742 case '#': flags |= F_ALT; continue; 8743 case '0': flags |= F_ZERO; continue; 8744 } 8745 break; 8746 } 8747 if (c == '*') { 8748 v = getnextarg(args, arglen, &argidx); 8749 if (v == NULL) 8750 goto onError; 8751 if (!PyLong_Check(v)) { 8752 PyErr_SetString(PyExc_TypeError, 8753 "* wants int"); 8754 goto onError; 8755 } 8756 width = PyLong_AsLong(v); 8757 if (width == -1 && PyErr_Occurred()) 8758 goto onError; 8759 if (width < 0) { 8760 flags |= F_LJUST; 8761 width = -width; 8762 } 8763 if (--fmtcnt >= 0) 8764 c = *fmt++; 8765 } 8766 else if (c >= '0' && c <= '9') { 8767 width = c - '0'; 8768 while (--fmtcnt >= 0) { 8769 c = *fmt++; 8770 if (c < '0' || c > '9') 8771 break; 8772 if ((width*10) / 10 != width) { 8773 PyErr_SetString(PyExc_ValueError, 8774 "width too big"); 8775 goto onError; 8776 } 8777 width = width*10 + (c - '0'); 8778 } 8779 } 8780 if (c == '.') { 8781 prec = 0; 8782 if (--fmtcnt >= 0) 8783 c = *fmt++; 8784 if (c == '*') { 8785 v = getnextarg(args, arglen, &argidx); 8786 if (v == NULL) 8787 goto onError; 8788 if (!PyLong_Check(v)) { 8789 PyErr_SetString(PyExc_TypeError, 8790 "* wants int"); 8791 goto onError; 8792 } 8793 prec = PyLong_AsLong(v); 8794 if (prec == -1 && PyErr_Occurred()) 8795 goto onError; 8796 if (prec < 0) 8797 prec = 0; 8798 if (--fmtcnt >= 0) 8799 c = *fmt++; 8800 } 8801 else if (c >= '0' && c <= '9') { 8802 prec = c - '0'; 8803 while (--fmtcnt >= 0) { 8804 c = Py_CHARMASK(*fmt++); 8805 if (c < '0' || c > '9') 8806 break; 8807 if ((prec*10) / 10 != prec) { 8808 PyErr_SetString(PyExc_ValueError, 8809 "prec too big"); 8810 goto onError; 8811 } 8812 prec = prec*10 + (c - '0'); 8813 } 8814 } 8815 } /* prec */ 8816 if (fmtcnt >= 0) { 8817 if (c == 'h' || c == 'l' || c == 'L') { 8818 if (--fmtcnt >= 0) 8819 c = *fmt++; 8820 } 8821 } 8822 if (fmtcnt < 0) { 8823 PyErr_SetString(PyExc_ValueError, 8824 "incomplete format"); 8825 goto onError; 8826 } 8827 if (c != '%') { 8828 v = getnextarg(args, arglen, &argidx); 8829 if (v == NULL) 8830 goto onError; 8831 } 8832 sign = 0; 8833 fill = ' '; 8834 switch (c) { 8835 8836 case '%': 8837 pbuf = formatbuf; 8838 /* presume that buffer length is at least 1 */ 8839 pbuf[0] = '%'; 8840 len = 1; 8841 break; 8842 8843 case 's': 8844 case 'r': 8845 if (PyUnicode_Check(v) && c == 's') { 8846 temp = v; 8847 Py_INCREF(temp); 8848 } 8849 else { 8850 if (c == 's') 8851 temp = PyObject_Str(v); 8852 else 8853 temp = PyObject_Repr(v); 8854 if (temp == NULL) 8855 goto onError; 8856 if (PyUnicode_Check(temp)) 8857 /* nothing to do */; 8858 else { 8859 Py_DECREF(temp); 8860 PyErr_SetString(PyExc_TypeError, 8861 "%s argument has non-string str()"); 8862 goto onError; 8863 } 8864 } 8865 pbuf = PyUnicode_AS_UNICODE(temp); 8866 len = PyUnicode_GET_SIZE(temp); 8867 if (prec >= 0 && len > prec) 8868 len = prec; 8869 break; 8870 8871 case 'i': 8872 case 'd': 8873 case 'u': 8874 case 'o': 8875 case 'x': 8876 case 'X': 8877 if (c == 'i') 8878 c = 'd'; 8879 isnumok = 0; 8880 if (PyNumber_Check(v)) { 8881 PyObject *iobj=NULL; 8882 8883 if (PyLong_Check(v)) { 8884 iobj = v; 8885 Py_INCREF(iobj); 8886 } 8887 else { 8888 iobj = PyNumber_Long(v); 8889 } 8890 if (iobj!=NULL) { 8891 if (PyLong_Check(iobj)) { 8892 isnumok = 1; 8893 temp = formatlong(iobj, flags, prec, c); 8894 Py_DECREF(iobj); 8895 if (!temp) 8896 goto onError; 8897 pbuf = PyUnicode_AS_UNICODE(temp); 8898 len = PyUnicode_GET_SIZE(temp); 8899 sign = 1; 8900 } 8901 else { 8902 Py_DECREF(iobj); 8903 } 8904 } 8905 } 8906 if (!isnumok) { 8907 PyErr_Format(PyExc_TypeError, 8908 "%%%c format: a number is required, " 8909 "not %.200s", (char)c, Py_TYPE(v)->tp_name); 8910 goto onError; 8911 } 8912 if (flags & F_ZERO) 8913 fill = '0'; 8914 break; 8915 8916 case 'e': 8917 case 'E': 8918 case 'f': 8919 case 'F': 8920 case 'g': 8921 case 'G': 8922 if (c == 'F') 8923 c = 'f'; 8924 pbuf = formatbuf; 8925 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), 8926 flags, prec, c, v); 8927 if (len < 0) 8928 goto onError; 8929 sign = 1; 8930 if (flags & F_ZERO) 8931 fill = '0'; 8932 break; 8933 8934 case 'c': 8935 pbuf = formatbuf; 8936 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v); 8937 if (len < 0) 8938 goto onError; 8939 break; 8940 8941 default: 8942 PyErr_Format(PyExc_ValueError, 8943 "unsupported format character '%c' (0x%x) " 8944 "at index %zd", 8945 (31<=c && c<=126) ? (char)c : '?', 8946 (int)c, 8947 (Py_ssize_t)(fmt - 1 - 8948 PyUnicode_AS_UNICODE(uformat))); 8949 goto onError; 8950 } 8951 if (sign) { 8952 if (*pbuf == '-' || *pbuf == '+') { 8953 sign = *pbuf++; 8954 len--; 8955 } 8956 else if (flags & F_SIGN) 8957 sign = '+'; 8958 else if (flags & F_BLANK) 8959 sign = ' '; 8960 else 8961 sign = 0; 8962 } 8963 if (width < len) 8964 width = len; 8965 if (rescnt - (sign != 0) < width) { 8966 reslen -= rescnt; 8967 rescnt = width + fmtcnt + 100; 8968 reslen += rescnt; 8969 if (reslen < 0) { 8970 Py_XDECREF(temp); 8971 PyErr_NoMemory(); 8972 goto onError; 8973 } 8974 if (_PyUnicode_Resize(&result, reslen) < 0) { 8975 Py_XDECREF(temp); 8976 goto onError; 8977 } 8978 res = PyUnicode_AS_UNICODE(result) 8979 + reslen - rescnt; 8980 } 8981 if (sign) { 8982 if (fill != ' ') 8983 *res++ = sign; 8984 rescnt--; 8985 if (width > len) 8986 width--; 8987 } 8988 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) { 8989 assert(pbuf[0] == '0'); 8990 assert(pbuf[1] == c); 8991 if (fill != ' ') { 8992 *res++ = *pbuf++; 8993 *res++ = *pbuf++; 8994 } 8995 rescnt -= 2; 8996 width -= 2; 8997 if (width < 0) 8998 width = 0; 8999 len -= 2; 9000 } 9001 if (width > len && !(flags & F_LJUST)) { 9002 do { 9003 --rescnt; 9004 *res++ = fill; 9005 } while (--width > len); 9006 } 9007 if (fill == ' ') { 9008 if (sign) 9009 *res++ = sign; 9010 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) { 9011 assert(pbuf[0] == '0'); 9012 assert(pbuf[1] == c); 9013 *res++ = *pbuf++; 9014 *res++ = *pbuf++; 9015 } 9016 } 9017 Py_UNICODE_COPY(res, pbuf, len); 9018 res += len; 9019 rescnt -= len; 9020 while (--width >= len) { 9021 --rescnt; 9022 *res++ = ' '; 9023 } 9024 if (dict && (argidx < arglen) && c != '%') { 9025 PyErr_SetString(PyExc_TypeError, 9026 "not all arguments converted during string formatting"); 9027 Py_XDECREF(temp); 9028 goto onError; 9029 } 9030 Py_XDECREF(temp); 9031 } /* '%' */ 9032 } /* until end */ 9033 if (argidx < arglen && !dict) { 9034 PyErr_SetString(PyExc_TypeError, 9035 "not all arguments converted during string formatting"); 9036 goto onError; 9037 } 9038 9039 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0) 9040 goto onError; 9041 if (args_owned) { 9042 Py_DECREF(args); 9043 } 9044 Py_DECREF(uformat); 9045 return (PyObject *)result; 9046 9047 onError: 9048 Py_XDECREF(result); 9049 Py_DECREF(uformat); 9050 if (args_owned) { 9051 Py_DECREF(args); 9052 } 9053 return NULL; 9054} 9055 9056static PyObject * 9057unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds); 9058 9059static PyObject * 9060unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds) 9061{ 9062 PyObject *x = NULL; 9063 static char *kwlist[] = {"object", "encoding", "errors", 0}; 9064 char *encoding = NULL; 9065 char *errors = NULL; 9066 9067 if (type != &PyUnicode_Type) 9068 return unicode_subtype_new(type, args, kwds); 9069 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str", 9070 kwlist, &x, &encoding, &errors)) 9071 return NULL; 9072 if (x == NULL) 9073 return (PyObject *)_PyUnicode_New(0); 9074 if (encoding == NULL && errors == NULL) 9075 return PyObject_Str(x); 9076 else 9077 return PyUnicode_FromEncodedObject(x, encoding, errors); 9078} 9079 9080static PyObject * 9081unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds) 9082{ 9083 PyUnicodeObject *tmp, *pnew; 9084 Py_ssize_t n; 9085 9086 assert(PyType_IsSubtype(type, &PyUnicode_Type)); 9087 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds); 9088 if (tmp == NULL) 9089 return NULL; 9090 assert(PyUnicode_Check(tmp)); 9091 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length); 9092 if (pnew == NULL) { 9093 Py_DECREF(tmp); 9094 return NULL; 9095 } 9096 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1)); 9097 if (pnew->str == NULL) { 9098 _Py_ForgetReference((PyObject *)pnew); 9099 PyObject_Del(pnew); 9100 Py_DECREF(tmp); 9101 return PyErr_NoMemory(); 9102 } 9103 Py_UNICODE_COPY(pnew->str, tmp->str, n+1); 9104 pnew->length = n; 9105 pnew->hash = tmp->hash; 9106 Py_DECREF(tmp); 9107 return (PyObject *)pnew; 9108} 9109 9110PyDoc_STRVAR(unicode_doc, 9111"str(string [, encoding[, errors]]) -> object\n\ 9112\n\ 9113Create a new string object from the given encoded string.\n\ 9114encoding defaults to the current default string encoding.\n\ 9115errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'."); 9116 9117static PyObject *unicode_iter(PyObject *seq); 9118 9119PyTypeObject PyUnicode_Type = { 9120 PyVarObject_HEAD_INIT(&PyType_Type, 0) 9121 "str", /* tp_name */ 9122 sizeof(PyUnicodeObject), /* tp_size */ 9123 0, /* tp_itemsize */ 9124 /* Slots */ 9125 (destructor)unicode_dealloc, /* tp_dealloc */ 9126 0, /* tp_print */ 9127 0, /* tp_getattr */ 9128 0, /* tp_setattr */ 9129 0, /* tp_compare */ 9130 unicode_repr, /* tp_repr */ 9131 &unicode_as_number, /* tp_as_number */ 9132 &unicode_as_sequence, /* tp_as_sequence */ 9133 &unicode_as_mapping, /* tp_as_mapping */ 9134 (hashfunc) unicode_hash, /* tp_hash*/ 9135 0, /* tp_call*/ 9136 (reprfunc) unicode_str, /* tp_str */ 9137 PyObject_GenericGetAttr, /* tp_getattro */ 9138 0, /* tp_setattro */ 9139 0, /* tp_as_buffer */ 9140 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | 9141 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */ 9142 unicode_doc, /* tp_doc */ 9143 0, /* tp_traverse */ 9144 0, /* tp_clear */ 9145 PyUnicode_RichCompare, /* tp_richcompare */ 9146 0, /* tp_weaklistoffset */ 9147 unicode_iter, /* tp_iter */ 9148 0, /* tp_iternext */ 9149 unicode_methods, /* tp_methods */ 9150 0, /* tp_members */ 9151 0, /* tp_getset */ 9152 &PyBaseObject_Type, /* tp_base */ 9153 0, /* tp_dict */ 9154 0, /* tp_descr_get */ 9155 0, /* tp_descr_set */ 9156 0, /* tp_dictoffset */ 9157 0, /* tp_init */ 9158 0, /* tp_alloc */ 9159 unicode_new, /* tp_new */ 9160 PyObject_Del, /* tp_free */ 9161}; 9162 9163/* Initialize the Unicode implementation */ 9164 9165void _PyUnicode_Init(void) 9166{ 9167 int i; 9168 9169 /* XXX - move this array to unicodectype.c ? */ 9170 Py_UNICODE linebreak[] = { 9171 0x000A, /* LINE FEED */ 9172 0x000D, /* CARRIAGE RETURN */ 9173 0x001C, /* FILE SEPARATOR */ 9174 0x001D, /* GROUP SEPARATOR */ 9175 0x001E, /* RECORD SEPARATOR */ 9176 0x0085, /* NEXT LINE */ 9177 0x2028, /* LINE SEPARATOR */ 9178 0x2029, /* PARAGRAPH SEPARATOR */ 9179 }; 9180 9181 /* Init the implementation */ 9182 free_list = NULL; 9183 numfree = 0; 9184 unicode_empty = _PyUnicode_New(0); 9185 if (!unicode_empty) 9186 return; 9187 9188 for (i = 0; i < 256; i++) 9189 unicode_latin1[i] = NULL; 9190 if (PyType_Ready(&PyUnicode_Type) < 0) 9191 Py_FatalError("Can't initialize 'unicode'"); 9192 9193 /* initialize the linebreak bloom filter */ 9194 bloom_linebreak = make_bloom_mask( 9195 linebreak, sizeof(linebreak) / sizeof(linebreak[0]) 9196 ); 9197 9198 PyType_Ready(&EncodingMapType); 9199} 9200 9201/* Finalize the Unicode implementation */ 9202 9203int 9204PyUnicode_ClearFreeList(void) 9205{ 9206 int freelist_size = numfree; 9207 PyUnicodeObject *u; 9208 9209 for (u = free_list; u != NULL;) { 9210 PyUnicodeObject *v = u; 9211 u = *(PyUnicodeObject **)u; 9212 if (v->str) 9213 PyObject_DEL(v->str); 9214 Py_XDECREF(v->defenc); 9215 PyObject_Del(v); 9216 numfree--; 9217 } 9218 free_list = NULL; 9219 assert(numfree == 0); 9220 return freelist_size; 9221} 9222 9223void 9224_PyUnicode_Fini(void) 9225{ 9226 int i; 9227 9228 Py_XDECREF(unicode_empty); 9229 unicode_empty = NULL; 9230 9231 for (i = 0; i < 256; i++) { 9232 if (unicode_latin1[i]) { 9233 Py_DECREF(unicode_latin1[i]); 9234 unicode_latin1[i] = NULL; 9235 } 9236 } 9237 (void)PyUnicode_ClearFreeList(); 9238} 9239 9240void 9241PyUnicode_InternInPlace(PyObject **p) 9242{ 9243 register PyUnicodeObject *s = (PyUnicodeObject *)(*p); 9244 PyObject *t; 9245 if (s == NULL || !PyUnicode_Check(s)) 9246 Py_FatalError( 9247 "PyUnicode_InternInPlace: unicode strings only please!"); 9248 /* If it's a subclass, we don't really know what putting 9249 it in the interned dict might do. */ 9250 if (!PyUnicode_CheckExact(s)) 9251 return; 9252 if (PyUnicode_CHECK_INTERNED(s)) 9253 return; 9254 if (interned == NULL) { 9255 interned = PyDict_New(); 9256 if (interned == NULL) { 9257 PyErr_Clear(); /* Don't leave an exception */ 9258 return; 9259 } 9260 } 9261 /* It might be that the GetItem call fails even 9262 though the key is present in the dictionary, 9263 namely when this happens during a stack overflow. */ 9264 Py_ALLOW_RECURSION 9265 t = PyDict_GetItem(interned, (PyObject *)s); 9266 Py_END_ALLOW_RECURSION 9267 9268 if (t) { 9269 Py_INCREF(t); 9270 Py_DECREF(*p); 9271 *p = t; 9272 return; 9273 } 9274 9275 PyThreadState_GET()->recursion_critical = 1; 9276 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) { 9277 PyErr_Clear(); 9278 PyThreadState_GET()->recursion_critical = 0; 9279 return; 9280 } 9281 PyThreadState_GET()->recursion_critical = 0; 9282 /* The two references in interned are not counted by refcnt. 9283 The deallocator will take care of this */ 9284 Py_REFCNT(s) -= 2; 9285 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL; 9286} 9287 9288void 9289PyUnicode_InternImmortal(PyObject **p) 9290{ 9291 PyUnicode_InternInPlace(p); 9292 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) { 9293 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL; 9294 Py_INCREF(*p); 9295 } 9296} 9297 9298PyObject * 9299PyUnicode_InternFromString(const char *cp) 9300{ 9301 PyObject *s = PyUnicode_FromString(cp); 9302 if (s == NULL) 9303 return NULL; 9304 PyUnicode_InternInPlace(&s); 9305 return s; 9306} 9307 9308void _Py_ReleaseInternedUnicodeStrings(void) 9309{ 9310 PyObject *keys; 9311 PyUnicodeObject *s; 9312 Py_ssize_t i, n; 9313 Py_ssize_t immortal_size = 0, mortal_size = 0; 9314 9315 if (interned == NULL || !PyDict_Check(interned)) 9316 return; 9317 keys = PyDict_Keys(interned); 9318 if (keys == NULL || !PyList_Check(keys)) { 9319 PyErr_Clear(); 9320 return; 9321 } 9322 9323 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak 9324 detector, interned unicode strings are not forcibly deallocated; 9325 rather, we give them their stolen references back, and then clear 9326 and DECREF the interned dict. */ 9327 9328 n = PyList_GET_SIZE(keys); 9329 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n", 9330 n); 9331 for (i = 0; i < n; i++) { 9332 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i); 9333 switch (s->state) { 9334 case SSTATE_NOT_INTERNED: 9335 /* XXX Shouldn't happen */ 9336 break; 9337 case SSTATE_INTERNED_IMMORTAL: 9338 Py_REFCNT(s) += 1; 9339 immortal_size += s->length; 9340 break; 9341 case SSTATE_INTERNED_MORTAL: 9342 Py_REFCNT(s) += 2; 9343 mortal_size += s->length; 9344 break; 9345 default: 9346 Py_FatalError("Inconsistent interned string state."); 9347 } 9348 s->state = SSTATE_NOT_INTERNED; 9349 } 9350 fprintf(stderr, "total size of all interned strings: " 9351 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d " 9352 "mortal/immortal\n", mortal_size, immortal_size); 9353 Py_DECREF(keys); 9354 PyDict_Clear(interned); 9355 Py_DECREF(interned); 9356 interned = NULL; 9357} 9358 9359 9360/********************* Unicode Iterator **************************/ 9361 9362typedef struct { 9363 PyObject_HEAD 9364 Py_ssize_t it_index; 9365 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */ 9366} unicodeiterobject; 9367 9368static void 9369unicodeiter_dealloc(unicodeiterobject *it) 9370{ 9371 _PyObject_GC_UNTRACK(it); 9372 Py_XDECREF(it->it_seq); 9373 PyObject_GC_Del(it); 9374} 9375 9376static int 9377unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg) 9378{ 9379 Py_VISIT(it->it_seq); 9380 return 0; 9381} 9382 9383static PyObject * 9384unicodeiter_next(unicodeiterobject *it) 9385{ 9386 PyUnicodeObject *seq; 9387 PyObject *item; 9388 9389 assert(it != NULL); 9390 seq = it->it_seq; 9391 if (seq == NULL) 9392 return NULL; 9393 assert(PyUnicode_Check(seq)); 9394 9395 if (it->it_index < PyUnicode_GET_SIZE(seq)) { 9396 item = PyUnicode_FromUnicode( 9397 PyUnicode_AS_UNICODE(seq)+it->it_index, 1); 9398 if (item != NULL) 9399 ++it->it_index; 9400 return item; 9401 } 9402 9403 Py_DECREF(seq); 9404 it->it_seq = NULL; 9405 return NULL; 9406} 9407 9408static PyObject * 9409unicodeiter_len(unicodeiterobject *it) 9410{ 9411 Py_ssize_t len = 0; 9412 if (it->it_seq) 9413 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index; 9414 return PyLong_FromSsize_t(len); 9415} 9416 9417PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it))."); 9418 9419static PyMethodDef unicodeiter_methods[] = { 9420 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS, 9421 length_hint_doc}, 9422 {NULL, NULL} /* sentinel */ 9423}; 9424 9425PyTypeObject PyUnicodeIter_Type = { 9426 PyVarObject_HEAD_INIT(&PyType_Type, 0) 9427 "str_iterator", /* tp_name */ 9428 sizeof(unicodeiterobject), /* tp_basicsize */ 9429 0, /* tp_itemsize */ 9430 /* methods */ 9431 (destructor)unicodeiter_dealloc, /* tp_dealloc */ 9432 0, /* tp_print */ 9433 0, /* tp_getattr */ 9434 0, /* tp_setattr */ 9435 0, /* tp_compare */ 9436 0, /* tp_repr */ 9437 0, /* tp_as_number */ 9438 0, /* tp_as_sequence */ 9439 0, /* tp_as_mapping */ 9440 0, /* tp_hash */ 9441 0, /* tp_call */ 9442 0, /* tp_str */ 9443 PyObject_GenericGetAttr, /* tp_getattro */ 9444 0, /* tp_setattro */ 9445 0, /* tp_as_buffer */ 9446 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */ 9447 0, /* tp_doc */ 9448 (traverseproc)unicodeiter_traverse, /* tp_traverse */ 9449 0, /* tp_clear */ 9450 0, /* tp_richcompare */ 9451 0, /* tp_weaklistoffset */ 9452 PyObject_SelfIter, /* tp_iter */ 9453 (iternextfunc)unicodeiter_next, /* tp_iternext */ 9454 unicodeiter_methods, /* tp_methods */ 9455 0, 9456}; 9457 9458static PyObject * 9459unicode_iter(PyObject *seq) 9460{ 9461 unicodeiterobject *it; 9462 9463 if (!PyUnicode_Check(seq)) { 9464 PyErr_BadInternalCall(); 9465 return NULL; 9466 } 9467 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type); 9468 if (it == NULL) 9469 return NULL; 9470 it->it_index = 0; 9471 Py_INCREF(seq); 9472 it->it_seq = (PyUnicodeObject *)seq; 9473 _PyObject_GC_TRACK(it); 9474 return (PyObject *)it; 9475} 9476 9477size_t 9478Py_UNICODE_strlen(const Py_UNICODE *u) 9479{ 9480 int res = 0; 9481 while(*u++) 9482 res++; 9483 return res; 9484} 9485 9486Py_UNICODE* 9487Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2) 9488{ 9489 Py_UNICODE *u = s1; 9490 while ((*u++ = *s2++)); 9491 return s1; 9492} 9493 9494Py_UNICODE* 9495Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n) 9496{ 9497 Py_UNICODE *u = s1; 9498 while ((*u++ = *s2++)) 9499 if (n-- == 0) 9500 break; 9501 return s1; 9502} 9503 9504int 9505Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2) 9506{ 9507 while (*s1 && *s2 && *s1 == *s2) 9508 s1++, s2++; 9509 if (*s1 && *s2) 9510 return (*s1 < *s2) ? -1 : +1; 9511 if (*s1) 9512 return 1; 9513 if (*s2) 9514 return -1; 9515 return 0; 9516} 9517 9518Py_UNICODE* 9519Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c) 9520{ 9521 const Py_UNICODE *p; 9522 for (p = s; *p; p++) 9523 if (*p == c) 9524 return (Py_UNICODE*)p; 9525 return NULL; 9526} 9527 9528 9529#ifdef __cplusplus 9530} 9531#endif 9532 9533 9534/* 9535Local variables: 9536c-basic-offset: 4 9537indent-tabs-mode: nil 9538End: 9539*/ 9540