unicodeobject.c revision af14b79ccea4dd04376fc2720905d8d2f29c5b6a
1/* 2 3Unicode implementation based on original code by Fredrik Lundh, 4modified by Marc-Andre Lemburg <mal@lemburg.com> according to the 5Unicode Integration Proposal (see file Misc/unicode.txt). 6 7Major speed upgrades to the method implementations at the Reykjavik 8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke. 9 10Copyright (c) Corporation for National Research Initiatives. 11 12-------------------------------------------------------------------- 13The original string type implementation is: 14 15 Copyright (c) 1999 by Secret Labs AB 16 Copyright (c) 1999 by Fredrik Lundh 17 18By obtaining, using, and/or copying this software and/or its 19associated documentation, you agree that you have read, understood, 20and will comply with the following terms and conditions: 21 22Permission to use, copy, modify, and distribute this software and its 23associated documentation for any purpose and without fee is hereby 24granted, provided that the above copyright notice appears in all 25copies, and that both that copyright notice and this permission notice 26appear in supporting documentation, and that the name of Secret Labs 27AB or the author not be used in advertising or publicity pertaining to 28distribution of the software without specific, written prior 29permission. 30 31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO 32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND 33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR 34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT 37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 38-------------------------------------------------------------------- 39 40*/ 41 42#define PY_SSIZE_T_CLEAN 43#include "Python.h" 44#include "bytes_methods.h" 45 46#include "unicodeobject.h" 47#include "ucnhash.h" 48 49#ifdef MS_WINDOWS 50#include <windows.h> 51#endif 52 53/* Limit for the Unicode object free list */ 54 55#define PyUnicode_MAXFREELIST 1024 56 57/* Limit for the Unicode object free list stay alive optimization. 58 59 The implementation will keep allocated Unicode memory intact for 60 all objects on the free list having a size less than this 61 limit. This reduces malloc() overhead for small Unicode objects. 62 63 At worst this will result in PyUnicode_MAXFREELIST * 64 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT + 65 malloc()-overhead) bytes of unused garbage. 66 67 Setting the limit to 0 effectively turns the feature off. 68 69 Note: This is an experimental feature ! If you get core dumps when 70 using Unicode objects, turn this feature off. 71 72*/ 73 74#define KEEPALIVE_SIZE_LIMIT 9 75 76/* Endianness switches; defaults to little endian */ 77 78#ifdef WORDS_BIGENDIAN 79# define BYTEORDER_IS_BIG_ENDIAN 80#else 81# define BYTEORDER_IS_LITTLE_ENDIAN 82#endif 83 84/* --- Globals ------------------------------------------------------------ 85 86 The globals are initialized by the _PyUnicode_Init() API and should 87 not be used before calling that API. 88 89*/ 90 91 92#ifdef __cplusplus 93extern "C" { 94#endif 95 96/* This dictionary holds all interned unicode strings. Note that references 97 to strings in this dictionary are *not* counted in the string's ob_refcnt. 98 When the interned string reaches a refcnt of 0 the string deallocation 99 function will delete the reference from this dictionary. 100 101 Another way to look at this is that to say that the actual reference 102 count of a string is: s->ob_refcnt + (s->state ? 2 : 0) 103*/ 104static PyObject *interned; 105 106/* Free list for Unicode objects */ 107static PyUnicodeObject *free_list; 108static int numfree; 109 110/* The empty Unicode object is shared to improve performance. */ 111static PyUnicodeObject *unicode_empty; 112 113/* Single character Unicode strings in the Latin-1 range are being 114 shared as well. */ 115static PyUnicodeObject *unicode_latin1[256]; 116 117/* Default encoding to use and assume when NULL is passed as encoding 118 parameter; it is fixed to "utf-8". Always use the 119 PyUnicode_GetDefaultEncoding() API to access this global. 120 121 Don't forget to alter Py_FileSystemDefaultEncoding if you change the 122 hard coded default! 123*/ 124static const char unicode_default_encoding[] = "utf-8"; 125 126/* Fast detection of the most frequent whitespace characters */ 127const unsigned char _Py_ascii_whitespace[] = { 128 0, 0, 0, 0, 0, 0, 0, 0, 129// case 0x0009: /* HORIZONTAL TABULATION */ 130// case 0x000A: /* LINE FEED */ 131// case 0x000B: /* VERTICAL TABULATION */ 132// case 0x000C: /* FORM FEED */ 133// case 0x000D: /* CARRIAGE RETURN */ 134 0, 1, 1, 1, 1, 1, 0, 0, 135 0, 0, 0, 0, 0, 0, 0, 0, 136// case 0x001C: /* FILE SEPARATOR */ 137// case 0x001D: /* GROUP SEPARATOR */ 138// case 0x001E: /* RECORD SEPARATOR */ 139// case 0x001F: /* UNIT SEPARATOR */ 140 0, 0, 0, 0, 1, 1, 1, 1, 141// case 0x0020: /* SPACE */ 142 1, 0, 0, 0, 0, 0, 0, 0, 143 0, 0, 0, 0, 0, 0, 0, 0, 144 0, 0, 0, 0, 0, 0, 0, 0, 145 0, 0, 0, 0, 0, 0, 0, 0, 146 147 0, 0, 0, 0, 0, 0, 0, 0, 148 0, 0, 0, 0, 0, 0, 0, 0, 149 0, 0, 0, 0, 0, 0, 0, 0, 150 0, 0, 0, 0, 0, 0, 0, 0, 151 0, 0, 0, 0, 0, 0, 0, 0, 152 0, 0, 0, 0, 0, 0, 0, 0, 153 0, 0, 0, 0, 0, 0, 0, 0, 154 0, 0, 0, 0, 0, 0, 0, 0 155}; 156 157/* Same for linebreaks */ 158static unsigned char ascii_linebreak[] = { 159 0, 0, 0, 0, 0, 0, 0, 0, 160// 0x000A, /* LINE FEED */ 161// 0x000D, /* CARRIAGE RETURN */ 162 0, 0, 1, 0, 0, 1, 0, 0, 163 0, 0, 0, 0, 0, 0, 0, 0, 164// 0x001C, /* FILE SEPARATOR */ 165// 0x001D, /* GROUP SEPARATOR */ 166// 0x001E, /* RECORD SEPARATOR */ 167 0, 0, 0, 0, 1, 1, 1, 0, 168 0, 0, 0, 0, 0, 0, 0, 0, 169 0, 0, 0, 0, 0, 0, 0, 0, 170 0, 0, 0, 0, 0, 0, 0, 0, 171 0, 0, 0, 0, 0, 0, 0, 0, 172 173 0, 0, 0, 0, 0, 0, 0, 0, 174 0, 0, 0, 0, 0, 0, 0, 0, 175 0, 0, 0, 0, 0, 0, 0, 0, 176 0, 0, 0, 0, 0, 0, 0, 0, 177 0, 0, 0, 0, 0, 0, 0, 0, 178 0, 0, 0, 0, 0, 0, 0, 0, 179 0, 0, 0, 0, 0, 0, 0, 0, 180 0, 0, 0, 0, 0, 0, 0, 0 181}; 182 183 184Py_UNICODE 185PyUnicode_GetMax(void) 186{ 187#ifdef Py_UNICODE_WIDE 188 return 0x10FFFF; 189#else 190 /* This is actually an illegal character, so it should 191 not be passed to unichr. */ 192 return 0xFFFF; 193#endif 194} 195 196/* --- Bloom Filters ----------------------------------------------------- */ 197 198/* stuff to implement simple "bloom filters" for Unicode characters. 199 to keep things simple, we use a single bitmask, using the least 5 200 bits from each unicode characters as the bit index. */ 201 202/* the linebreak mask is set up by Unicode_Init below */ 203 204#define BLOOM_MASK unsigned long 205 206static BLOOM_MASK bloom_linebreak; 207 208#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F)))) 209 210#define BLOOM_LINEBREAK(ch) \ 211 ((ch) < 128U ? ascii_linebreak[(ch)] : \ 212 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch))) 213 214Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len) 215{ 216 /* calculate simple bloom-style bitmask for a given unicode string */ 217 218 long mask; 219 Py_ssize_t i; 220 221 mask = 0; 222 for (i = 0; i < len; i++) 223 mask |= (1 << (ptr[i] & 0x1F)); 224 225 return mask; 226} 227 228Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen) 229{ 230 Py_ssize_t i; 231 232 for (i = 0; i < setlen; i++) 233 if (set[i] == chr) 234 return 1; 235 236 return 0; 237} 238 239#define BLOOM_MEMBER(mask, chr, set, setlen)\ 240 BLOOM(mask, chr) && unicode_member(chr, set, setlen) 241 242/* --- Unicode Object ----------------------------------------------------- */ 243 244static 245int unicode_resize(register PyUnicodeObject *unicode, 246 Py_ssize_t length) 247{ 248 void *oldstr; 249 250 /* Shortcut if there's nothing much to do. */ 251 if (unicode->length == length) 252 goto reset; 253 254 /* Resizing shared object (unicode_empty or single character 255 objects) in-place is not allowed. Use PyUnicode_Resize() 256 instead ! */ 257 258 if (unicode == unicode_empty || 259 (unicode->length == 1 && 260 unicode->str[0] < 256U && 261 unicode_latin1[unicode->str[0]] == unicode)) { 262 PyErr_SetString(PyExc_SystemError, 263 "can't resize shared str objects"); 264 return -1; 265 } 266 267 /* We allocate one more byte to make sure the string is Ux0000 terminated. 268 The overallocation is also used by fastsearch, which assumes that it's 269 safe to look at str[length] (without making any assumptions about what 270 it contains). */ 271 272 oldstr = unicode->str; 273 unicode->str = PyObject_REALLOC(unicode->str, 274 sizeof(Py_UNICODE) * (length + 1)); 275 if (!unicode->str) { 276 unicode->str = (Py_UNICODE *)oldstr; 277 PyErr_NoMemory(); 278 return -1; 279 } 280 unicode->str[length] = 0; 281 unicode->length = length; 282 283 reset: 284 /* Reset the object caches */ 285 if (unicode->defenc) { 286 Py_DECREF(unicode->defenc); 287 unicode->defenc = NULL; 288 } 289 unicode->hash = -1; 290 291 return 0; 292} 293 294/* We allocate one more byte to make sure the string is 295 Ux0000 terminated; some code (e.g. new_identifier) 296 relies on that. 297 298 XXX This allocator could further be enhanced by assuring that the 299 free list never reduces its size below 1. 300 301*/ 302 303static 304PyUnicodeObject *_PyUnicode_New(Py_ssize_t length) 305{ 306 register PyUnicodeObject *unicode; 307 308 /* Optimization for empty strings */ 309 if (length == 0 && unicode_empty != NULL) { 310 Py_INCREF(unicode_empty); 311 return unicode_empty; 312 } 313 314 /* Unicode freelist & memory allocation */ 315 if (free_list) { 316 unicode = free_list; 317 free_list = *(PyUnicodeObject **)unicode; 318 numfree--; 319 if (unicode->str) { 320 /* Keep-Alive optimization: we only upsize the buffer, 321 never downsize it. */ 322 if ((unicode->length < length) && 323 unicode_resize(unicode, length) < 0) { 324 PyObject_DEL(unicode->str); 325 unicode->str = NULL; 326 } 327 } 328 else { 329 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1); 330 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size); 331 } 332 PyObject_INIT(unicode, &PyUnicode_Type); 333 } 334 else { 335 size_t new_size; 336 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type); 337 if (unicode == NULL) 338 return NULL; 339 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1); 340 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size); 341 } 342 343 if (!unicode->str) { 344 PyErr_NoMemory(); 345 goto onError; 346 } 347 /* Initialize the first element to guard against cases where 348 * the caller fails before initializing str -- unicode_resize() 349 * reads str[0], and the Keep-Alive optimization can keep memory 350 * allocated for str alive across a call to unicode_dealloc(unicode). 351 * We don't want unicode_resize to read uninitialized memory in 352 * that case. 353 */ 354 unicode->str[0] = 0; 355 unicode->str[length] = 0; 356 unicode->length = length; 357 unicode->hash = -1; 358 unicode->state = 0; 359 unicode->defenc = NULL; 360 return unicode; 361 362 onError: 363 /* XXX UNREF/NEWREF interface should be more symmetrical */ 364 _Py_DEC_REFTOTAL; 365 _Py_ForgetReference((PyObject *)unicode); 366 PyObject_Del(unicode); 367 return NULL; 368} 369 370static 371void unicode_dealloc(register PyUnicodeObject *unicode) 372{ 373 switch (PyUnicode_CHECK_INTERNED(unicode)) { 374 case SSTATE_NOT_INTERNED: 375 break; 376 377 case SSTATE_INTERNED_MORTAL: 378 /* revive dead object temporarily for DelItem */ 379 Py_REFCNT(unicode) = 3; 380 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0) 381 Py_FatalError( 382 "deletion of interned string failed"); 383 break; 384 385 case SSTATE_INTERNED_IMMORTAL: 386 Py_FatalError("Immortal interned string died."); 387 388 default: 389 Py_FatalError("Inconsistent interned string state."); 390 } 391 392 if (PyUnicode_CheckExact(unicode) && 393 numfree < PyUnicode_MAXFREELIST) { 394 /* Keep-Alive optimization */ 395 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) { 396 PyObject_DEL(unicode->str); 397 unicode->str = NULL; 398 unicode->length = 0; 399 } 400 if (unicode->defenc) { 401 Py_DECREF(unicode->defenc); 402 unicode->defenc = NULL; 403 } 404 /* Add to free list */ 405 *(PyUnicodeObject **)unicode = free_list; 406 free_list = unicode; 407 numfree++; 408 } 409 else { 410 PyObject_DEL(unicode->str); 411 Py_XDECREF(unicode->defenc); 412 Py_TYPE(unicode)->tp_free((PyObject *)unicode); 413 } 414} 415 416int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length) 417{ 418 register PyUnicodeObject *v; 419 420 /* Argument checks */ 421 if (unicode == NULL) { 422 PyErr_BadInternalCall(); 423 return -1; 424 } 425 v = (PyUnicodeObject *)*unicode; 426 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) { 427 PyErr_BadInternalCall(); 428 return -1; 429 } 430 431 /* Resizing unicode_empty and single character objects is not 432 possible since these are being shared. We simply return a fresh 433 copy with the same Unicode content. */ 434 if (v->length != length && 435 (v == unicode_empty || v->length == 1)) { 436 PyUnicodeObject *w = _PyUnicode_New(length); 437 if (w == NULL) 438 return -1; 439 Py_UNICODE_COPY(w->str, v->str, 440 length < v->length ? length : v->length); 441 Py_DECREF(*unicode); 442 *unicode = (PyObject *)w; 443 return 0; 444 } 445 446 /* Note that we don't have to modify *unicode for unshared Unicode 447 objects, since we can modify them in-place. */ 448 return unicode_resize(v, length); 449} 450 451/* Internal API for use in unicodeobject.c only ! */ 452#define _PyUnicode_Resize(unicodevar, length) \ 453 PyUnicode_Resize(((PyObject **)(unicodevar)), length) 454 455PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u, 456 Py_ssize_t size) 457{ 458 PyUnicodeObject *unicode; 459 460 /* If the Unicode data is known at construction time, we can apply 461 some optimizations which share commonly used objects. */ 462 if (u != NULL) { 463 464 /* Optimization for empty strings */ 465 if (size == 0 && unicode_empty != NULL) { 466 Py_INCREF(unicode_empty); 467 return (PyObject *)unicode_empty; 468 } 469 470 /* Single character Unicode objects in the Latin-1 range are 471 shared when using this constructor */ 472 if (size == 1 && *u < 256) { 473 unicode = unicode_latin1[*u]; 474 if (!unicode) { 475 unicode = _PyUnicode_New(1); 476 if (!unicode) 477 return NULL; 478 unicode->str[0] = *u; 479 unicode_latin1[*u] = unicode; 480 } 481 Py_INCREF(unicode); 482 return (PyObject *)unicode; 483 } 484 } 485 486 unicode = _PyUnicode_New(size); 487 if (!unicode) 488 return NULL; 489 490 /* Copy the Unicode data into the new object */ 491 if (u != NULL) 492 Py_UNICODE_COPY(unicode->str, u, size); 493 494 return (PyObject *)unicode; 495} 496 497PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size) 498{ 499 PyUnicodeObject *unicode; 500 501 if (size < 0) { 502 PyErr_SetString(PyExc_SystemError, 503 "Negative size passed to PyUnicode_FromStringAndSize"); 504 return NULL; 505 } 506 507 /* If the Unicode data is known at construction time, we can apply 508 some optimizations which share commonly used objects. 509 Also, this means the input must be UTF-8, so fall back to the 510 UTF-8 decoder at the end. */ 511 if (u != NULL) { 512 513 /* Optimization for empty strings */ 514 if (size == 0 && unicode_empty != NULL) { 515 Py_INCREF(unicode_empty); 516 return (PyObject *)unicode_empty; 517 } 518 519 /* Single characters are shared when using this constructor. 520 Restrict to ASCII, since the input must be UTF-8. */ 521 if (size == 1 && Py_CHARMASK(*u) < 128) { 522 unicode = unicode_latin1[Py_CHARMASK(*u)]; 523 if (!unicode) { 524 unicode = _PyUnicode_New(1); 525 if (!unicode) 526 return NULL; 527 unicode->str[0] = Py_CHARMASK(*u); 528 unicode_latin1[Py_CHARMASK(*u)] = unicode; 529 } 530 Py_INCREF(unicode); 531 return (PyObject *)unicode; 532 } 533 534 return PyUnicode_DecodeUTF8(u, size, NULL); 535 } 536 537 unicode = _PyUnicode_New(size); 538 if (!unicode) 539 return NULL; 540 541 return (PyObject *)unicode; 542} 543 544PyObject *PyUnicode_FromString(const char *u) 545{ 546 size_t size = strlen(u); 547 if (size > PY_SSIZE_T_MAX) { 548 PyErr_SetString(PyExc_OverflowError, "input too long"); 549 return NULL; 550 } 551 552 return PyUnicode_FromStringAndSize(u, size); 553} 554 555#ifdef HAVE_WCHAR_H 556 557PyObject *PyUnicode_FromWideChar(register const wchar_t *w, 558 Py_ssize_t size) 559{ 560 PyUnicodeObject *unicode; 561 562 if (w == NULL) { 563 if (size == 0) 564 return PyUnicode_FromStringAndSize(NULL, 0); 565 PyErr_BadInternalCall(); 566 return NULL; 567 } 568 569 if (size == -1) { 570 size = wcslen(w); 571 } 572 573 unicode = _PyUnicode_New(size); 574 if (!unicode) 575 return NULL; 576 577 /* Copy the wchar_t data into the new object */ 578#ifdef HAVE_USABLE_WCHAR_T 579 memcpy(unicode->str, w, size * sizeof(wchar_t)); 580#else 581 { 582 register Py_UNICODE *u; 583 register Py_ssize_t i; 584 u = PyUnicode_AS_UNICODE(unicode); 585 for (i = size; i > 0; i--) 586 *u++ = *w++; 587 } 588#endif 589 590 return (PyObject *)unicode; 591} 592 593static void 594makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c) 595{ 596 *fmt++ = '%'; 597 if (width) { 598 if (zeropad) 599 *fmt++ = '0'; 600 fmt += sprintf(fmt, "%d", width); 601 } 602 if (precision) 603 fmt += sprintf(fmt, ".%d", precision); 604 if (longflag) 605 *fmt++ = 'l'; 606 else if (size_tflag) { 607 char *f = PY_FORMAT_SIZE_T; 608 while (*f) 609 *fmt++ = *f++; 610 } 611 *fmt++ = c; 612 *fmt = '\0'; 613} 614 615#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;} 616 617PyObject * 618PyUnicode_FromFormatV(const char *format, va_list vargs) 619{ 620 va_list count; 621 Py_ssize_t callcount = 0; 622 PyObject **callresults = NULL; 623 PyObject **callresult = NULL; 624 Py_ssize_t n = 0; 625 int width = 0; 626 int precision = 0; 627 int zeropad; 628 const char* f; 629 Py_UNICODE *s; 630 PyObject *string; 631 /* used by sprintf */ 632 char buffer[21]; 633 /* use abuffer instead of buffer, if we need more space 634 * (which can happen if there's a format specifier with width). */ 635 char *abuffer = NULL; 636 char *realbuffer; 637 Py_ssize_t abuffersize = 0; 638 char fmt[60]; /* should be enough for %0width.precisionld */ 639 const char *copy; 640 641#ifdef VA_LIST_IS_ARRAY 642 Py_MEMCPY(count, vargs, sizeof(va_list)); 643#else 644#ifdef __va_copy 645 __va_copy(count, vargs); 646#else 647 count = vargs; 648#endif 649#endif 650 /* step 1: count the number of %S/%R/%A format specifications 651 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII() for 652 * these objects once during step 3 and put the result in 653 an array) */ 654 for (f = format; *f; f++) { 655 if (*f == '%' && (*(f+1)=='S' || *(f+1)=='R' || *(f+1)=='A')) 656 ++callcount; 657 } 658 /* step 2: allocate memory for the results of 659 * PyObject_Str()/PyObject_Repr() calls */ 660 if (callcount) { 661 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount); 662 if (!callresults) { 663 PyErr_NoMemory(); 664 return NULL; 665 } 666 callresult = callresults; 667 } 668 /* step 3: figure out how large a buffer we need */ 669 for (f = format; *f; f++) { 670 if (*f == '%') { 671 const char* p = f; 672 width = 0; 673 while (ISDIGIT((unsigned)*f)) 674 width = (width*10) + *f++ - '0'; 675 while (*++f && *f != '%' && !ISALPHA((unsigned)*f)) 676 ; 677 678 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since 679 * they don't affect the amount of space we reserve. 680 */ 681 if ((*f == 'l' || *f == 'z') && 682 (f[1] == 'd' || f[1] == 'u')) 683 ++f; 684 685 switch (*f) { 686 case 'c': 687 (void)va_arg(count, int); 688 /* fall through... */ 689 case '%': 690 n++; 691 break; 692 case 'd': case 'u': case 'i': case 'x': 693 (void) va_arg(count, int); 694 /* 20 bytes is enough to hold a 64-bit 695 integer. Decimal takes the most space. 696 This isn't enough for octal. 697 If a width is specified we need more 698 (which we allocate later). */ 699 if (width < 20) 700 width = 20; 701 n += width; 702 if (abuffersize < width) 703 abuffersize = width; 704 break; 705 case 's': 706 { 707 /* UTF-8 */ 708 unsigned char*s; 709 s = va_arg(count, unsigned char*); 710 while (*s) { 711 if (*s < 128) { 712 n++; s++; 713 } else if (*s < 0xc0) { 714 /* invalid UTF-8 */ 715 n++; s++; 716 } else if (*s < 0xc0) { 717 n++; 718 s++; if(!*s)break; 719 s++; 720 } else if (*s < 0xe0) { 721 n++; 722 s++; if(!*s)break; 723 s++; if(!*s)break; 724 s++; 725 } else { 726 #ifdef Py_UNICODE_WIDE 727 n++; 728 #else 729 n+=2; 730 #endif 731 s++; if(!*s)break; 732 s++; if(!*s)break; 733 s++; if(!*s)break; 734 s++; 735 } 736 } 737 break; 738 } 739 case 'U': 740 { 741 PyObject *obj = va_arg(count, PyObject *); 742 assert(obj && PyUnicode_Check(obj)); 743 n += PyUnicode_GET_SIZE(obj); 744 break; 745 } 746 case 'V': 747 { 748 PyObject *obj = va_arg(count, PyObject *); 749 const char *str = va_arg(count, const char *); 750 assert(obj || str); 751 assert(!obj || PyUnicode_Check(obj)); 752 if (obj) 753 n += PyUnicode_GET_SIZE(obj); 754 else 755 n += strlen(str); 756 break; 757 } 758 case 'S': 759 { 760 PyObject *obj = va_arg(count, PyObject *); 761 PyObject *str; 762 assert(obj); 763 str = PyObject_Str(obj); 764 if (!str) 765 goto fail; 766 n += PyUnicode_GET_SIZE(str); 767 /* Remember the str and switch to the next slot */ 768 *callresult++ = str; 769 break; 770 } 771 case 'R': 772 { 773 PyObject *obj = va_arg(count, PyObject *); 774 PyObject *repr; 775 assert(obj); 776 repr = PyObject_Repr(obj); 777 if (!repr) 778 goto fail; 779 n += PyUnicode_GET_SIZE(repr); 780 /* Remember the repr and switch to the next slot */ 781 *callresult++ = repr; 782 break; 783 } 784 case 'A': 785 { 786 PyObject *obj = va_arg(count, PyObject *); 787 PyObject *ascii; 788 assert(obj); 789 ascii = PyObject_ASCII(obj); 790 if (!ascii) 791 goto fail; 792 n += PyUnicode_GET_SIZE(ascii); 793 /* Remember the repr and switch to the next slot */ 794 *callresult++ = ascii; 795 break; 796 } 797 case 'p': 798 (void) va_arg(count, int); 799 /* maximum 64-bit pointer representation: 800 * 0xffffffffffffffff 801 * so 19 characters is enough. 802 * XXX I count 18 -- what's the extra for? 803 */ 804 n += 19; 805 break; 806 default: 807 /* if we stumble upon an unknown 808 formatting code, copy the rest of 809 the format string to the output 810 string. (we cannot just skip the 811 code, since there's no way to know 812 what's in the argument list) */ 813 n += strlen(p); 814 goto expand; 815 } 816 } else 817 n++; 818 } 819 expand: 820 if (abuffersize > 20) { 821 abuffer = PyObject_Malloc(abuffersize); 822 if (!abuffer) { 823 PyErr_NoMemory(); 824 goto fail; 825 } 826 realbuffer = abuffer; 827 } 828 else 829 realbuffer = buffer; 830 /* step 4: fill the buffer */ 831 /* Since we've analyzed how much space we need for the worst case, 832 we don't have to resize the string. 833 There can be no errors beyond this point. */ 834 string = PyUnicode_FromUnicode(NULL, n); 835 if (!string) 836 goto fail; 837 838 s = PyUnicode_AS_UNICODE(string); 839 callresult = callresults; 840 841 for (f = format; *f; f++) { 842 if (*f == '%') { 843 const char* p = f++; 844 int longflag = 0; 845 int size_tflag = 0; 846 zeropad = (*f == '0'); 847 /* parse the width.precision part */ 848 width = 0; 849 while (ISDIGIT((unsigned)*f)) 850 width = (width*10) + *f++ - '0'; 851 precision = 0; 852 if (*f == '.') { 853 f++; 854 while (ISDIGIT((unsigned)*f)) 855 precision = (precision*10) + *f++ - '0'; 856 } 857 /* handle the long flag, but only for %ld and %lu. 858 others can be added when necessary. */ 859 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) { 860 longflag = 1; 861 ++f; 862 } 863 /* handle the size_t flag. */ 864 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) { 865 size_tflag = 1; 866 ++f; 867 } 868 869 switch (*f) { 870 case 'c': 871 *s++ = va_arg(vargs, int); 872 break; 873 case 'd': 874 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd'); 875 if (longflag) 876 sprintf(realbuffer, fmt, va_arg(vargs, long)); 877 else if (size_tflag) 878 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t)); 879 else 880 sprintf(realbuffer, fmt, va_arg(vargs, int)); 881 appendstring(realbuffer); 882 break; 883 case 'u': 884 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u'); 885 if (longflag) 886 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long)); 887 else if (size_tflag) 888 sprintf(realbuffer, fmt, va_arg(vargs, size_t)); 889 else 890 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int)); 891 appendstring(realbuffer); 892 break; 893 case 'i': 894 makefmt(fmt, 0, 0, zeropad, width, precision, 'i'); 895 sprintf(realbuffer, fmt, va_arg(vargs, int)); 896 appendstring(realbuffer); 897 break; 898 case 'x': 899 makefmt(fmt, 0, 0, zeropad, width, precision, 'x'); 900 sprintf(realbuffer, fmt, va_arg(vargs, int)); 901 appendstring(realbuffer); 902 break; 903 case 's': 904 { 905 /* Parameter must be UTF-8 encoded. 906 In case of encoding errors, use 907 the replacement character. */ 908 PyObject *u; 909 p = va_arg(vargs, char*); 910 u = PyUnicode_DecodeUTF8(p, strlen(p), 911 "replace"); 912 if (!u) 913 goto fail; 914 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(u), 915 PyUnicode_GET_SIZE(u)); 916 s += PyUnicode_GET_SIZE(u); 917 Py_DECREF(u); 918 break; 919 } 920 case 'U': 921 { 922 PyObject *obj = va_arg(vargs, PyObject *); 923 Py_ssize_t size = PyUnicode_GET_SIZE(obj); 924 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size); 925 s += size; 926 break; 927 } 928 case 'V': 929 { 930 PyObject *obj = va_arg(vargs, PyObject *); 931 const char *str = va_arg(vargs, const char *); 932 if (obj) { 933 Py_ssize_t size = PyUnicode_GET_SIZE(obj); 934 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size); 935 s += size; 936 } else { 937 appendstring(str); 938 } 939 break; 940 } 941 case 'S': 942 case 'R': 943 { 944 Py_UNICODE *ucopy; 945 Py_ssize_t usize; 946 Py_ssize_t upos; 947 /* unused, since we already have the result */ 948 (void) va_arg(vargs, PyObject *); 949 ucopy = PyUnicode_AS_UNICODE(*callresult); 950 usize = PyUnicode_GET_SIZE(*callresult); 951 for (upos = 0; upos<usize;) 952 *s++ = ucopy[upos++]; 953 /* We're done with the unicode()/repr() => forget it */ 954 Py_DECREF(*callresult); 955 /* switch to next unicode()/repr() result */ 956 ++callresult; 957 break; 958 } 959 case 'p': 960 sprintf(buffer, "%p", va_arg(vargs, void*)); 961 /* %p is ill-defined: ensure leading 0x. */ 962 if (buffer[1] == 'X') 963 buffer[1] = 'x'; 964 else if (buffer[1] != 'x') { 965 memmove(buffer+2, buffer, strlen(buffer)+1); 966 buffer[0] = '0'; 967 buffer[1] = 'x'; 968 } 969 appendstring(buffer); 970 break; 971 case '%': 972 *s++ = '%'; 973 break; 974 default: 975 appendstring(p); 976 goto end; 977 } 978 } else 979 *s++ = *f; 980 } 981 982 end: 983 if (callresults) 984 PyObject_Free(callresults); 985 if (abuffer) 986 PyObject_Free(abuffer); 987 _PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string)); 988 return string; 989 fail: 990 if (callresults) { 991 PyObject **callresult2 = callresults; 992 while (callresult2 < callresult) { 993 Py_DECREF(*callresult2); 994 ++callresult2; 995 } 996 PyObject_Free(callresults); 997 } 998 if (abuffer) 999 PyObject_Free(abuffer); 1000 return NULL; 1001} 1002 1003#undef appendstring 1004 1005PyObject * 1006PyUnicode_FromFormat(const char *format, ...) 1007{ 1008 PyObject* ret; 1009 va_list vargs; 1010 1011#ifdef HAVE_STDARG_PROTOTYPES 1012 va_start(vargs, format); 1013#else 1014 va_start(vargs); 1015#endif 1016 ret = PyUnicode_FromFormatV(format, vargs); 1017 va_end(vargs); 1018 return ret; 1019} 1020 1021Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode, 1022 wchar_t *w, 1023 Py_ssize_t size) 1024{ 1025 if (unicode == NULL) { 1026 PyErr_BadInternalCall(); 1027 return -1; 1028 } 1029 1030 /* If possible, try to copy the 0-termination as well */ 1031 if (size > PyUnicode_GET_SIZE(unicode)) 1032 size = PyUnicode_GET_SIZE(unicode) + 1; 1033 1034#ifdef HAVE_USABLE_WCHAR_T 1035 memcpy(w, unicode->str, size * sizeof(wchar_t)); 1036#else 1037 { 1038 register Py_UNICODE *u; 1039 register Py_ssize_t i; 1040 u = PyUnicode_AS_UNICODE(unicode); 1041 for (i = size; i > 0; i--) 1042 *w++ = *u++; 1043 } 1044#endif 1045 1046 if (size > PyUnicode_GET_SIZE(unicode)) 1047 return PyUnicode_GET_SIZE(unicode); 1048 else 1049 return size; 1050} 1051 1052#endif 1053 1054PyObject *PyUnicode_FromOrdinal(int ordinal) 1055{ 1056 Py_UNICODE s[2]; 1057 1058 if (ordinal < 0 || ordinal > 0x10ffff) { 1059 PyErr_SetString(PyExc_ValueError, 1060 "chr() arg not in range(0x110000)"); 1061 return NULL; 1062 } 1063 1064#ifndef Py_UNICODE_WIDE 1065 if (ordinal > 0xffff) { 1066 ordinal -= 0x10000; 1067 s[0] = 0xD800 | (ordinal >> 10); 1068 s[1] = 0xDC00 | (ordinal & 0x3FF); 1069 return PyUnicode_FromUnicode(s, 2); 1070 } 1071#endif 1072 1073 s[0] = (Py_UNICODE)ordinal; 1074 return PyUnicode_FromUnicode(s, 1); 1075} 1076 1077PyObject *PyUnicode_FromObject(register PyObject *obj) 1078{ 1079 /* XXX Perhaps we should make this API an alias of 1080 PyObject_Str() instead ?! */ 1081 if (PyUnicode_CheckExact(obj)) { 1082 Py_INCREF(obj); 1083 return obj; 1084 } 1085 if (PyUnicode_Check(obj)) { 1086 /* For a Unicode subtype that's not a Unicode object, 1087 return a true Unicode object with the same data. */ 1088 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj), 1089 PyUnicode_GET_SIZE(obj)); 1090 } 1091 PyErr_Format(PyExc_TypeError, 1092 "Can't convert '%.100s' object to str implicitly", 1093 Py_TYPE(obj)->tp_name); 1094 return NULL; 1095} 1096 1097PyObject *PyUnicode_FromEncodedObject(register PyObject *obj, 1098 const char *encoding, 1099 const char *errors) 1100{ 1101 const char *s = NULL; 1102 Py_ssize_t len; 1103 PyObject *v; 1104 1105 if (obj == NULL) { 1106 PyErr_BadInternalCall(); 1107 return NULL; 1108 } 1109 1110 if (PyUnicode_Check(obj)) { 1111 PyErr_SetString(PyExc_TypeError, 1112 "decoding str is not supported"); 1113 return NULL; 1114 } 1115 1116 /* Coerce object */ 1117 if (PyBytes_Check(obj)) { 1118 s = PyBytes_AS_STRING(obj); 1119 len = PyBytes_GET_SIZE(obj); 1120 } 1121 else if (PyByteArray_Check(obj)) { 1122 s = PyByteArray_AS_STRING(obj); 1123 len = PyByteArray_GET_SIZE(obj); 1124 } 1125 else if (PyObject_AsCharBuffer(obj, &s, &len)) { 1126 /* Overwrite the error message with something more useful in 1127 case of a TypeError. */ 1128 if (PyErr_ExceptionMatches(PyExc_TypeError)) 1129 PyErr_Format(PyExc_TypeError, 1130 "coercing to str: need string or buffer, " 1131 "%.80s found", 1132 Py_TYPE(obj)->tp_name); 1133 goto onError; 1134 } 1135 1136 /* Convert to Unicode */ 1137 if (len == 0) { 1138 Py_INCREF(unicode_empty); 1139 v = (PyObject *)unicode_empty; 1140 } 1141 else 1142 v = PyUnicode_Decode(s, len, encoding, errors); 1143 1144 return v; 1145 1146 onError: 1147 return NULL; 1148} 1149 1150PyObject *PyUnicode_Decode(const char *s, 1151 Py_ssize_t size, 1152 const char *encoding, 1153 const char *errors) 1154{ 1155 PyObject *buffer = NULL, *unicode; 1156 Py_buffer info; 1157 char lower[20]; /* Enough for any encoding name we recognize */ 1158 char *l; 1159 const char *e; 1160 1161 if (encoding == NULL) 1162 encoding = PyUnicode_GetDefaultEncoding(); 1163 1164 /* Convert encoding to lower case and replace '_' with '-' in order to 1165 catch e.g. UTF_8 */ 1166 e = encoding; 1167 l = lower; 1168 while (*e && l < &lower[(sizeof lower) - 2]) { 1169 if (ISUPPER(*e)) { 1170 *l++ = TOLOWER(*e++); 1171 } 1172 else if (*e == '_') { 1173 *l++ = '-'; 1174 e++; 1175 } 1176 else { 1177 *l++ = *e++; 1178 } 1179 } 1180 *l = '\0'; 1181 1182 /* Shortcuts for common default encodings */ 1183 if (strcmp(lower, "utf-8") == 0) 1184 return PyUnicode_DecodeUTF8(s, size, errors); 1185 else if ((strcmp(lower, "latin-1") == 0) || 1186 (strcmp(lower, "iso-8859-1") == 0)) 1187 return PyUnicode_DecodeLatin1(s, size, errors); 1188#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T) 1189 else if (strcmp(lower, "mbcs") == 0) 1190 return PyUnicode_DecodeMBCS(s, size, errors); 1191#endif 1192 else if (strcmp(lower, "ascii") == 0) 1193 return PyUnicode_DecodeASCII(s, size, errors); 1194 else if (strcmp(lower, "utf-16") == 0) 1195 return PyUnicode_DecodeUTF16(s, size, errors, 0); 1196 else if (strcmp(lower, "utf-32") == 0) 1197 return PyUnicode_DecodeUTF32(s, size, errors, 0); 1198 1199 /* Decode via the codec registry */ 1200 buffer = NULL; 1201 if (PyBuffer_FillInfo(&info, (void *)s, size, 1, PyBUF_SIMPLE) < 0) 1202 goto onError; 1203 buffer = PyMemoryView_FromMemory(&info); 1204 if (buffer == NULL) 1205 goto onError; 1206 unicode = PyCodec_Decode(buffer, encoding, errors); 1207 if (unicode == NULL) 1208 goto onError; 1209 if (!PyUnicode_Check(unicode)) { 1210 PyErr_Format(PyExc_TypeError, 1211 "decoder did not return a str object (type=%.400s)", 1212 Py_TYPE(unicode)->tp_name); 1213 Py_DECREF(unicode); 1214 goto onError; 1215 } 1216 Py_DECREF(buffer); 1217 return unicode; 1218 1219 onError: 1220 Py_XDECREF(buffer); 1221 return NULL; 1222} 1223 1224PyObject *PyUnicode_AsDecodedObject(PyObject *unicode, 1225 const char *encoding, 1226 const char *errors) 1227{ 1228 PyObject *v; 1229 1230 if (!PyUnicode_Check(unicode)) { 1231 PyErr_BadArgument(); 1232 goto onError; 1233 } 1234 1235 if (encoding == NULL) 1236 encoding = PyUnicode_GetDefaultEncoding(); 1237 1238 /* Decode via the codec registry */ 1239 v = PyCodec_Decode(unicode, encoding, errors); 1240 if (v == NULL) 1241 goto onError; 1242 return v; 1243 1244 onError: 1245 return NULL; 1246} 1247 1248PyObject *PyUnicode_AsDecodedUnicode(PyObject *unicode, 1249 const char *encoding, 1250 const char *errors) 1251{ 1252 PyObject *v; 1253 1254 if (!PyUnicode_Check(unicode)) { 1255 PyErr_BadArgument(); 1256 goto onError; 1257 } 1258 1259 if (encoding == NULL) 1260 encoding = PyUnicode_GetDefaultEncoding(); 1261 1262 /* Decode via the codec registry */ 1263 v = PyCodec_Decode(unicode, encoding, errors); 1264 if (v == NULL) 1265 goto onError; 1266 if (!PyUnicode_Check(v)) { 1267 PyErr_Format(PyExc_TypeError, 1268 "decoder did not return a str object (type=%.400s)", 1269 Py_TYPE(v)->tp_name); 1270 Py_DECREF(v); 1271 goto onError; 1272 } 1273 return v; 1274 1275 onError: 1276 return NULL; 1277} 1278 1279PyObject *PyUnicode_Encode(const Py_UNICODE *s, 1280 Py_ssize_t size, 1281 const char *encoding, 1282 const char *errors) 1283{ 1284 PyObject *v, *unicode; 1285 1286 unicode = PyUnicode_FromUnicode(s, size); 1287 if (unicode == NULL) 1288 return NULL; 1289 v = PyUnicode_AsEncodedString(unicode, encoding, errors); 1290 Py_DECREF(unicode); 1291 return v; 1292} 1293 1294PyObject *PyUnicode_AsEncodedObject(PyObject *unicode, 1295 const char *encoding, 1296 const char *errors) 1297{ 1298 PyObject *v; 1299 1300 if (!PyUnicode_Check(unicode)) { 1301 PyErr_BadArgument(); 1302 goto onError; 1303 } 1304 1305 if (encoding == NULL) 1306 encoding = PyUnicode_GetDefaultEncoding(); 1307 1308 /* Encode via the codec registry */ 1309 v = PyCodec_Encode(unicode, encoding, errors); 1310 if (v == NULL) 1311 goto onError; 1312 return v; 1313 1314 onError: 1315 return NULL; 1316} 1317 1318PyObject *PyUnicode_AsEncodedString(PyObject *unicode, 1319 const char *encoding, 1320 const char *errors) 1321{ 1322 PyObject *v; 1323 1324 if (!PyUnicode_Check(unicode)) { 1325 PyErr_BadArgument(); 1326 goto onError; 1327 } 1328 1329 if (encoding == NULL) 1330 encoding = PyUnicode_GetDefaultEncoding(); 1331 1332 /* Shortcuts for common default encodings */ 1333 if (errors == NULL) { 1334 if (strcmp(encoding, "utf-8") == 0) 1335 return PyUnicode_AsUTF8String(unicode); 1336 else if (strcmp(encoding, "latin-1") == 0) 1337 return PyUnicode_AsLatin1String(unicode); 1338#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T) 1339 else if (strcmp(encoding, "mbcs") == 0) 1340 return PyUnicode_AsMBCSString(unicode); 1341#endif 1342 else if (strcmp(encoding, "ascii") == 0) 1343 return PyUnicode_AsASCIIString(unicode); 1344 } 1345 1346 /* Encode via the codec registry */ 1347 v = PyCodec_Encode(unicode, encoding, errors); 1348 if (v == NULL) 1349 goto onError; 1350 if (PyByteArray_Check(v)) { 1351 char msg[100]; 1352 PyOS_snprintf(msg, sizeof(msg), 1353 "encoder %s returned buffer instead of bytes", 1354 encoding); 1355 if (PyErr_WarnEx(PyExc_RuntimeWarning, msg, 1) < 0) { 1356 v = NULL; 1357 goto onError; 1358 } 1359 v = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v)); 1360 } 1361 else if (!PyBytes_Check(v)) { 1362 PyErr_Format(PyExc_TypeError, 1363 "encoder did not return a bytes object (type=%.400s)", 1364 Py_TYPE(v)->tp_name); 1365 v = NULL; 1366 } 1367 return v; 1368 1369 onError: 1370 return NULL; 1371} 1372 1373PyObject *PyUnicode_AsEncodedUnicode(PyObject *unicode, 1374 const char *encoding, 1375 const char *errors) 1376{ 1377 PyObject *v; 1378 1379 if (!PyUnicode_Check(unicode)) { 1380 PyErr_BadArgument(); 1381 goto onError; 1382 } 1383 1384 if (encoding == NULL) 1385 encoding = PyUnicode_GetDefaultEncoding(); 1386 1387 /* Encode via the codec registry */ 1388 v = PyCodec_Encode(unicode, encoding, errors); 1389 if (v == NULL) 1390 goto onError; 1391 if (!PyUnicode_Check(v)) { 1392 PyErr_Format(PyExc_TypeError, 1393 "encoder did not return an str object (type=%.400s)", 1394 Py_TYPE(v)->tp_name); 1395 Py_DECREF(v); 1396 goto onError; 1397 } 1398 return v; 1399 1400 onError: 1401 return NULL; 1402} 1403 1404PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode, 1405 const char *errors) 1406{ 1407 PyObject *v = ((PyUnicodeObject *)unicode)->defenc; 1408 if (v) 1409 return v; 1410 if (errors != NULL) 1411 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString"); 1412 v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode), 1413 PyUnicode_GET_SIZE(unicode), 1414 NULL); 1415 if (!v) 1416 return NULL; 1417 ((PyUnicodeObject *)unicode)->defenc = v; 1418 return v; 1419} 1420 1421PyObject* 1422PyUnicode_DecodeFSDefault(const char *s) { 1423 Py_ssize_t size = (Py_ssize_t)strlen(s); 1424 return PyUnicode_DecodeFSDefaultAndSize(s, size); 1425} 1426 1427PyObject* 1428PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size) 1429{ 1430 /* During the early bootstrapping process, Py_FileSystemDefaultEncoding 1431 can be undefined. If it is case, decode using UTF-8. The following assumes 1432 that Py_FileSystemDefaultEncoding is set to a built-in encoding during the 1433 bootstrapping process where the codecs aren't ready yet. 1434 */ 1435 if (Py_FileSystemDefaultEncoding) { 1436#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T) 1437 if (strcmp(Py_FileSystemDefaultEncoding, "mbcs") == 0) { 1438 return PyUnicode_DecodeMBCS(s, size, "replace"); 1439 } 1440#elif defined(__APPLE__) 1441 if (strcmp(Py_FileSystemDefaultEncoding, "utf-8") == 0) { 1442 return PyUnicode_DecodeUTF8(s, size, "replace"); 1443 } 1444#endif 1445 return PyUnicode_Decode(s, size, 1446 Py_FileSystemDefaultEncoding, 1447 "replace"); 1448 } 1449 else { 1450 return PyUnicode_DecodeUTF8(s, size, "replace"); 1451 } 1452} 1453 1454char* 1455_PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize) 1456{ 1457 PyObject *bytes; 1458 if (!PyUnicode_Check(unicode)) { 1459 PyErr_BadArgument(); 1460 return NULL; 1461 } 1462 bytes = _PyUnicode_AsDefaultEncodedString(unicode, NULL); 1463 if (bytes == NULL) 1464 return NULL; 1465 if (psize != NULL) 1466 *psize = PyBytes_GET_SIZE(bytes); 1467 return PyBytes_AS_STRING(bytes); 1468} 1469 1470char* 1471_PyUnicode_AsString(PyObject *unicode) 1472{ 1473 return _PyUnicode_AsStringAndSize(unicode, NULL); 1474} 1475 1476Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode) 1477{ 1478 if (!PyUnicode_Check(unicode)) { 1479 PyErr_BadArgument(); 1480 goto onError; 1481 } 1482 return PyUnicode_AS_UNICODE(unicode); 1483 1484 onError: 1485 return NULL; 1486} 1487 1488Py_ssize_t PyUnicode_GetSize(PyObject *unicode) 1489{ 1490 if (!PyUnicode_Check(unicode)) { 1491 PyErr_BadArgument(); 1492 goto onError; 1493 } 1494 return PyUnicode_GET_SIZE(unicode); 1495 1496 onError: 1497 return -1; 1498} 1499 1500const char *PyUnicode_GetDefaultEncoding(void) 1501{ 1502 return unicode_default_encoding; 1503} 1504 1505int PyUnicode_SetDefaultEncoding(const char *encoding) 1506{ 1507 if (strcmp(encoding, unicode_default_encoding) != 0) { 1508 PyErr_Format(PyExc_ValueError, 1509 "Can only set default encoding to %s", 1510 unicode_default_encoding); 1511 return -1; 1512 } 1513 return 0; 1514} 1515 1516/* error handling callback helper: 1517 build arguments, call the callback and check the arguments, 1518 if no exception occurred, copy the replacement to the output 1519 and adjust various state variables. 1520 return 0 on success, -1 on error 1521*/ 1522 1523static 1524int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler, 1525 const char *encoding, const char *reason, 1526 const char **input, const char **inend, Py_ssize_t *startinpos, 1527 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr, 1528 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr) 1529{ 1530 static char *argparse = "O!n;decoding error handler must return (str, int) tuple"; 1531 1532 PyObject *restuple = NULL; 1533 PyObject *repunicode = NULL; 1534 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output); 1535 Py_ssize_t insize; 1536 Py_ssize_t requiredsize; 1537 Py_ssize_t newpos; 1538 Py_UNICODE *repptr; 1539 PyObject *inputobj = NULL; 1540 Py_ssize_t repsize; 1541 int res = -1; 1542 1543 if (*errorHandler == NULL) { 1544 *errorHandler = PyCodec_LookupError(errors); 1545 if (*errorHandler == NULL) 1546 goto onError; 1547 } 1548 1549 if (*exceptionObject == NULL) { 1550 *exceptionObject = PyUnicodeDecodeError_Create( 1551 encoding, *input, *inend-*input, *startinpos, *endinpos, reason); 1552 if (*exceptionObject == NULL) 1553 goto onError; 1554 } 1555 else { 1556 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos)) 1557 goto onError; 1558 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos)) 1559 goto onError; 1560 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason)) 1561 goto onError; 1562 } 1563 1564 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL); 1565 if (restuple == NULL) 1566 goto onError; 1567 if (!PyTuple_Check(restuple)) { 1568 PyErr_Format(PyExc_TypeError, &argparse[4]); 1569 goto onError; 1570 } 1571 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos)) 1572 goto onError; 1573 1574 /* Copy back the bytes variables, which might have been modified by the 1575 callback */ 1576 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject); 1577 if (!inputobj) 1578 goto onError; 1579 if (!PyBytes_Check(inputobj)) { 1580 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes"); 1581 } 1582 *input = PyBytes_AS_STRING(inputobj); 1583 insize = PyBytes_GET_SIZE(inputobj); 1584 *inend = *input + insize; 1585 /* we can DECREF safely, as the exception has another reference, 1586 so the object won't go away. */ 1587 Py_DECREF(inputobj); 1588 1589 if (newpos<0) 1590 newpos = insize+newpos; 1591 if (newpos<0 || newpos>insize) { 1592 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos); 1593 goto onError; 1594 } 1595 1596 /* need more space? (at least enough for what we 1597 have+the replacement+the rest of the string (starting 1598 at the new input position), so we won't have to check space 1599 when there are no errors in the rest of the string) */ 1600 repptr = PyUnicode_AS_UNICODE(repunicode); 1601 repsize = PyUnicode_GET_SIZE(repunicode); 1602 requiredsize = *outpos + repsize + insize-newpos; 1603 if (requiredsize > outsize) { 1604 if (requiredsize<2*outsize) 1605 requiredsize = 2*outsize; 1606 if (PyUnicode_Resize(output, requiredsize) < 0) 1607 goto onError; 1608 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos; 1609 } 1610 *endinpos = newpos; 1611 *inptr = *input + newpos; 1612 Py_UNICODE_COPY(*outptr, repptr, repsize); 1613 *outptr += repsize; 1614 *outpos += repsize; 1615 1616 /* we made it! */ 1617 res = 0; 1618 1619 onError: 1620 Py_XDECREF(restuple); 1621 return res; 1622} 1623 1624/* --- UTF-7 Codec -------------------------------------------------------- */ 1625 1626/* see RFC2152 for details */ 1627 1628static 1629char utf7_special[128] = { 1630 /* indicate whether a UTF-7 character is special i.e. cannot be directly 1631 encoded: 1632 0 - not special 1633 1 - special 1634 2 - whitespace (optional) 1635 3 - RFC2152 Set O (optional) */ 1636 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1, 1637 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1638 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1, 1639 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0, 1640 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1641 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3, 1642 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1643 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1, 1644 1645}; 1646 1647/* Note: The comparison (c) <= 0 is a trick to work-around gcc 1648 warnings about the comparison always being false; since 1649 utf7_special[0] is 1, we can safely make that one comparison 1650 true */ 1651 1652#define SPECIAL(c, encodeO, encodeWS) \ 1653 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \ 1654 (encodeWS && (utf7_special[(c)] == 2)) || \ 1655 (encodeO && (utf7_special[(c)] == 3))) 1656 1657#define B64(n) \ 1658 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f]) 1659#define B64CHAR(c) \ 1660 (ISALNUM(c) || (c) == '+' || (c) == '/') 1661#define UB64(c) \ 1662 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \ 1663 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 ) 1664 1665#define ENCODE(out, ch, bits) \ 1666 while (bits >= 6) { \ 1667 *out++ = B64(ch >> (bits-6)); \ 1668 bits -= 6; \ 1669 } 1670 1671#define DECODE(out, ch, bits, surrogate) \ 1672 while (bits >= 16) { \ 1673 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \ 1674 bits -= 16; \ 1675 if (surrogate) { \ 1676 /* We have already generated an error for the high surrogate \ 1677 so let's not bother seeing if the low surrogate is correct or not */ \ 1678 surrogate = 0; \ 1679 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \ 1680 /* This is a surrogate pair. Unfortunately we can't represent \ 1681 it in a 16-bit character */ \ 1682 surrogate = 1; \ 1683 errmsg = "code pairs are not supported"; \ 1684 goto utf7Error; \ 1685 } else { \ 1686 *out++ = outCh; \ 1687 } \ 1688 } 1689 1690PyObject *PyUnicode_DecodeUTF7(const char *s, 1691 Py_ssize_t size, 1692 const char *errors) 1693{ 1694 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL); 1695} 1696 1697PyObject *PyUnicode_DecodeUTF7Stateful(const char *s, 1698 Py_ssize_t size, 1699 const char *errors, 1700 Py_ssize_t *consumed) 1701{ 1702 const char *starts = s; 1703 Py_ssize_t startinpos; 1704 Py_ssize_t endinpos; 1705 Py_ssize_t outpos; 1706 const char *e; 1707 PyUnicodeObject *unicode; 1708 Py_UNICODE *p; 1709 const char *errmsg = ""; 1710 int inShift = 0; 1711 unsigned int bitsleft = 0; 1712 unsigned long charsleft = 0; 1713 int surrogate = 0; 1714 PyObject *errorHandler = NULL; 1715 PyObject *exc = NULL; 1716 1717 unicode = _PyUnicode_New(size); 1718 if (!unicode) 1719 return NULL; 1720 if (size == 0) { 1721 if (consumed) 1722 *consumed = 0; 1723 return (PyObject *)unicode; 1724 } 1725 1726 p = unicode->str; 1727 e = s + size; 1728 1729 while (s < e) { 1730 Py_UNICODE ch; 1731 restart: 1732 ch = (unsigned char) *s; 1733 1734 if (inShift) { 1735 if ((ch == '-') || !B64CHAR(ch)) { 1736 inShift = 0; 1737 s++; 1738 1739 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate); 1740 if (bitsleft >= 6) { 1741 /* The shift sequence has a partial character in it. If 1742 bitsleft < 6 then we could just classify it as padding 1743 but that is not the case here */ 1744 1745 errmsg = "partial character in shift sequence"; 1746 goto utf7Error; 1747 } 1748 /* According to RFC2152 the remaining bits should be zero. We 1749 choose to signal an error/insert a replacement character 1750 here so indicate the potential of a misencoded character. */ 1751 1752 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */ 1753 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) { 1754 errmsg = "non-zero padding bits in shift sequence"; 1755 goto utf7Error; 1756 } 1757 1758 if (ch == '-') { 1759 if ((s < e) && (*(s) == '-')) { 1760 *p++ = '-'; 1761 inShift = 1; 1762 } 1763 } else if (SPECIAL(ch,0,0)) { 1764 errmsg = "unexpected special character"; 1765 goto utf7Error; 1766 } else { 1767 *p++ = ch; 1768 } 1769 } else { 1770 charsleft = (charsleft << 6) | UB64(ch); 1771 bitsleft += 6; 1772 s++; 1773 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate); 1774 } 1775 } 1776 else if ( ch == '+' ) { 1777 startinpos = s-starts; 1778 s++; 1779 if (s < e && *s == '-') { 1780 s++; 1781 *p++ = '+'; 1782 } else 1783 { 1784 inShift = 1; 1785 bitsleft = 0; 1786 } 1787 } 1788 else if (SPECIAL(ch,0,0)) { 1789 startinpos = s-starts; 1790 errmsg = "unexpected special character"; 1791 s++; 1792 goto utf7Error; 1793 } 1794 else { 1795 *p++ = ch; 1796 s++; 1797 } 1798 continue; 1799 utf7Error: 1800 outpos = p-PyUnicode_AS_UNICODE(unicode); 1801 endinpos = s-starts; 1802 if (unicode_decode_call_errorhandler( 1803 errors, &errorHandler, 1804 "utf7", errmsg, 1805 &starts, &e, &startinpos, &endinpos, &exc, &s, 1806 (PyObject **)&unicode, &outpos, &p)) 1807 goto onError; 1808 } 1809 1810 if (inShift && !consumed) { 1811 outpos = p-PyUnicode_AS_UNICODE(unicode); 1812 endinpos = size; 1813 if (unicode_decode_call_errorhandler( 1814 errors, &errorHandler, 1815 "utf7", "unterminated shift sequence", 1816 &starts, &e, &startinpos, &endinpos, &exc, &s, 1817 (PyObject **)&unicode, &outpos, &p)) 1818 goto onError; 1819 if (s < e) 1820 goto restart; 1821 } 1822 if (consumed) { 1823 if(inShift) 1824 *consumed = startinpos; 1825 else 1826 *consumed = s-starts; 1827 } 1828 1829 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0) 1830 goto onError; 1831 1832 Py_XDECREF(errorHandler); 1833 Py_XDECREF(exc); 1834 return (PyObject *)unicode; 1835 1836onError: 1837 Py_XDECREF(errorHandler); 1838 Py_XDECREF(exc); 1839 Py_DECREF(unicode); 1840 return NULL; 1841} 1842 1843 1844PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s, 1845 Py_ssize_t size, 1846 int encodeSetO, 1847 int encodeWhiteSpace, 1848 const char *errors) 1849{ 1850 PyObject *v, *result; 1851 /* It might be possible to tighten this worst case */ 1852 Py_ssize_t cbAllocated = 5 * size; 1853 int inShift = 0; 1854 Py_ssize_t i = 0; 1855 unsigned int bitsleft = 0; 1856 unsigned long charsleft = 0; 1857 char * out; 1858 char * start; 1859 1860 if (size == 0) 1861 return PyBytes_FromStringAndSize(NULL, 0); 1862 1863 v = PyByteArray_FromStringAndSize(NULL, cbAllocated); 1864 if (v == NULL) 1865 return NULL; 1866 1867 start = out = PyByteArray_AS_STRING(v); 1868 for (;i < size; ++i) { 1869 Py_UNICODE ch = s[i]; 1870 1871 if (!inShift) { 1872 if (ch == '+') { 1873 *out++ = '+'; 1874 *out++ = '-'; 1875 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) { 1876 charsleft = ch; 1877 bitsleft = 16; 1878 *out++ = '+'; 1879 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft); 1880 inShift = bitsleft > 0; 1881 } else { 1882 *out++ = (char) ch; 1883 } 1884 } else { 1885 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) { 1886 *out++ = B64(charsleft << (6-bitsleft)); 1887 charsleft = 0; 1888 bitsleft = 0; 1889 /* Characters not in the BASE64 set implicitly unshift the sequence 1890 so no '-' is required, except if the character is itself a '-' */ 1891 if (B64CHAR(ch) || ch == '-') { 1892 *out++ = '-'; 1893 } 1894 inShift = 0; 1895 *out++ = (char) ch; 1896 } else { 1897 bitsleft += 16; 1898 charsleft = (charsleft << 16) | ch; 1899 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft); 1900 1901 /* If the next character is special then we dont' need to terminate 1902 the shift sequence. If the next character is not a BASE64 character 1903 or '-' then the shift sequence will be terminated implicitly and we 1904 don't have to insert a '-'. */ 1905 1906 if (bitsleft == 0) { 1907 if (i + 1 < size) { 1908 Py_UNICODE ch2 = s[i+1]; 1909 1910 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) { 1911 1912 } else if (B64CHAR(ch2) || ch2 == '-') { 1913 *out++ = '-'; 1914 inShift = 0; 1915 } else { 1916 inShift = 0; 1917 } 1918 1919 } 1920 else { 1921 *out++ = '-'; 1922 inShift = 0; 1923 } 1924 } 1925 } 1926 } 1927 } 1928 if (bitsleft) { 1929 *out++= B64(charsleft << (6-bitsleft) ); 1930 *out++ = '-'; 1931 } 1932 1933 result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), out - start); 1934 Py_DECREF(v); 1935 return result; 1936} 1937 1938#undef SPECIAL 1939#undef B64 1940#undef B64CHAR 1941#undef UB64 1942#undef ENCODE 1943#undef DECODE 1944 1945/* --- UTF-8 Codec -------------------------------------------------------- */ 1946 1947static 1948char utf8_code_length[256] = { 1949 /* Map UTF-8 encoded prefix byte to sequence length. zero means 1950 illegal prefix. see RFC 2279 for details */ 1951 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1952 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1953 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1954 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1955 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1956 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1957 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1958 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1959 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1960 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1961 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1962 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1963 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1964 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1965 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1966 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0 1967}; 1968 1969PyObject *PyUnicode_DecodeUTF8(const char *s, 1970 Py_ssize_t size, 1971 const char *errors) 1972{ 1973 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL); 1974} 1975 1976PyObject *PyUnicode_DecodeUTF8Stateful(const char *s, 1977 Py_ssize_t size, 1978 const char *errors, 1979 Py_ssize_t *consumed) 1980{ 1981 const char *starts = s; 1982 int n; 1983 Py_ssize_t startinpos; 1984 Py_ssize_t endinpos; 1985 Py_ssize_t outpos; 1986 const char *e; 1987 PyUnicodeObject *unicode; 1988 Py_UNICODE *p; 1989 const char *errmsg = ""; 1990 PyObject *errorHandler = NULL; 1991 PyObject *exc = NULL; 1992 1993 /* Note: size will always be longer than the resulting Unicode 1994 character count */ 1995 unicode = _PyUnicode_New(size); 1996 if (!unicode) 1997 return NULL; 1998 if (size == 0) { 1999 if (consumed) 2000 *consumed = 0; 2001 return (PyObject *)unicode; 2002 } 2003 2004 /* Unpack UTF-8 encoded data */ 2005 p = unicode->str; 2006 e = s + size; 2007 2008 while (s < e) { 2009 Py_UCS4 ch = (unsigned char)*s; 2010 2011 if (ch < 0x80) { 2012 *p++ = (Py_UNICODE)ch; 2013 s++; 2014 continue; 2015 } 2016 2017 n = utf8_code_length[ch]; 2018 2019 if (s + n > e) { 2020 if (consumed) 2021 break; 2022 else { 2023 errmsg = "unexpected end of data"; 2024 startinpos = s-starts; 2025 endinpos = size; 2026 goto utf8Error; 2027 } 2028 } 2029 2030 switch (n) { 2031 2032 case 0: 2033 errmsg = "unexpected code byte"; 2034 startinpos = s-starts; 2035 endinpos = startinpos+1; 2036 goto utf8Error; 2037 2038 case 1: 2039 errmsg = "internal error"; 2040 startinpos = s-starts; 2041 endinpos = startinpos+1; 2042 goto utf8Error; 2043 2044 case 2: 2045 if ((s[1] & 0xc0) != 0x80) { 2046 errmsg = "invalid data"; 2047 startinpos = s-starts; 2048 endinpos = startinpos+2; 2049 goto utf8Error; 2050 } 2051 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f); 2052 if (ch < 0x80) { 2053 startinpos = s-starts; 2054 endinpos = startinpos+2; 2055 errmsg = "illegal encoding"; 2056 goto utf8Error; 2057 } 2058 else 2059 *p++ = (Py_UNICODE)ch; 2060 break; 2061 2062 case 3: 2063 if ((s[1] & 0xc0) != 0x80 || 2064 (s[2] & 0xc0) != 0x80) { 2065 errmsg = "invalid data"; 2066 startinpos = s-starts; 2067 endinpos = startinpos+3; 2068 goto utf8Error; 2069 } 2070 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f); 2071 if (ch < 0x0800) { 2072 /* Note: UTF-8 encodings of surrogates are considered 2073 legal UTF-8 sequences; 2074 2075 XXX For wide builds (UCS-4) we should probably try 2076 to recombine the surrogates into a single code 2077 unit. 2078 */ 2079 errmsg = "illegal encoding"; 2080 startinpos = s-starts; 2081 endinpos = startinpos+3; 2082 goto utf8Error; 2083 } 2084 else 2085 *p++ = (Py_UNICODE)ch; 2086 break; 2087 2088 case 4: 2089 if ((s[1] & 0xc0) != 0x80 || 2090 (s[2] & 0xc0) != 0x80 || 2091 (s[3] & 0xc0) != 0x80) { 2092 errmsg = "invalid data"; 2093 startinpos = s-starts; 2094 endinpos = startinpos+4; 2095 goto utf8Error; 2096 } 2097 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) + 2098 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f); 2099 /* validate and convert to UTF-16 */ 2100 if ((ch < 0x10000) /* minimum value allowed for 4 2101 byte encoding */ 2102 || (ch > 0x10ffff)) /* maximum value allowed for 2103 UTF-16 */ 2104 { 2105 errmsg = "illegal encoding"; 2106 startinpos = s-starts; 2107 endinpos = startinpos+4; 2108 goto utf8Error; 2109 } 2110#ifdef Py_UNICODE_WIDE 2111 *p++ = (Py_UNICODE)ch; 2112#else 2113 /* compute and append the two surrogates: */ 2114 2115 /* translate from 10000..10FFFF to 0..FFFF */ 2116 ch -= 0x10000; 2117 2118 /* high surrogate = top 10 bits added to D800 */ 2119 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10)); 2120 2121 /* low surrogate = bottom 10 bits added to DC00 */ 2122 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF)); 2123#endif 2124 break; 2125 2126 default: 2127 /* Other sizes are only needed for UCS-4 */ 2128 errmsg = "unsupported Unicode code range"; 2129 startinpos = s-starts; 2130 endinpos = startinpos+n; 2131 goto utf8Error; 2132 } 2133 s += n; 2134 continue; 2135 2136 utf8Error: 2137 outpos = p-PyUnicode_AS_UNICODE(unicode); 2138 if (unicode_decode_call_errorhandler( 2139 errors, &errorHandler, 2140 "utf8", errmsg, 2141 &starts, &e, &startinpos, &endinpos, &exc, &s, 2142 (PyObject **)&unicode, &outpos, &p)) 2143 goto onError; 2144 } 2145 if (consumed) 2146 *consumed = s-starts; 2147 2148 /* Adjust length */ 2149 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0) 2150 goto onError; 2151 2152 Py_XDECREF(errorHandler); 2153 Py_XDECREF(exc); 2154 return (PyObject *)unicode; 2155 2156onError: 2157 Py_XDECREF(errorHandler); 2158 Py_XDECREF(exc); 2159 Py_DECREF(unicode); 2160 return NULL; 2161} 2162 2163/* Allocation strategy: if the string is short, convert into a stack buffer 2164 and allocate exactly as much space needed at the end. Else allocate the 2165 maximum possible needed (4 result bytes per Unicode character), and return 2166 the excess memory at the end. 2167*/ 2168PyObject * 2169PyUnicode_EncodeUTF8(const Py_UNICODE *s, 2170 Py_ssize_t size, 2171 const char *errors) 2172{ 2173#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */ 2174 2175 Py_ssize_t i; /* index into s of next input byte */ 2176 PyObject *result; /* result string object */ 2177 char *p; /* next free byte in output buffer */ 2178 Py_ssize_t nallocated; /* number of result bytes allocated */ 2179 Py_ssize_t nneeded; /* number of result bytes needed */ 2180 char stackbuf[MAX_SHORT_UNICHARS * 4]; 2181 2182 assert(s != NULL); 2183 assert(size >= 0); 2184 2185 if (size <= MAX_SHORT_UNICHARS) { 2186 /* Write into the stack buffer; nallocated can't overflow. 2187 * At the end, we'll allocate exactly as much heap space as it 2188 * turns out we need. 2189 */ 2190 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int); 2191 result = NULL; /* will allocate after we're done */ 2192 p = stackbuf; 2193 } 2194 else { 2195 /* Overallocate on the heap, and give the excess back at the end. */ 2196 nallocated = size * 4; 2197 if (nallocated / 4 != size) /* overflow! */ 2198 return PyErr_NoMemory(); 2199 result = PyBytes_FromStringAndSize(NULL, nallocated); 2200 if (result == NULL) 2201 return NULL; 2202 p = PyBytes_AS_STRING(result); 2203 } 2204 2205 for (i = 0; i < size;) { 2206 Py_UCS4 ch = s[i++]; 2207 2208 if (ch < 0x80) 2209 /* Encode ASCII */ 2210 *p++ = (char) ch; 2211 2212 else if (ch < 0x0800) { 2213 /* Encode Latin-1 */ 2214 *p++ = (char)(0xc0 | (ch >> 6)); 2215 *p++ = (char)(0x80 | (ch & 0x3f)); 2216 } 2217 else { 2218 /* Encode UCS2 Unicode ordinals */ 2219 if (ch < 0x10000) { 2220 /* Special case: check for high surrogate */ 2221 if (0xD800 <= ch && ch <= 0xDBFF && i != size) { 2222 Py_UCS4 ch2 = s[i]; 2223 /* Check for low surrogate and combine the two to 2224 form a UCS4 value */ 2225 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) { 2226 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000; 2227 i++; 2228 goto encodeUCS4; 2229 } 2230 /* Fall through: handles isolated high surrogates */ 2231 } 2232 *p++ = (char)(0xe0 | (ch >> 12)); 2233 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); 2234 *p++ = (char)(0x80 | (ch & 0x3f)); 2235 continue; 2236 } 2237encodeUCS4: 2238 /* Encode UCS4 Unicode ordinals */ 2239 *p++ = (char)(0xf0 | (ch >> 18)); 2240 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f)); 2241 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); 2242 *p++ = (char)(0x80 | (ch & 0x3f)); 2243 } 2244 } 2245 2246 if (result == NULL) { 2247 /* This was stack allocated. */ 2248 nneeded = p - stackbuf; 2249 assert(nneeded <= nallocated); 2250 result = PyBytes_FromStringAndSize(stackbuf, nneeded); 2251 } 2252 else { 2253 /* Cut back to size actually needed. */ 2254 nneeded = p - PyBytes_AS_STRING(result); 2255 assert(nneeded <= nallocated); 2256 _PyBytes_Resize(&result, nneeded); 2257 } 2258 return result; 2259 2260#undef MAX_SHORT_UNICHARS 2261} 2262 2263PyObject *PyUnicode_AsUTF8String(PyObject *unicode) 2264{ 2265 if (!PyUnicode_Check(unicode)) { 2266 PyErr_BadArgument(); 2267 return NULL; 2268 } 2269 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode), 2270 PyUnicode_GET_SIZE(unicode), 2271 NULL); 2272} 2273 2274/* --- UTF-32 Codec ------------------------------------------------------- */ 2275 2276PyObject * 2277PyUnicode_DecodeUTF32(const char *s, 2278 Py_ssize_t size, 2279 const char *errors, 2280 int *byteorder) 2281{ 2282 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL); 2283} 2284 2285PyObject * 2286PyUnicode_DecodeUTF32Stateful(const char *s, 2287 Py_ssize_t size, 2288 const char *errors, 2289 int *byteorder, 2290 Py_ssize_t *consumed) 2291{ 2292 const char *starts = s; 2293 Py_ssize_t startinpos; 2294 Py_ssize_t endinpos; 2295 Py_ssize_t outpos; 2296 PyUnicodeObject *unicode; 2297 Py_UNICODE *p; 2298#ifndef Py_UNICODE_WIDE 2299 int i, pairs; 2300#else 2301 const int pairs = 0; 2302#endif 2303 const unsigned char *q, *e; 2304 int bo = 0; /* assume native ordering by default */ 2305 const char *errmsg = ""; 2306 /* Offsets from q for retrieving bytes in the right order. */ 2307#ifdef BYTEORDER_IS_LITTLE_ENDIAN 2308 int iorder[] = {0, 1, 2, 3}; 2309#else 2310 int iorder[] = {3, 2, 1, 0}; 2311#endif 2312 PyObject *errorHandler = NULL; 2313 PyObject *exc = NULL; 2314 /* On narrow builds we split characters outside the BMP into two 2315 codepoints => count how much extra space we need. */ 2316#ifndef Py_UNICODE_WIDE 2317 for (i = pairs = 0; i < size/4; i++) 2318 if (((Py_UCS4 *)s)[i] >= 0x10000) 2319 pairs++; 2320#endif 2321 2322 /* This might be one to much, because of a BOM */ 2323 unicode = _PyUnicode_New((size+3)/4+pairs); 2324 if (!unicode) 2325 return NULL; 2326 if (size == 0) 2327 return (PyObject *)unicode; 2328 2329 /* Unpack UTF-32 encoded data */ 2330 p = unicode->str; 2331 q = (unsigned char *)s; 2332 e = q + size; 2333 2334 if (byteorder) 2335 bo = *byteorder; 2336 2337 /* Check for BOM marks (U+FEFF) in the input and adjust current 2338 byte order setting accordingly. In native mode, the leading BOM 2339 mark is skipped, in all other modes, it is copied to the output 2340 stream as-is (giving a ZWNBSP character). */ 2341 if (bo == 0) { 2342 if (size >= 4) { 2343 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) | 2344 (q[iorder[1]] << 8) | q[iorder[0]]; 2345#ifdef BYTEORDER_IS_LITTLE_ENDIAN 2346 if (bom == 0x0000FEFF) { 2347 q += 4; 2348 bo = -1; 2349 } 2350 else if (bom == 0xFFFE0000) { 2351 q += 4; 2352 bo = 1; 2353 } 2354#else 2355 if (bom == 0x0000FEFF) { 2356 q += 4; 2357 bo = 1; 2358 } 2359 else if (bom == 0xFFFE0000) { 2360 q += 4; 2361 bo = -1; 2362 } 2363#endif 2364 } 2365 } 2366 2367 if (bo == -1) { 2368 /* force LE */ 2369 iorder[0] = 0; 2370 iorder[1] = 1; 2371 iorder[2] = 2; 2372 iorder[3] = 3; 2373 } 2374 else if (bo == 1) { 2375 /* force BE */ 2376 iorder[0] = 3; 2377 iorder[1] = 2; 2378 iorder[2] = 1; 2379 iorder[3] = 0; 2380 } 2381 2382 while (q < e) { 2383 Py_UCS4 ch; 2384 /* remaining bytes at the end? (size should be divisible by 4) */ 2385 if (e-q<4) { 2386 if (consumed) 2387 break; 2388 errmsg = "truncated data"; 2389 startinpos = ((const char *)q)-starts; 2390 endinpos = ((const char *)e)-starts; 2391 goto utf32Error; 2392 /* The remaining input chars are ignored if the callback 2393 chooses to skip the input */ 2394 } 2395 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) | 2396 (q[iorder[1]] << 8) | q[iorder[0]]; 2397 2398 if (ch >= 0x110000) 2399 { 2400 errmsg = "codepoint not in range(0x110000)"; 2401 startinpos = ((const char *)q)-starts; 2402 endinpos = startinpos+4; 2403 goto utf32Error; 2404 } 2405#ifndef Py_UNICODE_WIDE 2406 if (ch >= 0x10000) 2407 { 2408 *p++ = 0xD800 | ((ch-0x10000) >> 10); 2409 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF); 2410 } 2411 else 2412#endif 2413 *p++ = ch; 2414 q += 4; 2415 continue; 2416 utf32Error: 2417 outpos = p-PyUnicode_AS_UNICODE(unicode); 2418 if (unicode_decode_call_errorhandler( 2419 errors, &errorHandler, 2420 "utf32", errmsg, 2421 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q, 2422 (PyObject **)&unicode, &outpos, &p)) 2423 goto onError; 2424 } 2425 2426 if (byteorder) 2427 *byteorder = bo; 2428 2429 if (consumed) 2430 *consumed = (const char *)q-starts; 2431 2432 /* Adjust length */ 2433 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0) 2434 goto onError; 2435 2436 Py_XDECREF(errorHandler); 2437 Py_XDECREF(exc); 2438 return (PyObject *)unicode; 2439 2440onError: 2441 Py_DECREF(unicode); 2442 Py_XDECREF(errorHandler); 2443 Py_XDECREF(exc); 2444 return NULL; 2445} 2446 2447PyObject * 2448PyUnicode_EncodeUTF32(const Py_UNICODE *s, 2449 Py_ssize_t size, 2450 const char *errors, 2451 int byteorder) 2452{ 2453 PyObject *v, *result; 2454 unsigned char *p; 2455#ifndef Py_UNICODE_WIDE 2456 int i, pairs; 2457#else 2458 const int pairs = 0; 2459#endif 2460 /* Offsets from p for storing byte pairs in the right order. */ 2461#ifdef BYTEORDER_IS_LITTLE_ENDIAN 2462 int iorder[] = {0, 1, 2, 3}; 2463#else 2464 int iorder[] = {3, 2, 1, 0}; 2465#endif 2466 2467#define STORECHAR(CH) \ 2468 do { \ 2469 p[iorder[3]] = ((CH) >> 24) & 0xff; \ 2470 p[iorder[2]] = ((CH) >> 16) & 0xff; \ 2471 p[iorder[1]] = ((CH) >> 8) & 0xff; \ 2472 p[iorder[0]] = (CH) & 0xff; \ 2473 p += 4; \ 2474 } while(0) 2475 2476 /* In narrow builds we can output surrogate pairs as one codepoint, 2477 so we need less space. */ 2478#ifndef Py_UNICODE_WIDE 2479 for (i = pairs = 0; i < size-1; i++) 2480 if (0xD800 <= s[i] && s[i] <= 0xDBFF && 2481 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF) 2482 pairs++; 2483#endif 2484 v = PyByteArray_FromStringAndSize(NULL, 2485 4 * (size - pairs + (byteorder == 0))); 2486 if (v == NULL) 2487 return NULL; 2488 2489 p = (unsigned char *)PyByteArray_AS_STRING(v); 2490 if (byteorder == 0) 2491 STORECHAR(0xFEFF); 2492 if (size == 0) 2493 goto done; 2494 2495 if (byteorder == -1) { 2496 /* force LE */ 2497 iorder[0] = 0; 2498 iorder[1] = 1; 2499 iorder[2] = 2; 2500 iorder[3] = 3; 2501 } 2502 else if (byteorder == 1) { 2503 /* force BE */ 2504 iorder[0] = 3; 2505 iorder[1] = 2; 2506 iorder[2] = 1; 2507 iorder[3] = 0; 2508 } 2509 2510 while (size-- > 0) { 2511 Py_UCS4 ch = *s++; 2512#ifndef Py_UNICODE_WIDE 2513 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) { 2514 Py_UCS4 ch2 = *s; 2515 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) { 2516 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000; 2517 s++; 2518 size--; 2519 } 2520 } 2521#endif 2522 STORECHAR(ch); 2523 } 2524 2525 done: 2526 result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v)); 2527 Py_DECREF(v); 2528 return result; 2529#undef STORECHAR 2530} 2531 2532PyObject *PyUnicode_AsUTF32String(PyObject *unicode) 2533{ 2534 if (!PyUnicode_Check(unicode)) { 2535 PyErr_BadArgument(); 2536 return NULL; 2537 } 2538 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode), 2539 PyUnicode_GET_SIZE(unicode), 2540 NULL, 2541 0); 2542} 2543 2544/* --- UTF-16 Codec ------------------------------------------------------- */ 2545 2546PyObject * 2547PyUnicode_DecodeUTF16(const char *s, 2548 Py_ssize_t size, 2549 const char *errors, 2550 int *byteorder) 2551{ 2552 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL); 2553} 2554 2555PyObject * 2556PyUnicode_DecodeUTF16Stateful(const char *s, 2557 Py_ssize_t size, 2558 const char *errors, 2559 int *byteorder, 2560 Py_ssize_t *consumed) 2561{ 2562 const char *starts = s; 2563 Py_ssize_t startinpos; 2564 Py_ssize_t endinpos; 2565 Py_ssize_t outpos; 2566 PyUnicodeObject *unicode; 2567 Py_UNICODE *p; 2568 const unsigned char *q, *e; 2569 int bo = 0; /* assume native ordering by default */ 2570 const char *errmsg = ""; 2571 /* Offsets from q for retrieving byte pairs in the right order. */ 2572#ifdef BYTEORDER_IS_LITTLE_ENDIAN 2573 int ihi = 1, ilo = 0; 2574#else 2575 int ihi = 0, ilo = 1; 2576#endif 2577 PyObject *errorHandler = NULL; 2578 PyObject *exc = NULL; 2579 2580 /* Note: size will always be longer than the resulting Unicode 2581 character count */ 2582 unicode = _PyUnicode_New(size); 2583 if (!unicode) 2584 return NULL; 2585 if (size == 0) 2586 return (PyObject *)unicode; 2587 2588 /* Unpack UTF-16 encoded data */ 2589 p = unicode->str; 2590 q = (unsigned char *)s; 2591 e = q + size; 2592 2593 if (byteorder) 2594 bo = *byteorder; 2595 2596 /* Check for BOM marks (U+FEFF) in the input and adjust current 2597 byte order setting accordingly. In native mode, the leading BOM 2598 mark is skipped, in all other modes, it is copied to the output 2599 stream as-is (giving a ZWNBSP character). */ 2600 if (bo == 0) { 2601 if (size >= 2) { 2602 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo]; 2603#ifdef BYTEORDER_IS_LITTLE_ENDIAN 2604 if (bom == 0xFEFF) { 2605 q += 2; 2606 bo = -1; 2607 } 2608 else if (bom == 0xFFFE) { 2609 q += 2; 2610 bo = 1; 2611 } 2612#else 2613 if (bom == 0xFEFF) { 2614 q += 2; 2615 bo = 1; 2616 } 2617 else if (bom == 0xFFFE) { 2618 q += 2; 2619 bo = -1; 2620 } 2621#endif 2622 } 2623 } 2624 2625 if (bo == -1) { 2626 /* force LE */ 2627 ihi = 1; 2628 ilo = 0; 2629 } 2630 else if (bo == 1) { 2631 /* force BE */ 2632 ihi = 0; 2633 ilo = 1; 2634 } 2635 2636 while (q < e) { 2637 Py_UNICODE ch; 2638 /* remaining bytes at the end? (size should be even) */ 2639 if (e-q<2) { 2640 if (consumed) 2641 break; 2642 errmsg = "truncated data"; 2643 startinpos = ((const char *)q)-starts; 2644 endinpos = ((const char *)e)-starts; 2645 goto utf16Error; 2646 /* The remaining input chars are ignored if the callback 2647 chooses to skip the input */ 2648 } 2649 ch = (q[ihi] << 8) | q[ilo]; 2650 2651 q += 2; 2652 2653 if (ch < 0xD800 || ch > 0xDFFF) { 2654 *p++ = ch; 2655 continue; 2656 } 2657 2658 /* UTF-16 code pair: */ 2659 if (q >= e) { 2660 errmsg = "unexpected end of data"; 2661 startinpos = (((const char *)q)-2)-starts; 2662 endinpos = ((const char *)e)-starts; 2663 goto utf16Error; 2664 } 2665 if (0xD800 <= ch && ch <= 0xDBFF) { 2666 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo]; 2667 q += 2; 2668 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) { 2669#ifndef Py_UNICODE_WIDE 2670 *p++ = ch; 2671 *p++ = ch2; 2672#else 2673 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000; 2674#endif 2675 continue; 2676 } 2677 else { 2678 errmsg = "illegal UTF-16 surrogate"; 2679 startinpos = (((const char *)q)-4)-starts; 2680 endinpos = startinpos+2; 2681 goto utf16Error; 2682 } 2683 2684 } 2685 errmsg = "illegal encoding"; 2686 startinpos = (((const char *)q)-2)-starts; 2687 endinpos = startinpos+2; 2688 /* Fall through to report the error */ 2689 2690 utf16Error: 2691 outpos = p-PyUnicode_AS_UNICODE(unicode); 2692 if (unicode_decode_call_errorhandler( 2693 errors, &errorHandler, 2694 "utf16", errmsg, 2695 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q, 2696 (PyObject **)&unicode, &outpos, &p)) 2697 goto onError; 2698 } 2699 2700 if (byteorder) 2701 *byteorder = bo; 2702 2703 if (consumed) 2704 *consumed = (const char *)q-starts; 2705 2706 /* Adjust length */ 2707 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0) 2708 goto onError; 2709 2710 Py_XDECREF(errorHandler); 2711 Py_XDECREF(exc); 2712 return (PyObject *)unicode; 2713 2714onError: 2715 Py_DECREF(unicode); 2716 Py_XDECREF(errorHandler); 2717 Py_XDECREF(exc); 2718 return NULL; 2719} 2720 2721PyObject * 2722PyUnicode_EncodeUTF16(const Py_UNICODE *s, 2723 Py_ssize_t size, 2724 const char *errors, 2725 int byteorder) 2726{ 2727 PyObject *v, *result; 2728 unsigned char *p; 2729#ifdef Py_UNICODE_WIDE 2730 int i, pairs; 2731#else 2732 const int pairs = 0; 2733#endif 2734 /* Offsets from p for storing byte pairs in the right order. */ 2735#ifdef BYTEORDER_IS_LITTLE_ENDIAN 2736 int ihi = 1, ilo = 0; 2737#else 2738 int ihi = 0, ilo = 1; 2739#endif 2740 2741#define STORECHAR(CH) \ 2742 do { \ 2743 p[ihi] = ((CH) >> 8) & 0xff; \ 2744 p[ilo] = (CH) & 0xff; \ 2745 p += 2; \ 2746 } while(0) 2747 2748#ifdef Py_UNICODE_WIDE 2749 for (i = pairs = 0; i < size; i++) 2750 if (s[i] >= 0x10000) 2751 pairs++; 2752#endif 2753 v = PyByteArray_FromStringAndSize(NULL, 2754 2 * (size + pairs + (byteorder == 0))); 2755 if (v == NULL) 2756 return NULL; 2757 2758 p = (unsigned char *)PyByteArray_AS_STRING(v); 2759 if (byteorder == 0) 2760 STORECHAR(0xFEFF); 2761 if (size == 0) 2762 goto done; 2763 2764 if (byteorder == -1) { 2765 /* force LE */ 2766 ihi = 1; 2767 ilo = 0; 2768 } 2769 else if (byteorder == 1) { 2770 /* force BE */ 2771 ihi = 0; 2772 ilo = 1; 2773 } 2774 2775 while (size-- > 0) { 2776 Py_UNICODE ch = *s++; 2777 Py_UNICODE ch2 = 0; 2778#ifdef Py_UNICODE_WIDE 2779 if (ch >= 0x10000) { 2780 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF); 2781 ch = 0xD800 | ((ch-0x10000) >> 10); 2782 } 2783#endif 2784 STORECHAR(ch); 2785 if (ch2) 2786 STORECHAR(ch2); 2787 } 2788 2789 done: 2790 result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v)); 2791 Py_DECREF(v); 2792 return result; 2793#undef STORECHAR 2794} 2795 2796PyObject *PyUnicode_AsUTF16String(PyObject *unicode) 2797{ 2798 if (!PyUnicode_Check(unicode)) { 2799 PyErr_BadArgument(); 2800 return NULL; 2801 } 2802 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode), 2803 PyUnicode_GET_SIZE(unicode), 2804 NULL, 2805 0); 2806} 2807 2808/* --- Unicode Escape Codec ----------------------------------------------- */ 2809 2810static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL; 2811 2812PyObject *PyUnicode_DecodeUnicodeEscape(const char *s, 2813 Py_ssize_t size, 2814 const char *errors) 2815{ 2816 const char *starts = s; 2817 Py_ssize_t startinpos; 2818 Py_ssize_t endinpos; 2819 Py_ssize_t outpos; 2820 int i; 2821 PyUnicodeObject *v; 2822 Py_UNICODE *p; 2823 const char *end; 2824 char* message; 2825 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */ 2826 PyObject *errorHandler = NULL; 2827 PyObject *exc = NULL; 2828 2829 /* Escaped strings will always be longer than the resulting 2830 Unicode string, so we start with size here and then reduce the 2831 length after conversion to the true value. 2832 (but if the error callback returns a long replacement string 2833 we'll have to allocate more space) */ 2834 v = _PyUnicode_New(size); 2835 if (v == NULL) 2836 goto onError; 2837 if (size == 0) 2838 return (PyObject *)v; 2839 2840 p = PyUnicode_AS_UNICODE(v); 2841 end = s + size; 2842 2843 while (s < end) { 2844 unsigned char c; 2845 Py_UNICODE x; 2846 int digits; 2847 2848 /* Non-escape characters are interpreted as Unicode ordinals */ 2849 if (*s != '\\') { 2850 *p++ = (unsigned char) *s++; 2851 continue; 2852 } 2853 2854 startinpos = s-starts; 2855 /* \ - Escapes */ 2856 s++; 2857 c = *s++; 2858 if (s > end) 2859 c = '\0'; /* Invalid after \ */ 2860 switch (c) { 2861 2862 /* \x escapes */ 2863 case '\n': break; 2864 case '\\': *p++ = '\\'; break; 2865 case '\'': *p++ = '\''; break; 2866 case '\"': *p++ = '\"'; break; 2867 case 'b': *p++ = '\b'; break; 2868 case 'f': *p++ = '\014'; break; /* FF */ 2869 case 't': *p++ = '\t'; break; 2870 case 'n': *p++ = '\n'; break; 2871 case 'r': *p++ = '\r'; break; 2872 case 'v': *p++ = '\013'; break; /* VT */ 2873 case 'a': *p++ = '\007'; break; /* BEL, not classic C */ 2874 2875 /* \OOO (octal) escapes */ 2876 case '0': case '1': case '2': case '3': 2877 case '4': case '5': case '6': case '7': 2878 x = s[-1] - '0'; 2879 if (s < end && '0' <= *s && *s <= '7') { 2880 x = (x<<3) + *s++ - '0'; 2881 if (s < end && '0' <= *s && *s <= '7') 2882 x = (x<<3) + *s++ - '0'; 2883 } 2884 *p++ = x; 2885 break; 2886 2887 /* hex escapes */ 2888 /* \xXX */ 2889 case 'x': 2890 digits = 2; 2891 message = "truncated \\xXX escape"; 2892 goto hexescape; 2893 2894 /* \uXXXX */ 2895 case 'u': 2896 digits = 4; 2897 message = "truncated \\uXXXX escape"; 2898 goto hexescape; 2899 2900 /* \UXXXXXXXX */ 2901 case 'U': 2902 digits = 8; 2903 message = "truncated \\UXXXXXXXX escape"; 2904 hexescape: 2905 chr = 0; 2906 outpos = p-PyUnicode_AS_UNICODE(v); 2907 if (s+digits>end) { 2908 endinpos = size; 2909 if (unicode_decode_call_errorhandler( 2910 errors, &errorHandler, 2911 "unicodeescape", "end of string in escape sequence", 2912 &starts, &end, &startinpos, &endinpos, &exc, &s, 2913 (PyObject **)&v, &outpos, &p)) 2914 goto onError; 2915 goto nextByte; 2916 } 2917 for (i = 0; i < digits; ++i) { 2918 c = (unsigned char) s[i]; 2919 if (!ISXDIGIT(c)) { 2920 endinpos = (s+i+1)-starts; 2921 if (unicode_decode_call_errorhandler( 2922 errors, &errorHandler, 2923 "unicodeescape", message, 2924 &starts, &end, &startinpos, &endinpos, &exc, &s, 2925 (PyObject **)&v, &outpos, &p)) 2926 goto onError; 2927 goto nextByte; 2928 } 2929 chr = (chr<<4) & ~0xF; 2930 if (c >= '0' && c <= '9') 2931 chr += c - '0'; 2932 else if (c >= 'a' && c <= 'f') 2933 chr += 10 + c - 'a'; 2934 else 2935 chr += 10 + c - 'A'; 2936 } 2937 s += i; 2938 if (chr == 0xffffffff && PyErr_Occurred()) 2939 /* _decoding_error will have already written into the 2940 target buffer. */ 2941 break; 2942 store: 2943 /* when we get here, chr is a 32-bit unicode character */ 2944 if (chr <= 0xffff) 2945 /* UCS-2 character */ 2946 *p++ = (Py_UNICODE) chr; 2947 else if (chr <= 0x10ffff) { 2948 /* UCS-4 character. Either store directly, or as 2949 surrogate pair. */ 2950#ifdef Py_UNICODE_WIDE 2951 *p++ = chr; 2952#else 2953 chr -= 0x10000L; 2954 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10); 2955 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF); 2956#endif 2957 } else { 2958 endinpos = s-starts; 2959 outpos = p-PyUnicode_AS_UNICODE(v); 2960 if (unicode_decode_call_errorhandler( 2961 errors, &errorHandler, 2962 "unicodeescape", "illegal Unicode character", 2963 &starts, &end, &startinpos, &endinpos, &exc, &s, 2964 (PyObject **)&v, &outpos, &p)) 2965 goto onError; 2966 } 2967 break; 2968 2969 /* \N{name} */ 2970 case 'N': 2971 message = "malformed \\N character escape"; 2972 if (ucnhash_CAPI == NULL) { 2973 /* load the unicode data module */ 2974 PyObject *m, *api; 2975 m = PyImport_ImportModuleNoBlock("unicodedata"); 2976 if (m == NULL) 2977 goto ucnhashError; 2978 api = PyObject_GetAttrString(m, "ucnhash_CAPI"); 2979 Py_DECREF(m); 2980 if (api == NULL) 2981 goto ucnhashError; 2982 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api); 2983 Py_DECREF(api); 2984 if (ucnhash_CAPI == NULL) 2985 goto ucnhashError; 2986 } 2987 if (*s == '{') { 2988 const char *start = s+1; 2989 /* look for the closing brace */ 2990 while (*s != '}' && s < end) 2991 s++; 2992 if (s > start && s < end && *s == '}') { 2993 /* found a name. look it up in the unicode database */ 2994 message = "unknown Unicode character name"; 2995 s++; 2996 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr)) 2997 goto store; 2998 } 2999 } 3000 endinpos = s-starts; 3001 outpos = p-PyUnicode_AS_UNICODE(v); 3002 if (unicode_decode_call_errorhandler( 3003 errors, &errorHandler, 3004 "unicodeescape", message, 3005 &starts, &end, &startinpos, &endinpos, &exc, &s, 3006 (PyObject **)&v, &outpos, &p)) 3007 goto onError; 3008 break; 3009 3010 default: 3011 if (s > end) { 3012 message = "\\ at end of string"; 3013 s--; 3014 endinpos = s-starts; 3015 outpos = p-PyUnicode_AS_UNICODE(v); 3016 if (unicode_decode_call_errorhandler( 3017 errors, &errorHandler, 3018 "unicodeescape", message, 3019 &starts, &end, &startinpos, &endinpos, &exc, &s, 3020 (PyObject **)&v, &outpos, &p)) 3021 goto onError; 3022 } 3023 else { 3024 *p++ = '\\'; 3025 *p++ = (unsigned char)s[-1]; 3026 } 3027 break; 3028 } 3029 nextByte: 3030 ; 3031 } 3032 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0) 3033 goto onError; 3034 Py_XDECREF(errorHandler); 3035 Py_XDECREF(exc); 3036 return (PyObject *)v; 3037 3038ucnhashError: 3039 PyErr_SetString( 3040 PyExc_UnicodeError, 3041 "\\N escapes not supported (can't load unicodedata module)" 3042 ); 3043 Py_XDECREF(v); 3044 Py_XDECREF(errorHandler); 3045 Py_XDECREF(exc); 3046 return NULL; 3047 3048onError: 3049 Py_XDECREF(v); 3050 Py_XDECREF(errorHandler); 3051 Py_XDECREF(exc); 3052 return NULL; 3053} 3054 3055/* Return a Unicode-Escape string version of the Unicode object. 3056 3057 If quotes is true, the string is enclosed in u"" or u'' quotes as 3058 appropriate. 3059 3060*/ 3061 3062Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s, 3063 Py_ssize_t size, 3064 Py_UNICODE ch) 3065{ 3066 /* like wcschr, but doesn't stop at NULL characters */ 3067 3068 while (size-- > 0) { 3069 if (*s == ch) 3070 return s; 3071 s++; 3072 } 3073 3074 return NULL; 3075} 3076 3077static const char *hexdigits = "0123456789abcdef"; 3078 3079PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s, 3080 Py_ssize_t size) 3081{ 3082 PyObject *repr, *result; 3083 char *p; 3084 3085 /* XXX(nnorwitz): rather than over-allocating, it would be 3086 better to choose a different scheme. Perhaps scan the 3087 first N-chars of the string and allocate based on that size. 3088 */ 3089 /* Initial allocation is based on the longest-possible unichr 3090 escape. 3091 3092 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source 3093 unichr, so in this case it's the longest unichr escape. In 3094 narrow (UTF-16) builds this is five chars per source unichr 3095 since there are two unichrs in the surrogate pair, so in narrow 3096 (UTF-16) builds it's not the longest unichr escape. 3097 3098 In wide or narrow builds '\uxxxx' is 6 chars per source unichr, 3099 so in the narrow (UTF-16) build case it's the longest unichr 3100 escape. 3101 */ 3102 3103 repr = PyByteArray_FromStringAndSize(NULL, 3104#ifdef Py_UNICODE_WIDE 3105 + 10*size 3106#else 3107 + 6*size 3108#endif 3109 + 1); 3110 if (repr == NULL) 3111 return NULL; 3112 3113 p = PyByteArray_AS_STRING(repr); 3114 3115 while (size-- > 0) { 3116 Py_UNICODE ch = *s++; 3117 3118 /* Escape backslashes */ 3119 if (ch == '\\') { 3120 *p++ = '\\'; 3121 *p++ = (char) ch; 3122 continue; 3123 } 3124 3125#ifdef Py_UNICODE_WIDE 3126 /* Map 21-bit characters to '\U00xxxxxx' */ 3127 else if (ch >= 0x10000) { 3128 *p++ = '\\'; 3129 *p++ = 'U'; 3130 *p++ = hexdigits[(ch >> 28) & 0x0000000F]; 3131 *p++ = hexdigits[(ch >> 24) & 0x0000000F]; 3132 *p++ = hexdigits[(ch >> 20) & 0x0000000F]; 3133 *p++ = hexdigits[(ch >> 16) & 0x0000000F]; 3134 *p++ = hexdigits[(ch >> 12) & 0x0000000F]; 3135 *p++ = hexdigits[(ch >> 8) & 0x0000000F]; 3136 *p++ = hexdigits[(ch >> 4) & 0x0000000F]; 3137 *p++ = hexdigits[ch & 0x0000000F]; 3138 continue; 3139 } 3140#else 3141 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */ 3142 else if (ch >= 0xD800 && ch < 0xDC00) { 3143 Py_UNICODE ch2; 3144 Py_UCS4 ucs; 3145 3146 ch2 = *s++; 3147 size--; 3148 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) { 3149 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000; 3150 *p++ = '\\'; 3151 *p++ = 'U'; 3152 *p++ = hexdigits[(ucs >> 28) & 0x0000000F]; 3153 *p++ = hexdigits[(ucs >> 24) & 0x0000000F]; 3154 *p++ = hexdigits[(ucs >> 20) & 0x0000000F]; 3155 *p++ = hexdigits[(ucs >> 16) & 0x0000000F]; 3156 *p++ = hexdigits[(ucs >> 12) & 0x0000000F]; 3157 *p++ = hexdigits[(ucs >> 8) & 0x0000000F]; 3158 *p++ = hexdigits[(ucs >> 4) & 0x0000000F]; 3159 *p++ = hexdigits[ucs & 0x0000000F]; 3160 continue; 3161 } 3162 /* Fall through: isolated surrogates are copied as-is */ 3163 s--; 3164 size++; 3165 } 3166#endif 3167 3168 /* Map 16-bit characters to '\uxxxx' */ 3169 if (ch >= 256) { 3170 *p++ = '\\'; 3171 *p++ = 'u'; 3172 *p++ = hexdigits[(ch >> 12) & 0x000F]; 3173 *p++ = hexdigits[(ch >> 8) & 0x000F]; 3174 *p++ = hexdigits[(ch >> 4) & 0x000F]; 3175 *p++ = hexdigits[ch & 0x000F]; 3176 } 3177 3178 /* Map special whitespace to '\t', \n', '\r' */ 3179 else if (ch == '\t') { 3180 *p++ = '\\'; 3181 *p++ = 't'; 3182 } 3183 else if (ch == '\n') { 3184 *p++ = '\\'; 3185 *p++ = 'n'; 3186 } 3187 else if (ch == '\r') { 3188 *p++ = '\\'; 3189 *p++ = 'r'; 3190 } 3191 3192 /* Map non-printable US ASCII to '\xhh' */ 3193 else if (ch < ' ' || ch >= 0x7F) { 3194 *p++ = '\\'; 3195 *p++ = 'x'; 3196 *p++ = hexdigits[(ch >> 4) & 0x000F]; 3197 *p++ = hexdigits[ch & 0x000F]; 3198 } 3199 3200 /* Copy everything else as-is */ 3201 else 3202 *p++ = (char) ch; 3203 } 3204 3205 result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(repr), 3206 p - PyByteArray_AS_STRING(repr)); 3207 Py_DECREF(repr); 3208 return result; 3209} 3210 3211PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode) 3212{ 3213 PyObject *s, *result; 3214 if (!PyUnicode_Check(unicode)) { 3215 PyErr_BadArgument(); 3216 return NULL; 3217 } 3218 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode), 3219 PyUnicode_GET_SIZE(unicode)); 3220 3221 if (!s) 3222 return NULL; 3223 result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(s), 3224 PyByteArray_GET_SIZE(s)); 3225 Py_DECREF(s); 3226 return result; 3227} 3228 3229/* --- Raw Unicode Escape Codec ------------------------------------------- */ 3230 3231PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s, 3232 Py_ssize_t size, 3233 const char *errors) 3234{ 3235 const char *starts = s; 3236 Py_ssize_t startinpos; 3237 Py_ssize_t endinpos; 3238 Py_ssize_t outpos; 3239 PyUnicodeObject *v; 3240 Py_UNICODE *p; 3241 const char *end; 3242 const char *bs; 3243 PyObject *errorHandler = NULL; 3244 PyObject *exc = NULL; 3245 3246 /* Escaped strings will always be longer than the resulting 3247 Unicode string, so we start with size here and then reduce the 3248 length after conversion to the true value. (But decoding error 3249 handler might have to resize the string) */ 3250 v = _PyUnicode_New(size); 3251 if (v == NULL) 3252 goto onError; 3253 if (size == 0) 3254 return (PyObject *)v; 3255 p = PyUnicode_AS_UNICODE(v); 3256 end = s + size; 3257 while (s < end) { 3258 unsigned char c; 3259 Py_UCS4 x; 3260 int i; 3261 int count; 3262 3263 /* Non-escape characters are interpreted as Unicode ordinals */ 3264 if (*s != '\\') { 3265 *p++ = (unsigned char)*s++; 3266 continue; 3267 } 3268 startinpos = s-starts; 3269 3270 /* \u-escapes are only interpreted iff the number of leading 3271 backslashes if odd */ 3272 bs = s; 3273 for (;s < end;) { 3274 if (*s != '\\') 3275 break; 3276 *p++ = (unsigned char)*s++; 3277 } 3278 if (((s - bs) & 1) == 0 || 3279 s >= end || 3280 (*s != 'u' && *s != 'U')) { 3281 continue; 3282 } 3283 p--; 3284 count = *s=='u' ? 4 : 8; 3285 s++; 3286 3287 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */ 3288 outpos = p-PyUnicode_AS_UNICODE(v); 3289 for (x = 0, i = 0; i < count; ++i, ++s) { 3290 c = (unsigned char)*s; 3291 if (!ISXDIGIT(c)) { 3292 endinpos = s-starts; 3293 if (unicode_decode_call_errorhandler( 3294 errors, &errorHandler, 3295 "rawunicodeescape", "truncated \\uXXXX", 3296 &starts, &end, &startinpos, &endinpos, &exc, &s, 3297 (PyObject **)&v, &outpos, &p)) 3298 goto onError; 3299 goto nextByte; 3300 } 3301 x = (x<<4) & ~0xF; 3302 if (c >= '0' && c <= '9') 3303 x += c - '0'; 3304 else if (c >= 'a' && c <= 'f') 3305 x += 10 + c - 'a'; 3306 else 3307 x += 10 + c - 'A'; 3308 } 3309 if (x <= 0xffff) 3310 /* UCS-2 character */ 3311 *p++ = (Py_UNICODE) x; 3312 else if (x <= 0x10ffff) { 3313 /* UCS-4 character. Either store directly, or as 3314 surrogate pair. */ 3315#ifdef Py_UNICODE_WIDE 3316 *p++ = (Py_UNICODE) x; 3317#else 3318 x -= 0x10000L; 3319 *p++ = 0xD800 + (Py_UNICODE) (x >> 10); 3320 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF); 3321#endif 3322 } else { 3323 endinpos = s-starts; 3324 outpos = p-PyUnicode_AS_UNICODE(v); 3325 if (unicode_decode_call_errorhandler( 3326 errors, &errorHandler, 3327 "rawunicodeescape", "\\Uxxxxxxxx out of range", 3328 &starts, &end, &startinpos, &endinpos, &exc, &s, 3329 (PyObject **)&v, &outpos, &p)) 3330 goto onError; 3331 } 3332 nextByte: 3333 ; 3334 } 3335 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0) 3336 goto onError; 3337 Py_XDECREF(errorHandler); 3338 Py_XDECREF(exc); 3339 return (PyObject *)v; 3340 3341 onError: 3342 Py_XDECREF(v); 3343 Py_XDECREF(errorHandler); 3344 Py_XDECREF(exc); 3345 return NULL; 3346} 3347 3348PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s, 3349 Py_ssize_t size) 3350{ 3351 PyObject *repr, *result; 3352 char *p; 3353 char *q; 3354 3355#ifdef Py_UNICODE_WIDE 3356 repr = PyByteArray_FromStringAndSize(NULL, 10 * size); 3357#else 3358 repr = PyByteArray_FromStringAndSize(NULL, 6 * size); 3359#endif 3360 if (repr == NULL) 3361 return NULL; 3362 if (size == 0) 3363 goto done; 3364 3365 p = q = PyByteArray_AS_STRING(repr); 3366 while (size-- > 0) { 3367 Py_UNICODE ch = *s++; 3368#ifdef Py_UNICODE_WIDE 3369 /* Map 32-bit characters to '\Uxxxxxxxx' */ 3370 if (ch >= 0x10000) { 3371 *p++ = '\\'; 3372 *p++ = 'U'; 3373 *p++ = hexdigits[(ch >> 28) & 0xf]; 3374 *p++ = hexdigits[(ch >> 24) & 0xf]; 3375 *p++ = hexdigits[(ch >> 20) & 0xf]; 3376 *p++ = hexdigits[(ch >> 16) & 0xf]; 3377 *p++ = hexdigits[(ch >> 12) & 0xf]; 3378 *p++ = hexdigits[(ch >> 8) & 0xf]; 3379 *p++ = hexdigits[(ch >> 4) & 0xf]; 3380 *p++ = hexdigits[ch & 15]; 3381 } 3382 else 3383#else 3384 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */ 3385 if (ch >= 0xD800 && ch < 0xDC00) { 3386 Py_UNICODE ch2; 3387 Py_UCS4 ucs; 3388 3389 ch2 = *s++; 3390 size--; 3391 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) { 3392 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000; 3393 *p++ = '\\'; 3394 *p++ = 'U'; 3395 *p++ = hexdigits[(ucs >> 28) & 0xf]; 3396 *p++ = hexdigits[(ucs >> 24) & 0xf]; 3397 *p++ = hexdigits[(ucs >> 20) & 0xf]; 3398 *p++ = hexdigits[(ucs >> 16) & 0xf]; 3399 *p++ = hexdigits[(ucs >> 12) & 0xf]; 3400 *p++ = hexdigits[(ucs >> 8) & 0xf]; 3401 *p++ = hexdigits[(ucs >> 4) & 0xf]; 3402 *p++ = hexdigits[ucs & 0xf]; 3403 continue; 3404 } 3405 /* Fall through: isolated surrogates are copied as-is */ 3406 s--; 3407 size++; 3408 } 3409#endif 3410 /* Map 16-bit characters to '\uxxxx' */ 3411 if (ch >= 256) { 3412 *p++ = '\\'; 3413 *p++ = 'u'; 3414 *p++ = hexdigits[(ch >> 12) & 0xf]; 3415 *p++ = hexdigits[(ch >> 8) & 0xf]; 3416 *p++ = hexdigits[(ch >> 4) & 0xf]; 3417 *p++ = hexdigits[ch & 15]; 3418 } 3419 /* Copy everything else as-is */ 3420 else 3421 *p++ = (char) ch; 3422 } 3423 size = p - q; 3424 3425 done: 3426 result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(repr), size); 3427 Py_DECREF(repr); 3428 return result; 3429} 3430 3431PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode) 3432{ 3433 PyObject *s, *result; 3434 if (!PyUnicode_Check(unicode)) { 3435 PyErr_BadArgument(); 3436 return NULL; 3437 } 3438 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode), 3439 PyUnicode_GET_SIZE(unicode)); 3440 3441 if (!s) 3442 return NULL; 3443 result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(s), 3444 PyByteArray_GET_SIZE(s)); 3445 Py_DECREF(s); 3446 return result; 3447} 3448 3449/* --- Unicode Internal Codec ------------------------------------------- */ 3450 3451PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s, 3452 Py_ssize_t size, 3453 const char *errors) 3454{ 3455 const char *starts = s; 3456 Py_ssize_t startinpos; 3457 Py_ssize_t endinpos; 3458 Py_ssize_t outpos; 3459 PyUnicodeObject *v; 3460 Py_UNICODE *p; 3461 const char *end; 3462 const char *reason; 3463 PyObject *errorHandler = NULL; 3464 PyObject *exc = NULL; 3465 3466#ifdef Py_UNICODE_WIDE 3467 Py_UNICODE unimax = PyUnicode_GetMax(); 3468#endif 3469 3470 /* XXX overflow detection missing */ 3471 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE); 3472 if (v == NULL) 3473 goto onError; 3474 if (PyUnicode_GetSize((PyObject *)v) == 0) 3475 return (PyObject *)v; 3476 p = PyUnicode_AS_UNICODE(v); 3477 end = s + size; 3478 3479 while (s < end) { 3480 memcpy(p, s, sizeof(Py_UNICODE)); 3481 /* We have to sanity check the raw data, otherwise doom looms for 3482 some malformed UCS-4 data. */ 3483 if ( 3484 #ifdef Py_UNICODE_WIDE 3485 *p > unimax || *p < 0 || 3486 #endif 3487 end-s < Py_UNICODE_SIZE 3488 ) 3489 { 3490 startinpos = s - starts; 3491 if (end-s < Py_UNICODE_SIZE) { 3492 endinpos = end-starts; 3493 reason = "truncated input"; 3494 } 3495 else { 3496 endinpos = s - starts + Py_UNICODE_SIZE; 3497 reason = "illegal code point (> 0x10FFFF)"; 3498 } 3499 outpos = p - PyUnicode_AS_UNICODE(v); 3500 if (unicode_decode_call_errorhandler( 3501 errors, &errorHandler, 3502 "unicode_internal", reason, 3503 &starts, &end, &startinpos, &endinpos, &exc, &s, 3504 (PyObject **)&v, &outpos, &p)) { 3505 goto onError; 3506 } 3507 } 3508 else { 3509 p++; 3510 s += Py_UNICODE_SIZE; 3511 } 3512 } 3513 3514 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0) 3515 goto onError; 3516 Py_XDECREF(errorHandler); 3517 Py_XDECREF(exc); 3518 return (PyObject *)v; 3519 3520 onError: 3521 Py_XDECREF(v); 3522 Py_XDECREF(errorHandler); 3523 Py_XDECREF(exc); 3524 return NULL; 3525} 3526 3527/* --- Latin-1 Codec ------------------------------------------------------ */ 3528 3529PyObject *PyUnicode_DecodeLatin1(const char *s, 3530 Py_ssize_t size, 3531 const char *errors) 3532{ 3533 PyUnicodeObject *v; 3534 Py_UNICODE *p; 3535 3536 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */ 3537 if (size == 1) { 3538 Py_UNICODE r = *(unsigned char*)s; 3539 return PyUnicode_FromUnicode(&r, 1); 3540 } 3541 3542 v = _PyUnicode_New(size); 3543 if (v == NULL) 3544 goto onError; 3545 if (size == 0) 3546 return (PyObject *)v; 3547 p = PyUnicode_AS_UNICODE(v); 3548 while (size-- > 0) 3549 *p++ = (unsigned char)*s++; 3550 return (PyObject *)v; 3551 3552 onError: 3553 Py_XDECREF(v); 3554 return NULL; 3555} 3556 3557/* create or adjust a UnicodeEncodeError */ 3558static void make_encode_exception(PyObject **exceptionObject, 3559 const char *encoding, 3560 const Py_UNICODE *unicode, Py_ssize_t size, 3561 Py_ssize_t startpos, Py_ssize_t endpos, 3562 const char *reason) 3563{ 3564 if (*exceptionObject == NULL) { 3565 *exceptionObject = PyUnicodeEncodeError_Create( 3566 encoding, unicode, size, startpos, endpos, reason); 3567 } 3568 else { 3569 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos)) 3570 goto onError; 3571 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos)) 3572 goto onError; 3573 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason)) 3574 goto onError; 3575 return; 3576 onError: 3577 Py_DECREF(*exceptionObject); 3578 *exceptionObject = NULL; 3579 } 3580} 3581 3582/* raises a UnicodeEncodeError */ 3583static void raise_encode_exception(PyObject **exceptionObject, 3584 const char *encoding, 3585 const Py_UNICODE *unicode, Py_ssize_t size, 3586 Py_ssize_t startpos, Py_ssize_t endpos, 3587 const char *reason) 3588{ 3589 make_encode_exception(exceptionObject, 3590 encoding, unicode, size, startpos, endpos, reason); 3591 if (*exceptionObject != NULL) 3592 PyCodec_StrictErrors(*exceptionObject); 3593} 3594 3595/* error handling callback helper: 3596 build arguments, call the callback and check the arguments, 3597 put the result into newpos and return the replacement string, which 3598 has to be freed by the caller */ 3599static PyObject *unicode_encode_call_errorhandler(const char *errors, 3600 PyObject **errorHandler, 3601 const char *encoding, const char *reason, 3602 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject, 3603 Py_ssize_t startpos, Py_ssize_t endpos, 3604 Py_ssize_t *newpos) 3605{ 3606 static char *argparse = "O!n;encoding error handler must return (str, int) tuple"; 3607 3608 PyObject *restuple; 3609 PyObject *resunicode; 3610 3611 if (*errorHandler == NULL) { 3612 *errorHandler = PyCodec_LookupError(errors); 3613 if (*errorHandler == NULL) 3614 return NULL; 3615 } 3616 3617 make_encode_exception(exceptionObject, 3618 encoding, unicode, size, startpos, endpos, reason); 3619 if (*exceptionObject == NULL) 3620 return NULL; 3621 3622 restuple = PyObject_CallFunctionObjArgs( 3623 *errorHandler, *exceptionObject, NULL); 3624 if (restuple == NULL) 3625 return NULL; 3626 if (!PyTuple_Check(restuple)) { 3627 PyErr_Format(PyExc_TypeError, &argparse[4]); 3628 Py_DECREF(restuple); 3629 return NULL; 3630 } 3631 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, 3632 &resunicode, newpos)) { 3633 Py_DECREF(restuple); 3634 return NULL; 3635 } 3636 if (*newpos<0) 3637 *newpos = size+*newpos; 3638 if (*newpos<0 || *newpos>size) { 3639 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos); 3640 Py_DECREF(restuple); 3641 return NULL; 3642 } 3643 Py_INCREF(resunicode); 3644 Py_DECREF(restuple); 3645 return resunicode; 3646} 3647 3648static PyObject *unicode_encode_ucs1(const Py_UNICODE *p, 3649 Py_ssize_t size, 3650 const char *errors, 3651 int limit) 3652{ 3653 /* output object */ 3654 PyObject *res; 3655 /* pointers to the beginning and end+1 of input */ 3656 const Py_UNICODE *startp = p; 3657 const Py_UNICODE *endp = p + size; 3658 /* pointer to the beginning of the unencodable characters */ 3659 /* const Py_UNICODE *badp = NULL; */ 3660 /* pointer into the output */ 3661 char *str; 3662 /* current output position */ 3663 Py_ssize_t ressize; 3664 const char *encoding = (limit == 256) ? "latin-1" : "ascii"; 3665 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)"; 3666 PyObject *errorHandler = NULL; 3667 PyObject *exc = NULL; 3668 PyObject *result = NULL; 3669 /* the following variable is used for caching string comparisons 3670 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */ 3671 int known_errorHandler = -1; 3672 3673 /* allocate enough for a simple encoding without 3674 replacements, if we need more, we'll resize */ 3675 if (size == 0) 3676 return PyBytes_FromStringAndSize(NULL, 0); 3677 res = PyByteArray_FromStringAndSize(NULL, size); 3678 if (res == NULL) 3679 return NULL; 3680 str = PyByteArray_AS_STRING(res); 3681 ressize = size; 3682 3683 while (p<endp) { 3684 Py_UNICODE c = *p; 3685 3686 /* can we encode this? */ 3687 if (c<limit) { 3688 /* no overflow check, because we know that the space is enough */ 3689 *str++ = (char)c; 3690 ++p; 3691 } 3692 else { 3693 Py_ssize_t unicodepos = p-startp; 3694 Py_ssize_t requiredsize; 3695 PyObject *repunicode; 3696 Py_ssize_t repsize; 3697 Py_ssize_t newpos; 3698 Py_ssize_t respos; 3699 Py_UNICODE *uni2; 3700 /* startpos for collecting unencodable chars */ 3701 const Py_UNICODE *collstart = p; 3702 const Py_UNICODE *collend = p; 3703 /* find all unecodable characters */ 3704 while ((collend < endp) && ((*collend)>=limit)) 3705 ++collend; 3706 /* cache callback name lookup (if not done yet, i.e. it's the first error) */ 3707 if (known_errorHandler==-1) { 3708 if ((errors==NULL) || (!strcmp(errors, "strict"))) 3709 known_errorHandler = 1; 3710 else if (!strcmp(errors, "replace")) 3711 known_errorHandler = 2; 3712 else if (!strcmp(errors, "ignore")) 3713 known_errorHandler = 3; 3714 else if (!strcmp(errors, "xmlcharrefreplace")) 3715 known_errorHandler = 4; 3716 else 3717 known_errorHandler = 0; 3718 } 3719 switch (known_errorHandler) { 3720 case 1: /* strict */ 3721 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason); 3722 goto onError; 3723 case 2: /* replace */ 3724 while (collstart++<collend) 3725 *str++ = '?'; /* fall through */ 3726 case 3: /* ignore */ 3727 p = collend; 3728 break; 3729 case 4: /* xmlcharrefreplace */ 3730 respos = str - PyByteArray_AS_STRING(res); 3731 /* determine replacement size (temporarily (mis)uses p) */ 3732 for (p = collstart, repsize = 0; p < collend; ++p) { 3733 if (*p<10) 3734 repsize += 2+1+1; 3735 else if (*p<100) 3736 repsize += 2+2+1; 3737 else if (*p<1000) 3738 repsize += 2+3+1; 3739 else if (*p<10000) 3740 repsize += 2+4+1; 3741#ifndef Py_UNICODE_WIDE 3742 else 3743 repsize += 2+5+1; 3744#else 3745 else if (*p<100000) 3746 repsize += 2+5+1; 3747 else if (*p<1000000) 3748 repsize += 2+6+1; 3749 else 3750 repsize += 2+7+1; 3751#endif 3752 } 3753 requiredsize = respos+repsize+(endp-collend); 3754 if (requiredsize > ressize) { 3755 if (requiredsize<2*ressize) 3756 requiredsize = 2*ressize; 3757 if (PyByteArray_Resize(res, requiredsize)) 3758 goto onError; 3759 str = PyByteArray_AS_STRING(res) + respos; 3760 ressize = requiredsize; 3761 } 3762 /* generate replacement (temporarily (mis)uses p) */ 3763 for (p = collstart; p < collend; ++p) { 3764 str += sprintf(str, "&#%d;", (int)*p); 3765 } 3766 p = collend; 3767 break; 3768 default: 3769 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler, 3770 encoding, reason, startp, size, &exc, 3771 collstart-startp, collend-startp, &newpos); 3772 if (repunicode == NULL) 3773 goto onError; 3774 /* need more space? (at least enough for what we 3775 have+the replacement+the rest of the string, so 3776 we won't have to check space for encodable characters) */ 3777 respos = str - PyByteArray_AS_STRING(res); 3778 repsize = PyUnicode_GET_SIZE(repunicode); 3779 requiredsize = respos+repsize+(endp-collend); 3780 if (requiredsize > ressize) { 3781 if (requiredsize<2*ressize) 3782 requiredsize = 2*ressize; 3783 if (PyByteArray_Resize(res, requiredsize)) { 3784 Py_DECREF(repunicode); 3785 goto onError; 3786 } 3787 str = PyByteArray_AS_STRING(res) + respos; 3788 ressize = requiredsize; 3789 } 3790 /* check if there is anything unencodable in the replacement 3791 and copy it to the output */ 3792 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) { 3793 c = *uni2; 3794 if (c >= limit) { 3795 raise_encode_exception(&exc, encoding, startp, size, 3796 unicodepos, unicodepos+1, reason); 3797 Py_DECREF(repunicode); 3798 goto onError; 3799 } 3800 *str = (char)c; 3801 } 3802 p = startp + newpos; 3803 Py_DECREF(repunicode); 3804 } 3805 } 3806 } 3807 result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(res), 3808 str - PyByteArray_AS_STRING(res)); 3809 onError: 3810 Py_DECREF(res); 3811 Py_XDECREF(errorHandler); 3812 Py_XDECREF(exc); 3813 return result; 3814} 3815 3816PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p, 3817 Py_ssize_t size, 3818 const char *errors) 3819{ 3820 return unicode_encode_ucs1(p, size, errors, 256); 3821} 3822 3823PyObject *PyUnicode_AsLatin1String(PyObject *unicode) 3824{ 3825 if (!PyUnicode_Check(unicode)) { 3826 PyErr_BadArgument(); 3827 return NULL; 3828 } 3829 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode), 3830 PyUnicode_GET_SIZE(unicode), 3831 NULL); 3832} 3833 3834/* --- 7-bit ASCII Codec -------------------------------------------------- */ 3835 3836PyObject *PyUnicode_DecodeASCII(const char *s, 3837 Py_ssize_t size, 3838 const char *errors) 3839{ 3840 const char *starts = s; 3841 PyUnicodeObject *v; 3842 Py_UNICODE *p; 3843 Py_ssize_t startinpos; 3844 Py_ssize_t endinpos; 3845 Py_ssize_t outpos; 3846 const char *e; 3847 PyObject *errorHandler = NULL; 3848 PyObject *exc = NULL; 3849 3850 /* ASCII is equivalent to the first 128 ordinals in Unicode. */ 3851 if (size == 1 && *(unsigned char*)s < 128) { 3852 Py_UNICODE r = *(unsigned char*)s; 3853 return PyUnicode_FromUnicode(&r, 1); 3854 } 3855 3856 v = _PyUnicode_New(size); 3857 if (v == NULL) 3858 goto onError; 3859 if (size == 0) 3860 return (PyObject *)v; 3861 p = PyUnicode_AS_UNICODE(v); 3862 e = s + size; 3863 while (s < e) { 3864 register unsigned char c = (unsigned char)*s; 3865 if (c < 128) { 3866 *p++ = c; 3867 ++s; 3868 } 3869 else { 3870 startinpos = s-starts; 3871 endinpos = startinpos + 1; 3872 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v); 3873 if (unicode_decode_call_errorhandler( 3874 errors, &errorHandler, 3875 "ascii", "ordinal not in range(128)", 3876 &starts, &e, &startinpos, &endinpos, &exc, &s, 3877 (PyObject **)&v, &outpos, &p)) 3878 goto onError; 3879 } 3880 } 3881 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v)) 3882 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0) 3883 goto onError; 3884 Py_XDECREF(errorHandler); 3885 Py_XDECREF(exc); 3886 return (PyObject *)v; 3887 3888 onError: 3889 Py_XDECREF(v); 3890 Py_XDECREF(errorHandler); 3891 Py_XDECREF(exc); 3892 return NULL; 3893} 3894 3895PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p, 3896 Py_ssize_t size, 3897 const char *errors) 3898{ 3899 return unicode_encode_ucs1(p, size, errors, 128); 3900} 3901 3902PyObject *PyUnicode_AsASCIIString(PyObject *unicode) 3903{ 3904 if (!PyUnicode_Check(unicode)) { 3905 PyErr_BadArgument(); 3906 return NULL; 3907 } 3908 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode), 3909 PyUnicode_GET_SIZE(unicode), 3910 NULL); 3911} 3912 3913#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T) 3914 3915/* --- MBCS codecs for Windows -------------------------------------------- */ 3916 3917#if SIZEOF_INT < SIZEOF_SSIZE_T 3918#define NEED_RETRY 3919#endif 3920 3921/* XXX This code is limited to "true" double-byte encodings, as 3922 a) it assumes an incomplete character consists of a single byte, and 3923 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte 3924 encodings, see IsDBCSLeadByteEx documentation. */ 3925 3926static int is_dbcs_lead_byte(const char *s, int offset) 3927{ 3928 const char *curr = s + offset; 3929 3930 if (IsDBCSLeadByte(*curr)) { 3931 const char *prev = CharPrev(s, curr); 3932 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2); 3933 } 3934 return 0; 3935} 3936 3937/* 3938 * Decode MBCS string into unicode object. If 'final' is set, converts 3939 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise. 3940 */ 3941static int decode_mbcs(PyUnicodeObject **v, 3942 const char *s, /* MBCS string */ 3943 int size, /* sizeof MBCS string */ 3944 int final) 3945{ 3946 Py_UNICODE *p; 3947 Py_ssize_t n = 0; 3948 int usize = 0; 3949 3950 assert(size >= 0); 3951 3952 /* Skip trailing lead-byte unless 'final' is set */ 3953 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1)) 3954 --size; 3955 3956 /* First get the size of the result */ 3957 if (size > 0) { 3958 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0); 3959 if (usize == 0) { 3960 PyErr_SetFromWindowsErrWithFilename(0, NULL); 3961 return -1; 3962 } 3963 } 3964 3965 if (*v == NULL) { 3966 /* Create unicode object */ 3967 *v = _PyUnicode_New(usize); 3968 if (*v == NULL) 3969 return -1; 3970 } 3971 else { 3972 /* Extend unicode object */ 3973 n = PyUnicode_GET_SIZE(*v); 3974 if (_PyUnicode_Resize(v, n + usize) < 0) 3975 return -1; 3976 } 3977 3978 /* Do the conversion */ 3979 if (size > 0) { 3980 p = PyUnicode_AS_UNICODE(*v) + n; 3981 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) { 3982 PyErr_SetFromWindowsErrWithFilename(0, NULL); 3983 return -1; 3984 } 3985 } 3986 3987 return size; 3988} 3989 3990PyObject *PyUnicode_DecodeMBCSStateful(const char *s, 3991 Py_ssize_t size, 3992 const char *errors, 3993 Py_ssize_t *consumed) 3994{ 3995 PyUnicodeObject *v = NULL; 3996 int done; 3997 3998 if (consumed) 3999 *consumed = 0; 4000 4001#ifdef NEED_RETRY 4002 retry: 4003 if (size > INT_MAX) 4004 done = decode_mbcs(&v, s, INT_MAX, 0); 4005 else 4006#endif 4007 done = decode_mbcs(&v, s, (int)size, !consumed); 4008 4009 if (done < 0) { 4010 Py_XDECREF(v); 4011 return NULL; 4012 } 4013 4014 if (consumed) 4015 *consumed += done; 4016 4017#ifdef NEED_RETRY 4018 if (size > INT_MAX) { 4019 s += done; 4020 size -= done; 4021 goto retry; 4022 } 4023#endif 4024 4025 return (PyObject *)v; 4026} 4027 4028PyObject *PyUnicode_DecodeMBCS(const char *s, 4029 Py_ssize_t size, 4030 const char *errors) 4031{ 4032 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL); 4033} 4034 4035/* 4036 * Convert unicode into string object (MBCS). 4037 * Returns 0 if succeed, -1 otherwise. 4038 */ 4039static int encode_mbcs(PyObject **repr, 4040 const Py_UNICODE *p, /* unicode */ 4041 int size) /* size of unicode */ 4042{ 4043 int mbcssize = 0; 4044 Py_ssize_t n = 0; 4045 4046 assert(size >= 0); 4047 4048 /* First get the size of the result */ 4049 if (size > 0) { 4050 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL); 4051 if (mbcssize == 0) { 4052 PyErr_SetFromWindowsErrWithFilename(0, NULL); 4053 return -1; 4054 } 4055 } 4056 4057 if (*repr == NULL) { 4058 /* Create string object */ 4059 *repr = PyBytes_FromStringAndSize(NULL, mbcssize); 4060 if (*repr == NULL) 4061 return -1; 4062 } 4063 else { 4064 /* Extend string object */ 4065 n = PyBytes_Size(*repr); 4066 if (_PyBytes_Resize(repr, n + mbcssize) < 0) 4067 return -1; 4068 } 4069 4070 /* Do the conversion */ 4071 if (size > 0) { 4072 char *s = PyBytes_AS_STRING(*repr) + n; 4073 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) { 4074 PyErr_SetFromWindowsErrWithFilename(0, NULL); 4075 return -1; 4076 } 4077 } 4078 4079 return 0; 4080} 4081 4082PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p, 4083 Py_ssize_t size, 4084 const char *errors) 4085{ 4086 PyObject *repr = NULL; 4087 int ret; 4088 4089#ifdef NEED_RETRY 4090 retry: 4091 if (size > INT_MAX) 4092 ret = encode_mbcs(&repr, p, INT_MAX); 4093 else 4094#endif 4095 ret = encode_mbcs(&repr, p, (int)size); 4096 4097 if (ret < 0) { 4098 Py_XDECREF(repr); 4099 return NULL; 4100 } 4101 4102#ifdef NEED_RETRY 4103 if (size > INT_MAX) { 4104 p += INT_MAX; 4105 size -= INT_MAX; 4106 goto retry; 4107 } 4108#endif 4109 4110 return repr; 4111} 4112 4113PyObject *PyUnicode_AsMBCSString(PyObject *unicode) 4114{ 4115 if (!PyUnicode_Check(unicode)) { 4116 PyErr_BadArgument(); 4117 return NULL; 4118 } 4119 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode), 4120 PyUnicode_GET_SIZE(unicode), 4121 NULL); 4122} 4123 4124#undef NEED_RETRY 4125 4126#endif /* MS_WINDOWS */ 4127 4128/* --- Character Mapping Codec -------------------------------------------- */ 4129 4130PyObject *PyUnicode_DecodeCharmap(const char *s, 4131 Py_ssize_t size, 4132 PyObject *mapping, 4133 const char *errors) 4134{ 4135 const char *starts = s; 4136 Py_ssize_t startinpos; 4137 Py_ssize_t endinpos; 4138 Py_ssize_t outpos; 4139 const char *e; 4140 PyUnicodeObject *v; 4141 Py_UNICODE *p; 4142 Py_ssize_t extrachars = 0; 4143 PyObject *errorHandler = NULL; 4144 PyObject *exc = NULL; 4145 Py_UNICODE *mapstring = NULL; 4146 Py_ssize_t maplen = 0; 4147 4148 /* Default to Latin-1 */ 4149 if (mapping == NULL) 4150 return PyUnicode_DecodeLatin1(s, size, errors); 4151 4152 v = _PyUnicode_New(size); 4153 if (v == NULL) 4154 goto onError; 4155 if (size == 0) 4156 return (PyObject *)v; 4157 p = PyUnicode_AS_UNICODE(v); 4158 e = s + size; 4159 if (PyUnicode_CheckExact(mapping)) { 4160 mapstring = PyUnicode_AS_UNICODE(mapping); 4161 maplen = PyUnicode_GET_SIZE(mapping); 4162 while (s < e) { 4163 unsigned char ch = *s; 4164 Py_UNICODE x = 0xfffe; /* illegal value */ 4165 4166 if (ch < maplen) 4167 x = mapstring[ch]; 4168 4169 if (x == 0xfffe) { 4170 /* undefined mapping */ 4171 outpos = p-PyUnicode_AS_UNICODE(v); 4172 startinpos = s-starts; 4173 endinpos = startinpos+1; 4174 if (unicode_decode_call_errorhandler( 4175 errors, &errorHandler, 4176 "charmap", "character maps to <undefined>", 4177 &starts, &e, &startinpos, &endinpos, &exc, &s, 4178 (PyObject **)&v, &outpos, &p)) { 4179 goto onError; 4180 } 4181 continue; 4182 } 4183 *p++ = x; 4184 ++s; 4185 } 4186 } 4187 else { 4188 while (s < e) { 4189 unsigned char ch = *s; 4190 PyObject *w, *x; 4191 4192 /* Get mapping (char ordinal -> integer, Unicode char or None) */ 4193 w = PyLong_FromLong((long)ch); 4194 if (w == NULL) 4195 goto onError; 4196 x = PyObject_GetItem(mapping, w); 4197 Py_DECREF(w); 4198 if (x == NULL) { 4199 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 4200 /* No mapping found means: mapping is undefined. */ 4201 PyErr_Clear(); 4202 x = Py_None; 4203 Py_INCREF(x); 4204 } else 4205 goto onError; 4206 } 4207 4208 /* Apply mapping */ 4209 if (PyLong_Check(x)) { 4210 long value = PyLong_AS_LONG(x); 4211 if (value < 0 || value > 65535) { 4212 PyErr_SetString(PyExc_TypeError, 4213 "character mapping must be in range(65536)"); 4214 Py_DECREF(x); 4215 goto onError; 4216 } 4217 *p++ = (Py_UNICODE)value; 4218 } 4219 else if (x == Py_None) { 4220 /* undefined mapping */ 4221 outpos = p-PyUnicode_AS_UNICODE(v); 4222 startinpos = s-starts; 4223 endinpos = startinpos+1; 4224 if (unicode_decode_call_errorhandler( 4225 errors, &errorHandler, 4226 "charmap", "character maps to <undefined>", 4227 &starts, &e, &startinpos, &endinpos, &exc, &s, 4228 (PyObject **)&v, &outpos, &p)) { 4229 Py_DECREF(x); 4230 goto onError; 4231 } 4232 Py_DECREF(x); 4233 continue; 4234 } 4235 else if (PyUnicode_Check(x)) { 4236 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x); 4237 4238 if (targetsize == 1) 4239 /* 1-1 mapping */ 4240 *p++ = *PyUnicode_AS_UNICODE(x); 4241 4242 else if (targetsize > 1) { 4243 /* 1-n mapping */ 4244 if (targetsize > extrachars) { 4245 /* resize first */ 4246 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v); 4247 Py_ssize_t needed = (targetsize - extrachars) + \ 4248 (targetsize << 2); 4249 extrachars += needed; 4250 /* XXX overflow detection missing */ 4251 if (_PyUnicode_Resize(&v, 4252 PyUnicode_GET_SIZE(v) + needed) < 0) { 4253 Py_DECREF(x); 4254 goto onError; 4255 } 4256 p = PyUnicode_AS_UNICODE(v) + oldpos; 4257 } 4258 Py_UNICODE_COPY(p, 4259 PyUnicode_AS_UNICODE(x), 4260 targetsize); 4261 p += targetsize; 4262 extrachars -= targetsize; 4263 } 4264 /* 1-0 mapping: skip the character */ 4265 } 4266 else { 4267 /* wrong return value */ 4268 PyErr_SetString(PyExc_TypeError, 4269 "character mapping must return integer, None or str"); 4270 Py_DECREF(x); 4271 goto onError; 4272 } 4273 Py_DECREF(x); 4274 ++s; 4275 } 4276 } 4277 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v)) 4278 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0) 4279 goto onError; 4280 Py_XDECREF(errorHandler); 4281 Py_XDECREF(exc); 4282 return (PyObject *)v; 4283 4284 onError: 4285 Py_XDECREF(errorHandler); 4286 Py_XDECREF(exc); 4287 Py_XDECREF(v); 4288 return NULL; 4289} 4290 4291/* Charmap encoding: the lookup table */ 4292 4293struct encoding_map{ 4294 PyObject_HEAD 4295 unsigned char level1[32]; 4296 int count2, count3; 4297 unsigned char level23[1]; 4298}; 4299 4300static PyObject* 4301encoding_map_size(PyObject *obj, PyObject* args) 4302{ 4303 struct encoding_map *map = (struct encoding_map*)obj; 4304 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 + 4305 128*map->count3); 4306} 4307 4308static PyMethodDef encoding_map_methods[] = { 4309 {"size", encoding_map_size, METH_NOARGS, 4310 PyDoc_STR("Return the size (in bytes) of this object") }, 4311 { 0 } 4312}; 4313 4314static void 4315encoding_map_dealloc(PyObject* o) 4316{ 4317 PyObject_FREE(o); 4318} 4319 4320static PyTypeObject EncodingMapType = { 4321 PyVarObject_HEAD_INIT(NULL, 0) 4322 "EncodingMap", /*tp_name*/ 4323 sizeof(struct encoding_map), /*tp_basicsize*/ 4324 0, /*tp_itemsize*/ 4325 /* methods */ 4326 encoding_map_dealloc, /*tp_dealloc*/ 4327 0, /*tp_print*/ 4328 0, /*tp_getattr*/ 4329 0, /*tp_setattr*/ 4330 0, /*tp_compare*/ 4331 0, /*tp_repr*/ 4332 0, /*tp_as_number*/ 4333 0, /*tp_as_sequence*/ 4334 0, /*tp_as_mapping*/ 4335 0, /*tp_hash*/ 4336 0, /*tp_call*/ 4337 0, /*tp_str*/ 4338 0, /*tp_getattro*/ 4339 0, /*tp_setattro*/ 4340 0, /*tp_as_buffer*/ 4341 Py_TPFLAGS_DEFAULT, /*tp_flags*/ 4342 0, /*tp_doc*/ 4343 0, /*tp_traverse*/ 4344 0, /*tp_clear*/ 4345 0, /*tp_richcompare*/ 4346 0, /*tp_weaklistoffset*/ 4347 0, /*tp_iter*/ 4348 0, /*tp_iternext*/ 4349 encoding_map_methods, /*tp_methods*/ 4350 0, /*tp_members*/ 4351 0, /*tp_getset*/ 4352 0, /*tp_base*/ 4353 0, /*tp_dict*/ 4354 0, /*tp_descr_get*/ 4355 0, /*tp_descr_set*/ 4356 0, /*tp_dictoffset*/ 4357 0, /*tp_init*/ 4358 0, /*tp_alloc*/ 4359 0, /*tp_new*/ 4360 0, /*tp_free*/ 4361 0, /*tp_is_gc*/ 4362}; 4363 4364PyObject* 4365PyUnicode_BuildEncodingMap(PyObject* string) 4366{ 4367 Py_UNICODE *decode; 4368 PyObject *result; 4369 struct encoding_map *mresult; 4370 int i; 4371 int need_dict = 0; 4372 unsigned char level1[32]; 4373 unsigned char level2[512]; 4374 unsigned char *mlevel1, *mlevel2, *mlevel3; 4375 int count2 = 0, count3 = 0; 4376 4377 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) { 4378 PyErr_BadArgument(); 4379 return NULL; 4380 } 4381 decode = PyUnicode_AS_UNICODE(string); 4382 memset(level1, 0xFF, sizeof level1); 4383 memset(level2, 0xFF, sizeof level2); 4384 4385 /* If there isn't a one-to-one mapping of NULL to \0, 4386 or if there are non-BMP characters, we need to use 4387 a mapping dictionary. */ 4388 if (decode[0] != 0) 4389 need_dict = 1; 4390 for (i = 1; i < 256; i++) { 4391 int l1, l2; 4392 if (decode[i] == 0 4393 #ifdef Py_UNICODE_WIDE 4394 || decode[i] > 0xFFFF 4395 #endif 4396 ) { 4397 need_dict = 1; 4398 break; 4399 } 4400 if (decode[i] == 0xFFFE) 4401 /* unmapped character */ 4402 continue; 4403 l1 = decode[i] >> 11; 4404 l2 = decode[i] >> 7; 4405 if (level1[l1] == 0xFF) 4406 level1[l1] = count2++; 4407 if (level2[l2] == 0xFF) 4408 level2[l2] = count3++; 4409 } 4410 4411 if (count2 >= 0xFF || count3 >= 0xFF) 4412 need_dict = 1; 4413 4414 if (need_dict) { 4415 PyObject *result = PyDict_New(); 4416 PyObject *key, *value; 4417 if (!result) 4418 return NULL; 4419 for (i = 0; i < 256; i++) { 4420 key = value = NULL; 4421 key = PyLong_FromLong(decode[i]); 4422 value = PyLong_FromLong(i); 4423 if (!key || !value) 4424 goto failed1; 4425 if (PyDict_SetItem(result, key, value) == -1) 4426 goto failed1; 4427 Py_DECREF(key); 4428 Py_DECREF(value); 4429 } 4430 return result; 4431 failed1: 4432 Py_XDECREF(key); 4433 Py_XDECREF(value); 4434 Py_DECREF(result); 4435 return NULL; 4436 } 4437 4438 /* Create a three-level trie */ 4439 result = PyObject_MALLOC(sizeof(struct encoding_map) + 4440 16*count2 + 128*count3 - 1); 4441 if (!result) 4442 return PyErr_NoMemory(); 4443 PyObject_Init(result, &EncodingMapType); 4444 mresult = (struct encoding_map*)result; 4445 mresult->count2 = count2; 4446 mresult->count3 = count3; 4447 mlevel1 = mresult->level1; 4448 mlevel2 = mresult->level23; 4449 mlevel3 = mresult->level23 + 16*count2; 4450 memcpy(mlevel1, level1, 32); 4451 memset(mlevel2, 0xFF, 16*count2); 4452 memset(mlevel3, 0, 128*count3); 4453 count3 = 0; 4454 for (i = 1; i < 256; i++) { 4455 int o1, o2, o3, i2, i3; 4456 if (decode[i] == 0xFFFE) 4457 /* unmapped character */ 4458 continue; 4459 o1 = decode[i]>>11; 4460 o2 = (decode[i]>>7) & 0xF; 4461 i2 = 16*mlevel1[o1] + o2; 4462 if (mlevel2[i2] == 0xFF) 4463 mlevel2[i2] = count3++; 4464 o3 = decode[i] & 0x7F; 4465 i3 = 128*mlevel2[i2] + o3; 4466 mlevel3[i3] = i; 4467 } 4468 return result; 4469} 4470 4471static int 4472encoding_map_lookup(Py_UNICODE c, PyObject *mapping) 4473{ 4474 struct encoding_map *map = (struct encoding_map*)mapping; 4475 int l1 = c>>11; 4476 int l2 = (c>>7) & 0xF; 4477 int l3 = c & 0x7F; 4478 int i; 4479 4480#ifdef Py_UNICODE_WIDE 4481 if (c > 0xFFFF) { 4482 return -1; 4483 } 4484#endif 4485 if (c == 0) 4486 return 0; 4487 /* level 1*/ 4488 i = map->level1[l1]; 4489 if (i == 0xFF) { 4490 return -1; 4491 } 4492 /* level 2*/ 4493 i = map->level23[16*i+l2]; 4494 if (i == 0xFF) { 4495 return -1; 4496 } 4497 /* level 3 */ 4498 i = map->level23[16*map->count2 + 128*i + l3]; 4499 if (i == 0) { 4500 return -1; 4501 } 4502 return i; 4503} 4504 4505/* Lookup the character ch in the mapping. If the character 4506 can't be found, Py_None is returned (or NULL, if another 4507 error occurred). */ 4508static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping) 4509{ 4510 PyObject *w = PyLong_FromLong((long)c); 4511 PyObject *x; 4512 4513 if (w == NULL) 4514 return NULL; 4515 x = PyObject_GetItem(mapping, w); 4516 Py_DECREF(w); 4517 if (x == NULL) { 4518 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 4519 /* No mapping found means: mapping is undefined. */ 4520 PyErr_Clear(); 4521 x = Py_None; 4522 Py_INCREF(x); 4523 return x; 4524 } else 4525 return NULL; 4526 } 4527 else if (x == Py_None) 4528 return x; 4529 else if (PyLong_Check(x)) { 4530 long value = PyLong_AS_LONG(x); 4531 if (value < 0 || value > 255) { 4532 PyErr_SetString(PyExc_TypeError, 4533 "character mapping must be in range(256)"); 4534 Py_DECREF(x); 4535 return NULL; 4536 } 4537 return x; 4538 } 4539 else if (PyBytes_Check(x)) 4540 return x; 4541 else { 4542 /* wrong return value */ 4543 PyErr_Format(PyExc_TypeError, 4544 "character mapping must return integer, bytes or None, not %.400s", 4545 x->ob_type->tp_name); 4546 Py_DECREF(x); 4547 return NULL; 4548 } 4549} 4550 4551static int 4552charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize) 4553{ 4554 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj); 4555 /* exponentially overallocate to minimize reallocations */ 4556 if (requiredsize < 2*outsize) 4557 requiredsize = 2*outsize; 4558 if (_PyBytes_Resize(outobj, requiredsize)) 4559 return -1; 4560 return 0; 4561} 4562 4563typedef enum charmapencode_result { 4564 enc_SUCCESS, enc_FAILED, enc_EXCEPTION 4565}charmapencode_result; 4566/* lookup the character, put the result in the output string and adjust 4567 various state variables. Resize the output bytes object if not enough 4568 space is available. Return a new reference to the object that 4569 was put in the output buffer, or Py_None, if the mapping was undefined 4570 (in which case no character was written) or NULL, if a 4571 reallocation error occurred. The caller must decref the result */ 4572static 4573charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping, 4574 PyObject **outobj, Py_ssize_t *outpos) 4575{ 4576 PyObject *rep; 4577 char *outstart; 4578 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj); 4579 4580 if (Py_TYPE(mapping) == &EncodingMapType) { 4581 int res = encoding_map_lookup(c, mapping); 4582 Py_ssize_t requiredsize = *outpos+1; 4583 if (res == -1) 4584 return enc_FAILED; 4585 if (outsize<requiredsize) 4586 if (charmapencode_resize(outobj, outpos, requiredsize)) 4587 return enc_EXCEPTION; 4588 outstart = PyBytes_AS_STRING(*outobj); 4589 outstart[(*outpos)++] = (char)res; 4590 return enc_SUCCESS; 4591 } 4592 4593 rep = charmapencode_lookup(c, mapping); 4594 if (rep==NULL) 4595 return enc_EXCEPTION; 4596 else if (rep==Py_None) { 4597 Py_DECREF(rep); 4598 return enc_FAILED; 4599 } else { 4600 if (PyLong_Check(rep)) { 4601 Py_ssize_t requiredsize = *outpos+1; 4602 if (outsize<requiredsize) 4603 if (charmapencode_resize(outobj, outpos, requiredsize)) { 4604 Py_DECREF(rep); 4605 return enc_EXCEPTION; 4606 } 4607 outstart = PyBytes_AS_STRING(*outobj); 4608 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep); 4609 } 4610 else { 4611 const char *repchars = PyBytes_AS_STRING(rep); 4612 Py_ssize_t repsize = PyBytes_GET_SIZE(rep); 4613 Py_ssize_t requiredsize = *outpos+repsize; 4614 if (outsize<requiredsize) 4615 if (charmapencode_resize(outobj, outpos, requiredsize)) { 4616 Py_DECREF(rep); 4617 return enc_EXCEPTION; 4618 } 4619 outstart = PyBytes_AS_STRING(*outobj); 4620 memcpy(outstart + *outpos, repchars, repsize); 4621 *outpos += repsize; 4622 } 4623 } 4624 Py_DECREF(rep); 4625 return enc_SUCCESS; 4626} 4627 4628/* handle an error in PyUnicode_EncodeCharmap 4629 Return 0 on success, -1 on error */ 4630static 4631int charmap_encoding_error( 4632 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping, 4633 PyObject **exceptionObject, 4634 int *known_errorHandler, PyObject **errorHandler, const char *errors, 4635 PyObject **res, Py_ssize_t *respos) 4636{ 4637 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */ 4638 Py_ssize_t repsize; 4639 Py_ssize_t newpos; 4640 Py_UNICODE *uni2; 4641 /* startpos for collecting unencodable chars */ 4642 Py_ssize_t collstartpos = *inpos; 4643 Py_ssize_t collendpos = *inpos+1; 4644 Py_ssize_t collpos; 4645 char *encoding = "charmap"; 4646 char *reason = "character maps to <undefined>"; 4647 charmapencode_result x; 4648 4649 /* find all unencodable characters */ 4650 while (collendpos < size) { 4651 PyObject *rep; 4652 if (Py_TYPE(mapping) == &EncodingMapType) { 4653 int res = encoding_map_lookup(p[collendpos], mapping); 4654 if (res != -1) 4655 break; 4656 ++collendpos; 4657 continue; 4658 } 4659 4660 rep = charmapencode_lookup(p[collendpos], mapping); 4661 if (rep==NULL) 4662 return -1; 4663 else if (rep!=Py_None) { 4664 Py_DECREF(rep); 4665 break; 4666 } 4667 Py_DECREF(rep); 4668 ++collendpos; 4669 } 4670 /* cache callback name lookup 4671 * (if not done yet, i.e. it's the first error) */ 4672 if (*known_errorHandler==-1) { 4673 if ((errors==NULL) || (!strcmp(errors, "strict"))) 4674 *known_errorHandler = 1; 4675 else if (!strcmp(errors, "replace")) 4676 *known_errorHandler = 2; 4677 else if (!strcmp(errors, "ignore")) 4678 *known_errorHandler = 3; 4679 else if (!strcmp(errors, "xmlcharrefreplace")) 4680 *known_errorHandler = 4; 4681 else 4682 *known_errorHandler = 0; 4683 } 4684 switch (*known_errorHandler) { 4685 case 1: /* strict */ 4686 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason); 4687 return -1; 4688 case 2: /* replace */ 4689 for (collpos = collstartpos; collpos<collendpos; ++collpos) { 4690 x = charmapencode_output('?', mapping, res, respos); 4691 if (x==enc_EXCEPTION) { 4692 return -1; 4693 } 4694 else if (x==enc_FAILED) { 4695 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason); 4696 return -1; 4697 } 4698 } 4699 /* fall through */ 4700 case 3: /* ignore */ 4701 *inpos = collendpos; 4702 break; 4703 case 4: /* xmlcharrefreplace */ 4704 /* generate replacement (temporarily (mis)uses p) */ 4705 for (collpos = collstartpos; collpos < collendpos; ++collpos) { 4706 char buffer[2+29+1+1]; 4707 char *cp; 4708 sprintf(buffer, "&#%d;", (int)p[collpos]); 4709 for (cp = buffer; *cp; ++cp) { 4710 x = charmapencode_output(*cp, mapping, res, respos); 4711 if (x==enc_EXCEPTION) 4712 return -1; 4713 else if (x==enc_FAILED) { 4714 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason); 4715 return -1; 4716 } 4717 } 4718 } 4719 *inpos = collendpos; 4720 break; 4721 default: 4722 repunicode = unicode_encode_call_errorhandler(errors, errorHandler, 4723 encoding, reason, p, size, exceptionObject, 4724 collstartpos, collendpos, &newpos); 4725 if (repunicode == NULL) 4726 return -1; 4727 /* generate replacement */ 4728 repsize = PyUnicode_GET_SIZE(repunicode); 4729 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) { 4730 x = charmapencode_output(*uni2, mapping, res, respos); 4731 if (x==enc_EXCEPTION) { 4732 return -1; 4733 } 4734 else if (x==enc_FAILED) { 4735 Py_DECREF(repunicode); 4736 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason); 4737 return -1; 4738 } 4739 } 4740 *inpos = newpos; 4741 Py_DECREF(repunicode); 4742 } 4743 return 0; 4744} 4745 4746PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p, 4747 Py_ssize_t size, 4748 PyObject *mapping, 4749 const char *errors) 4750{ 4751 /* output object */ 4752 PyObject *res = NULL; 4753 /* current input position */ 4754 Py_ssize_t inpos = 0; 4755 /* current output position */ 4756 Py_ssize_t respos = 0; 4757 PyObject *errorHandler = NULL; 4758 PyObject *exc = NULL; 4759 /* the following variable is used for caching string comparisons 4760 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 4761 * 3=ignore, 4=xmlcharrefreplace */ 4762 int known_errorHandler = -1; 4763 4764 /* Default to Latin-1 */ 4765 if (mapping == NULL) 4766 return PyUnicode_EncodeLatin1(p, size, errors); 4767 4768 /* allocate enough for a simple encoding without 4769 replacements, if we need more, we'll resize */ 4770 res = PyBytes_FromStringAndSize(NULL, size); 4771 if (res == NULL) 4772 goto onError; 4773 if (size == 0) 4774 return res; 4775 4776 while (inpos<size) { 4777 /* try to encode it */ 4778 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos); 4779 if (x==enc_EXCEPTION) /* error */ 4780 goto onError; 4781 if (x==enc_FAILED) { /* unencodable character */ 4782 if (charmap_encoding_error(p, size, &inpos, mapping, 4783 &exc, 4784 &known_errorHandler, &errorHandler, errors, 4785 &res, &respos)) { 4786 goto onError; 4787 } 4788 } 4789 else 4790 /* done with this character => adjust input position */ 4791 ++inpos; 4792 } 4793 4794 /* Resize if we allocated to much */ 4795 if (respos<PyBytes_GET_SIZE(res)) 4796 _PyBytes_Resize(&res, respos); 4797 4798 Py_XDECREF(exc); 4799 Py_XDECREF(errorHandler); 4800 return res; 4801 4802 onError: 4803 Py_XDECREF(res); 4804 Py_XDECREF(exc); 4805 Py_XDECREF(errorHandler); 4806 return NULL; 4807} 4808 4809PyObject *PyUnicode_AsCharmapString(PyObject *unicode, 4810 PyObject *mapping) 4811{ 4812 if (!PyUnicode_Check(unicode) || mapping == NULL) { 4813 PyErr_BadArgument(); 4814 return NULL; 4815 } 4816 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode), 4817 PyUnicode_GET_SIZE(unicode), 4818 mapping, 4819 NULL); 4820} 4821 4822/* create or adjust a UnicodeTranslateError */ 4823static void make_translate_exception(PyObject **exceptionObject, 4824 const Py_UNICODE *unicode, Py_ssize_t size, 4825 Py_ssize_t startpos, Py_ssize_t endpos, 4826 const char *reason) 4827{ 4828 if (*exceptionObject == NULL) { 4829 *exceptionObject = PyUnicodeTranslateError_Create( 4830 unicode, size, startpos, endpos, reason); 4831 } 4832 else { 4833 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos)) 4834 goto onError; 4835 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos)) 4836 goto onError; 4837 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason)) 4838 goto onError; 4839 return; 4840 onError: 4841 Py_DECREF(*exceptionObject); 4842 *exceptionObject = NULL; 4843 } 4844} 4845 4846/* raises a UnicodeTranslateError */ 4847static void raise_translate_exception(PyObject **exceptionObject, 4848 const Py_UNICODE *unicode, Py_ssize_t size, 4849 Py_ssize_t startpos, Py_ssize_t endpos, 4850 const char *reason) 4851{ 4852 make_translate_exception(exceptionObject, 4853 unicode, size, startpos, endpos, reason); 4854 if (*exceptionObject != NULL) 4855 PyCodec_StrictErrors(*exceptionObject); 4856} 4857 4858/* error handling callback helper: 4859 build arguments, call the callback and check the arguments, 4860 put the result into newpos and return the replacement string, which 4861 has to be freed by the caller */ 4862static PyObject *unicode_translate_call_errorhandler(const char *errors, 4863 PyObject **errorHandler, 4864 const char *reason, 4865 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject, 4866 Py_ssize_t startpos, Py_ssize_t endpos, 4867 Py_ssize_t *newpos) 4868{ 4869 static char *argparse = "O!n;translating error handler must return (str, int) tuple"; 4870 4871 Py_ssize_t i_newpos; 4872 PyObject *restuple; 4873 PyObject *resunicode; 4874 4875 if (*errorHandler == NULL) { 4876 *errorHandler = PyCodec_LookupError(errors); 4877 if (*errorHandler == NULL) 4878 return NULL; 4879 } 4880 4881 make_translate_exception(exceptionObject, 4882 unicode, size, startpos, endpos, reason); 4883 if (*exceptionObject == NULL) 4884 return NULL; 4885 4886 restuple = PyObject_CallFunctionObjArgs( 4887 *errorHandler, *exceptionObject, NULL); 4888 if (restuple == NULL) 4889 return NULL; 4890 if (!PyTuple_Check(restuple)) { 4891 PyErr_Format(PyExc_TypeError, &argparse[4]); 4892 Py_DECREF(restuple); 4893 return NULL; 4894 } 4895 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, 4896 &resunicode, &i_newpos)) { 4897 Py_DECREF(restuple); 4898 return NULL; 4899 } 4900 if (i_newpos<0) 4901 *newpos = size+i_newpos; 4902 else 4903 *newpos = i_newpos; 4904 if (*newpos<0 || *newpos>size) { 4905 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos); 4906 Py_DECREF(restuple); 4907 return NULL; 4908 } 4909 Py_INCREF(resunicode); 4910 Py_DECREF(restuple); 4911 return resunicode; 4912} 4913 4914/* Lookup the character ch in the mapping and put the result in result, 4915 which must be decrefed by the caller. 4916 Return 0 on success, -1 on error */ 4917static 4918int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result) 4919{ 4920 PyObject *w = PyLong_FromLong((long)c); 4921 PyObject *x; 4922 4923 if (w == NULL) 4924 return -1; 4925 x = PyObject_GetItem(mapping, w); 4926 Py_DECREF(w); 4927 if (x == NULL) { 4928 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 4929 /* No mapping found means: use 1:1 mapping. */ 4930 PyErr_Clear(); 4931 *result = NULL; 4932 return 0; 4933 } else 4934 return -1; 4935 } 4936 else if (x == Py_None) { 4937 *result = x; 4938 return 0; 4939 } 4940 else if (PyLong_Check(x)) { 4941 long value = PyLong_AS_LONG(x); 4942 long max = PyUnicode_GetMax(); 4943 if (value < 0 || value > max) { 4944 PyErr_Format(PyExc_TypeError, 4945 "character mapping must be in range(0x%x)", max+1); 4946 Py_DECREF(x); 4947 return -1; 4948 } 4949 *result = x; 4950 return 0; 4951 } 4952 else if (PyUnicode_Check(x)) { 4953 *result = x; 4954 return 0; 4955 } 4956 else { 4957 /* wrong return value */ 4958 PyErr_SetString(PyExc_TypeError, 4959 "character mapping must return integer, None or str"); 4960 Py_DECREF(x); 4961 return -1; 4962 } 4963} 4964/* ensure that *outobj is at least requiredsize characters long, 4965if not reallocate and adjust various state variables. 4966Return 0 on success, -1 on error */ 4967static 4968int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp, 4969 Py_ssize_t requiredsize) 4970{ 4971 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj); 4972 if (requiredsize > oldsize) { 4973 /* remember old output position */ 4974 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj); 4975 /* exponentially overallocate to minimize reallocations */ 4976 if (requiredsize < 2 * oldsize) 4977 requiredsize = 2 * oldsize; 4978 if (_PyUnicode_Resize(outobj, requiredsize) < 0) 4979 return -1; 4980 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos; 4981 } 4982 return 0; 4983} 4984/* lookup the character, put the result in the output string and adjust 4985 various state variables. Return a new reference to the object that 4986 was put in the output buffer in *result, or Py_None, if the mapping was 4987 undefined (in which case no character was written). 4988 The called must decref result. 4989 Return 0 on success, -1 on error. */ 4990static 4991int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp, 4992 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp, 4993 PyObject **res) 4994{ 4995 if (charmaptranslate_lookup(*curinp, mapping, res)) 4996 return -1; 4997 if (*res==NULL) { 4998 /* not found => default to 1:1 mapping */ 4999 *(*outp)++ = *curinp; 5000 } 5001 else if (*res==Py_None) 5002 ; 5003 else if (PyLong_Check(*res)) { 5004 /* no overflow check, because we know that the space is enough */ 5005 *(*outp)++ = (Py_UNICODE)PyLong_AS_LONG(*res); 5006 } 5007 else if (PyUnicode_Check(*res)) { 5008 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res); 5009 if (repsize==1) { 5010 /* no overflow check, because we know that the space is enough */ 5011 *(*outp)++ = *PyUnicode_AS_UNICODE(*res); 5012 } 5013 else if (repsize!=0) { 5014 /* more than one character */ 5015 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) + 5016 (insize - (curinp-startinp)) + 5017 repsize - 1; 5018 if (charmaptranslate_makespace(outobj, outp, requiredsize)) 5019 return -1; 5020 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize); 5021 *outp += repsize; 5022 } 5023 } 5024 else 5025 return -1; 5026 return 0; 5027} 5028 5029PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p, 5030 Py_ssize_t size, 5031 PyObject *mapping, 5032 const char *errors) 5033{ 5034 /* output object */ 5035 PyObject *res = NULL; 5036 /* pointers to the beginning and end+1 of input */ 5037 const Py_UNICODE *startp = p; 5038 const Py_UNICODE *endp = p + size; 5039 /* pointer into the output */ 5040 Py_UNICODE *str; 5041 /* current output position */ 5042 Py_ssize_t respos = 0; 5043 char *reason = "character maps to <undefined>"; 5044 PyObject *errorHandler = NULL; 5045 PyObject *exc = NULL; 5046 /* the following variable is used for caching string comparisons 5047 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 5048 * 3=ignore, 4=xmlcharrefreplace */ 5049 int known_errorHandler = -1; 5050 5051 if (mapping == NULL) { 5052 PyErr_BadArgument(); 5053 return NULL; 5054 } 5055 5056 /* allocate enough for a simple 1:1 translation without 5057 replacements, if we need more, we'll resize */ 5058 res = PyUnicode_FromUnicode(NULL, size); 5059 if (res == NULL) 5060 goto onError; 5061 if (size == 0) 5062 return res; 5063 str = PyUnicode_AS_UNICODE(res); 5064 5065 while (p<endp) { 5066 /* try to encode it */ 5067 PyObject *x = NULL; 5068 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) { 5069 Py_XDECREF(x); 5070 goto onError; 5071 } 5072 Py_XDECREF(x); 5073 if (x!=Py_None) /* it worked => adjust input pointer */ 5074 ++p; 5075 else { /* untranslatable character */ 5076 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */ 5077 Py_ssize_t repsize; 5078 Py_ssize_t newpos; 5079 Py_UNICODE *uni2; 5080 /* startpos for collecting untranslatable chars */ 5081 const Py_UNICODE *collstart = p; 5082 const Py_UNICODE *collend = p+1; 5083 const Py_UNICODE *coll; 5084 5085 /* find all untranslatable characters */ 5086 while (collend < endp) { 5087 if (charmaptranslate_lookup(*collend, mapping, &x)) 5088 goto onError; 5089 Py_XDECREF(x); 5090 if (x!=Py_None) 5091 break; 5092 ++collend; 5093 } 5094 /* cache callback name lookup 5095 * (if not done yet, i.e. it's the first error) */ 5096 if (known_errorHandler==-1) { 5097 if ((errors==NULL) || (!strcmp(errors, "strict"))) 5098 known_errorHandler = 1; 5099 else if (!strcmp(errors, "replace")) 5100 known_errorHandler = 2; 5101 else if (!strcmp(errors, "ignore")) 5102 known_errorHandler = 3; 5103 else if (!strcmp(errors, "xmlcharrefreplace")) 5104 known_errorHandler = 4; 5105 else 5106 known_errorHandler = 0; 5107 } 5108 switch (known_errorHandler) { 5109 case 1: /* strict */ 5110 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason); 5111 goto onError; 5112 case 2: /* replace */ 5113 /* No need to check for space, this is a 1:1 replacement */ 5114 for (coll = collstart; coll<collend; ++coll) 5115 *str++ = '?'; 5116 /* fall through */ 5117 case 3: /* ignore */ 5118 p = collend; 5119 break; 5120 case 4: /* xmlcharrefreplace */ 5121 /* generate replacement (temporarily (mis)uses p) */ 5122 for (p = collstart; p < collend; ++p) { 5123 char buffer[2+29+1+1]; 5124 char *cp; 5125 sprintf(buffer, "&#%d;", (int)*p); 5126 if (charmaptranslate_makespace(&res, &str, 5127 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend))) 5128 goto onError; 5129 for (cp = buffer; *cp; ++cp) 5130 *str++ = *cp; 5131 } 5132 p = collend; 5133 break; 5134 default: 5135 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler, 5136 reason, startp, size, &exc, 5137 collstart-startp, collend-startp, &newpos); 5138 if (repunicode == NULL) 5139 goto onError; 5140 /* generate replacement */ 5141 repsize = PyUnicode_GET_SIZE(repunicode); 5142 if (charmaptranslate_makespace(&res, &str, 5143 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) { 5144 Py_DECREF(repunicode); 5145 goto onError; 5146 } 5147 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) 5148 *str++ = *uni2; 5149 p = startp + newpos; 5150 Py_DECREF(repunicode); 5151 } 5152 } 5153 } 5154 /* Resize if we allocated to much */ 5155 respos = str-PyUnicode_AS_UNICODE(res); 5156 if (respos<PyUnicode_GET_SIZE(res)) { 5157 if (_PyUnicode_Resize(&res, respos) < 0) 5158 goto onError; 5159 } 5160 Py_XDECREF(exc); 5161 Py_XDECREF(errorHandler); 5162 return res; 5163 5164 onError: 5165 Py_XDECREF(res); 5166 Py_XDECREF(exc); 5167 Py_XDECREF(errorHandler); 5168 return NULL; 5169} 5170 5171PyObject *PyUnicode_Translate(PyObject *str, 5172 PyObject *mapping, 5173 const char *errors) 5174{ 5175 PyObject *result; 5176 5177 str = PyUnicode_FromObject(str); 5178 if (str == NULL) 5179 goto onError; 5180 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str), 5181 PyUnicode_GET_SIZE(str), 5182 mapping, 5183 errors); 5184 Py_DECREF(str); 5185 return result; 5186 5187 onError: 5188 Py_XDECREF(str); 5189 return NULL; 5190} 5191 5192/* --- Decimal Encoder ---------------------------------------------------- */ 5193 5194int PyUnicode_EncodeDecimal(Py_UNICODE *s, 5195 Py_ssize_t length, 5196 char *output, 5197 const char *errors) 5198{ 5199 Py_UNICODE *p, *end; 5200 PyObject *errorHandler = NULL; 5201 PyObject *exc = NULL; 5202 const char *encoding = "decimal"; 5203 const char *reason = "invalid decimal Unicode string"; 5204 /* the following variable is used for caching string comparisons 5205 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */ 5206 int known_errorHandler = -1; 5207 5208 if (output == NULL) { 5209 PyErr_BadArgument(); 5210 return -1; 5211 } 5212 5213 p = s; 5214 end = s + length; 5215 while (p < end) { 5216 register Py_UNICODE ch = *p; 5217 int decimal; 5218 PyObject *repunicode; 5219 Py_ssize_t repsize; 5220 Py_ssize_t newpos; 5221 Py_UNICODE *uni2; 5222 Py_UNICODE *collstart; 5223 Py_UNICODE *collend; 5224 5225 if (Py_UNICODE_ISSPACE(ch)) { 5226 *output++ = ' '; 5227 ++p; 5228 continue; 5229 } 5230 decimal = Py_UNICODE_TODECIMAL(ch); 5231 if (decimal >= 0) { 5232 *output++ = '0' + decimal; 5233 ++p; 5234 continue; 5235 } 5236 if (0 < ch && ch < 256) { 5237 *output++ = (char)ch; 5238 ++p; 5239 continue; 5240 } 5241 /* All other characters are considered unencodable */ 5242 collstart = p; 5243 collend = p+1; 5244 while (collend < end) { 5245 if ((0 < *collend && *collend < 256) || 5246 !Py_UNICODE_ISSPACE(*collend) || 5247 Py_UNICODE_TODECIMAL(*collend)) 5248 break; 5249 } 5250 /* cache callback name lookup 5251 * (if not done yet, i.e. it's the first error) */ 5252 if (known_errorHandler==-1) { 5253 if ((errors==NULL) || (!strcmp(errors, "strict"))) 5254 known_errorHandler = 1; 5255 else if (!strcmp(errors, "replace")) 5256 known_errorHandler = 2; 5257 else if (!strcmp(errors, "ignore")) 5258 known_errorHandler = 3; 5259 else if (!strcmp(errors, "xmlcharrefreplace")) 5260 known_errorHandler = 4; 5261 else 5262 known_errorHandler = 0; 5263 } 5264 switch (known_errorHandler) { 5265 case 1: /* strict */ 5266 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason); 5267 goto onError; 5268 case 2: /* replace */ 5269 for (p = collstart; p < collend; ++p) 5270 *output++ = '?'; 5271 /* fall through */ 5272 case 3: /* ignore */ 5273 p = collend; 5274 break; 5275 case 4: /* xmlcharrefreplace */ 5276 /* generate replacement (temporarily (mis)uses p) */ 5277 for (p = collstart; p < collend; ++p) 5278 output += sprintf(output, "&#%d;", (int)*p); 5279 p = collend; 5280 break; 5281 default: 5282 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler, 5283 encoding, reason, s, length, &exc, 5284 collstart-s, collend-s, &newpos); 5285 if (repunicode == NULL) 5286 goto onError; 5287 /* generate replacement */ 5288 repsize = PyUnicode_GET_SIZE(repunicode); 5289 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) { 5290 Py_UNICODE ch = *uni2; 5291 if (Py_UNICODE_ISSPACE(ch)) 5292 *output++ = ' '; 5293 else { 5294 decimal = Py_UNICODE_TODECIMAL(ch); 5295 if (decimal >= 0) 5296 *output++ = '0' + decimal; 5297 else if (0 < ch && ch < 256) 5298 *output++ = (char)ch; 5299 else { 5300 Py_DECREF(repunicode); 5301 raise_encode_exception(&exc, encoding, 5302 s, length, collstart-s, collend-s, reason); 5303 goto onError; 5304 } 5305 } 5306 } 5307 p = s + newpos; 5308 Py_DECREF(repunicode); 5309 } 5310 } 5311 /* 0-terminate the output string */ 5312 *output++ = '\0'; 5313 Py_XDECREF(exc); 5314 Py_XDECREF(errorHandler); 5315 return 0; 5316 5317 onError: 5318 Py_XDECREF(exc); 5319 Py_XDECREF(errorHandler); 5320 return -1; 5321} 5322 5323/* --- Helpers ------------------------------------------------------------ */ 5324 5325#include "stringlib/unicodedefs.h" 5326#include "stringlib/fastsearch.h" 5327#include "stringlib/count.h" 5328/* Include _ParseTupleFinds from find.h */ 5329#define FROM_UNICODE 5330#include "stringlib/find.h" 5331#include "stringlib/partition.h" 5332 5333#define _Py_InsertThousandsGrouping _PyUnicode_InsertThousandsGrouping 5334#include "stringlib/localeutil.h" 5335 5336/* helper macro to fixup start/end slice values */ 5337#define FIX_START_END(obj) \ 5338 if (start < 0) \ 5339 start += (obj)->length; \ 5340 if (start < 0) \ 5341 start = 0; \ 5342 if (end > (obj)->length) \ 5343 end = (obj)->length; \ 5344 if (end < 0) \ 5345 end += (obj)->length; \ 5346 if (end < 0) \ 5347 end = 0; 5348 5349Py_ssize_t PyUnicode_Count(PyObject *str, 5350 PyObject *substr, 5351 Py_ssize_t start, 5352 Py_ssize_t end) 5353{ 5354 Py_ssize_t result; 5355 PyUnicodeObject* str_obj; 5356 PyUnicodeObject* sub_obj; 5357 5358 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str); 5359 if (!str_obj) 5360 return -1; 5361 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr); 5362 if (!sub_obj) { 5363 Py_DECREF(str_obj); 5364 return -1; 5365 } 5366 5367 FIX_START_END(str_obj); 5368 5369 result = stringlib_count( 5370 str_obj->str + start, end - start, sub_obj->str, sub_obj->length 5371 ); 5372 5373 Py_DECREF(sub_obj); 5374 Py_DECREF(str_obj); 5375 5376 return result; 5377} 5378 5379Py_ssize_t PyUnicode_Find(PyObject *str, 5380 PyObject *sub, 5381 Py_ssize_t start, 5382 Py_ssize_t end, 5383 int direction) 5384{ 5385 Py_ssize_t result; 5386 5387 str = PyUnicode_FromObject(str); 5388 if (!str) 5389 return -2; 5390 sub = PyUnicode_FromObject(sub); 5391 if (!sub) { 5392 Py_DECREF(str); 5393 return -2; 5394 } 5395 5396 if (direction > 0) 5397 result = stringlib_find_slice( 5398 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str), 5399 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub), 5400 start, end 5401 ); 5402 else 5403 result = stringlib_rfind_slice( 5404 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str), 5405 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub), 5406 start, end 5407 ); 5408 5409 Py_DECREF(str); 5410 Py_DECREF(sub); 5411 5412 return result; 5413} 5414 5415static 5416int tailmatch(PyUnicodeObject *self, 5417 PyUnicodeObject *substring, 5418 Py_ssize_t start, 5419 Py_ssize_t end, 5420 int direction) 5421{ 5422 if (substring->length == 0) 5423 return 1; 5424 5425 FIX_START_END(self); 5426 5427 end -= substring->length; 5428 if (end < start) 5429 return 0; 5430 5431 if (direction > 0) { 5432 if (Py_UNICODE_MATCH(self, end, substring)) 5433 return 1; 5434 } else { 5435 if (Py_UNICODE_MATCH(self, start, substring)) 5436 return 1; 5437 } 5438 5439 return 0; 5440} 5441 5442Py_ssize_t PyUnicode_Tailmatch(PyObject *str, 5443 PyObject *substr, 5444 Py_ssize_t start, 5445 Py_ssize_t end, 5446 int direction) 5447{ 5448 Py_ssize_t result; 5449 5450 str = PyUnicode_FromObject(str); 5451 if (str == NULL) 5452 return -1; 5453 substr = PyUnicode_FromObject(substr); 5454 if (substr == NULL) { 5455 Py_DECREF(str); 5456 return -1; 5457 } 5458 5459 result = tailmatch((PyUnicodeObject *)str, 5460 (PyUnicodeObject *)substr, 5461 start, end, direction); 5462 Py_DECREF(str); 5463 Py_DECREF(substr); 5464 return result; 5465} 5466 5467/* Apply fixfct filter to the Unicode object self and return a 5468 reference to the modified object */ 5469 5470static 5471PyObject *fixup(PyUnicodeObject *self, 5472 int (*fixfct)(PyUnicodeObject *s)) 5473{ 5474 5475 PyUnicodeObject *u; 5476 5477 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length); 5478 if (u == NULL) 5479 return NULL; 5480 5481 Py_UNICODE_COPY(u->str, self->str, self->length); 5482 5483 if (!fixfct(u) && PyUnicode_CheckExact(self)) { 5484 /* fixfct should return TRUE if it modified the buffer. If 5485 FALSE, return a reference to the original buffer instead 5486 (to save space, not time) */ 5487 Py_INCREF(self); 5488 Py_DECREF(u); 5489 return (PyObject*) self; 5490 } 5491 return (PyObject*) u; 5492} 5493 5494static 5495int fixupper(PyUnicodeObject *self) 5496{ 5497 Py_ssize_t len = self->length; 5498 Py_UNICODE *s = self->str; 5499 int status = 0; 5500 5501 while (len-- > 0) { 5502 register Py_UNICODE ch; 5503 5504 ch = Py_UNICODE_TOUPPER(*s); 5505 if (ch != *s) { 5506 status = 1; 5507 *s = ch; 5508 } 5509 s++; 5510 } 5511 5512 return status; 5513} 5514 5515static 5516int fixlower(PyUnicodeObject *self) 5517{ 5518 Py_ssize_t len = self->length; 5519 Py_UNICODE *s = self->str; 5520 int status = 0; 5521 5522 while (len-- > 0) { 5523 register Py_UNICODE ch; 5524 5525 ch = Py_UNICODE_TOLOWER(*s); 5526 if (ch != *s) { 5527 status = 1; 5528 *s = ch; 5529 } 5530 s++; 5531 } 5532 5533 return status; 5534} 5535 5536static 5537int fixswapcase(PyUnicodeObject *self) 5538{ 5539 Py_ssize_t len = self->length; 5540 Py_UNICODE *s = self->str; 5541 int status = 0; 5542 5543 while (len-- > 0) { 5544 if (Py_UNICODE_ISUPPER(*s)) { 5545 *s = Py_UNICODE_TOLOWER(*s); 5546 status = 1; 5547 } else if (Py_UNICODE_ISLOWER(*s)) { 5548 *s = Py_UNICODE_TOUPPER(*s); 5549 status = 1; 5550 } 5551 s++; 5552 } 5553 5554 return status; 5555} 5556 5557static 5558int fixcapitalize(PyUnicodeObject *self) 5559{ 5560 Py_ssize_t len = self->length; 5561 Py_UNICODE *s = self->str; 5562 int status = 0; 5563 5564 if (len == 0) 5565 return 0; 5566 if (Py_UNICODE_ISLOWER(*s)) { 5567 *s = Py_UNICODE_TOUPPER(*s); 5568 status = 1; 5569 } 5570 s++; 5571 while (--len > 0) { 5572 if (Py_UNICODE_ISUPPER(*s)) { 5573 *s = Py_UNICODE_TOLOWER(*s); 5574 status = 1; 5575 } 5576 s++; 5577 } 5578 return status; 5579} 5580 5581static 5582int fixtitle(PyUnicodeObject *self) 5583{ 5584 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 5585 register Py_UNICODE *e; 5586 int previous_is_cased; 5587 5588 /* Shortcut for single character strings */ 5589 if (PyUnicode_GET_SIZE(self) == 1) { 5590 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p); 5591 if (*p != ch) { 5592 *p = ch; 5593 return 1; 5594 } 5595 else 5596 return 0; 5597 } 5598 5599 e = p + PyUnicode_GET_SIZE(self); 5600 previous_is_cased = 0; 5601 for (; p < e; p++) { 5602 register const Py_UNICODE ch = *p; 5603 5604 if (previous_is_cased) 5605 *p = Py_UNICODE_TOLOWER(ch); 5606 else 5607 *p = Py_UNICODE_TOTITLE(ch); 5608 5609 if (Py_UNICODE_ISLOWER(ch) || 5610 Py_UNICODE_ISUPPER(ch) || 5611 Py_UNICODE_ISTITLE(ch)) 5612 previous_is_cased = 1; 5613 else 5614 previous_is_cased = 0; 5615 } 5616 return 1; 5617} 5618 5619PyObject * 5620PyUnicode_Join(PyObject *separator, PyObject *seq) 5621{ 5622 const Py_UNICODE blank = ' '; 5623 const Py_UNICODE *sep = ␣ 5624 Py_ssize_t seplen = 1; 5625 PyUnicodeObject *res = NULL; /* the result */ 5626 Py_UNICODE *res_p; /* pointer to free byte in res's string area */ 5627 PyObject *fseq; /* PySequence_Fast(seq) */ 5628 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */ 5629 PyObject **items; 5630 PyObject *item; 5631 Py_ssize_t sz, i; 5632 5633 fseq = PySequence_Fast(seq, ""); 5634 if (fseq == NULL) { 5635 return NULL; 5636 } 5637 5638 /* NOTE: the following code can't call back into Python code, 5639 * so we are sure that fseq won't be mutated. 5640 */ 5641 5642 seqlen = PySequence_Fast_GET_SIZE(fseq); 5643 /* If empty sequence, return u"". */ 5644 if (seqlen == 0) { 5645 res = _PyUnicode_New(0); /* empty sequence; return u"" */ 5646 goto Done; 5647 } 5648 items = PySequence_Fast_ITEMS(fseq); 5649 /* If singleton sequence with an exact Unicode, return that. */ 5650 if (seqlen == 1) { 5651 item = items[0]; 5652 if (PyUnicode_CheckExact(item)) { 5653 Py_INCREF(item); 5654 res = (PyUnicodeObject *)item; 5655 goto Done; 5656 } 5657 } 5658 else { 5659 /* Set up sep and seplen */ 5660 if (separator == NULL) { 5661 sep = ␣ 5662 seplen = 1; 5663 } 5664 else { 5665 if (!PyUnicode_Check(separator)) { 5666 PyErr_Format(PyExc_TypeError, 5667 "separator: expected str instance," 5668 " %.80s found", 5669 Py_TYPE(separator)->tp_name); 5670 goto onError; 5671 } 5672 sep = PyUnicode_AS_UNICODE(separator); 5673 seplen = PyUnicode_GET_SIZE(separator); 5674 } 5675 } 5676 5677 /* There are at least two things to join, or else we have a subclass 5678 * of str in the sequence. 5679 * Do a pre-pass to figure out the total amount of space we'll 5680 * need (sz), and see whether all argument are strings. 5681 */ 5682 sz = 0; 5683 for (i = 0; i < seqlen; i++) { 5684 const Py_ssize_t old_sz = sz; 5685 item = items[i]; 5686 if (!PyUnicode_Check(item)) { 5687 PyErr_Format(PyExc_TypeError, 5688 "sequence item %zd: expected str instance," 5689 " %.80s found", 5690 i, Py_TYPE(item)->tp_name); 5691 goto onError; 5692 } 5693 sz += PyUnicode_GET_SIZE(item); 5694 if (i != 0) 5695 sz += seplen; 5696 if (sz < old_sz || sz > PY_SSIZE_T_MAX) { 5697 PyErr_SetString(PyExc_OverflowError, 5698 "join() result is too long for a Python string"); 5699 goto onError; 5700 } 5701 } 5702 5703 res = _PyUnicode_New(sz); 5704 if (res == NULL) 5705 goto onError; 5706 5707 /* Catenate everything. */ 5708 res_p = PyUnicode_AS_UNICODE(res); 5709 for (i = 0; i < seqlen; ++i) { 5710 Py_ssize_t itemlen; 5711 item = items[i]; 5712 itemlen = PyUnicode_GET_SIZE(item); 5713 /* Copy item, and maybe the separator. */ 5714 if (i) { 5715 Py_UNICODE_COPY(res_p, sep, seplen); 5716 res_p += seplen; 5717 } 5718 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen); 5719 res_p += itemlen; 5720 } 5721 5722 Done: 5723 Py_DECREF(fseq); 5724 return (PyObject *)res; 5725 5726 onError: 5727 Py_DECREF(fseq); 5728 Py_XDECREF(res); 5729 return NULL; 5730} 5731 5732static 5733PyUnicodeObject *pad(PyUnicodeObject *self, 5734 Py_ssize_t left, 5735 Py_ssize_t right, 5736 Py_UNICODE fill) 5737{ 5738 PyUnicodeObject *u; 5739 5740 if (left < 0) 5741 left = 0; 5742 if (right < 0) 5743 right = 0; 5744 5745 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) { 5746 Py_INCREF(self); 5747 return self; 5748 } 5749 5750 u = _PyUnicode_New(left + self->length + right); 5751 if (u) { 5752 if (left) 5753 Py_UNICODE_FILL(u->str, fill, left); 5754 Py_UNICODE_COPY(u->str + left, self->str, self->length); 5755 if (right) 5756 Py_UNICODE_FILL(u->str + left + self->length, fill, right); 5757 } 5758 5759 return u; 5760} 5761 5762#define SPLIT_APPEND(data, left, right) \ 5763 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \ 5764 if (!str) \ 5765 goto onError; \ 5766 if (PyList_Append(list, str)) { \ 5767 Py_DECREF(str); \ 5768 goto onError; \ 5769 } \ 5770 else \ 5771 Py_DECREF(str); 5772 5773static 5774PyObject *split_whitespace(PyUnicodeObject *self, 5775 PyObject *list, 5776 Py_ssize_t maxcount) 5777{ 5778 register Py_ssize_t i; 5779 register Py_ssize_t j; 5780 Py_ssize_t len = self->length; 5781 PyObject *str; 5782 register const Py_UNICODE *buf = self->str; 5783 5784 for (i = j = 0; i < len; ) { 5785 /* find a token */ 5786 while (i < len && Py_UNICODE_ISSPACE(buf[i])) 5787 i++; 5788 j = i; 5789 while (i < len && !Py_UNICODE_ISSPACE(buf[i])) 5790 i++; 5791 if (j < i) { 5792 if (maxcount-- <= 0) 5793 break; 5794 SPLIT_APPEND(buf, j, i); 5795 while (i < len && Py_UNICODE_ISSPACE(buf[i])) 5796 i++; 5797 j = i; 5798 } 5799 } 5800 if (j < len) { 5801 SPLIT_APPEND(buf, j, len); 5802 } 5803 return list; 5804 5805 onError: 5806 Py_DECREF(list); 5807 return NULL; 5808} 5809 5810PyObject *PyUnicode_Splitlines(PyObject *string, 5811 int keepends) 5812{ 5813 register Py_ssize_t i; 5814 register Py_ssize_t j; 5815 Py_ssize_t len; 5816 PyObject *list; 5817 PyObject *str; 5818 Py_UNICODE *data; 5819 5820 string = PyUnicode_FromObject(string); 5821 if (string == NULL) 5822 return NULL; 5823 data = PyUnicode_AS_UNICODE(string); 5824 len = PyUnicode_GET_SIZE(string); 5825 5826 list = PyList_New(0); 5827 if (!list) 5828 goto onError; 5829 5830 for (i = j = 0; i < len; ) { 5831 Py_ssize_t eol; 5832 5833 /* Find a line and append it */ 5834 while (i < len && !BLOOM_LINEBREAK(data[i])) 5835 i++; 5836 5837 /* Skip the line break reading CRLF as one line break */ 5838 eol = i; 5839 if (i < len) { 5840 if (data[i] == '\r' && i + 1 < len && 5841 data[i+1] == '\n') 5842 i += 2; 5843 else 5844 i++; 5845 if (keepends) 5846 eol = i; 5847 } 5848 SPLIT_APPEND(data, j, eol); 5849 j = i; 5850 } 5851 if (j < len) { 5852 SPLIT_APPEND(data, j, len); 5853 } 5854 5855 Py_DECREF(string); 5856 return list; 5857 5858 onError: 5859 Py_XDECREF(list); 5860 Py_DECREF(string); 5861 return NULL; 5862} 5863 5864static 5865PyObject *split_char(PyUnicodeObject *self, 5866 PyObject *list, 5867 Py_UNICODE ch, 5868 Py_ssize_t maxcount) 5869{ 5870 register Py_ssize_t i; 5871 register Py_ssize_t j; 5872 Py_ssize_t len = self->length; 5873 PyObject *str; 5874 register const Py_UNICODE *buf = self->str; 5875 5876 for (i = j = 0; i < len; ) { 5877 if (buf[i] == ch) { 5878 if (maxcount-- <= 0) 5879 break; 5880 SPLIT_APPEND(buf, j, i); 5881 i = j = i + 1; 5882 } else 5883 i++; 5884 } 5885 if (j <= len) { 5886 SPLIT_APPEND(buf, j, len); 5887 } 5888 return list; 5889 5890 onError: 5891 Py_DECREF(list); 5892 return NULL; 5893} 5894 5895static 5896PyObject *split_substring(PyUnicodeObject *self, 5897 PyObject *list, 5898 PyUnicodeObject *substring, 5899 Py_ssize_t maxcount) 5900{ 5901 register Py_ssize_t i; 5902 register Py_ssize_t j; 5903 Py_ssize_t len = self->length; 5904 Py_ssize_t sublen = substring->length; 5905 PyObject *str; 5906 5907 for (i = j = 0; i <= len - sublen; ) { 5908 if (Py_UNICODE_MATCH(self, i, substring)) { 5909 if (maxcount-- <= 0) 5910 break; 5911 SPLIT_APPEND(self->str, j, i); 5912 i = j = i + sublen; 5913 } else 5914 i++; 5915 } 5916 if (j <= len) { 5917 SPLIT_APPEND(self->str, j, len); 5918 } 5919 return list; 5920 5921 onError: 5922 Py_DECREF(list); 5923 return NULL; 5924} 5925 5926static 5927PyObject *rsplit_whitespace(PyUnicodeObject *self, 5928 PyObject *list, 5929 Py_ssize_t maxcount) 5930{ 5931 register Py_ssize_t i; 5932 register Py_ssize_t j; 5933 Py_ssize_t len = self->length; 5934 PyObject *str; 5935 register const Py_UNICODE *buf = self->str; 5936 5937 for (i = j = len - 1; i >= 0; ) { 5938 /* find a token */ 5939 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i])) 5940 i--; 5941 j = i; 5942 while (i >= 0 && !Py_UNICODE_ISSPACE(buf[i])) 5943 i--; 5944 if (j > i) { 5945 if (maxcount-- <= 0) 5946 break; 5947 SPLIT_APPEND(buf, i + 1, j + 1); 5948 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i])) 5949 i--; 5950 j = i; 5951 } 5952 } 5953 if (j >= 0) { 5954 SPLIT_APPEND(buf, 0, j + 1); 5955 } 5956 if (PyList_Reverse(list) < 0) 5957 goto onError; 5958 return list; 5959 5960 onError: 5961 Py_DECREF(list); 5962 return NULL; 5963} 5964 5965static 5966PyObject *rsplit_char(PyUnicodeObject *self, 5967 PyObject *list, 5968 Py_UNICODE ch, 5969 Py_ssize_t maxcount) 5970{ 5971 register Py_ssize_t i; 5972 register Py_ssize_t j; 5973 Py_ssize_t len = self->length; 5974 PyObject *str; 5975 register const Py_UNICODE *buf = self->str; 5976 5977 for (i = j = len - 1; i >= 0; ) { 5978 if (buf[i] == ch) { 5979 if (maxcount-- <= 0) 5980 break; 5981 SPLIT_APPEND(buf, i + 1, j + 1); 5982 j = i = i - 1; 5983 } else 5984 i--; 5985 } 5986 if (j >= -1) { 5987 SPLIT_APPEND(buf, 0, j + 1); 5988 } 5989 if (PyList_Reverse(list) < 0) 5990 goto onError; 5991 return list; 5992 5993 onError: 5994 Py_DECREF(list); 5995 return NULL; 5996} 5997 5998static 5999PyObject *rsplit_substring(PyUnicodeObject *self, 6000 PyObject *list, 6001 PyUnicodeObject *substring, 6002 Py_ssize_t maxcount) 6003{ 6004 register Py_ssize_t i; 6005 register Py_ssize_t j; 6006 Py_ssize_t len = self->length; 6007 Py_ssize_t sublen = substring->length; 6008 PyObject *str; 6009 6010 for (i = len - sublen, j = len; i >= 0; ) { 6011 if (Py_UNICODE_MATCH(self, i, substring)) { 6012 if (maxcount-- <= 0) 6013 break; 6014 SPLIT_APPEND(self->str, i + sublen, j); 6015 j = i; 6016 i -= sublen; 6017 } else 6018 i--; 6019 } 6020 if (j >= 0) { 6021 SPLIT_APPEND(self->str, 0, j); 6022 } 6023 if (PyList_Reverse(list) < 0) 6024 goto onError; 6025 return list; 6026 6027 onError: 6028 Py_DECREF(list); 6029 return NULL; 6030} 6031 6032#undef SPLIT_APPEND 6033 6034static 6035PyObject *split(PyUnicodeObject *self, 6036 PyUnicodeObject *substring, 6037 Py_ssize_t maxcount) 6038{ 6039 PyObject *list; 6040 6041 if (maxcount < 0) 6042 maxcount = PY_SSIZE_T_MAX; 6043 6044 list = PyList_New(0); 6045 if (!list) 6046 return NULL; 6047 6048 if (substring == NULL) 6049 return split_whitespace(self,list,maxcount); 6050 6051 else if (substring->length == 1) 6052 return split_char(self,list,substring->str[0],maxcount); 6053 6054 else if (substring->length == 0) { 6055 Py_DECREF(list); 6056 PyErr_SetString(PyExc_ValueError, "empty separator"); 6057 return NULL; 6058 } 6059 else 6060 return split_substring(self,list,substring,maxcount); 6061} 6062 6063static 6064PyObject *rsplit(PyUnicodeObject *self, 6065 PyUnicodeObject *substring, 6066 Py_ssize_t maxcount) 6067{ 6068 PyObject *list; 6069 6070 if (maxcount < 0) 6071 maxcount = PY_SSIZE_T_MAX; 6072 6073 list = PyList_New(0); 6074 if (!list) 6075 return NULL; 6076 6077 if (substring == NULL) 6078 return rsplit_whitespace(self,list,maxcount); 6079 6080 else if (substring->length == 1) 6081 return rsplit_char(self,list,substring->str[0],maxcount); 6082 6083 else if (substring->length == 0) { 6084 Py_DECREF(list); 6085 PyErr_SetString(PyExc_ValueError, "empty separator"); 6086 return NULL; 6087 } 6088 else 6089 return rsplit_substring(self,list,substring,maxcount); 6090} 6091 6092static 6093PyObject *replace(PyUnicodeObject *self, 6094 PyUnicodeObject *str1, 6095 PyUnicodeObject *str2, 6096 Py_ssize_t maxcount) 6097{ 6098 PyUnicodeObject *u; 6099 6100 if (maxcount < 0) 6101 maxcount = PY_SSIZE_T_MAX; 6102 6103 if (str1->length == str2->length) { 6104 /* same length */ 6105 Py_ssize_t i; 6106 if (str1->length == 1) { 6107 /* replace characters */ 6108 Py_UNICODE u1, u2; 6109 if (!findchar(self->str, self->length, str1->str[0])) 6110 goto nothing; 6111 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length); 6112 if (!u) 6113 return NULL; 6114 Py_UNICODE_COPY(u->str, self->str, self->length); 6115 u1 = str1->str[0]; 6116 u2 = str2->str[0]; 6117 for (i = 0; i < u->length; i++) 6118 if (u->str[i] == u1) { 6119 if (--maxcount < 0) 6120 break; 6121 u->str[i] = u2; 6122 } 6123 } else { 6124 i = fastsearch( 6125 self->str, self->length, str1->str, str1->length, FAST_SEARCH 6126 ); 6127 if (i < 0) 6128 goto nothing; 6129 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length); 6130 if (!u) 6131 return NULL; 6132 Py_UNICODE_COPY(u->str, self->str, self->length); 6133 while (i <= self->length - str1->length) 6134 if (Py_UNICODE_MATCH(self, i, str1)) { 6135 if (--maxcount < 0) 6136 break; 6137 Py_UNICODE_COPY(u->str+i, str2->str, str2->length); 6138 i += str1->length; 6139 } else 6140 i++; 6141 } 6142 } else { 6143 6144 Py_ssize_t n, i, j, e; 6145 Py_ssize_t product, new_size, delta; 6146 Py_UNICODE *p; 6147 6148 /* replace strings */ 6149 n = stringlib_count(self->str, self->length, str1->str, str1->length); 6150 if (n > maxcount) 6151 n = maxcount; 6152 if (n == 0) 6153 goto nothing; 6154 /* new_size = self->length + n * (str2->length - str1->length)); */ 6155 delta = (str2->length - str1->length); 6156 if (delta == 0) { 6157 new_size = self->length; 6158 } else { 6159 product = n * (str2->length - str1->length); 6160 if ((product / (str2->length - str1->length)) != n) { 6161 PyErr_SetString(PyExc_OverflowError, 6162 "replace string is too long"); 6163 return NULL; 6164 } 6165 new_size = self->length + product; 6166 if (new_size < 0) { 6167 PyErr_SetString(PyExc_OverflowError, 6168 "replace string is too long"); 6169 return NULL; 6170 } 6171 } 6172 u = _PyUnicode_New(new_size); 6173 if (!u) 6174 return NULL; 6175 i = 0; 6176 p = u->str; 6177 e = self->length - str1->length; 6178 if (str1->length > 0) { 6179 while (n-- > 0) { 6180 /* look for next match */ 6181 j = i; 6182 while (j <= e) { 6183 if (Py_UNICODE_MATCH(self, j, str1)) 6184 break; 6185 j++; 6186 } 6187 if (j > i) { 6188 if (j > e) 6189 break; 6190 /* copy unchanged part [i:j] */ 6191 Py_UNICODE_COPY(p, self->str+i, j-i); 6192 p += j - i; 6193 } 6194 /* copy substitution string */ 6195 if (str2->length > 0) { 6196 Py_UNICODE_COPY(p, str2->str, str2->length); 6197 p += str2->length; 6198 } 6199 i = j + str1->length; 6200 } 6201 if (i < self->length) 6202 /* copy tail [i:] */ 6203 Py_UNICODE_COPY(p, self->str+i, self->length-i); 6204 } else { 6205 /* interleave */ 6206 while (n > 0) { 6207 Py_UNICODE_COPY(p, str2->str, str2->length); 6208 p += str2->length; 6209 if (--n <= 0) 6210 break; 6211 *p++ = self->str[i++]; 6212 } 6213 Py_UNICODE_COPY(p, self->str+i, self->length-i); 6214 } 6215 } 6216 return (PyObject *) u; 6217 6218nothing: 6219 /* nothing to replace; return original string (when possible) */ 6220 if (PyUnicode_CheckExact(self)) { 6221 Py_INCREF(self); 6222 return (PyObject *) self; 6223 } 6224 return PyUnicode_FromUnicode(self->str, self->length); 6225} 6226 6227/* --- Unicode Object Methods --------------------------------------------- */ 6228 6229PyDoc_STRVAR(title__doc__, 6230"S.title() -> str\n\ 6231\n\ 6232Return a titlecased version of S, i.e. words start with title case\n\ 6233characters, all remaining cased characters have lower case."); 6234 6235static PyObject* 6236unicode_title(PyUnicodeObject *self) 6237{ 6238 return fixup(self, fixtitle); 6239} 6240 6241PyDoc_STRVAR(capitalize__doc__, 6242"S.capitalize() -> str\n\ 6243\n\ 6244Return a capitalized version of S, i.e. make the first character\n\ 6245have upper case."); 6246 6247static PyObject* 6248unicode_capitalize(PyUnicodeObject *self) 6249{ 6250 return fixup(self, fixcapitalize); 6251} 6252 6253#if 0 6254PyDoc_STRVAR(capwords__doc__, 6255"S.capwords() -> str\n\ 6256\n\ 6257Apply .capitalize() to all words in S and return the result with\n\ 6258normalized whitespace (all whitespace strings are replaced by ' ')."); 6259 6260static PyObject* 6261unicode_capwords(PyUnicodeObject *self) 6262{ 6263 PyObject *list; 6264 PyObject *item; 6265 Py_ssize_t i; 6266 6267 /* Split into words */ 6268 list = split(self, NULL, -1); 6269 if (!list) 6270 return NULL; 6271 6272 /* Capitalize each word */ 6273 for (i = 0; i < PyList_GET_SIZE(list); i++) { 6274 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i), 6275 fixcapitalize); 6276 if (item == NULL) 6277 goto onError; 6278 Py_DECREF(PyList_GET_ITEM(list, i)); 6279 PyList_SET_ITEM(list, i, item); 6280 } 6281 6282 /* Join the words to form a new string */ 6283 item = PyUnicode_Join(NULL, list); 6284 6285onError: 6286 Py_DECREF(list); 6287 return (PyObject *)item; 6288} 6289#endif 6290 6291/* Argument converter. Coerces to a single unicode character */ 6292 6293static int 6294convert_uc(PyObject *obj, void *addr) 6295{ 6296 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr; 6297 PyObject *uniobj; 6298 Py_UNICODE *unistr; 6299 6300 uniobj = PyUnicode_FromObject(obj); 6301 if (uniobj == NULL) { 6302 PyErr_SetString(PyExc_TypeError, 6303 "The fill character cannot be converted to Unicode"); 6304 return 0; 6305 } 6306 if (PyUnicode_GET_SIZE(uniobj) != 1) { 6307 PyErr_SetString(PyExc_TypeError, 6308 "The fill character must be exactly one character long"); 6309 Py_DECREF(uniobj); 6310 return 0; 6311 } 6312 unistr = PyUnicode_AS_UNICODE(uniobj); 6313 *fillcharloc = unistr[0]; 6314 Py_DECREF(uniobj); 6315 return 1; 6316} 6317 6318PyDoc_STRVAR(center__doc__, 6319"S.center(width[, fillchar]) -> str\n\ 6320\n\ 6321Return S centered in a string of length width. Padding is\n\ 6322done using the specified fill character (default is a space)"); 6323 6324static PyObject * 6325unicode_center(PyUnicodeObject *self, PyObject *args) 6326{ 6327 Py_ssize_t marg, left; 6328 Py_ssize_t width; 6329 Py_UNICODE fillchar = ' '; 6330 6331 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar)) 6332 return NULL; 6333 6334 if (self->length >= width && PyUnicode_CheckExact(self)) { 6335 Py_INCREF(self); 6336 return (PyObject*) self; 6337 } 6338 6339 marg = width - self->length; 6340 left = marg / 2 + (marg & width & 1); 6341 6342 return (PyObject*) pad(self, left, marg - left, fillchar); 6343} 6344 6345#if 0 6346 6347/* This code should go into some future Unicode collation support 6348 module. The basic comparison should compare ordinals on a naive 6349 basis (this is what Java does and thus JPython too). */ 6350 6351/* speedy UTF-16 code point order comparison */ 6352/* gleaned from: */ 6353/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */ 6354 6355static short utf16Fixup[32] = 6356{ 6357 0, 0, 0, 0, 0, 0, 0, 0, 6358 0, 0, 0, 0, 0, 0, 0, 0, 6359 0, 0, 0, 0, 0, 0, 0, 0, 6360 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800 6361}; 6362 6363static int 6364unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2) 6365{ 6366 Py_ssize_t len1, len2; 6367 6368 Py_UNICODE *s1 = str1->str; 6369 Py_UNICODE *s2 = str2->str; 6370 6371 len1 = str1->length; 6372 len2 = str2->length; 6373 6374 while (len1 > 0 && len2 > 0) { 6375 Py_UNICODE c1, c2; 6376 6377 c1 = *s1++; 6378 c2 = *s2++; 6379 6380 if (c1 > (1<<11) * 26) 6381 c1 += utf16Fixup[c1>>11]; 6382 if (c2 > (1<<11) * 26) 6383 c2 += utf16Fixup[c2>>11]; 6384 /* now c1 and c2 are in UTF-32-compatible order */ 6385 6386 if (c1 != c2) 6387 return (c1 < c2) ? -1 : 1; 6388 6389 len1--; len2--; 6390 } 6391 6392 return (len1 < len2) ? -1 : (len1 != len2); 6393} 6394 6395#else 6396 6397static int 6398unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2) 6399{ 6400 register Py_ssize_t len1, len2; 6401 6402 Py_UNICODE *s1 = str1->str; 6403 Py_UNICODE *s2 = str2->str; 6404 6405 len1 = str1->length; 6406 len2 = str2->length; 6407 6408 while (len1 > 0 && len2 > 0) { 6409 Py_UNICODE c1, c2; 6410 6411 c1 = *s1++; 6412 c2 = *s2++; 6413 6414 if (c1 != c2) 6415 return (c1 < c2) ? -1 : 1; 6416 6417 len1--; len2--; 6418 } 6419 6420 return (len1 < len2) ? -1 : (len1 != len2); 6421} 6422 6423#endif 6424 6425int PyUnicode_Compare(PyObject *left, 6426 PyObject *right) 6427{ 6428 if (PyUnicode_Check(left) && PyUnicode_Check(right)) 6429 return unicode_compare((PyUnicodeObject *)left, 6430 (PyUnicodeObject *)right); 6431 PyErr_Format(PyExc_TypeError, 6432 "Can't compare %.100s and %.100s", 6433 left->ob_type->tp_name, 6434 right->ob_type->tp_name); 6435 return -1; 6436} 6437 6438int 6439PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str) 6440{ 6441 int i; 6442 Py_UNICODE *id; 6443 assert(PyUnicode_Check(uni)); 6444 id = PyUnicode_AS_UNICODE(uni); 6445 /* Compare Unicode string and source character set string */ 6446 for (i = 0; id[i] && str[i]; i++) 6447 if (id[i] != str[i]) 6448 return ((int)id[i] < (int)str[i]) ? -1 : 1; 6449 if (id[i]) 6450 return 1; /* uni is longer */ 6451 if (str[i]) 6452 return -1; /* str is longer */ 6453 return 0; 6454} 6455 6456PyObject *PyUnicode_RichCompare(PyObject *left, 6457 PyObject *right, 6458 int op) 6459{ 6460 int result; 6461 6462 result = PyUnicode_Compare(left, right); 6463 if (result == -1 && PyErr_Occurred()) 6464 goto onError; 6465 6466 /* Convert the return value to a Boolean */ 6467 switch (op) { 6468 case Py_EQ: 6469 result = (result == 0); 6470 break; 6471 case Py_NE: 6472 result = (result != 0); 6473 break; 6474 case Py_LE: 6475 result = (result <= 0); 6476 break; 6477 case Py_GE: 6478 result = (result >= 0); 6479 break; 6480 case Py_LT: 6481 result = (result == -1); 6482 break; 6483 case Py_GT: 6484 result = (result == 1); 6485 break; 6486 } 6487 return PyBool_FromLong(result); 6488 6489 onError: 6490 6491 /* Standard case 6492 6493 Type errors mean that PyUnicode_FromObject() could not convert 6494 one of the arguments (usually the right hand side) to Unicode, 6495 ie. we can't handle the comparison request. However, it is 6496 possible that the other object knows a comparison method, which 6497 is why we return Py_NotImplemented to give the other object a 6498 chance. 6499 6500 */ 6501 if (PyErr_ExceptionMatches(PyExc_TypeError)) { 6502 PyErr_Clear(); 6503 Py_INCREF(Py_NotImplemented); 6504 return Py_NotImplemented; 6505 } 6506 if (op != Py_EQ && op != Py_NE) 6507 return NULL; 6508 6509 /* Equality comparison. 6510 6511 This is a special case: we silence any PyExc_UnicodeDecodeError 6512 and instead turn it into a PyErr_UnicodeWarning. 6513 6514 */ 6515 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) 6516 return NULL; 6517 PyErr_Clear(); 6518 if (PyErr_WarnEx(PyExc_UnicodeWarning, 6519 (op == Py_EQ) ? 6520 "equal comparison " 6521 "failed to convert both arguments to str - " 6522 "interpreting them as being unequal" 6523 : 6524 "Unicode unequal comparison " 6525 "failed to convert both arguments to str - " 6526 "interpreting them as being unequal", 6527 1) < 0) 6528 return NULL; 6529 result = (op == Py_NE); 6530 return PyBool_FromLong(result); 6531} 6532 6533int PyUnicode_Contains(PyObject *container, 6534 PyObject *element) 6535{ 6536 PyObject *str, *sub; 6537 int result; 6538 6539 /* Coerce the two arguments */ 6540 sub = PyUnicode_FromObject(element); 6541 if (!sub) { 6542 PyErr_Format(PyExc_TypeError, 6543 "'in <string>' requires string as left operand, not %s", 6544 element->ob_type->tp_name); 6545 return -1; 6546 } 6547 6548 str = PyUnicode_FromObject(container); 6549 if (!str) { 6550 Py_DECREF(sub); 6551 return -1; 6552 } 6553 6554 result = stringlib_contains_obj(str, sub); 6555 6556 Py_DECREF(str); 6557 Py_DECREF(sub); 6558 6559 return result; 6560} 6561 6562/* Concat to string or Unicode object giving a new Unicode object. */ 6563 6564PyObject *PyUnicode_Concat(PyObject *left, 6565 PyObject *right) 6566{ 6567 PyUnicodeObject *u = NULL, *v = NULL, *w; 6568 6569 /* Coerce the two arguments */ 6570 u = (PyUnicodeObject *)PyUnicode_FromObject(left); 6571 if (u == NULL) 6572 goto onError; 6573 v = (PyUnicodeObject *)PyUnicode_FromObject(right); 6574 if (v == NULL) 6575 goto onError; 6576 6577 /* Shortcuts */ 6578 if (v == unicode_empty) { 6579 Py_DECREF(v); 6580 return (PyObject *)u; 6581 } 6582 if (u == unicode_empty) { 6583 Py_DECREF(u); 6584 return (PyObject *)v; 6585 } 6586 6587 /* Concat the two Unicode strings */ 6588 w = _PyUnicode_New(u->length + v->length); 6589 if (w == NULL) 6590 goto onError; 6591 Py_UNICODE_COPY(w->str, u->str, u->length); 6592 Py_UNICODE_COPY(w->str + u->length, v->str, v->length); 6593 6594 Py_DECREF(u); 6595 Py_DECREF(v); 6596 return (PyObject *)w; 6597 6598onError: 6599 Py_XDECREF(u); 6600 Py_XDECREF(v); 6601 return NULL; 6602} 6603 6604void 6605PyUnicode_Append(PyObject **pleft, PyObject *right) 6606{ 6607 PyObject *new; 6608 if (*pleft == NULL) 6609 return; 6610 if (right == NULL || !PyUnicode_Check(*pleft)) { 6611 Py_DECREF(*pleft); 6612 *pleft = NULL; 6613 return; 6614 } 6615 new = PyUnicode_Concat(*pleft, right); 6616 Py_DECREF(*pleft); 6617 *pleft = new; 6618} 6619 6620void 6621PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right) 6622{ 6623 PyUnicode_Append(pleft, right); 6624 Py_XDECREF(right); 6625} 6626 6627PyDoc_STRVAR(count__doc__, 6628"S.count(sub[, start[, end]]) -> int\n\ 6629\n\ 6630Return the number of non-overlapping occurrences of substring sub in\n\ 6631string S[start:end]. Optional arguments start and end are\n\ 6632interpreted as in slice notation."); 6633 6634static PyObject * 6635unicode_count(PyUnicodeObject *self, PyObject *args) 6636{ 6637 PyUnicodeObject *substring; 6638 Py_ssize_t start = 0; 6639 Py_ssize_t end = PY_SSIZE_T_MAX; 6640 PyObject *result; 6641 6642 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring, 6643 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 6644 return NULL; 6645 6646 substring = (PyUnicodeObject *)PyUnicode_FromObject( 6647 (PyObject *)substring); 6648 if (substring == NULL) 6649 return NULL; 6650 6651 FIX_START_END(self); 6652 6653 result = PyLong_FromSsize_t( 6654 stringlib_count(self->str + start, end - start, 6655 substring->str, substring->length) 6656 ); 6657 6658 Py_DECREF(substring); 6659 6660 return result; 6661} 6662 6663PyDoc_STRVAR(encode__doc__, 6664"S.encode([encoding[, errors]]) -> bytes\n\ 6665\n\ 6666Encode S using the codec registered for encoding. encoding defaults\n\ 6667to the default encoding. errors may be given to set a different error\n\ 6668handling scheme. Default is 'strict' meaning that encoding errors raise\n\ 6669a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\ 6670'xmlcharrefreplace' as well as any other name registered with\n\ 6671codecs.register_error that can handle UnicodeEncodeErrors."); 6672 6673static PyObject * 6674unicode_encode(PyUnicodeObject *self, PyObject *args) 6675{ 6676 char *encoding = NULL; 6677 char *errors = NULL; 6678 PyObject *v; 6679 6680 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors)) 6681 return NULL; 6682 v = PyUnicode_AsEncodedString((PyObject *)self, encoding, errors); 6683 if (v == NULL) 6684 goto onError; 6685 if (!PyBytes_Check(v)) { 6686 PyErr_Format(PyExc_TypeError, 6687 "encoder did not return a bytes object " 6688 "(type=%.400s)", 6689 Py_TYPE(v)->tp_name); 6690 Py_DECREF(v); 6691 return NULL; 6692 } 6693 return v; 6694 6695 onError: 6696 return NULL; 6697} 6698 6699PyDoc_STRVAR(expandtabs__doc__, 6700"S.expandtabs([tabsize]) -> str\n\ 6701\n\ 6702Return a copy of S where all tab characters are expanded using spaces.\n\ 6703If tabsize is not given, a tab size of 8 characters is assumed."); 6704 6705static PyObject* 6706unicode_expandtabs(PyUnicodeObject *self, PyObject *args) 6707{ 6708 Py_UNICODE *e; 6709 Py_UNICODE *p; 6710 Py_UNICODE *q; 6711 Py_UNICODE *qe; 6712 Py_ssize_t i, j, incr; 6713 PyUnicodeObject *u; 6714 int tabsize = 8; 6715 6716 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize)) 6717 return NULL; 6718 6719 /* First pass: determine size of output string */ 6720 i = 0; /* chars up to and including most recent \n or \r */ 6721 j = 0; /* chars since most recent \n or \r (use in tab calculations) */ 6722 e = self->str + self->length; /* end of input */ 6723 for (p = self->str; p < e; p++) 6724 if (*p == '\t') { 6725 if (tabsize > 0) { 6726 incr = tabsize - (j % tabsize); /* cannot overflow */ 6727 if (j > PY_SSIZE_T_MAX - incr) 6728 goto overflow1; 6729 j += incr; 6730 } 6731 } 6732 else { 6733 if (j > PY_SSIZE_T_MAX - 1) 6734 goto overflow1; 6735 j++; 6736 if (*p == '\n' || *p == '\r') { 6737 if (i > PY_SSIZE_T_MAX - j) 6738 goto overflow1; 6739 i += j; 6740 j = 0; 6741 } 6742 } 6743 6744 if (i > PY_SSIZE_T_MAX - j) 6745 goto overflow1; 6746 6747 /* Second pass: create output string and fill it */ 6748 u = _PyUnicode_New(i + j); 6749 if (!u) 6750 return NULL; 6751 6752 j = 0; /* same as in first pass */ 6753 q = u->str; /* next output char */ 6754 qe = u->str + u->length; /* end of output */ 6755 6756 for (p = self->str; p < e; p++) 6757 if (*p == '\t') { 6758 if (tabsize > 0) { 6759 i = tabsize - (j % tabsize); 6760 j += i; 6761 while (i--) { 6762 if (q >= qe) 6763 goto overflow2; 6764 *q++ = ' '; 6765 } 6766 } 6767 } 6768 else { 6769 if (q >= qe) 6770 goto overflow2; 6771 *q++ = *p; 6772 j++; 6773 if (*p == '\n' || *p == '\r') 6774 j = 0; 6775 } 6776 6777 return (PyObject*) u; 6778 6779 overflow2: 6780 Py_DECREF(u); 6781 overflow1: 6782 PyErr_SetString(PyExc_OverflowError, "new string is too long"); 6783 return NULL; 6784} 6785 6786PyDoc_STRVAR(find__doc__, 6787"S.find(sub[, start[, end]]) -> int\n\ 6788\n\ 6789Return the lowest index in S where substring sub is found,\n\ 6790such that sub is contained within s[start:end]. Optional\n\ 6791arguments start and end are interpreted as in slice notation.\n\ 6792\n\ 6793Return -1 on failure."); 6794 6795static PyObject * 6796unicode_find(PyUnicodeObject *self, PyObject *args) 6797{ 6798 PyObject *substring; 6799 Py_ssize_t start; 6800 Py_ssize_t end; 6801 Py_ssize_t result; 6802 6803 if (!_ParseTupleFinds(args, &substring, &start, &end)) 6804 return NULL; 6805 6806 result = stringlib_find_slice( 6807 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self), 6808 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring), 6809 start, end 6810 ); 6811 6812 Py_DECREF(substring); 6813 6814 return PyLong_FromSsize_t(result); 6815} 6816 6817static PyObject * 6818unicode_getitem(PyUnicodeObject *self, Py_ssize_t index) 6819{ 6820 if (index < 0 || index >= self->length) { 6821 PyErr_SetString(PyExc_IndexError, "string index out of range"); 6822 return NULL; 6823 } 6824 6825 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1); 6826} 6827 6828/* Believe it or not, this produces the same value for ASCII strings 6829 as string_hash(). */ 6830static long 6831unicode_hash(PyUnicodeObject *self) 6832{ 6833 Py_ssize_t len; 6834 Py_UNICODE *p; 6835 long x; 6836 6837 if (self->hash != -1) 6838 return self->hash; 6839 len = Py_SIZE(self); 6840 p = self->str; 6841 x = *p << 7; 6842 while (--len >= 0) 6843 x = (1000003*x) ^ *p++; 6844 x ^= Py_SIZE(self); 6845 if (x == -1) 6846 x = -2; 6847 self->hash = x; 6848 return x; 6849} 6850 6851PyDoc_STRVAR(index__doc__, 6852"S.index(sub[, start[, end]]) -> int\n\ 6853\n\ 6854Like S.find() but raise ValueError when the substring is not found."); 6855 6856static PyObject * 6857unicode_index(PyUnicodeObject *self, PyObject *args) 6858{ 6859 Py_ssize_t result; 6860 PyObject *substring; 6861 Py_ssize_t start; 6862 Py_ssize_t end; 6863 6864 if (!_ParseTupleFinds(args, &substring, &start, &end)) 6865 return NULL; 6866 6867 result = stringlib_find_slice( 6868 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self), 6869 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring), 6870 start, end 6871 ); 6872 6873 Py_DECREF(substring); 6874 6875 if (result < 0) { 6876 PyErr_SetString(PyExc_ValueError, "substring not found"); 6877 return NULL; 6878 } 6879 6880 return PyLong_FromSsize_t(result); 6881} 6882 6883PyDoc_STRVAR(islower__doc__, 6884"S.islower() -> bool\n\ 6885\n\ 6886Return True if all cased characters in S are lowercase and there is\n\ 6887at least one cased character in S, False otherwise."); 6888 6889static PyObject* 6890unicode_islower(PyUnicodeObject *self) 6891{ 6892 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 6893 register const Py_UNICODE *e; 6894 int cased; 6895 6896 /* Shortcut for single character strings */ 6897 if (PyUnicode_GET_SIZE(self) == 1) 6898 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p)); 6899 6900 /* Special case for empty strings */ 6901 if (PyUnicode_GET_SIZE(self) == 0) 6902 return PyBool_FromLong(0); 6903 6904 e = p + PyUnicode_GET_SIZE(self); 6905 cased = 0; 6906 for (; p < e; p++) { 6907 register const Py_UNICODE ch = *p; 6908 6909 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) 6910 return PyBool_FromLong(0); 6911 else if (!cased && Py_UNICODE_ISLOWER(ch)) 6912 cased = 1; 6913 } 6914 return PyBool_FromLong(cased); 6915} 6916 6917PyDoc_STRVAR(isupper__doc__, 6918"S.isupper() -> bool\n\ 6919\n\ 6920Return True if all cased characters in S are uppercase and there is\n\ 6921at least one cased character in S, False otherwise."); 6922 6923static PyObject* 6924unicode_isupper(PyUnicodeObject *self) 6925{ 6926 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 6927 register const Py_UNICODE *e; 6928 int cased; 6929 6930 /* Shortcut for single character strings */ 6931 if (PyUnicode_GET_SIZE(self) == 1) 6932 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0); 6933 6934 /* Special case for empty strings */ 6935 if (PyUnicode_GET_SIZE(self) == 0) 6936 return PyBool_FromLong(0); 6937 6938 e = p + PyUnicode_GET_SIZE(self); 6939 cased = 0; 6940 for (; p < e; p++) { 6941 register const Py_UNICODE ch = *p; 6942 6943 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch)) 6944 return PyBool_FromLong(0); 6945 else if (!cased && Py_UNICODE_ISUPPER(ch)) 6946 cased = 1; 6947 } 6948 return PyBool_FromLong(cased); 6949} 6950 6951PyDoc_STRVAR(istitle__doc__, 6952"S.istitle() -> bool\n\ 6953\n\ 6954Return True if S is a titlecased string and there is at least one\n\ 6955character in S, i.e. upper- and titlecase characters may only\n\ 6956follow uncased characters and lowercase characters only cased ones.\n\ 6957Return False otherwise."); 6958 6959static PyObject* 6960unicode_istitle(PyUnicodeObject *self) 6961{ 6962 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 6963 register const Py_UNICODE *e; 6964 int cased, previous_is_cased; 6965 6966 /* Shortcut for single character strings */ 6967 if (PyUnicode_GET_SIZE(self) == 1) 6968 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) || 6969 (Py_UNICODE_ISUPPER(*p) != 0)); 6970 6971 /* Special case for empty strings */ 6972 if (PyUnicode_GET_SIZE(self) == 0) 6973 return PyBool_FromLong(0); 6974 6975 e = p + PyUnicode_GET_SIZE(self); 6976 cased = 0; 6977 previous_is_cased = 0; 6978 for (; p < e; p++) { 6979 register const Py_UNICODE ch = *p; 6980 6981 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) { 6982 if (previous_is_cased) 6983 return PyBool_FromLong(0); 6984 previous_is_cased = 1; 6985 cased = 1; 6986 } 6987 else if (Py_UNICODE_ISLOWER(ch)) { 6988 if (!previous_is_cased) 6989 return PyBool_FromLong(0); 6990 previous_is_cased = 1; 6991 cased = 1; 6992 } 6993 else 6994 previous_is_cased = 0; 6995 } 6996 return PyBool_FromLong(cased); 6997} 6998 6999PyDoc_STRVAR(isspace__doc__, 7000"S.isspace() -> bool\n\ 7001\n\ 7002Return True if all characters in S are whitespace\n\ 7003and there is at least one character in S, False otherwise."); 7004 7005static PyObject* 7006unicode_isspace(PyUnicodeObject *self) 7007{ 7008 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 7009 register const Py_UNICODE *e; 7010 7011 /* Shortcut for single character strings */ 7012 if (PyUnicode_GET_SIZE(self) == 1 && 7013 Py_UNICODE_ISSPACE(*p)) 7014 return PyBool_FromLong(1); 7015 7016 /* Special case for empty strings */ 7017 if (PyUnicode_GET_SIZE(self) == 0) 7018 return PyBool_FromLong(0); 7019 7020 e = p + PyUnicode_GET_SIZE(self); 7021 for (; p < e; p++) { 7022 if (!Py_UNICODE_ISSPACE(*p)) 7023 return PyBool_FromLong(0); 7024 } 7025 return PyBool_FromLong(1); 7026} 7027 7028PyDoc_STRVAR(isalpha__doc__, 7029"S.isalpha() -> bool\n\ 7030\n\ 7031Return True if all characters in S are alphabetic\n\ 7032and there is at least one character in S, False otherwise."); 7033 7034static PyObject* 7035unicode_isalpha(PyUnicodeObject *self) 7036{ 7037 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 7038 register const Py_UNICODE *e; 7039 7040 /* Shortcut for single character strings */ 7041 if (PyUnicode_GET_SIZE(self) == 1 && 7042 Py_UNICODE_ISALPHA(*p)) 7043 return PyBool_FromLong(1); 7044 7045 /* Special case for empty strings */ 7046 if (PyUnicode_GET_SIZE(self) == 0) 7047 return PyBool_FromLong(0); 7048 7049 e = p + PyUnicode_GET_SIZE(self); 7050 for (; p < e; p++) { 7051 if (!Py_UNICODE_ISALPHA(*p)) 7052 return PyBool_FromLong(0); 7053 } 7054 return PyBool_FromLong(1); 7055} 7056 7057PyDoc_STRVAR(isalnum__doc__, 7058"S.isalnum() -> bool\n\ 7059\n\ 7060Return True if all characters in S are alphanumeric\n\ 7061and there is at least one character in S, False otherwise."); 7062 7063static PyObject* 7064unicode_isalnum(PyUnicodeObject *self) 7065{ 7066 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 7067 register const Py_UNICODE *e; 7068 7069 /* Shortcut for single character strings */ 7070 if (PyUnicode_GET_SIZE(self) == 1 && 7071 Py_UNICODE_ISALNUM(*p)) 7072 return PyBool_FromLong(1); 7073 7074 /* Special case for empty strings */ 7075 if (PyUnicode_GET_SIZE(self) == 0) 7076 return PyBool_FromLong(0); 7077 7078 e = p + PyUnicode_GET_SIZE(self); 7079 for (; p < e; p++) { 7080 if (!Py_UNICODE_ISALNUM(*p)) 7081 return PyBool_FromLong(0); 7082 } 7083 return PyBool_FromLong(1); 7084} 7085 7086PyDoc_STRVAR(isdecimal__doc__, 7087"S.isdecimal() -> bool\n\ 7088\n\ 7089Return True if there are only decimal characters in S,\n\ 7090False otherwise."); 7091 7092static PyObject* 7093unicode_isdecimal(PyUnicodeObject *self) 7094{ 7095 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 7096 register const Py_UNICODE *e; 7097 7098 /* Shortcut for single character strings */ 7099 if (PyUnicode_GET_SIZE(self) == 1 && 7100 Py_UNICODE_ISDECIMAL(*p)) 7101 return PyBool_FromLong(1); 7102 7103 /* Special case for empty strings */ 7104 if (PyUnicode_GET_SIZE(self) == 0) 7105 return PyBool_FromLong(0); 7106 7107 e = p + PyUnicode_GET_SIZE(self); 7108 for (; p < e; p++) { 7109 if (!Py_UNICODE_ISDECIMAL(*p)) 7110 return PyBool_FromLong(0); 7111 } 7112 return PyBool_FromLong(1); 7113} 7114 7115PyDoc_STRVAR(isdigit__doc__, 7116"S.isdigit() -> bool\n\ 7117\n\ 7118Return True if all characters in S are digits\n\ 7119and there is at least one character in S, False otherwise."); 7120 7121static PyObject* 7122unicode_isdigit(PyUnicodeObject *self) 7123{ 7124 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 7125 register const Py_UNICODE *e; 7126 7127 /* Shortcut for single character strings */ 7128 if (PyUnicode_GET_SIZE(self) == 1 && 7129 Py_UNICODE_ISDIGIT(*p)) 7130 return PyBool_FromLong(1); 7131 7132 /* Special case for empty strings */ 7133 if (PyUnicode_GET_SIZE(self) == 0) 7134 return PyBool_FromLong(0); 7135 7136 e = p + PyUnicode_GET_SIZE(self); 7137 for (; p < e; p++) { 7138 if (!Py_UNICODE_ISDIGIT(*p)) 7139 return PyBool_FromLong(0); 7140 } 7141 return PyBool_FromLong(1); 7142} 7143 7144PyDoc_STRVAR(isnumeric__doc__, 7145"S.isnumeric() -> bool\n\ 7146\n\ 7147Return True if there are only numeric characters in S,\n\ 7148False otherwise."); 7149 7150static PyObject* 7151unicode_isnumeric(PyUnicodeObject *self) 7152{ 7153 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 7154 register const Py_UNICODE *e; 7155 7156 /* Shortcut for single character strings */ 7157 if (PyUnicode_GET_SIZE(self) == 1 && 7158 Py_UNICODE_ISNUMERIC(*p)) 7159 return PyBool_FromLong(1); 7160 7161 /* Special case for empty strings */ 7162 if (PyUnicode_GET_SIZE(self) == 0) 7163 return PyBool_FromLong(0); 7164 7165 e = p + PyUnicode_GET_SIZE(self); 7166 for (; p < e; p++) { 7167 if (!Py_UNICODE_ISNUMERIC(*p)) 7168 return PyBool_FromLong(0); 7169 } 7170 return PyBool_FromLong(1); 7171} 7172 7173int 7174PyUnicode_IsIdentifier(PyObject *self) 7175{ 7176 register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self); 7177 register const Py_UNICODE *e; 7178 7179 /* Special case for empty strings */ 7180 if (PyUnicode_GET_SIZE(self) == 0) 7181 return 0; 7182 7183 /* PEP 3131 says that the first character must be in 7184 XID_Start and subsequent characters in XID_Continue, 7185 and for the ASCII range, the 2.x rules apply (i.e 7186 start with letters and underscore, continue with 7187 letters, digits, underscore). However, given the current 7188 definition of XID_Start and XID_Continue, it is sufficient 7189 to check just for these, except that _ must be allowed 7190 as starting an identifier. */ 7191 if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */) 7192 return 0; 7193 7194 e = p + PyUnicode_GET_SIZE(self); 7195 for (p++; p < e; p++) { 7196 if (!_PyUnicode_IsXidContinue(*p)) 7197 return 0; 7198 } 7199 return 1; 7200} 7201 7202PyDoc_STRVAR(isidentifier__doc__, 7203"S.isidentifier() -> bool\n\ 7204\n\ 7205Return True if S is a valid identifier according\n\ 7206to the language definition."); 7207 7208static PyObject* 7209unicode_isidentifier(PyObject *self) 7210{ 7211 return PyBool_FromLong(PyUnicode_IsIdentifier(self)); 7212} 7213 7214PyDoc_STRVAR(isprintable__doc__, 7215"S.isprintable() -> bool\n\ 7216\n\ 7217Return True if all characters in S are considered\n\ 7218printable in repr() or S is empty, False otherwise."); 7219 7220static PyObject* 7221unicode_isprintable(PyObject *self) 7222{ 7223 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 7224 register const Py_UNICODE *e; 7225 7226 /* Shortcut for single character strings */ 7227 if (PyUnicode_GET_SIZE(self) == 1 && Py_UNICODE_ISPRINTABLE(*p)) { 7228 Py_RETURN_TRUE; 7229 } 7230 7231 e = p + PyUnicode_GET_SIZE(self); 7232 for (; p < e; p++) { 7233 if (!Py_UNICODE_ISPRINTABLE(*p)) { 7234 Py_RETURN_FALSE; 7235 } 7236 } 7237 Py_RETURN_TRUE; 7238} 7239 7240PyDoc_STRVAR(join__doc__, 7241"S.join(sequence) -> str\n\ 7242\n\ 7243Return a string which is the concatenation of the strings in the\n\ 7244sequence. The separator between elements is S."); 7245 7246static PyObject* 7247unicode_join(PyObject *self, PyObject *data) 7248{ 7249 return PyUnicode_Join(self, data); 7250} 7251 7252static Py_ssize_t 7253unicode_length(PyUnicodeObject *self) 7254{ 7255 return self->length; 7256} 7257 7258PyDoc_STRVAR(ljust__doc__, 7259"S.ljust(width[, fillchar]) -> str\n\ 7260\n\ 7261Return S left justified in a Unicode string of length width. Padding is\n\ 7262done using the specified fill character (default is a space)."); 7263 7264static PyObject * 7265unicode_ljust(PyUnicodeObject *self, PyObject *args) 7266{ 7267 Py_ssize_t width; 7268 Py_UNICODE fillchar = ' '; 7269 7270 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar)) 7271 return NULL; 7272 7273 if (self->length >= width && PyUnicode_CheckExact(self)) { 7274 Py_INCREF(self); 7275 return (PyObject*) self; 7276 } 7277 7278 return (PyObject*) pad(self, 0, width - self->length, fillchar); 7279} 7280 7281PyDoc_STRVAR(lower__doc__, 7282"S.lower() -> str\n\ 7283\n\ 7284Return a copy of the string S converted to lowercase."); 7285 7286static PyObject* 7287unicode_lower(PyUnicodeObject *self) 7288{ 7289 return fixup(self, fixlower); 7290} 7291 7292#define LEFTSTRIP 0 7293#define RIGHTSTRIP 1 7294#define BOTHSTRIP 2 7295 7296/* Arrays indexed by above */ 7297static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"}; 7298 7299#define STRIPNAME(i) (stripformat[i]+3) 7300 7301/* externally visible for str.strip(unicode) */ 7302PyObject * 7303_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj) 7304{ 7305 Py_UNICODE *s = PyUnicode_AS_UNICODE(self); 7306 Py_ssize_t len = PyUnicode_GET_SIZE(self); 7307 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj); 7308 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj); 7309 Py_ssize_t i, j; 7310 7311 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen); 7312 7313 i = 0; 7314 if (striptype != RIGHTSTRIP) { 7315 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) { 7316 i++; 7317 } 7318 } 7319 7320 j = len; 7321 if (striptype != LEFTSTRIP) { 7322 do { 7323 j--; 7324 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen)); 7325 j++; 7326 } 7327 7328 if (i == 0 && j == len && PyUnicode_CheckExact(self)) { 7329 Py_INCREF(self); 7330 return (PyObject*)self; 7331 } 7332 else 7333 return PyUnicode_FromUnicode(s+i, j-i); 7334} 7335 7336 7337static PyObject * 7338do_strip(PyUnicodeObject *self, int striptype) 7339{ 7340 Py_UNICODE *s = PyUnicode_AS_UNICODE(self); 7341 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j; 7342 7343 i = 0; 7344 if (striptype != RIGHTSTRIP) { 7345 while (i < len && Py_UNICODE_ISSPACE(s[i])) { 7346 i++; 7347 } 7348 } 7349 7350 j = len; 7351 if (striptype != LEFTSTRIP) { 7352 do { 7353 j--; 7354 } while (j >= i && Py_UNICODE_ISSPACE(s[j])); 7355 j++; 7356 } 7357 7358 if (i == 0 && j == len && PyUnicode_CheckExact(self)) { 7359 Py_INCREF(self); 7360 return (PyObject*)self; 7361 } 7362 else 7363 return PyUnicode_FromUnicode(s+i, j-i); 7364} 7365 7366 7367static PyObject * 7368do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args) 7369{ 7370 PyObject *sep = NULL; 7371 7372 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep)) 7373 return NULL; 7374 7375 if (sep != NULL && sep != Py_None) { 7376 if (PyUnicode_Check(sep)) 7377 return _PyUnicode_XStrip(self, striptype, sep); 7378 else { 7379 PyErr_Format(PyExc_TypeError, 7380 "%s arg must be None or str", 7381 STRIPNAME(striptype)); 7382 return NULL; 7383 } 7384 } 7385 7386 return do_strip(self, striptype); 7387} 7388 7389 7390PyDoc_STRVAR(strip__doc__, 7391"S.strip([chars]) -> str\n\ 7392\n\ 7393Return a copy of the string S with leading and trailing\n\ 7394whitespace removed.\n\ 7395If chars is given and not None, remove characters in chars instead."); 7396 7397static PyObject * 7398unicode_strip(PyUnicodeObject *self, PyObject *args) 7399{ 7400 if (PyTuple_GET_SIZE(args) == 0) 7401 return do_strip(self, BOTHSTRIP); /* Common case */ 7402 else 7403 return do_argstrip(self, BOTHSTRIP, args); 7404} 7405 7406 7407PyDoc_STRVAR(lstrip__doc__, 7408"S.lstrip([chars]) -> str\n\ 7409\n\ 7410Return a copy of the string S with leading whitespace removed.\n\ 7411If chars is given and not None, remove characters in chars instead."); 7412 7413static PyObject * 7414unicode_lstrip(PyUnicodeObject *self, PyObject *args) 7415{ 7416 if (PyTuple_GET_SIZE(args) == 0) 7417 return do_strip(self, LEFTSTRIP); /* Common case */ 7418 else 7419 return do_argstrip(self, LEFTSTRIP, args); 7420} 7421 7422 7423PyDoc_STRVAR(rstrip__doc__, 7424"S.rstrip([chars]) -> str\n\ 7425\n\ 7426Return a copy of the string S with trailing whitespace removed.\n\ 7427If chars is given and not None, remove characters in chars instead."); 7428 7429static PyObject * 7430unicode_rstrip(PyUnicodeObject *self, PyObject *args) 7431{ 7432 if (PyTuple_GET_SIZE(args) == 0) 7433 return do_strip(self, RIGHTSTRIP); /* Common case */ 7434 else 7435 return do_argstrip(self, RIGHTSTRIP, args); 7436} 7437 7438 7439static PyObject* 7440unicode_repeat(PyUnicodeObject *str, Py_ssize_t len) 7441{ 7442 PyUnicodeObject *u; 7443 Py_UNICODE *p; 7444 Py_ssize_t nchars; 7445 size_t nbytes; 7446 7447 if (len < 0) 7448 len = 0; 7449 7450 if (len == 1 && PyUnicode_CheckExact(str)) { 7451 /* no repeat, return original string */ 7452 Py_INCREF(str); 7453 return (PyObject*) str; 7454 } 7455 7456 /* ensure # of chars needed doesn't overflow int and # of bytes 7457 * needed doesn't overflow size_t 7458 */ 7459 nchars = len * str->length; 7460 if (len && nchars / len != str->length) { 7461 PyErr_SetString(PyExc_OverflowError, 7462 "repeated string is too long"); 7463 return NULL; 7464 } 7465 nbytes = (nchars + 1) * sizeof(Py_UNICODE); 7466 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) { 7467 PyErr_SetString(PyExc_OverflowError, 7468 "repeated string is too long"); 7469 return NULL; 7470 } 7471 u = _PyUnicode_New(nchars); 7472 if (!u) 7473 return NULL; 7474 7475 p = u->str; 7476 7477 if (str->length == 1 && len > 0) { 7478 Py_UNICODE_FILL(p, str->str[0], len); 7479 } else { 7480 Py_ssize_t done = 0; /* number of characters copied this far */ 7481 if (done < nchars) { 7482 Py_UNICODE_COPY(p, str->str, str->length); 7483 done = str->length; 7484 } 7485 while (done < nchars) { 7486 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done; 7487 Py_UNICODE_COPY(p+done, p, n); 7488 done += n; 7489 } 7490 } 7491 7492 return (PyObject*) u; 7493} 7494 7495PyObject *PyUnicode_Replace(PyObject *obj, 7496 PyObject *subobj, 7497 PyObject *replobj, 7498 Py_ssize_t maxcount) 7499{ 7500 PyObject *self; 7501 PyObject *str1; 7502 PyObject *str2; 7503 PyObject *result; 7504 7505 self = PyUnicode_FromObject(obj); 7506 if (self == NULL) 7507 return NULL; 7508 str1 = PyUnicode_FromObject(subobj); 7509 if (str1 == NULL) { 7510 Py_DECREF(self); 7511 return NULL; 7512 } 7513 str2 = PyUnicode_FromObject(replobj); 7514 if (str2 == NULL) { 7515 Py_DECREF(self); 7516 Py_DECREF(str1); 7517 return NULL; 7518 } 7519 result = replace((PyUnicodeObject *)self, 7520 (PyUnicodeObject *)str1, 7521 (PyUnicodeObject *)str2, 7522 maxcount); 7523 Py_DECREF(self); 7524 Py_DECREF(str1); 7525 Py_DECREF(str2); 7526 return result; 7527} 7528 7529PyDoc_STRVAR(replace__doc__, 7530"S.replace (old, new[, count]) -> str\n\ 7531\n\ 7532Return a copy of S with all occurrences of substring\n\ 7533old replaced by new. If the optional argument count is\n\ 7534given, only the first count occurrences are replaced."); 7535 7536static PyObject* 7537unicode_replace(PyUnicodeObject *self, PyObject *args) 7538{ 7539 PyUnicodeObject *str1; 7540 PyUnicodeObject *str2; 7541 Py_ssize_t maxcount = -1; 7542 PyObject *result; 7543 7544 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount)) 7545 return NULL; 7546 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1); 7547 if (str1 == NULL) 7548 return NULL; 7549 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2); 7550 if (str2 == NULL) { 7551 Py_DECREF(str1); 7552 return NULL; 7553 } 7554 7555 result = replace(self, str1, str2, maxcount); 7556 7557 Py_DECREF(str1); 7558 Py_DECREF(str2); 7559 return result; 7560} 7561 7562static 7563PyObject *unicode_repr(PyObject *unicode) 7564{ 7565 PyObject *repr; 7566 Py_UNICODE *p; 7567 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode); 7568 Py_ssize_t size = PyUnicode_GET_SIZE(unicode); 7569 7570 /* XXX(nnorwitz): rather than over-allocating, it would be 7571 better to choose a different scheme. Perhaps scan the 7572 first N-chars of the string and allocate based on that size. 7573 */ 7574 /* Initial allocation is based on the longest-possible unichr 7575 escape. 7576 7577 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source 7578 unichr, so in this case it's the longest unichr escape. In 7579 narrow (UTF-16) builds this is five chars per source unichr 7580 since there are two unichrs in the surrogate pair, so in narrow 7581 (UTF-16) builds it's not the longest unichr escape. 7582 7583 In wide or narrow builds '\uxxxx' is 6 chars per source unichr, 7584 so in the narrow (UTF-16) build case it's the longest unichr 7585 escape. 7586 */ 7587 7588 repr = PyUnicode_FromUnicode(NULL, 7589 2 /* quotes */ 7590#ifdef Py_UNICODE_WIDE 7591 + 10*size 7592#else 7593 + 6*size 7594#endif 7595 + 1); 7596 if (repr == NULL) 7597 return NULL; 7598 7599 p = PyUnicode_AS_UNICODE(repr); 7600 7601 /* Add quote */ 7602 *p++ = (findchar(s, size, '\'') && 7603 !findchar(s, size, '"')) ? '"' : '\''; 7604 while (size-- > 0) { 7605 Py_UNICODE ch = *s++; 7606 7607 /* Escape quotes and backslashes */ 7608 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) { 7609 *p++ = '\\'; 7610 *p++ = ch; 7611 continue; 7612 } 7613 7614 /* Map special whitespace to '\t', \n', '\r' */ 7615 if (ch == '\t') { 7616 *p++ = '\\'; 7617 *p++ = 't'; 7618 } 7619 else if (ch == '\n') { 7620 *p++ = '\\'; 7621 *p++ = 'n'; 7622 } 7623 else if (ch == '\r') { 7624 *p++ = '\\'; 7625 *p++ = 'r'; 7626 } 7627 7628 /* Map non-printable US ASCII to '\xhh' */ 7629 else if (ch < ' ' || ch == 0x7F) { 7630 *p++ = '\\'; 7631 *p++ = 'x'; 7632 *p++ = hexdigits[(ch >> 4) & 0x000F]; 7633 *p++ = hexdigits[ch & 0x000F]; 7634 } 7635 7636 /* Copy ASCII characters as-is */ 7637 else if (ch < 0x7F) { 7638 *p++ = ch; 7639 } 7640 7641 /* Non-ASCII characters */ 7642 else { 7643 Py_UCS4 ucs = ch; 7644 7645#ifndef Py_UNICODE_WIDE 7646 Py_UNICODE ch2 = 0; 7647 /* Get code point from surrogate pair */ 7648 if (size > 0) { 7649 ch2 = *s; 7650 if (ch >= 0xD800 && ch < 0xDC00 && ch2 >= 0xDC00 7651 && ch2 <= 0xDFFF) { 7652 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) 7653 + 0x00010000; 7654 s++; 7655 size--; 7656 } 7657 } 7658#endif 7659 /* Map Unicode whitespace and control characters 7660 (categories Z* and C* except ASCII space) 7661 */ 7662 if (!Py_UNICODE_ISPRINTABLE(ucs)) { 7663 /* Map 8-bit characters to '\xhh' */ 7664 if (ucs <= 0xff) { 7665 *p++ = '\\'; 7666 *p++ = 'x'; 7667 *p++ = hexdigits[(ch >> 4) & 0x000F]; 7668 *p++ = hexdigits[ch & 0x000F]; 7669 } 7670 /* Map 21-bit characters to '\U00xxxxxx' */ 7671 else if (ucs >= 0x10000) { 7672 *p++ = '\\'; 7673 *p++ = 'U'; 7674 *p++ = hexdigits[(ucs >> 28) & 0x0000000F]; 7675 *p++ = hexdigits[(ucs >> 24) & 0x0000000F]; 7676 *p++ = hexdigits[(ucs >> 20) & 0x0000000F]; 7677 *p++ = hexdigits[(ucs >> 16) & 0x0000000F]; 7678 *p++ = hexdigits[(ucs >> 12) & 0x0000000F]; 7679 *p++ = hexdigits[(ucs >> 8) & 0x0000000F]; 7680 *p++ = hexdigits[(ucs >> 4) & 0x0000000F]; 7681 *p++ = hexdigits[ucs & 0x0000000F]; 7682 } 7683 /* Map 16-bit characters to '\uxxxx' */ 7684 else { 7685 *p++ = '\\'; 7686 *p++ = 'u'; 7687 *p++ = hexdigits[(ucs >> 12) & 0x000F]; 7688 *p++ = hexdigits[(ucs >> 8) & 0x000F]; 7689 *p++ = hexdigits[(ucs >> 4) & 0x000F]; 7690 *p++ = hexdigits[ucs & 0x000F]; 7691 } 7692 } 7693 /* Copy characters as-is */ 7694 else { 7695 *p++ = ch; 7696#ifndef Py_UNICODE_WIDE 7697 if (ucs >= 0x10000) 7698 *p++ = ch2; 7699#endif 7700 } 7701 } 7702 } 7703 /* Add quote */ 7704 *p++ = PyUnicode_AS_UNICODE(repr)[0]; 7705 7706 *p = '\0'; 7707 _PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr)); 7708 return repr; 7709} 7710 7711PyDoc_STRVAR(rfind__doc__, 7712"S.rfind(sub[, start[, end]]) -> int\n\ 7713\n\ 7714Return the highest index in S where substring sub is found,\n\ 7715such that sub is contained within s[start:end]. Optional\n\ 7716arguments start and end are interpreted as in slice notation.\n\ 7717\n\ 7718Return -1 on failure."); 7719 7720static PyObject * 7721unicode_rfind(PyUnicodeObject *self, PyObject *args) 7722{ 7723 PyObject *substring; 7724 Py_ssize_t start; 7725 Py_ssize_t end; 7726 Py_ssize_t result; 7727 7728 if (!_ParseTupleFinds(args, &substring, &start, &end)) 7729 return NULL; 7730 7731 result = stringlib_rfind_slice( 7732 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self), 7733 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring), 7734 start, end 7735 ); 7736 7737 Py_DECREF(substring); 7738 7739 return PyLong_FromSsize_t(result); 7740} 7741 7742PyDoc_STRVAR(rindex__doc__, 7743"S.rindex(sub[, start[, end]]) -> int\n\ 7744\n\ 7745Like S.rfind() but raise ValueError when the substring is not found."); 7746 7747static PyObject * 7748unicode_rindex(PyUnicodeObject *self, PyObject *args) 7749{ 7750 PyObject *substring; 7751 Py_ssize_t start; 7752 Py_ssize_t end; 7753 Py_ssize_t result; 7754 7755 if (!_ParseTupleFinds(args, &substring, &start, &end)) 7756 return NULL; 7757 7758 result = stringlib_rfind_slice( 7759 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self), 7760 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring), 7761 start, end 7762 ); 7763 7764 Py_DECREF(substring); 7765 7766 if (result < 0) { 7767 PyErr_SetString(PyExc_ValueError, "substring not found"); 7768 return NULL; 7769 } 7770 return PyLong_FromSsize_t(result); 7771} 7772 7773PyDoc_STRVAR(rjust__doc__, 7774"S.rjust(width[, fillchar]) -> str\n\ 7775\n\ 7776Return S right justified in a string of length width. Padding is\n\ 7777done using the specified fill character (default is a space)."); 7778 7779static PyObject * 7780unicode_rjust(PyUnicodeObject *self, PyObject *args) 7781{ 7782 Py_ssize_t width; 7783 Py_UNICODE fillchar = ' '; 7784 7785 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar)) 7786 return NULL; 7787 7788 if (self->length >= width && PyUnicode_CheckExact(self)) { 7789 Py_INCREF(self); 7790 return (PyObject*) self; 7791 } 7792 7793 return (PyObject*) pad(self, width - self->length, 0, fillchar); 7794} 7795 7796PyObject *PyUnicode_Split(PyObject *s, 7797 PyObject *sep, 7798 Py_ssize_t maxsplit) 7799{ 7800 PyObject *result; 7801 7802 s = PyUnicode_FromObject(s); 7803 if (s == NULL) 7804 return NULL; 7805 if (sep != NULL) { 7806 sep = PyUnicode_FromObject(sep); 7807 if (sep == NULL) { 7808 Py_DECREF(s); 7809 return NULL; 7810 } 7811 } 7812 7813 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit); 7814 7815 Py_DECREF(s); 7816 Py_XDECREF(sep); 7817 return result; 7818} 7819 7820PyDoc_STRVAR(split__doc__, 7821"S.split([sep[, maxsplit]]) -> list of strings\n\ 7822\n\ 7823Return a list of the words in S, using sep as the\n\ 7824delimiter string. If maxsplit is given, at most maxsplit\n\ 7825splits are done. If sep is not specified or is None, any\n\ 7826whitespace string is a separator and empty strings are\n\ 7827removed from the result."); 7828 7829static PyObject* 7830unicode_split(PyUnicodeObject *self, PyObject *args) 7831{ 7832 PyObject *substring = Py_None; 7833 Py_ssize_t maxcount = -1; 7834 7835 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount)) 7836 return NULL; 7837 7838 if (substring == Py_None) 7839 return split(self, NULL, maxcount); 7840 else if (PyUnicode_Check(substring)) 7841 return split(self, (PyUnicodeObject *)substring, maxcount); 7842 else 7843 return PyUnicode_Split((PyObject *)self, substring, maxcount); 7844} 7845 7846PyObject * 7847PyUnicode_Partition(PyObject *str_in, PyObject *sep_in) 7848{ 7849 PyObject* str_obj; 7850 PyObject* sep_obj; 7851 PyObject* out; 7852 7853 str_obj = PyUnicode_FromObject(str_in); 7854 if (!str_obj) 7855 return NULL; 7856 sep_obj = PyUnicode_FromObject(sep_in); 7857 if (!sep_obj) { 7858 Py_DECREF(str_obj); 7859 return NULL; 7860 } 7861 7862 out = stringlib_partition( 7863 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj), 7864 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj) 7865 ); 7866 7867 Py_DECREF(sep_obj); 7868 Py_DECREF(str_obj); 7869 7870 return out; 7871} 7872 7873 7874PyObject * 7875PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in) 7876{ 7877 PyObject* str_obj; 7878 PyObject* sep_obj; 7879 PyObject* out; 7880 7881 str_obj = PyUnicode_FromObject(str_in); 7882 if (!str_obj) 7883 return NULL; 7884 sep_obj = PyUnicode_FromObject(sep_in); 7885 if (!sep_obj) { 7886 Py_DECREF(str_obj); 7887 return NULL; 7888 } 7889 7890 out = stringlib_rpartition( 7891 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj), 7892 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj) 7893 ); 7894 7895 Py_DECREF(sep_obj); 7896 Py_DECREF(str_obj); 7897 7898 return out; 7899} 7900 7901PyDoc_STRVAR(partition__doc__, 7902"S.partition(sep) -> (head, sep, tail)\n\ 7903\n\ 7904Search for the separator sep in S, and return the part before it,\n\ 7905the separator itself, and the part after it. If the separator is not\n\ 7906found, returns S and two empty strings."); 7907 7908static PyObject* 7909unicode_partition(PyUnicodeObject *self, PyObject *separator) 7910{ 7911 return PyUnicode_Partition((PyObject *)self, separator); 7912} 7913 7914PyDoc_STRVAR(rpartition__doc__, 7915"S.rpartition(sep) -> (tail, sep, head)\n\ 7916\n\ 7917Search for the separator sep in S, starting at the end of S, and return\n\ 7918the part before it, the separator itself, and the part after it. If the\n\ 7919separator is not found, returns two empty strings and S."); 7920 7921static PyObject* 7922unicode_rpartition(PyUnicodeObject *self, PyObject *separator) 7923{ 7924 return PyUnicode_RPartition((PyObject *)self, separator); 7925} 7926 7927PyObject *PyUnicode_RSplit(PyObject *s, 7928 PyObject *sep, 7929 Py_ssize_t maxsplit) 7930{ 7931 PyObject *result; 7932 7933 s = PyUnicode_FromObject(s); 7934 if (s == NULL) 7935 return NULL; 7936 if (sep != NULL) { 7937 sep = PyUnicode_FromObject(sep); 7938 if (sep == NULL) { 7939 Py_DECREF(s); 7940 return NULL; 7941 } 7942 } 7943 7944 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit); 7945 7946 Py_DECREF(s); 7947 Py_XDECREF(sep); 7948 return result; 7949} 7950 7951PyDoc_STRVAR(rsplit__doc__, 7952"S.rsplit([sep[, maxsplit]]) -> list of strings\n\ 7953\n\ 7954Return a list of the words in S, using sep as the\n\ 7955delimiter string, starting at the end of the string and\n\ 7956working to the front. If maxsplit is given, at most maxsplit\n\ 7957splits are done. If sep is not specified, any whitespace string\n\ 7958is a separator."); 7959 7960static PyObject* 7961unicode_rsplit(PyUnicodeObject *self, PyObject *args) 7962{ 7963 PyObject *substring = Py_None; 7964 Py_ssize_t maxcount = -1; 7965 7966 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount)) 7967 return NULL; 7968 7969 if (substring == Py_None) 7970 return rsplit(self, NULL, maxcount); 7971 else if (PyUnicode_Check(substring)) 7972 return rsplit(self, (PyUnicodeObject *)substring, maxcount); 7973 else 7974 return PyUnicode_RSplit((PyObject *)self, substring, maxcount); 7975} 7976 7977PyDoc_STRVAR(splitlines__doc__, 7978"S.splitlines([keepends]]) -> list of strings\n\ 7979\n\ 7980Return a list of the lines in S, breaking at line boundaries.\n\ 7981Line breaks are not included in the resulting list unless keepends\n\ 7982is given and true."); 7983 7984static PyObject* 7985unicode_splitlines(PyUnicodeObject *self, PyObject *args) 7986{ 7987 int keepends = 0; 7988 7989 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends)) 7990 return NULL; 7991 7992 return PyUnicode_Splitlines((PyObject *)self, keepends); 7993} 7994 7995static 7996PyObject *unicode_str(PyObject *self) 7997{ 7998 if (PyUnicode_CheckExact(self)) { 7999 Py_INCREF(self); 8000 return self; 8001 } else 8002 /* Subtype -- return genuine unicode string with the same value. */ 8003 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self), 8004 PyUnicode_GET_SIZE(self)); 8005} 8006 8007PyDoc_STRVAR(swapcase__doc__, 8008"S.swapcase() -> str\n\ 8009\n\ 8010Return a copy of S with uppercase characters converted to lowercase\n\ 8011and vice versa."); 8012 8013static PyObject* 8014unicode_swapcase(PyUnicodeObject *self) 8015{ 8016 return fixup(self, fixswapcase); 8017} 8018 8019PyDoc_STRVAR(maketrans__doc__, 8020"str.maketrans(x[, y[, z]]) -> dict (static method)\n\ 8021\n\ 8022Return a translation table usable for str.translate().\n\ 8023If there is only one argument, it must be a dictionary mapping Unicode\n\ 8024ordinals (integers) or characters to Unicode ordinals, strings or None.\n\ 8025Character keys will be then converted to ordinals.\n\ 8026If there are two arguments, they must be strings of equal length, and\n\ 8027in the resulting dictionary, each character in x will be mapped to the\n\ 8028character at the same position in y. If there is a third argument, it\n\ 8029must be a string, whose characters will be mapped to None in the result."); 8030 8031static PyObject* 8032unicode_maketrans(PyUnicodeObject *null, PyObject *args) 8033{ 8034 PyObject *x, *y = NULL, *z = NULL; 8035 PyObject *new = NULL, *key, *value; 8036 Py_ssize_t i = 0; 8037 int res; 8038 8039 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z)) 8040 return NULL; 8041 new = PyDict_New(); 8042 if (!new) 8043 return NULL; 8044 if (y != NULL) { 8045 /* x must be a string too, of equal length */ 8046 Py_ssize_t ylen = PyUnicode_GET_SIZE(y); 8047 if (!PyUnicode_Check(x)) { 8048 PyErr_SetString(PyExc_TypeError, "first maketrans argument must " 8049 "be a string if there is a second argument"); 8050 goto err; 8051 } 8052 if (PyUnicode_GET_SIZE(x) != ylen) { 8053 PyErr_SetString(PyExc_ValueError, "the first two maketrans " 8054 "arguments must have equal length"); 8055 goto err; 8056 } 8057 /* create entries for translating chars in x to those in y */ 8058 for (i = 0; i < PyUnicode_GET_SIZE(x); i++) { 8059 key = PyLong_FromLong(PyUnicode_AS_UNICODE(x)[i]); 8060 value = PyLong_FromLong(PyUnicode_AS_UNICODE(y)[i]); 8061 if (!key || !value) 8062 goto err; 8063 res = PyDict_SetItem(new, key, value); 8064 Py_DECREF(key); 8065 Py_DECREF(value); 8066 if (res < 0) 8067 goto err; 8068 } 8069 /* create entries for deleting chars in z */ 8070 if (z != NULL) { 8071 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) { 8072 key = PyLong_FromLong(PyUnicode_AS_UNICODE(z)[i]); 8073 if (!key) 8074 goto err; 8075 res = PyDict_SetItem(new, key, Py_None); 8076 Py_DECREF(key); 8077 if (res < 0) 8078 goto err; 8079 } 8080 } 8081 } else { 8082 /* x must be a dict */ 8083 if (!PyDict_Check(x)) { 8084 PyErr_SetString(PyExc_TypeError, "if you give only one argument " 8085 "to maketrans it must be a dict"); 8086 goto err; 8087 } 8088 /* copy entries into the new dict, converting string keys to int keys */ 8089 while (PyDict_Next(x, &i, &key, &value)) { 8090 if (PyUnicode_Check(key)) { 8091 /* convert string keys to integer keys */ 8092 PyObject *newkey; 8093 if (PyUnicode_GET_SIZE(key) != 1) { 8094 PyErr_SetString(PyExc_ValueError, "string keys in translate " 8095 "table must be of length 1"); 8096 goto err; 8097 } 8098 newkey = PyLong_FromLong(PyUnicode_AS_UNICODE(key)[0]); 8099 if (!newkey) 8100 goto err; 8101 res = PyDict_SetItem(new, newkey, value); 8102 Py_DECREF(newkey); 8103 if (res < 0) 8104 goto err; 8105 } else if (PyLong_Check(key)) { 8106 /* just keep integer keys */ 8107 if (PyDict_SetItem(new, key, value) < 0) 8108 goto err; 8109 } else { 8110 PyErr_SetString(PyExc_TypeError, "keys in translate table must " 8111 "be strings or integers"); 8112 goto err; 8113 } 8114 } 8115 } 8116 return new; 8117 err: 8118 Py_DECREF(new); 8119 return NULL; 8120} 8121 8122PyDoc_STRVAR(translate__doc__, 8123"S.translate(table) -> str\n\ 8124\n\ 8125Return a copy of the string S, where all characters have been mapped\n\ 8126through the given translation table, which must be a mapping of\n\ 8127Unicode ordinals to Unicode ordinals, strings, or None.\n\ 8128Unmapped characters are left untouched. Characters mapped to None\n\ 8129are deleted."); 8130 8131static PyObject* 8132unicode_translate(PyUnicodeObject *self, PyObject *table) 8133{ 8134 return PyUnicode_TranslateCharmap(self->str, self->length, table, "ignore"); 8135} 8136 8137PyDoc_STRVAR(upper__doc__, 8138"S.upper() -> str\n\ 8139\n\ 8140Return a copy of S converted to uppercase."); 8141 8142static PyObject* 8143unicode_upper(PyUnicodeObject *self) 8144{ 8145 return fixup(self, fixupper); 8146} 8147 8148PyDoc_STRVAR(zfill__doc__, 8149"S.zfill(width) -> str\n\ 8150\n\ 8151Pad a numeric string x with zeros on the left, to fill a field\n\ 8152of the specified width. The string x is never truncated."); 8153 8154static PyObject * 8155unicode_zfill(PyUnicodeObject *self, PyObject *args) 8156{ 8157 Py_ssize_t fill; 8158 PyUnicodeObject *u; 8159 8160 Py_ssize_t width; 8161 if (!PyArg_ParseTuple(args, "n:zfill", &width)) 8162 return NULL; 8163 8164 if (self->length >= width) { 8165 if (PyUnicode_CheckExact(self)) { 8166 Py_INCREF(self); 8167 return (PyObject*) self; 8168 } 8169 else 8170 return PyUnicode_FromUnicode( 8171 PyUnicode_AS_UNICODE(self), 8172 PyUnicode_GET_SIZE(self) 8173 ); 8174 } 8175 8176 fill = width - self->length; 8177 8178 u = pad(self, fill, 0, '0'); 8179 8180 if (u == NULL) 8181 return NULL; 8182 8183 if (u->str[fill] == '+' || u->str[fill] == '-') { 8184 /* move sign to beginning of string */ 8185 u->str[0] = u->str[fill]; 8186 u->str[fill] = '0'; 8187 } 8188 8189 return (PyObject*) u; 8190} 8191 8192#if 0 8193static PyObject* 8194unicode_freelistsize(PyUnicodeObject *self) 8195{ 8196 return PyLong_FromLong(numfree); 8197} 8198#endif 8199 8200PyDoc_STRVAR(startswith__doc__, 8201"S.startswith(prefix[, start[, end]]) -> bool\n\ 8202\n\ 8203Return True if S starts with the specified prefix, False otherwise.\n\ 8204With optional start, test S beginning at that position.\n\ 8205With optional end, stop comparing S at that position.\n\ 8206prefix can also be a tuple of strings to try."); 8207 8208static PyObject * 8209unicode_startswith(PyUnicodeObject *self, 8210 PyObject *args) 8211{ 8212 PyObject *subobj; 8213 PyUnicodeObject *substring; 8214 Py_ssize_t start = 0; 8215 Py_ssize_t end = PY_SSIZE_T_MAX; 8216 int result; 8217 8218 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj, 8219 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 8220 return NULL; 8221 if (PyTuple_Check(subobj)) { 8222 Py_ssize_t i; 8223 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) { 8224 substring = (PyUnicodeObject *)PyUnicode_FromObject( 8225 PyTuple_GET_ITEM(subobj, i)); 8226 if (substring == NULL) 8227 return NULL; 8228 result = tailmatch(self, substring, start, end, -1); 8229 Py_DECREF(substring); 8230 if (result) { 8231 Py_RETURN_TRUE; 8232 } 8233 } 8234 /* nothing matched */ 8235 Py_RETURN_FALSE; 8236 } 8237 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj); 8238 if (substring == NULL) 8239 return NULL; 8240 result = tailmatch(self, substring, start, end, -1); 8241 Py_DECREF(substring); 8242 return PyBool_FromLong(result); 8243} 8244 8245 8246PyDoc_STRVAR(endswith__doc__, 8247"S.endswith(suffix[, start[, end]]) -> bool\n\ 8248\n\ 8249Return True if S ends with the specified suffix, False otherwise.\n\ 8250With optional start, test S beginning at that position.\n\ 8251With optional end, stop comparing S at that position.\n\ 8252suffix can also be a tuple of strings to try."); 8253 8254static PyObject * 8255unicode_endswith(PyUnicodeObject *self, 8256 PyObject *args) 8257{ 8258 PyObject *subobj; 8259 PyUnicodeObject *substring; 8260 Py_ssize_t start = 0; 8261 Py_ssize_t end = PY_SSIZE_T_MAX; 8262 int result; 8263 8264 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj, 8265 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 8266 return NULL; 8267 if (PyTuple_Check(subobj)) { 8268 Py_ssize_t i; 8269 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) { 8270 substring = (PyUnicodeObject *)PyUnicode_FromObject( 8271 PyTuple_GET_ITEM(subobj, i)); 8272 if (substring == NULL) 8273 return NULL; 8274 result = tailmatch(self, substring, start, end, +1); 8275 Py_DECREF(substring); 8276 if (result) { 8277 Py_RETURN_TRUE; 8278 } 8279 } 8280 Py_RETURN_FALSE; 8281 } 8282 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj); 8283 if (substring == NULL) 8284 return NULL; 8285 8286 result = tailmatch(self, substring, start, end, +1); 8287 Py_DECREF(substring); 8288 return PyBool_FromLong(result); 8289} 8290 8291#include "stringlib/string_format.h" 8292 8293PyDoc_STRVAR(format__doc__, 8294"S.format(*args, **kwargs) -> str\n\ 8295\n\ 8296"); 8297 8298static PyObject * 8299unicode__format__(PyObject* self, PyObject* args) 8300{ 8301 PyObject *format_spec; 8302 8303 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec)) 8304 return NULL; 8305 8306 return _PyUnicode_FormatAdvanced(self, 8307 PyUnicode_AS_UNICODE(format_spec), 8308 PyUnicode_GET_SIZE(format_spec)); 8309} 8310 8311PyDoc_STRVAR(p_format__doc__, 8312"S.__format__(format_spec) -> str\n\ 8313\n\ 8314"); 8315 8316static PyObject * 8317unicode__sizeof__(PyUnicodeObject *v) 8318{ 8319 return PyLong_FromSsize_t(sizeof(PyUnicodeObject) + 8320 sizeof(Py_UNICODE) * (v->length + 1)); 8321} 8322 8323PyDoc_STRVAR(sizeof__doc__, 8324"S.__sizeof__() -> size of S in memory, in bytes"); 8325 8326static PyObject * 8327unicode_getnewargs(PyUnicodeObject *v) 8328{ 8329 return Py_BuildValue("(u#)", v->str, v->length); 8330} 8331 8332 8333static PyMethodDef unicode_methods[] = { 8334 8335 /* Order is according to common usage: often used methods should 8336 appear first, since lookup is done sequentially. */ 8337 8338 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__}, 8339 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__}, 8340 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__}, 8341 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__}, 8342 {"join", (PyCFunction) unicode_join, METH_O, join__doc__}, 8343 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__}, 8344 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__}, 8345 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__}, 8346 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__}, 8347 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__}, 8348 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__}, 8349 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__}, 8350 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__}, 8351 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__}, 8352 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__}, 8353 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__}, 8354 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__}, 8355 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__}, 8356 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__}, 8357 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__}, 8358 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__}, 8359 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__}, 8360 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__}, 8361 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__}, 8362 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__}, 8363 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__}, 8364 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__}, 8365 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__}, 8366 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__}, 8367 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__}, 8368 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__}, 8369 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__}, 8370 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__}, 8371 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__}, 8372 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__}, 8373 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__}, 8374 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__}, 8375 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__}, 8376 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__}, 8377 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__}, 8378 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__}, 8379 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__}, 8380 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS}, 8381 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS}, 8382 {"maketrans", (PyCFunction) unicode_maketrans, 8383 METH_VARARGS | METH_STATIC, maketrans__doc__}, 8384 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__}, 8385#if 0 8386 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__}, 8387#endif 8388 8389#if 0 8390 /* This one is just used for debugging the implementation. */ 8391 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS}, 8392#endif 8393 8394 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS}, 8395 {NULL, NULL} 8396}; 8397 8398static PyObject * 8399unicode_mod(PyObject *v, PyObject *w) 8400{ 8401 if (!PyUnicode_Check(v)) { 8402 Py_INCREF(Py_NotImplemented); 8403 return Py_NotImplemented; 8404 } 8405 return PyUnicode_Format(v, w); 8406} 8407 8408static PyNumberMethods unicode_as_number = { 8409 0, /*nb_add*/ 8410 0, /*nb_subtract*/ 8411 0, /*nb_multiply*/ 8412 unicode_mod, /*nb_remainder*/ 8413}; 8414 8415static PySequenceMethods unicode_as_sequence = { 8416 (lenfunc) unicode_length, /* sq_length */ 8417 PyUnicode_Concat, /* sq_concat */ 8418 (ssizeargfunc) unicode_repeat, /* sq_repeat */ 8419 (ssizeargfunc) unicode_getitem, /* sq_item */ 8420 0, /* sq_slice */ 8421 0, /* sq_ass_item */ 8422 0, /* sq_ass_slice */ 8423 PyUnicode_Contains, /* sq_contains */ 8424}; 8425 8426static PyObject* 8427unicode_subscript(PyUnicodeObject* self, PyObject* item) 8428{ 8429 if (PyIndex_Check(item)) { 8430 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError); 8431 if (i == -1 && PyErr_Occurred()) 8432 return NULL; 8433 if (i < 0) 8434 i += PyUnicode_GET_SIZE(self); 8435 return unicode_getitem(self, i); 8436 } else if (PySlice_Check(item)) { 8437 Py_ssize_t start, stop, step, slicelength, cur, i; 8438 Py_UNICODE* source_buf; 8439 Py_UNICODE* result_buf; 8440 PyObject* result; 8441 8442 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self), 8443 &start, &stop, &step, &slicelength) < 0) { 8444 return NULL; 8445 } 8446 8447 if (slicelength <= 0) { 8448 return PyUnicode_FromUnicode(NULL, 0); 8449 } else if (start == 0 && step == 1 && slicelength == self->length && 8450 PyUnicode_CheckExact(self)) { 8451 Py_INCREF(self); 8452 return (PyObject *)self; 8453 } else if (step == 1) { 8454 return PyUnicode_FromUnicode(self->str + start, slicelength); 8455 } else { 8456 source_buf = PyUnicode_AS_UNICODE((PyObject*)self); 8457 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength* 8458 sizeof(Py_UNICODE)); 8459 8460 if (result_buf == NULL) 8461 return PyErr_NoMemory(); 8462 8463 for (cur = start, i = 0; i < slicelength; cur += step, i++) { 8464 result_buf[i] = source_buf[cur]; 8465 } 8466 8467 result = PyUnicode_FromUnicode(result_buf, slicelength); 8468 PyObject_FREE(result_buf); 8469 return result; 8470 } 8471 } else { 8472 PyErr_SetString(PyExc_TypeError, "string indices must be integers"); 8473 return NULL; 8474 } 8475} 8476 8477static PyMappingMethods unicode_as_mapping = { 8478 (lenfunc)unicode_length, /* mp_length */ 8479 (binaryfunc)unicode_subscript, /* mp_subscript */ 8480 (objobjargproc)0, /* mp_ass_subscript */ 8481}; 8482 8483 8484/* Helpers for PyUnicode_Format() */ 8485 8486static PyObject * 8487getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx) 8488{ 8489 Py_ssize_t argidx = *p_argidx; 8490 if (argidx < arglen) { 8491 (*p_argidx)++; 8492 if (arglen < 0) 8493 return args; 8494 else 8495 return PyTuple_GetItem(args, argidx); 8496 } 8497 PyErr_SetString(PyExc_TypeError, 8498 "not enough arguments for format string"); 8499 return NULL; 8500} 8501 8502static Py_ssize_t 8503strtounicode(Py_UNICODE *buffer, const char *charbuffer) 8504{ 8505 register Py_ssize_t i; 8506 Py_ssize_t len = strlen(charbuffer); 8507 for (i = len - 1; i >= 0; i--) 8508 buffer[i] = (Py_UNICODE) charbuffer[i]; 8509 8510 return len; 8511} 8512 8513static int 8514doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x) 8515{ 8516 Py_ssize_t result; 8517 8518 PyOS_ascii_formatd((char *)buffer, len, format, x); 8519 result = strtounicode(buffer, (char *)buffer); 8520 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int); 8521} 8522 8523#if 0 8524static int 8525longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x) 8526{ 8527 Py_ssize_t result; 8528 8529 PyOS_snprintf((char *)buffer, len, format, x); 8530 result = strtounicode(buffer, (char *)buffer); 8531 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int); 8532} 8533#endif 8534 8535/* XXX To save some code duplication, formatfloat/long/int could have been 8536 shared with stringobject.c, converting from 8-bit to Unicode after the 8537 formatting is done. */ 8538 8539static int 8540formatfloat(Py_UNICODE *buf, 8541 size_t buflen, 8542 int flags, 8543 int prec, 8544 int type, 8545 PyObject *v) 8546{ 8547 /* fmt = '%#.' + `prec` + `type` 8548 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/ 8549 char fmt[20]; 8550 double x; 8551 8552 x = PyFloat_AsDouble(v); 8553 if (x == -1.0 && PyErr_Occurred()) 8554 return -1; 8555 if (prec < 0) 8556 prec = 6; 8557 if (type == 'f' && (fabs(x) / 1e25) >= 1e25) 8558 type = 'g'; 8559 /* Worst case length calc to ensure no buffer overrun: 8560 8561 'g' formats: 8562 fmt = %#.<prec>g 8563 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp 8564 for any double rep.) 8565 len = 1 + prec + 1 + 2 + 5 = 9 + prec 8566 8567 'f' formats: 8568 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50) 8569 len = 1 + 50 + 1 + prec = 52 + prec 8570 8571 If prec=0 the effective precision is 1 (the leading digit is 8572 always given), therefore increase the length by one. 8573 8574 */ 8575 if (((type == 'g' || type == 'G') && 8576 buflen <= (size_t)10 + (size_t)prec) || 8577 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) { 8578 PyErr_SetString(PyExc_OverflowError, 8579 "formatted float is too long (precision too large?)"); 8580 return -1; 8581 } 8582 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c", 8583 (flags&F_ALT) ? "#" : "", 8584 prec, type); 8585 return doubletounicode(buf, buflen, fmt, x); 8586} 8587 8588static PyObject* 8589formatlong(PyObject *val, int flags, int prec, int type) 8590{ 8591 char *buf; 8592 int len; 8593 PyObject *str; /* temporary string object. */ 8594 PyObject *result; 8595 8596 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len); 8597 if (!str) 8598 return NULL; 8599 result = PyUnicode_FromStringAndSize(buf, len); 8600 Py_DECREF(str); 8601 return result; 8602} 8603 8604#if 0 8605static int 8606formatint(Py_UNICODE *buf, 8607 size_t buflen, 8608 int flags, 8609 int prec, 8610 int type, 8611 PyObject *v) 8612{ 8613 /* fmt = '%#.' + `prec` + 'l' + `type` 8614 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine) 8615 * + 1 + 1 8616 * = 24 8617 */ 8618 char fmt[64]; /* plenty big enough! */ 8619 char *sign; 8620 long x; 8621 8622 x = PyLong_AsLong(v); 8623 if (x == -1 && PyErr_Occurred()) 8624 return -1; 8625 if (x < 0 && type == 'u') { 8626 type = 'd'; 8627 } 8628 if (x < 0 && (type == 'x' || type == 'X' || type == 'o')) 8629 sign = "-"; 8630 else 8631 sign = ""; 8632 if (prec < 0) 8633 prec = 1; 8634 8635 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal)) 8636 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11 8637 */ 8638 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) { 8639 PyErr_SetString(PyExc_OverflowError, 8640 "formatted integer is too long (precision too large?)"); 8641 return -1; 8642 } 8643 8644 if ((flags & F_ALT) && 8645 (type == 'x' || type == 'X' || type == 'o')) { 8646 /* When converting under %#o, %#x or %#X, there are a number 8647 * of issues that cause pain: 8648 * - for %#o, we want a different base marker than C 8649 * - when 0 is being converted, the C standard leaves off 8650 * the '0x' or '0X', which is inconsistent with other 8651 * %#x/%#X conversions and inconsistent with Python's 8652 * hex() function 8653 * - there are platforms that violate the standard and 8654 * convert 0 with the '0x' or '0X' 8655 * (Metrowerks, Compaq Tru64) 8656 * - there are platforms that give '0x' when converting 8657 * under %#X, but convert 0 in accordance with the 8658 * standard (OS/2 EMX) 8659 * 8660 * We can achieve the desired consistency by inserting our 8661 * own '0x' or '0X' prefix, and substituting %x/%X in place 8662 * of %#x/%#X. 8663 * 8664 * Note that this is the same approach as used in 8665 * formatint() in stringobject.c 8666 */ 8667 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c", 8668 sign, type, prec, type); 8669 } 8670 else { 8671 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c", 8672 sign, (flags&F_ALT) ? "#" : "", 8673 prec, type); 8674 } 8675 if (sign[0]) 8676 return longtounicode(buf, buflen, fmt, -x); 8677 else 8678 return longtounicode(buf, buflen, fmt, x); 8679} 8680#endif 8681 8682static int 8683formatchar(Py_UNICODE *buf, 8684 size_t buflen, 8685 PyObject *v) 8686{ 8687 /* presume that the buffer is at least 3 characters long */ 8688 if (PyUnicode_Check(v)) { 8689 if (PyUnicode_GET_SIZE(v) == 1) { 8690 buf[0] = PyUnicode_AS_UNICODE(v)[0]; 8691 buf[1] = '\0'; 8692 return 1; 8693 } 8694#ifndef Py_UNICODE_WIDE 8695 if (PyUnicode_GET_SIZE(v) == 2) { 8696 /* Decode a valid surrogate pair */ 8697 int c0 = PyUnicode_AS_UNICODE(v)[0]; 8698 int c1 = PyUnicode_AS_UNICODE(v)[1]; 8699 if (0xD800 <= c0 && c0 <= 0xDBFF && 8700 0xDC00 <= c1 && c1 <= 0xDFFF) { 8701 buf[0] = c0; 8702 buf[1] = c1; 8703 buf[2] = '\0'; 8704 return 2; 8705 } 8706 } 8707#endif 8708 goto onError; 8709 } 8710 else { 8711 /* Integer input truncated to a character */ 8712 long x; 8713 x = PyLong_AsLong(v); 8714 if (x == -1 && PyErr_Occurred()) 8715 goto onError; 8716 8717 if (x < 0 || x > 0x10ffff) { 8718 PyErr_SetString(PyExc_OverflowError, 8719 "%c arg not in range(0x110000)"); 8720 return -1; 8721 } 8722 8723#ifndef Py_UNICODE_WIDE 8724 if (x > 0xffff) { 8725 x -= 0x10000; 8726 buf[0] = (Py_UNICODE)(0xD800 | (x >> 10)); 8727 buf[1] = (Py_UNICODE)(0xDC00 | (x & 0x3FF)); 8728 return 2; 8729 } 8730#endif 8731 buf[0] = (Py_UNICODE) x; 8732 buf[1] = '\0'; 8733 return 1; 8734 } 8735 8736 onError: 8737 PyErr_SetString(PyExc_TypeError, 8738 "%c requires int or char"); 8739 return -1; 8740} 8741 8742/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...) 8743 8744 FORMATBUFLEN is the length of the buffer in which the floats, ints, & 8745 chars are formatted. XXX This is a magic number. Each formatting 8746 routine does bounds checking to ensure no overflow, but a better 8747 solution may be to malloc a buffer of appropriate size for each 8748 format. For now, the current solution is sufficient. 8749*/ 8750#define FORMATBUFLEN (size_t)120 8751 8752PyObject *PyUnicode_Format(PyObject *format, 8753 PyObject *args) 8754{ 8755 Py_UNICODE *fmt, *res; 8756 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx; 8757 int args_owned = 0; 8758 PyUnicodeObject *result = NULL; 8759 PyObject *dict = NULL; 8760 PyObject *uformat; 8761 8762 if (format == NULL || args == NULL) { 8763 PyErr_BadInternalCall(); 8764 return NULL; 8765 } 8766 uformat = PyUnicode_FromObject(format); 8767 if (uformat == NULL) 8768 return NULL; 8769 fmt = PyUnicode_AS_UNICODE(uformat); 8770 fmtcnt = PyUnicode_GET_SIZE(uformat); 8771 8772 reslen = rescnt = fmtcnt + 100; 8773 result = _PyUnicode_New(reslen); 8774 if (result == NULL) 8775 goto onError; 8776 res = PyUnicode_AS_UNICODE(result); 8777 8778 if (PyTuple_Check(args)) { 8779 arglen = PyTuple_Size(args); 8780 argidx = 0; 8781 } 8782 else { 8783 arglen = -1; 8784 argidx = -2; 8785 } 8786 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) && 8787 !PyUnicode_Check(args)) 8788 dict = args; 8789 8790 while (--fmtcnt >= 0) { 8791 if (*fmt != '%') { 8792 if (--rescnt < 0) { 8793 rescnt = fmtcnt + 100; 8794 reslen += rescnt; 8795 if (_PyUnicode_Resize(&result, reslen) < 0) 8796 goto onError; 8797 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt; 8798 --rescnt; 8799 } 8800 *res++ = *fmt++; 8801 } 8802 else { 8803 /* Got a format specifier */ 8804 int flags = 0; 8805 Py_ssize_t width = -1; 8806 int prec = -1; 8807 Py_UNICODE c = '\0'; 8808 Py_UNICODE fill; 8809 int isnumok; 8810 PyObject *v = NULL; 8811 PyObject *temp = NULL; 8812 Py_UNICODE *pbuf; 8813 Py_UNICODE sign; 8814 Py_ssize_t len; 8815 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */ 8816 8817 fmt++; 8818 if (*fmt == '(') { 8819 Py_UNICODE *keystart; 8820 Py_ssize_t keylen; 8821 PyObject *key; 8822 int pcount = 1; 8823 8824 if (dict == NULL) { 8825 PyErr_SetString(PyExc_TypeError, 8826 "format requires a mapping"); 8827 goto onError; 8828 } 8829 ++fmt; 8830 --fmtcnt; 8831 keystart = fmt; 8832 /* Skip over balanced parentheses */ 8833 while (pcount > 0 && --fmtcnt >= 0) { 8834 if (*fmt == ')') 8835 --pcount; 8836 else if (*fmt == '(') 8837 ++pcount; 8838 fmt++; 8839 } 8840 keylen = fmt - keystart - 1; 8841 if (fmtcnt < 0 || pcount > 0) { 8842 PyErr_SetString(PyExc_ValueError, 8843 "incomplete format key"); 8844 goto onError; 8845 } 8846#if 0 8847 /* keys are converted to strings using UTF-8 and 8848 then looked up since Python uses strings to hold 8849 variables names etc. in its namespaces and we 8850 wouldn't want to break common idioms. */ 8851 key = PyUnicode_EncodeUTF8(keystart, 8852 keylen, 8853 NULL); 8854#else 8855 key = PyUnicode_FromUnicode(keystart, keylen); 8856#endif 8857 if (key == NULL) 8858 goto onError; 8859 if (args_owned) { 8860 Py_DECREF(args); 8861 args_owned = 0; 8862 } 8863 args = PyObject_GetItem(dict, key); 8864 Py_DECREF(key); 8865 if (args == NULL) { 8866 goto onError; 8867 } 8868 args_owned = 1; 8869 arglen = -1; 8870 argidx = -2; 8871 } 8872 while (--fmtcnt >= 0) { 8873 switch (c = *fmt++) { 8874 case '-': flags |= F_LJUST; continue; 8875 case '+': flags |= F_SIGN; continue; 8876 case ' ': flags |= F_BLANK; continue; 8877 case '#': flags |= F_ALT; continue; 8878 case '0': flags |= F_ZERO; continue; 8879 } 8880 break; 8881 } 8882 if (c == '*') { 8883 v = getnextarg(args, arglen, &argidx); 8884 if (v == NULL) 8885 goto onError; 8886 if (!PyLong_Check(v)) { 8887 PyErr_SetString(PyExc_TypeError, 8888 "* wants int"); 8889 goto onError; 8890 } 8891 width = PyLong_AsLong(v); 8892 if (width == -1 && PyErr_Occurred()) 8893 goto onError; 8894 if (width < 0) { 8895 flags |= F_LJUST; 8896 width = -width; 8897 } 8898 if (--fmtcnt >= 0) 8899 c = *fmt++; 8900 } 8901 else if (c >= '0' && c <= '9') { 8902 width = c - '0'; 8903 while (--fmtcnt >= 0) { 8904 c = *fmt++; 8905 if (c < '0' || c > '9') 8906 break; 8907 if ((width*10) / 10 != width) { 8908 PyErr_SetString(PyExc_ValueError, 8909 "width too big"); 8910 goto onError; 8911 } 8912 width = width*10 + (c - '0'); 8913 } 8914 } 8915 if (c == '.') { 8916 prec = 0; 8917 if (--fmtcnt >= 0) 8918 c = *fmt++; 8919 if (c == '*') { 8920 v = getnextarg(args, arglen, &argidx); 8921 if (v == NULL) 8922 goto onError; 8923 if (!PyLong_Check(v)) { 8924 PyErr_SetString(PyExc_TypeError, 8925 "* wants int"); 8926 goto onError; 8927 } 8928 prec = PyLong_AsLong(v); 8929 if (prec == -1 && PyErr_Occurred()) 8930 goto onError; 8931 if (prec < 0) 8932 prec = 0; 8933 if (--fmtcnt >= 0) 8934 c = *fmt++; 8935 } 8936 else if (c >= '0' && c <= '9') { 8937 prec = c - '0'; 8938 while (--fmtcnt >= 0) { 8939 c = Py_CHARMASK(*fmt++); 8940 if (c < '0' || c > '9') 8941 break; 8942 if ((prec*10) / 10 != prec) { 8943 PyErr_SetString(PyExc_ValueError, 8944 "prec too big"); 8945 goto onError; 8946 } 8947 prec = prec*10 + (c - '0'); 8948 } 8949 } 8950 } /* prec */ 8951 if (fmtcnt >= 0) { 8952 if (c == 'h' || c == 'l' || c == 'L') { 8953 if (--fmtcnt >= 0) 8954 c = *fmt++; 8955 } 8956 } 8957 if (fmtcnt < 0) { 8958 PyErr_SetString(PyExc_ValueError, 8959 "incomplete format"); 8960 goto onError; 8961 } 8962 if (c != '%') { 8963 v = getnextarg(args, arglen, &argidx); 8964 if (v == NULL) 8965 goto onError; 8966 } 8967 sign = 0; 8968 fill = ' '; 8969 switch (c) { 8970 8971 case '%': 8972 pbuf = formatbuf; 8973 /* presume that buffer length is at least 1 */ 8974 pbuf[0] = '%'; 8975 len = 1; 8976 break; 8977 8978 case 's': 8979 case 'r': 8980 case 'a': 8981 if (PyUnicode_Check(v) && c == 's') { 8982 temp = v; 8983 Py_INCREF(temp); 8984 } 8985 else { 8986 if (c == 's') 8987 temp = PyObject_Str(v); 8988 else if (c == 'r') 8989 temp = PyObject_Repr(v); 8990 else 8991 temp = PyObject_ASCII(v); 8992 if (temp == NULL) 8993 goto onError; 8994 if (PyUnicode_Check(temp)) 8995 /* nothing to do */; 8996 else { 8997 Py_DECREF(temp); 8998 PyErr_SetString(PyExc_TypeError, 8999 "%s argument has non-string str()"); 9000 goto onError; 9001 } 9002 } 9003 pbuf = PyUnicode_AS_UNICODE(temp); 9004 len = PyUnicode_GET_SIZE(temp); 9005 if (prec >= 0 && len > prec) 9006 len = prec; 9007 break; 9008 9009 case 'i': 9010 case 'd': 9011 case 'u': 9012 case 'o': 9013 case 'x': 9014 case 'X': 9015 if (c == 'i') 9016 c = 'd'; 9017 isnumok = 0; 9018 if (PyNumber_Check(v)) { 9019 PyObject *iobj=NULL; 9020 9021 if (PyLong_Check(v)) { 9022 iobj = v; 9023 Py_INCREF(iobj); 9024 } 9025 else { 9026 iobj = PyNumber_Long(v); 9027 } 9028 if (iobj!=NULL) { 9029 if (PyLong_Check(iobj)) { 9030 isnumok = 1; 9031 temp = formatlong(iobj, flags, prec, c); 9032 Py_DECREF(iobj); 9033 if (!temp) 9034 goto onError; 9035 pbuf = PyUnicode_AS_UNICODE(temp); 9036 len = PyUnicode_GET_SIZE(temp); 9037 sign = 1; 9038 } 9039 else { 9040 Py_DECREF(iobj); 9041 } 9042 } 9043 } 9044 if (!isnumok) { 9045 PyErr_Format(PyExc_TypeError, 9046 "%%%c format: a number is required, " 9047 "not %.200s", (char)c, Py_TYPE(v)->tp_name); 9048 goto onError; 9049 } 9050 if (flags & F_ZERO) 9051 fill = '0'; 9052 break; 9053 9054 case 'e': 9055 case 'E': 9056 case 'f': 9057 case 'F': 9058 case 'g': 9059 case 'G': 9060 if (c == 'F') 9061 c = 'f'; 9062 pbuf = formatbuf; 9063 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), 9064 flags, prec, c, v); 9065 if (len < 0) 9066 goto onError; 9067 sign = 1; 9068 if (flags & F_ZERO) 9069 fill = '0'; 9070 break; 9071 9072 case 'c': 9073 pbuf = formatbuf; 9074 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v); 9075 if (len < 0) 9076 goto onError; 9077 break; 9078 9079 default: 9080 PyErr_Format(PyExc_ValueError, 9081 "unsupported format character '%c' (0x%x) " 9082 "at index %zd", 9083 (31<=c && c<=126) ? (char)c : '?', 9084 (int)c, 9085 (Py_ssize_t)(fmt - 1 - 9086 PyUnicode_AS_UNICODE(uformat))); 9087 goto onError; 9088 } 9089 if (sign) { 9090 if (*pbuf == '-' || *pbuf == '+') { 9091 sign = *pbuf++; 9092 len--; 9093 } 9094 else if (flags & F_SIGN) 9095 sign = '+'; 9096 else if (flags & F_BLANK) 9097 sign = ' '; 9098 else 9099 sign = 0; 9100 } 9101 if (width < len) 9102 width = len; 9103 if (rescnt - (sign != 0) < width) { 9104 reslen -= rescnt; 9105 rescnt = width + fmtcnt + 100; 9106 reslen += rescnt; 9107 if (reslen < 0) { 9108 Py_XDECREF(temp); 9109 PyErr_NoMemory(); 9110 goto onError; 9111 } 9112 if (_PyUnicode_Resize(&result, reslen) < 0) { 9113 Py_XDECREF(temp); 9114 goto onError; 9115 } 9116 res = PyUnicode_AS_UNICODE(result) 9117 + reslen - rescnt; 9118 } 9119 if (sign) { 9120 if (fill != ' ') 9121 *res++ = sign; 9122 rescnt--; 9123 if (width > len) 9124 width--; 9125 } 9126 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) { 9127 assert(pbuf[0] == '0'); 9128 assert(pbuf[1] == c); 9129 if (fill != ' ') { 9130 *res++ = *pbuf++; 9131 *res++ = *pbuf++; 9132 } 9133 rescnt -= 2; 9134 width -= 2; 9135 if (width < 0) 9136 width = 0; 9137 len -= 2; 9138 } 9139 if (width > len && !(flags & F_LJUST)) { 9140 do { 9141 --rescnt; 9142 *res++ = fill; 9143 } while (--width > len); 9144 } 9145 if (fill == ' ') { 9146 if (sign) 9147 *res++ = sign; 9148 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) { 9149 assert(pbuf[0] == '0'); 9150 assert(pbuf[1] == c); 9151 *res++ = *pbuf++; 9152 *res++ = *pbuf++; 9153 } 9154 } 9155 Py_UNICODE_COPY(res, pbuf, len); 9156 res += len; 9157 rescnt -= len; 9158 while (--width >= len) { 9159 --rescnt; 9160 *res++ = ' '; 9161 } 9162 if (dict && (argidx < arglen) && c != '%') { 9163 PyErr_SetString(PyExc_TypeError, 9164 "not all arguments converted during string formatting"); 9165 Py_XDECREF(temp); 9166 goto onError; 9167 } 9168 Py_XDECREF(temp); 9169 } /* '%' */ 9170 } /* until end */ 9171 if (argidx < arglen && !dict) { 9172 PyErr_SetString(PyExc_TypeError, 9173 "not all arguments converted during string formatting"); 9174 goto onError; 9175 } 9176 9177 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0) 9178 goto onError; 9179 if (args_owned) { 9180 Py_DECREF(args); 9181 } 9182 Py_DECREF(uformat); 9183 return (PyObject *)result; 9184 9185 onError: 9186 Py_XDECREF(result); 9187 Py_DECREF(uformat); 9188 if (args_owned) { 9189 Py_DECREF(args); 9190 } 9191 return NULL; 9192} 9193 9194static PyObject * 9195unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds); 9196 9197static PyObject * 9198unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds) 9199{ 9200 PyObject *x = NULL; 9201 static char *kwlist[] = {"object", "encoding", "errors", 0}; 9202 char *encoding = NULL; 9203 char *errors = NULL; 9204 9205 if (type != &PyUnicode_Type) 9206 return unicode_subtype_new(type, args, kwds); 9207 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str", 9208 kwlist, &x, &encoding, &errors)) 9209 return NULL; 9210 if (x == NULL) 9211 return (PyObject *)_PyUnicode_New(0); 9212 if (encoding == NULL && errors == NULL) 9213 return PyObject_Str(x); 9214 else 9215 return PyUnicode_FromEncodedObject(x, encoding, errors); 9216} 9217 9218static PyObject * 9219unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds) 9220{ 9221 PyUnicodeObject *tmp, *pnew; 9222 Py_ssize_t n; 9223 9224 assert(PyType_IsSubtype(type, &PyUnicode_Type)); 9225 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds); 9226 if (tmp == NULL) 9227 return NULL; 9228 assert(PyUnicode_Check(tmp)); 9229 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length); 9230 if (pnew == NULL) { 9231 Py_DECREF(tmp); 9232 return NULL; 9233 } 9234 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1)); 9235 if (pnew->str == NULL) { 9236 _Py_ForgetReference((PyObject *)pnew); 9237 PyObject_Del(pnew); 9238 Py_DECREF(tmp); 9239 return PyErr_NoMemory(); 9240 } 9241 Py_UNICODE_COPY(pnew->str, tmp->str, n+1); 9242 pnew->length = n; 9243 pnew->hash = tmp->hash; 9244 Py_DECREF(tmp); 9245 return (PyObject *)pnew; 9246} 9247 9248PyDoc_STRVAR(unicode_doc, 9249"str(string[, encoding[, errors]]) -> str\n\ 9250\n\ 9251Create a new string object from the given encoded string.\n\ 9252encoding defaults to the current default string encoding.\n\ 9253errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'."); 9254 9255static PyObject *unicode_iter(PyObject *seq); 9256 9257PyTypeObject PyUnicode_Type = { 9258 PyVarObject_HEAD_INIT(&PyType_Type, 0) 9259 "str", /* tp_name */ 9260 sizeof(PyUnicodeObject), /* tp_size */ 9261 0, /* tp_itemsize */ 9262 /* Slots */ 9263 (destructor)unicode_dealloc, /* tp_dealloc */ 9264 0, /* tp_print */ 9265 0, /* tp_getattr */ 9266 0, /* tp_setattr */ 9267 0, /* tp_compare */ 9268 unicode_repr, /* tp_repr */ 9269 &unicode_as_number, /* tp_as_number */ 9270 &unicode_as_sequence, /* tp_as_sequence */ 9271 &unicode_as_mapping, /* tp_as_mapping */ 9272 (hashfunc) unicode_hash, /* tp_hash*/ 9273 0, /* tp_call*/ 9274 (reprfunc) unicode_str, /* tp_str */ 9275 PyObject_GenericGetAttr, /* tp_getattro */ 9276 0, /* tp_setattro */ 9277 0, /* tp_as_buffer */ 9278 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | 9279 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */ 9280 unicode_doc, /* tp_doc */ 9281 0, /* tp_traverse */ 9282 0, /* tp_clear */ 9283 PyUnicode_RichCompare, /* tp_richcompare */ 9284 0, /* tp_weaklistoffset */ 9285 unicode_iter, /* tp_iter */ 9286 0, /* tp_iternext */ 9287 unicode_methods, /* tp_methods */ 9288 0, /* tp_members */ 9289 0, /* tp_getset */ 9290 &PyBaseObject_Type, /* tp_base */ 9291 0, /* tp_dict */ 9292 0, /* tp_descr_get */ 9293 0, /* tp_descr_set */ 9294 0, /* tp_dictoffset */ 9295 0, /* tp_init */ 9296 0, /* tp_alloc */ 9297 unicode_new, /* tp_new */ 9298 PyObject_Del, /* tp_free */ 9299}; 9300 9301/* Initialize the Unicode implementation */ 9302 9303void _PyUnicode_Init(void) 9304{ 9305 int i; 9306 9307 /* XXX - move this array to unicodectype.c ? */ 9308 Py_UNICODE linebreak[] = { 9309 0x000A, /* LINE FEED */ 9310 0x000D, /* CARRIAGE RETURN */ 9311 0x001C, /* FILE SEPARATOR */ 9312 0x001D, /* GROUP SEPARATOR */ 9313 0x001E, /* RECORD SEPARATOR */ 9314 0x0085, /* NEXT LINE */ 9315 0x2028, /* LINE SEPARATOR */ 9316 0x2029, /* PARAGRAPH SEPARATOR */ 9317 }; 9318 9319 /* Init the implementation */ 9320 free_list = NULL; 9321 numfree = 0; 9322 unicode_empty = _PyUnicode_New(0); 9323 if (!unicode_empty) 9324 return; 9325 9326 for (i = 0; i < 256; i++) 9327 unicode_latin1[i] = NULL; 9328 if (PyType_Ready(&PyUnicode_Type) < 0) 9329 Py_FatalError("Can't initialize 'unicode'"); 9330 9331 /* initialize the linebreak bloom filter */ 9332 bloom_linebreak = make_bloom_mask( 9333 linebreak, sizeof(linebreak) / sizeof(linebreak[0]) 9334 ); 9335 9336 PyType_Ready(&EncodingMapType); 9337} 9338 9339/* Finalize the Unicode implementation */ 9340 9341int 9342PyUnicode_ClearFreeList(void) 9343{ 9344 int freelist_size = numfree; 9345 PyUnicodeObject *u; 9346 9347 for (u = free_list; u != NULL;) { 9348 PyUnicodeObject *v = u; 9349 u = *(PyUnicodeObject **)u; 9350 if (v->str) 9351 PyObject_DEL(v->str); 9352 Py_XDECREF(v->defenc); 9353 PyObject_Del(v); 9354 numfree--; 9355 } 9356 free_list = NULL; 9357 assert(numfree == 0); 9358 return freelist_size; 9359} 9360 9361void 9362_PyUnicode_Fini(void) 9363{ 9364 int i; 9365 9366 Py_XDECREF(unicode_empty); 9367 unicode_empty = NULL; 9368 9369 for (i = 0; i < 256; i++) { 9370 if (unicode_latin1[i]) { 9371 Py_DECREF(unicode_latin1[i]); 9372 unicode_latin1[i] = NULL; 9373 } 9374 } 9375 (void)PyUnicode_ClearFreeList(); 9376} 9377 9378void 9379PyUnicode_InternInPlace(PyObject **p) 9380{ 9381 register PyUnicodeObject *s = (PyUnicodeObject *)(*p); 9382 PyObject *t; 9383 if (s == NULL || !PyUnicode_Check(s)) 9384 Py_FatalError( 9385 "PyUnicode_InternInPlace: unicode strings only please!"); 9386 /* If it's a subclass, we don't really know what putting 9387 it in the interned dict might do. */ 9388 if (!PyUnicode_CheckExact(s)) 9389 return; 9390 if (PyUnicode_CHECK_INTERNED(s)) 9391 return; 9392 if (interned == NULL) { 9393 interned = PyDict_New(); 9394 if (interned == NULL) { 9395 PyErr_Clear(); /* Don't leave an exception */ 9396 return; 9397 } 9398 } 9399 /* It might be that the GetItem call fails even 9400 though the key is present in the dictionary, 9401 namely when this happens during a stack overflow. */ 9402 Py_ALLOW_RECURSION 9403 t = PyDict_GetItem(interned, (PyObject *)s); 9404 Py_END_ALLOW_RECURSION 9405 9406 if (t) { 9407 Py_INCREF(t); 9408 Py_DECREF(*p); 9409 *p = t; 9410 return; 9411 } 9412 9413 PyThreadState_GET()->recursion_critical = 1; 9414 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) { 9415 PyErr_Clear(); 9416 PyThreadState_GET()->recursion_critical = 0; 9417 return; 9418 } 9419 PyThreadState_GET()->recursion_critical = 0; 9420 /* The two references in interned are not counted by refcnt. 9421 The deallocator will take care of this */ 9422 Py_REFCNT(s) -= 2; 9423 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL; 9424} 9425 9426void 9427PyUnicode_InternImmortal(PyObject **p) 9428{ 9429 PyUnicode_InternInPlace(p); 9430 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) { 9431 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL; 9432 Py_INCREF(*p); 9433 } 9434} 9435 9436PyObject * 9437PyUnicode_InternFromString(const char *cp) 9438{ 9439 PyObject *s = PyUnicode_FromString(cp); 9440 if (s == NULL) 9441 return NULL; 9442 PyUnicode_InternInPlace(&s); 9443 return s; 9444} 9445 9446void _Py_ReleaseInternedUnicodeStrings(void) 9447{ 9448 PyObject *keys; 9449 PyUnicodeObject *s; 9450 Py_ssize_t i, n; 9451 Py_ssize_t immortal_size = 0, mortal_size = 0; 9452 9453 if (interned == NULL || !PyDict_Check(interned)) 9454 return; 9455 keys = PyDict_Keys(interned); 9456 if (keys == NULL || !PyList_Check(keys)) { 9457 PyErr_Clear(); 9458 return; 9459 } 9460 9461 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak 9462 detector, interned unicode strings are not forcibly deallocated; 9463 rather, we give them their stolen references back, and then clear 9464 and DECREF the interned dict. */ 9465 9466 n = PyList_GET_SIZE(keys); 9467 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n", 9468 n); 9469 for (i = 0; i < n; i++) { 9470 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i); 9471 switch (s->state) { 9472 case SSTATE_NOT_INTERNED: 9473 /* XXX Shouldn't happen */ 9474 break; 9475 case SSTATE_INTERNED_IMMORTAL: 9476 Py_REFCNT(s) += 1; 9477 immortal_size += s->length; 9478 break; 9479 case SSTATE_INTERNED_MORTAL: 9480 Py_REFCNT(s) += 2; 9481 mortal_size += s->length; 9482 break; 9483 default: 9484 Py_FatalError("Inconsistent interned string state."); 9485 } 9486 s->state = SSTATE_NOT_INTERNED; 9487 } 9488 fprintf(stderr, "total size of all interned strings: " 9489 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d " 9490 "mortal/immortal\n", mortal_size, immortal_size); 9491 Py_DECREF(keys); 9492 PyDict_Clear(interned); 9493 Py_DECREF(interned); 9494 interned = NULL; 9495} 9496 9497 9498/********************* Unicode Iterator **************************/ 9499 9500typedef struct { 9501 PyObject_HEAD 9502 Py_ssize_t it_index; 9503 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */ 9504} unicodeiterobject; 9505 9506static void 9507unicodeiter_dealloc(unicodeiterobject *it) 9508{ 9509 _PyObject_GC_UNTRACK(it); 9510 Py_XDECREF(it->it_seq); 9511 PyObject_GC_Del(it); 9512} 9513 9514static int 9515unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg) 9516{ 9517 Py_VISIT(it->it_seq); 9518 return 0; 9519} 9520 9521static PyObject * 9522unicodeiter_next(unicodeiterobject *it) 9523{ 9524 PyUnicodeObject *seq; 9525 PyObject *item; 9526 9527 assert(it != NULL); 9528 seq = it->it_seq; 9529 if (seq == NULL) 9530 return NULL; 9531 assert(PyUnicode_Check(seq)); 9532 9533 if (it->it_index < PyUnicode_GET_SIZE(seq)) { 9534 item = PyUnicode_FromUnicode( 9535 PyUnicode_AS_UNICODE(seq)+it->it_index, 1); 9536 if (item != NULL) 9537 ++it->it_index; 9538 return item; 9539 } 9540 9541 Py_DECREF(seq); 9542 it->it_seq = NULL; 9543 return NULL; 9544} 9545 9546static PyObject * 9547unicodeiter_len(unicodeiterobject *it) 9548{ 9549 Py_ssize_t len = 0; 9550 if (it->it_seq) 9551 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index; 9552 return PyLong_FromSsize_t(len); 9553} 9554 9555PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it))."); 9556 9557static PyMethodDef unicodeiter_methods[] = { 9558 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS, 9559 length_hint_doc}, 9560 {NULL, NULL} /* sentinel */ 9561}; 9562 9563PyTypeObject PyUnicodeIter_Type = { 9564 PyVarObject_HEAD_INIT(&PyType_Type, 0) 9565 "str_iterator", /* tp_name */ 9566 sizeof(unicodeiterobject), /* tp_basicsize */ 9567 0, /* tp_itemsize */ 9568 /* methods */ 9569 (destructor)unicodeiter_dealloc, /* tp_dealloc */ 9570 0, /* tp_print */ 9571 0, /* tp_getattr */ 9572 0, /* tp_setattr */ 9573 0, /* tp_compare */ 9574 0, /* tp_repr */ 9575 0, /* tp_as_number */ 9576 0, /* tp_as_sequence */ 9577 0, /* tp_as_mapping */ 9578 0, /* tp_hash */ 9579 0, /* tp_call */ 9580 0, /* tp_str */ 9581 PyObject_GenericGetAttr, /* tp_getattro */ 9582 0, /* tp_setattro */ 9583 0, /* tp_as_buffer */ 9584 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */ 9585 0, /* tp_doc */ 9586 (traverseproc)unicodeiter_traverse, /* tp_traverse */ 9587 0, /* tp_clear */ 9588 0, /* tp_richcompare */ 9589 0, /* tp_weaklistoffset */ 9590 PyObject_SelfIter, /* tp_iter */ 9591 (iternextfunc)unicodeiter_next, /* tp_iternext */ 9592 unicodeiter_methods, /* tp_methods */ 9593 0, 9594}; 9595 9596static PyObject * 9597unicode_iter(PyObject *seq) 9598{ 9599 unicodeiterobject *it; 9600 9601 if (!PyUnicode_Check(seq)) { 9602 PyErr_BadInternalCall(); 9603 return NULL; 9604 } 9605 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type); 9606 if (it == NULL) 9607 return NULL; 9608 it->it_index = 0; 9609 Py_INCREF(seq); 9610 it->it_seq = (PyUnicodeObject *)seq; 9611 _PyObject_GC_TRACK(it); 9612 return (PyObject *)it; 9613} 9614 9615size_t 9616Py_UNICODE_strlen(const Py_UNICODE *u) 9617{ 9618 int res = 0; 9619 while(*u++) 9620 res++; 9621 return res; 9622} 9623 9624Py_UNICODE* 9625Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2) 9626{ 9627 Py_UNICODE *u = s1; 9628 while ((*u++ = *s2++)); 9629 return s1; 9630} 9631 9632Py_UNICODE* 9633Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n) 9634{ 9635 Py_UNICODE *u = s1; 9636 while ((*u++ = *s2++)) 9637 if (n-- == 0) 9638 break; 9639 return s1; 9640} 9641 9642int 9643Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2) 9644{ 9645 while (*s1 && *s2 && *s1 == *s2) 9646 s1++, s2++; 9647 if (*s1 && *s2) 9648 return (*s1 < *s2) ? -1 : +1; 9649 if (*s1) 9650 return 1; 9651 if (*s2) 9652 return -1; 9653 return 0; 9654} 9655 9656Py_UNICODE* 9657Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c) 9658{ 9659 const Py_UNICODE *p; 9660 for (p = s; *p; p++) 9661 if (*p == c) 9662 return (Py_UNICODE*)p; 9663 return NULL; 9664} 9665 9666 9667#ifdef __cplusplus 9668} 9669#endif 9670 9671 9672/* 9673Local variables: 9674c-basic-offset: 4 9675indent-tabs-mode: nil 9676End: 9677*/ 9678