unicodeobject.c revision c28e1fa71f61278256887d257e4e7e24b0e7e7ce
1/* 2 3Unicode implementation based on original code by Fredrik Lundh, 4modified by Marc-Andre Lemburg <mal@lemburg.com> according to the 5Unicode Integration Proposal (see file Misc/unicode.txt). 6 7Major speed upgrades to the method implementations at the Reykjavik 8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke. 9 10Copyright (c) Corporation for National Research Initiatives. 11 12-------------------------------------------------------------------- 13The original string type implementation is: 14 15 Copyright (c) 1999 by Secret Labs AB 16 Copyright (c) 1999 by Fredrik Lundh 17 18By obtaining, using, and/or copying this software and/or its 19associated documentation, you agree that you have read, understood, 20and will comply with the following terms and conditions: 21 22Permission to use, copy, modify, and distribute this software and its 23associated documentation for any purpose and without fee is hereby 24granted, provided that the above copyright notice appears in all 25copies, and that both that copyright notice and this permission notice 26appear in supporting documentation, and that the name of Secret Labs 27AB or the author not be used in advertising or publicity pertaining to 28distribution of the software without specific, written prior 29permission. 30 31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO 32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND 33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR 34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT 37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 38-------------------------------------------------------------------- 39 40*/ 41 42#define PY_SSIZE_T_CLEAN 43#include "Python.h" 44#include "bytes_methods.h" 45 46#include "unicodeobject.h" 47#include "ucnhash.h" 48 49#ifdef MS_WINDOWS 50#include <windows.h> 51#endif 52 53/* Limit for the Unicode object free list */ 54 55#define PyUnicode_MAXFREELIST 1024 56 57/* Limit for the Unicode object free list stay alive optimization. 58 59 The implementation will keep allocated Unicode memory intact for 60 all objects on the free list having a size less than this 61 limit. This reduces malloc() overhead for small Unicode objects. 62 63 At worst this will result in PyUnicode_MAXFREELIST * 64 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT + 65 malloc()-overhead) bytes of unused garbage. 66 67 Setting the limit to 0 effectively turns the feature off. 68 69 Note: This is an experimental feature ! If you get core dumps when 70 using Unicode objects, turn this feature off. 71 72*/ 73 74#define KEEPALIVE_SIZE_LIMIT 9 75 76/* Endianness switches; defaults to little endian */ 77 78#ifdef WORDS_BIGENDIAN 79# define BYTEORDER_IS_BIG_ENDIAN 80#else 81# define BYTEORDER_IS_LITTLE_ENDIAN 82#endif 83 84/* --- Globals ------------------------------------------------------------ 85 86 The globals are initialized by the _PyUnicode_Init() API and should 87 not be used before calling that API. 88 89*/ 90 91 92#ifdef __cplusplus 93extern "C" { 94#endif 95 96/* This dictionary holds all interned unicode strings. Note that references 97 to strings in this dictionary are *not* counted in the string's ob_refcnt. 98 When the interned string reaches a refcnt of 0 the string deallocation 99 function will delete the reference from this dictionary. 100 101 Another way to look at this is that to say that the actual reference 102 count of a string is: s->ob_refcnt + (s->state ? 2 : 0) 103*/ 104static PyObject *interned; 105 106/* Free list for Unicode objects */ 107static PyUnicodeObject *free_list; 108static int numfree; 109 110/* The empty Unicode object is shared to improve performance. */ 111static PyUnicodeObject *unicode_empty; 112 113/* Single character Unicode strings in the Latin-1 range are being 114 shared as well. */ 115static PyUnicodeObject *unicode_latin1[256]; 116 117/* Default encoding to use and assume when NULL is passed as encoding 118 parameter; it is fixed to "utf-8". Always use the 119 PyUnicode_GetDefaultEncoding() API to access this global. 120 121 Don't forget to alter Py_FileSystemDefaultEncoding if you change the 122 hard coded default! 123*/ 124static const char unicode_default_encoding[] = "utf-8"; 125 126/* Fast detection of the most frequent whitespace characters */ 127const unsigned char _Py_ascii_whitespace[] = { 128 0, 0, 0, 0, 0, 0, 0, 0, 129// case 0x0009: /* HORIZONTAL TABULATION */ 130// case 0x000A: /* LINE FEED */ 131// case 0x000B: /* VERTICAL TABULATION */ 132// case 0x000C: /* FORM FEED */ 133// case 0x000D: /* CARRIAGE RETURN */ 134 0, 1, 1, 1, 1, 1, 0, 0, 135 0, 0, 0, 0, 0, 0, 0, 0, 136// case 0x001C: /* FILE SEPARATOR */ 137// case 0x001D: /* GROUP SEPARATOR */ 138// case 0x001E: /* RECORD SEPARATOR */ 139// case 0x001F: /* UNIT SEPARATOR */ 140 0, 0, 0, 0, 1, 1, 1, 1, 141// case 0x0020: /* SPACE */ 142 1, 0, 0, 0, 0, 0, 0, 0, 143 0, 0, 0, 0, 0, 0, 0, 0, 144 0, 0, 0, 0, 0, 0, 0, 0, 145 0, 0, 0, 0, 0, 0, 0, 0, 146 147 0, 0, 0, 0, 0, 0, 0, 0, 148 0, 0, 0, 0, 0, 0, 0, 0, 149 0, 0, 0, 0, 0, 0, 0, 0, 150 0, 0, 0, 0, 0, 0, 0, 0, 151 0, 0, 0, 0, 0, 0, 0, 0, 152 0, 0, 0, 0, 0, 0, 0, 0, 153 0, 0, 0, 0, 0, 0, 0, 0, 154 0, 0, 0, 0, 0, 0, 0, 0 155}; 156 157/* Same for linebreaks */ 158static unsigned char ascii_linebreak[] = { 159 0, 0, 0, 0, 0, 0, 0, 0, 160// 0x000A, /* LINE FEED */ 161// 0x000D, /* CARRIAGE RETURN */ 162 0, 0, 1, 0, 0, 1, 0, 0, 163 0, 0, 0, 0, 0, 0, 0, 0, 164// 0x001C, /* FILE SEPARATOR */ 165// 0x001D, /* GROUP SEPARATOR */ 166// 0x001E, /* RECORD SEPARATOR */ 167 0, 0, 0, 0, 1, 1, 1, 0, 168 0, 0, 0, 0, 0, 0, 0, 0, 169 0, 0, 0, 0, 0, 0, 0, 0, 170 0, 0, 0, 0, 0, 0, 0, 0, 171 0, 0, 0, 0, 0, 0, 0, 0, 172 173 0, 0, 0, 0, 0, 0, 0, 0, 174 0, 0, 0, 0, 0, 0, 0, 0, 175 0, 0, 0, 0, 0, 0, 0, 0, 176 0, 0, 0, 0, 0, 0, 0, 0, 177 0, 0, 0, 0, 0, 0, 0, 0, 178 0, 0, 0, 0, 0, 0, 0, 0, 179 0, 0, 0, 0, 0, 0, 0, 0, 180 0, 0, 0, 0, 0, 0, 0, 0 181}; 182 183 184Py_UNICODE 185PyUnicode_GetMax(void) 186{ 187#ifdef Py_UNICODE_WIDE 188 return 0x10FFFF; 189#else 190 /* This is actually an illegal character, so it should 191 not be passed to unichr. */ 192 return 0xFFFF; 193#endif 194} 195 196/* --- Bloom Filters ----------------------------------------------------- */ 197 198/* stuff to implement simple "bloom filters" for Unicode characters. 199 to keep things simple, we use a single bitmask, using the least 5 200 bits from each unicode characters as the bit index. */ 201 202/* the linebreak mask is set up by Unicode_Init below */ 203 204#define BLOOM_MASK unsigned long 205 206static BLOOM_MASK bloom_linebreak; 207 208#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F)))) 209 210#define BLOOM_LINEBREAK(ch) \ 211 ((ch) < 128U ? ascii_linebreak[(ch)] : \ 212 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch))) 213 214Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len) 215{ 216 /* calculate simple bloom-style bitmask for a given unicode string */ 217 218 long mask; 219 Py_ssize_t i; 220 221 mask = 0; 222 for (i = 0; i < len; i++) 223 mask |= (1 << (ptr[i] & 0x1F)); 224 225 return mask; 226} 227 228Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen) 229{ 230 Py_ssize_t i; 231 232 for (i = 0; i < setlen; i++) 233 if (set[i] == chr) 234 return 1; 235 236 return 0; 237} 238 239#define BLOOM_MEMBER(mask, chr, set, setlen)\ 240 BLOOM(mask, chr) && unicode_member(chr, set, setlen) 241 242/* --- Unicode Object ----------------------------------------------------- */ 243 244static 245int unicode_resize(register PyUnicodeObject *unicode, 246 Py_ssize_t length) 247{ 248 void *oldstr; 249 250 /* Shortcut if there's nothing much to do. */ 251 if (unicode->length == length) 252 goto reset; 253 254 /* Resizing shared object (unicode_empty or single character 255 objects) in-place is not allowed. Use PyUnicode_Resize() 256 instead ! */ 257 258 if (unicode == unicode_empty || 259 (unicode->length == 1 && 260 unicode->str[0] < 256U && 261 unicode_latin1[unicode->str[0]] == unicode)) { 262 PyErr_SetString(PyExc_SystemError, 263 "can't resize shared unicode objects"); 264 return -1; 265 } 266 267 /* We allocate one more byte to make sure the string is Ux0000 terminated. 268 The overallocation is also used by fastsearch, which assumes that it's 269 safe to look at str[length] (without making any assumptions about what 270 it contains). */ 271 272 oldstr = unicode->str; 273 unicode->str = PyObject_REALLOC(unicode->str, 274 sizeof(Py_UNICODE) * (length + 1)); 275 if (!unicode->str) { 276 unicode->str = (Py_UNICODE *)oldstr; 277 PyErr_NoMemory(); 278 return -1; 279 } 280 unicode->str[length] = 0; 281 unicode->length = length; 282 283 reset: 284 /* Reset the object caches */ 285 if (unicode->defenc) { 286 Py_DECREF(unicode->defenc); 287 unicode->defenc = NULL; 288 } 289 unicode->hash = -1; 290 291 return 0; 292} 293 294/* We allocate one more byte to make sure the string is 295 Ux0000 terminated; some code (e.g. new_identifier) 296 relies on that. 297 298 XXX This allocator could further be enhanced by assuring that the 299 free list never reduces its size below 1. 300 301*/ 302 303static 304PyUnicodeObject *_PyUnicode_New(Py_ssize_t length) 305{ 306 register PyUnicodeObject *unicode; 307 308 /* Optimization for empty strings */ 309 if (length == 0 && unicode_empty != NULL) { 310 Py_INCREF(unicode_empty); 311 return unicode_empty; 312 } 313 314 /* Unicode freelist & memory allocation */ 315 if (free_list) { 316 unicode = free_list; 317 free_list = *(PyUnicodeObject **)unicode; 318 numfree--; 319 if (unicode->str) { 320 /* Keep-Alive optimization: we only upsize the buffer, 321 never downsize it. */ 322 if ((unicode->length < length) && 323 unicode_resize(unicode, length) < 0) { 324 PyObject_DEL(unicode->str); 325 goto onError; 326 } 327 } 328 else { 329 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1); 330 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size); 331 } 332 PyObject_INIT(unicode, &PyUnicode_Type); 333 } 334 else { 335 size_t new_size; 336 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type); 337 if (unicode == NULL) 338 return NULL; 339 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1); 340 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size); 341 } 342 343 if (!unicode->str) { 344 PyErr_NoMemory(); 345 goto onError; 346 } 347 /* Initialize the first element to guard against cases where 348 * the caller fails before initializing str -- unicode_resize() 349 * reads str[0], and the Keep-Alive optimization can keep memory 350 * allocated for str alive across a call to unicode_dealloc(unicode). 351 * We don't want unicode_resize to read uninitialized memory in 352 * that case. 353 */ 354 unicode->str[0] = 0; 355 unicode->str[length] = 0; 356 unicode->length = length; 357 unicode->hash = -1; 358 unicode->state = 0; 359 unicode->defenc = NULL; 360 return unicode; 361 362 onError: 363 _Py_ForgetReference((PyObject *)unicode); 364 PyObject_Del(unicode); 365 return NULL; 366} 367 368static 369void unicode_dealloc(register PyUnicodeObject *unicode) 370{ 371 switch (PyUnicode_CHECK_INTERNED(unicode)) { 372 case SSTATE_NOT_INTERNED: 373 break; 374 375 case SSTATE_INTERNED_MORTAL: 376 /* revive dead object temporarily for DelItem */ 377 Py_REFCNT(unicode) = 3; 378 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0) 379 Py_FatalError( 380 "deletion of interned unicode string failed"); 381 break; 382 383 case SSTATE_INTERNED_IMMORTAL: 384 Py_FatalError("Immortal interned unicode string died."); 385 386 default: 387 Py_FatalError("Inconsistent interned unicode string state."); 388 } 389 390 if (PyUnicode_CheckExact(unicode) && 391 numfree < PyUnicode_MAXFREELIST) { 392 /* Keep-Alive optimization */ 393 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) { 394 PyObject_DEL(unicode->str); 395 unicode->str = NULL; 396 unicode->length = 0; 397 } 398 if (unicode->defenc) { 399 Py_DECREF(unicode->defenc); 400 unicode->defenc = NULL; 401 } 402 /* Add to free list */ 403 *(PyUnicodeObject **)unicode = free_list; 404 free_list = unicode; 405 numfree++; 406 } 407 else { 408 PyObject_DEL(unicode->str); 409 Py_XDECREF(unicode->defenc); 410 Py_TYPE(unicode)->tp_free((PyObject *)unicode); 411 } 412} 413 414int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length) 415{ 416 register PyUnicodeObject *v; 417 418 /* Argument checks */ 419 if (unicode == NULL) { 420 PyErr_BadInternalCall(); 421 return -1; 422 } 423 v = (PyUnicodeObject *)*unicode; 424 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) { 425 PyErr_BadInternalCall(); 426 return -1; 427 } 428 429 /* Resizing unicode_empty and single character objects is not 430 possible since these are being shared. We simply return a fresh 431 copy with the same Unicode content. */ 432 if (v->length != length && 433 (v == unicode_empty || v->length == 1)) { 434 PyUnicodeObject *w = _PyUnicode_New(length); 435 if (w == NULL) 436 return -1; 437 Py_UNICODE_COPY(w->str, v->str, 438 length < v->length ? length : v->length); 439 Py_DECREF(*unicode); 440 *unicode = (PyObject *)w; 441 return 0; 442 } 443 444 /* Note that we don't have to modify *unicode for unshared Unicode 445 objects, since we can modify them in-place. */ 446 return unicode_resize(v, length); 447} 448 449/* Internal API for use in unicodeobject.c only ! */ 450#define _PyUnicode_Resize(unicodevar, length) \ 451 PyUnicode_Resize(((PyObject **)(unicodevar)), length) 452 453PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u, 454 Py_ssize_t size) 455{ 456 PyUnicodeObject *unicode; 457 458 /* If the Unicode data is known at construction time, we can apply 459 some optimizations which share commonly used objects. */ 460 if (u != NULL) { 461 462 /* Optimization for empty strings */ 463 if (size == 0 && unicode_empty != NULL) { 464 Py_INCREF(unicode_empty); 465 return (PyObject *)unicode_empty; 466 } 467 468 /* Single character Unicode objects in the Latin-1 range are 469 shared when using this constructor */ 470 if (size == 1 && *u < 256) { 471 unicode = unicode_latin1[*u]; 472 if (!unicode) { 473 unicode = _PyUnicode_New(1); 474 if (!unicode) 475 return NULL; 476 unicode->str[0] = *u; 477 unicode_latin1[*u] = unicode; 478 } 479 Py_INCREF(unicode); 480 return (PyObject *)unicode; 481 } 482 } 483 484 unicode = _PyUnicode_New(size); 485 if (!unicode) 486 return NULL; 487 488 /* Copy the Unicode data into the new object */ 489 if (u != NULL) 490 Py_UNICODE_COPY(unicode->str, u, size); 491 492 return (PyObject *)unicode; 493} 494 495PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size) 496{ 497 PyUnicodeObject *unicode; 498 499 if (size < 0) { 500 PyErr_SetString(PyExc_SystemError, 501 "Negative size passed to PyUnicode_FromStringAndSize"); 502 return NULL; 503 } 504 505 /* If the Unicode data is known at construction time, we can apply 506 some optimizations which share commonly used objects. 507 Also, this means the input must be UTF-8, so fall back to the 508 UTF-8 decoder at the end. */ 509 if (u != NULL) { 510 511 /* Optimization for empty strings */ 512 if (size == 0 && unicode_empty != NULL) { 513 Py_INCREF(unicode_empty); 514 return (PyObject *)unicode_empty; 515 } 516 517 /* Single characters are shared when using this constructor. 518 Restrict to ASCII, since the input must be UTF-8. */ 519 if (size == 1 && Py_CHARMASK(*u) < 128) { 520 unicode = unicode_latin1[Py_CHARMASK(*u)]; 521 if (!unicode) { 522 unicode = _PyUnicode_New(1); 523 if (!unicode) 524 return NULL; 525 unicode->str[0] = Py_CHARMASK(*u); 526 unicode_latin1[Py_CHARMASK(*u)] = unicode; 527 } 528 Py_INCREF(unicode); 529 return (PyObject *)unicode; 530 } 531 532 return PyUnicode_DecodeUTF8(u, size, NULL); 533 } 534 535 unicode = _PyUnicode_New(size); 536 if (!unicode) 537 return NULL; 538 539 return (PyObject *)unicode; 540} 541 542PyObject *PyUnicode_FromString(const char *u) 543{ 544 size_t size = strlen(u); 545 if (size > PY_SSIZE_T_MAX) { 546 PyErr_SetString(PyExc_OverflowError, "input too long"); 547 return NULL; 548 } 549 550 return PyUnicode_FromStringAndSize(u, size); 551} 552 553#ifdef HAVE_WCHAR_H 554 555PyObject *PyUnicode_FromWideChar(register const wchar_t *w, 556 Py_ssize_t size) 557{ 558 PyUnicodeObject *unicode; 559 560 if (w == NULL) { 561 if (size == 0) 562 return PyUnicode_FromStringAndSize(NULL, 0); 563 PyErr_BadInternalCall(); 564 return NULL; 565 } 566 567 if (size == -1) { 568 size = wcslen(w); 569 } 570 571 unicode = _PyUnicode_New(size); 572 if (!unicode) 573 return NULL; 574 575 /* Copy the wchar_t data into the new object */ 576#ifdef HAVE_USABLE_WCHAR_T 577 memcpy(unicode->str, w, size * sizeof(wchar_t)); 578#else 579 { 580 register Py_UNICODE *u; 581 register Py_ssize_t i; 582 u = PyUnicode_AS_UNICODE(unicode); 583 for (i = size; i > 0; i--) 584 *u++ = *w++; 585 } 586#endif 587 588 return (PyObject *)unicode; 589} 590 591static void 592makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c) 593{ 594 *fmt++ = '%'; 595 if (width) { 596 if (zeropad) 597 *fmt++ = '0'; 598 fmt += sprintf(fmt, "%d", width); 599 } 600 if (precision) 601 fmt += sprintf(fmt, ".%d", precision); 602 if (longflag) 603 *fmt++ = 'l'; 604 else if (size_tflag) { 605 char *f = PY_FORMAT_SIZE_T; 606 while (*f) 607 *fmt++ = *f++; 608 } 609 *fmt++ = c; 610 *fmt = '\0'; 611} 612 613#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;} 614 615PyObject * 616PyUnicode_FromFormatV(const char *format, va_list vargs) 617{ 618 va_list count; 619 Py_ssize_t callcount = 0; 620 PyObject **callresults = NULL; 621 PyObject **callresult = NULL; 622 Py_ssize_t n = 0; 623 int width = 0; 624 int precision = 0; 625 int zeropad; 626 const char* f; 627 Py_UNICODE *s; 628 PyObject *string; 629 /* used by sprintf */ 630 char buffer[21]; 631 /* use abuffer instead of buffer, if we need more space 632 * (which can happen if there's a format specifier with width). */ 633 char *abuffer = NULL; 634 char *realbuffer; 635 Py_ssize_t abuffersize = 0; 636 char fmt[60]; /* should be enough for %0width.precisionld */ 637 const char *copy; 638 639#ifdef VA_LIST_IS_ARRAY 640 Py_MEMCPY(count, vargs, sizeof(va_list)); 641#else 642#ifdef __va_copy 643 __va_copy(count, vargs); 644#else 645 count = vargs; 646#endif 647#endif 648 /* step 1: count the number of %S/%R format specifications 649 * (we call PyObject_Str()/PyObject_Repr() for these objects 650 * once during step 3 and put the result in an array) */ 651 for (f = format; *f; f++) { 652 if (*f == '%' && (*(f+1)=='S' || *(f+1)=='R')) 653 ++callcount; 654 } 655 /* step 2: allocate memory for the results of 656 * PyObject_Str()/PyObject_Repr() calls */ 657 if (callcount) { 658 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount); 659 if (!callresults) { 660 PyErr_NoMemory(); 661 return NULL; 662 } 663 callresult = callresults; 664 } 665 /* step 3: figure out how large a buffer we need */ 666 for (f = format; *f; f++) { 667 if (*f == '%') { 668 const char* p = f; 669 width = 0; 670 while (ISDIGIT((unsigned)*f)) 671 width = (width*10) + *f++ - '0'; 672 while (*++f && *f != '%' && !ISALPHA((unsigned)*f)) 673 ; 674 675 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since 676 * they don't affect the amount of space we reserve. 677 */ 678 if ((*f == 'l' || *f == 'z') && 679 (f[1] == 'd' || f[1] == 'u')) 680 ++f; 681 682 switch (*f) { 683 case 'c': 684 (void)va_arg(count, int); 685 /* fall through... */ 686 case '%': 687 n++; 688 break; 689 case 'd': case 'u': case 'i': case 'x': 690 (void) va_arg(count, int); 691 /* 20 bytes is enough to hold a 64-bit 692 integer. Decimal takes the most space. 693 This isn't enough for octal. 694 If a width is specified we need more 695 (which we allocate later). */ 696 if (width < 20) 697 width = 20; 698 n += width; 699 if (abuffersize < width) 700 abuffersize = width; 701 break; 702 case 's': 703 { 704 /* UTF-8 */ 705 unsigned char*s; 706 s = va_arg(count, unsigned char*); 707 while (*s) { 708 if (*s < 128) { 709 n++; s++; 710 } else if (*s < 0xc0) { 711 /* invalid UTF-8 */ 712 n++; s++; 713 } else if (*s < 0xc0) { 714 n++; 715 s++; if(!*s)break; 716 s++; 717 } else if (*s < 0xe0) { 718 n++; 719 s++; if(!*s)break; 720 s++; if(!*s)break; 721 s++; 722 } else { 723 #ifdef Py_UNICODE_WIDE 724 n++; 725 #else 726 n+=2; 727 #endif 728 s++; if(!*s)break; 729 s++; if(!*s)break; 730 s++; if(!*s)break; 731 s++; 732 } 733 } 734 break; 735 } 736 case 'U': 737 { 738 PyObject *obj = va_arg(count, PyObject *); 739 assert(obj && PyUnicode_Check(obj)); 740 n += PyUnicode_GET_SIZE(obj); 741 break; 742 } 743 case 'V': 744 { 745 PyObject *obj = va_arg(count, PyObject *); 746 const char *str = va_arg(count, const char *); 747 assert(obj || str); 748 assert(!obj || PyUnicode_Check(obj)); 749 if (obj) 750 n += PyUnicode_GET_SIZE(obj); 751 else 752 n += strlen(str); 753 break; 754 } 755 case 'S': 756 { 757 PyObject *obj = va_arg(count, PyObject *); 758 PyObject *str; 759 assert(obj); 760 str = PyObject_Str(obj); 761 if (!str) 762 goto fail; 763 n += PyUnicode_GET_SIZE(str); 764 /* Remember the str and switch to the next slot */ 765 *callresult++ = str; 766 break; 767 } 768 case 'R': 769 { 770 PyObject *obj = va_arg(count, PyObject *); 771 PyObject *repr; 772 assert(obj); 773 repr = PyObject_Repr(obj); 774 if (!repr) 775 goto fail; 776 n += PyUnicode_GET_SIZE(repr); 777 /* Remember the repr and switch to the next slot */ 778 *callresult++ = repr; 779 break; 780 } 781 case 'p': 782 (void) va_arg(count, int); 783 /* maximum 64-bit pointer representation: 784 * 0xffffffffffffffff 785 * so 19 characters is enough. 786 * XXX I count 18 -- what's the extra for? 787 */ 788 n += 19; 789 break; 790 default: 791 /* if we stumble upon an unknown 792 formatting code, copy the rest of 793 the format string to the output 794 string. (we cannot just skip the 795 code, since there's no way to know 796 what's in the argument list) */ 797 n += strlen(p); 798 goto expand; 799 } 800 } else 801 n++; 802 } 803 expand: 804 if (abuffersize > 20) { 805 abuffer = PyObject_Malloc(abuffersize); 806 if (!abuffer) { 807 PyErr_NoMemory(); 808 goto fail; 809 } 810 realbuffer = abuffer; 811 } 812 else 813 realbuffer = buffer; 814 /* step 4: fill the buffer */ 815 /* Since we've analyzed how much space we need for the worst case, 816 we don't have to resize the string. 817 There can be no errors beyond this point. */ 818 string = PyUnicode_FromUnicode(NULL, n); 819 if (!string) 820 goto fail; 821 822 s = PyUnicode_AS_UNICODE(string); 823 callresult = callresults; 824 825 for (f = format; *f; f++) { 826 if (*f == '%') { 827 const char* p = f++; 828 int longflag = 0; 829 int size_tflag = 0; 830 zeropad = (*f == '0'); 831 /* parse the width.precision part */ 832 width = 0; 833 while (ISDIGIT((unsigned)*f)) 834 width = (width*10) + *f++ - '0'; 835 precision = 0; 836 if (*f == '.') { 837 f++; 838 while (ISDIGIT((unsigned)*f)) 839 precision = (precision*10) + *f++ - '0'; 840 } 841 /* handle the long flag, but only for %ld and %lu. 842 others can be added when necessary. */ 843 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) { 844 longflag = 1; 845 ++f; 846 } 847 /* handle the size_t flag. */ 848 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) { 849 size_tflag = 1; 850 ++f; 851 } 852 853 switch (*f) { 854 case 'c': 855 *s++ = va_arg(vargs, int); 856 break; 857 case 'd': 858 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd'); 859 if (longflag) 860 sprintf(realbuffer, fmt, va_arg(vargs, long)); 861 else if (size_tflag) 862 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t)); 863 else 864 sprintf(realbuffer, fmt, va_arg(vargs, int)); 865 appendstring(realbuffer); 866 break; 867 case 'u': 868 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u'); 869 if (longflag) 870 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long)); 871 else if (size_tflag) 872 sprintf(realbuffer, fmt, va_arg(vargs, size_t)); 873 else 874 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int)); 875 appendstring(realbuffer); 876 break; 877 case 'i': 878 makefmt(fmt, 0, 0, zeropad, width, precision, 'i'); 879 sprintf(realbuffer, fmt, va_arg(vargs, int)); 880 appendstring(realbuffer); 881 break; 882 case 'x': 883 makefmt(fmt, 0, 0, zeropad, width, precision, 'x'); 884 sprintf(realbuffer, fmt, va_arg(vargs, int)); 885 appendstring(realbuffer); 886 break; 887 case 's': 888 { 889 /* Parameter must be UTF-8 encoded. 890 In case of encoding errors, use 891 the replacement character. */ 892 PyObject *u; 893 p = va_arg(vargs, char*); 894 u = PyUnicode_DecodeUTF8(p, strlen(p), 895 "replace"); 896 if (!u) 897 goto fail; 898 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(u), 899 PyUnicode_GET_SIZE(u)); 900 s += PyUnicode_GET_SIZE(u); 901 Py_DECREF(u); 902 break; 903 } 904 case 'U': 905 { 906 PyObject *obj = va_arg(vargs, PyObject *); 907 Py_ssize_t size = PyUnicode_GET_SIZE(obj); 908 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size); 909 s += size; 910 break; 911 } 912 case 'V': 913 { 914 PyObject *obj = va_arg(vargs, PyObject *); 915 const char *str = va_arg(vargs, const char *); 916 if (obj) { 917 Py_ssize_t size = PyUnicode_GET_SIZE(obj); 918 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size); 919 s += size; 920 } else { 921 appendstring(str); 922 } 923 break; 924 } 925 case 'S': 926 case 'R': 927 { 928 Py_UNICODE *ucopy; 929 Py_ssize_t usize; 930 Py_ssize_t upos; 931 /* unused, since we already have the result */ 932 (void) va_arg(vargs, PyObject *); 933 ucopy = PyUnicode_AS_UNICODE(*callresult); 934 usize = PyUnicode_GET_SIZE(*callresult); 935 for (upos = 0; upos<usize;) 936 *s++ = ucopy[upos++]; 937 /* We're done with the unicode()/repr() => forget it */ 938 Py_DECREF(*callresult); 939 /* switch to next unicode()/repr() result */ 940 ++callresult; 941 break; 942 } 943 case 'p': 944 sprintf(buffer, "%p", va_arg(vargs, void*)); 945 /* %p is ill-defined: ensure leading 0x. */ 946 if (buffer[1] == 'X') 947 buffer[1] = 'x'; 948 else if (buffer[1] != 'x') { 949 memmove(buffer+2, buffer, strlen(buffer)+1); 950 buffer[0] = '0'; 951 buffer[1] = 'x'; 952 } 953 appendstring(buffer); 954 break; 955 case '%': 956 *s++ = '%'; 957 break; 958 default: 959 appendstring(p); 960 goto end; 961 } 962 } else 963 *s++ = *f; 964 } 965 966 end: 967 if (callresults) 968 PyObject_Free(callresults); 969 if (abuffer) 970 PyObject_Free(abuffer); 971 _PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string)); 972 return string; 973 fail: 974 if (callresults) { 975 PyObject **callresult2 = callresults; 976 while (callresult2 < callresult) { 977 Py_DECREF(*callresult2); 978 ++callresult2; 979 } 980 PyObject_Free(callresults); 981 } 982 if (abuffer) 983 PyObject_Free(abuffer); 984 return NULL; 985} 986 987#undef appendstring 988 989PyObject * 990PyUnicode_FromFormat(const char *format, ...) 991{ 992 PyObject* ret; 993 va_list vargs; 994 995#ifdef HAVE_STDARG_PROTOTYPES 996 va_start(vargs, format); 997#else 998 va_start(vargs); 999#endif 1000 ret = PyUnicode_FromFormatV(format, vargs); 1001 va_end(vargs); 1002 return ret; 1003} 1004 1005Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode, 1006 wchar_t *w, 1007 Py_ssize_t size) 1008{ 1009 if (unicode == NULL) { 1010 PyErr_BadInternalCall(); 1011 return -1; 1012 } 1013 1014 /* If possible, try to copy the 0-termination as well */ 1015 if (size > PyUnicode_GET_SIZE(unicode)) 1016 size = PyUnicode_GET_SIZE(unicode) + 1; 1017 1018#ifdef HAVE_USABLE_WCHAR_T 1019 memcpy(w, unicode->str, size * sizeof(wchar_t)); 1020#else 1021 { 1022 register Py_UNICODE *u; 1023 register Py_ssize_t i; 1024 u = PyUnicode_AS_UNICODE(unicode); 1025 for (i = size; i > 0; i--) 1026 *w++ = *u++; 1027 } 1028#endif 1029 1030 if (size > PyUnicode_GET_SIZE(unicode)) 1031 return PyUnicode_GET_SIZE(unicode); 1032 else 1033 return size; 1034} 1035 1036#endif 1037 1038PyObject *PyUnicode_FromOrdinal(int ordinal) 1039{ 1040 Py_UNICODE s[2]; 1041 1042 if (ordinal < 0 || ordinal > 0x10ffff) { 1043 PyErr_SetString(PyExc_ValueError, 1044 "chr() arg not in range(0x110000)"); 1045 return NULL; 1046 } 1047 1048#ifndef Py_UNICODE_WIDE 1049 if (ordinal > 0xffff) { 1050 ordinal -= 0x10000; 1051 s[0] = 0xD800 | (ordinal >> 10); 1052 s[1] = 0xDC00 | (ordinal & 0x3FF); 1053 return PyUnicode_FromUnicode(s, 2); 1054 } 1055#endif 1056 1057 s[0] = (Py_UNICODE)ordinal; 1058 return PyUnicode_FromUnicode(s, 1); 1059} 1060 1061PyObject *PyUnicode_FromObject(register PyObject *obj) 1062{ 1063 /* XXX Perhaps we should make this API an alias of 1064 PyObject_Str() instead ?! */ 1065 if (PyUnicode_CheckExact(obj)) { 1066 Py_INCREF(obj); 1067 return obj; 1068 } 1069 if (PyUnicode_Check(obj)) { 1070 /* For a Unicode subtype that's not a Unicode object, 1071 return a true Unicode object with the same data. */ 1072 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj), 1073 PyUnicode_GET_SIZE(obj)); 1074 } 1075 PyErr_Format(PyExc_TypeError, 1076 "Can't convert '%.100s' object to str implicitly", 1077 Py_TYPE(obj)->tp_name); 1078 return NULL; 1079} 1080 1081PyObject *PyUnicode_FromEncodedObject(register PyObject *obj, 1082 const char *encoding, 1083 const char *errors) 1084{ 1085 const char *s = NULL; 1086 Py_ssize_t len; 1087 PyObject *v; 1088 1089 if (obj == NULL) { 1090 PyErr_BadInternalCall(); 1091 return NULL; 1092 } 1093 1094 if (PyUnicode_Check(obj)) { 1095 PyErr_SetString(PyExc_TypeError, 1096 "decoding Unicode is not supported"); 1097 return NULL; 1098 } 1099 1100 /* Coerce object */ 1101 if (PyBytes_Check(obj)) { 1102 s = PyBytes_AS_STRING(obj); 1103 len = PyBytes_GET_SIZE(obj); 1104 } 1105 else if (PyByteArray_Check(obj)) { 1106 s = PyByteArray_AS_STRING(obj); 1107 len = PyByteArray_GET_SIZE(obj); 1108 } 1109 else if (PyObject_AsCharBuffer(obj, &s, &len)) { 1110 /* Overwrite the error message with something more useful in 1111 case of a TypeError. */ 1112 if (PyErr_ExceptionMatches(PyExc_TypeError)) 1113 PyErr_Format(PyExc_TypeError, 1114 "coercing to Unicode: need string or buffer, " 1115 "%.80s found", 1116 Py_TYPE(obj)->tp_name); 1117 goto onError; 1118 } 1119 1120 /* Convert to Unicode */ 1121 if (len == 0) { 1122 Py_INCREF(unicode_empty); 1123 v = (PyObject *)unicode_empty; 1124 } 1125 else 1126 v = PyUnicode_Decode(s, len, encoding, errors); 1127 1128 return v; 1129 1130 onError: 1131 return NULL; 1132} 1133 1134PyObject *PyUnicode_Decode(const char *s, 1135 Py_ssize_t size, 1136 const char *encoding, 1137 const char *errors) 1138{ 1139 PyObject *buffer = NULL, *unicode; 1140 Py_buffer info; 1141 char lower[20]; /* Enough for any encoding name we recognize */ 1142 char *l; 1143 const char *e; 1144 1145 if (encoding == NULL) 1146 encoding = PyUnicode_GetDefaultEncoding(); 1147 1148 /* Convert encoding to lower case and replace '_' with '-' in order to 1149 catch e.g. UTF_8 */ 1150 e = encoding; 1151 l = lower; 1152 while (*e && l < &lower[(sizeof lower) - 2]) { 1153 if (ISUPPER(*e)) { 1154 *l++ = TOLOWER(*e++); 1155 } 1156 else if (*e == '_') { 1157 *l++ = '-'; 1158 e++; 1159 } 1160 else { 1161 *l++ = *e++; 1162 } 1163 } 1164 *l = '\0'; 1165 1166 /* Shortcuts for common default encodings */ 1167 if (strcmp(lower, "utf-8") == 0) 1168 return PyUnicode_DecodeUTF8(s, size, errors); 1169 else if ((strcmp(lower, "latin-1") == 0) || 1170 (strcmp(lower, "iso-8859-1") == 0)) 1171 return PyUnicode_DecodeLatin1(s, size, errors); 1172#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T) 1173 else if (strcmp(lower, "mbcs") == 0) 1174 return PyUnicode_DecodeMBCS(s, size, errors); 1175#endif 1176 else if (strcmp(lower, "ascii") == 0) 1177 return PyUnicode_DecodeASCII(s, size, errors); 1178 else if (strcmp(lower, "utf-16") == 0) 1179 return PyUnicode_DecodeUTF16(s, size, errors, 0); 1180 else if (strcmp(lower, "utf-32") == 0) 1181 return PyUnicode_DecodeUTF32(s, size, errors, 0); 1182 1183 /* Decode via the codec registry */ 1184 buffer = NULL; 1185 if (PyBuffer_FillInfo(&info, (void *)s, size, 1, PyBUF_SIMPLE) < 0) 1186 goto onError; 1187 buffer = PyMemoryView_FromMemory(&info); 1188 if (buffer == NULL) 1189 goto onError; 1190 unicode = PyCodec_Decode(buffer, encoding, errors); 1191 if (unicode == NULL) 1192 goto onError; 1193 if (!PyUnicode_Check(unicode)) { 1194 PyErr_Format(PyExc_TypeError, 1195 "decoder did not return a unicode object (type=%.400s)", 1196 Py_TYPE(unicode)->tp_name); 1197 Py_DECREF(unicode); 1198 goto onError; 1199 } 1200 Py_DECREF(buffer); 1201 return unicode; 1202 1203 onError: 1204 Py_XDECREF(buffer); 1205 return NULL; 1206} 1207 1208PyObject *PyUnicode_AsDecodedObject(PyObject *unicode, 1209 const char *encoding, 1210 const char *errors) 1211{ 1212 PyObject *v; 1213 1214 if (!PyUnicode_Check(unicode)) { 1215 PyErr_BadArgument(); 1216 goto onError; 1217 } 1218 1219 if (encoding == NULL) 1220 encoding = PyUnicode_GetDefaultEncoding(); 1221 1222 /* Decode via the codec registry */ 1223 v = PyCodec_Decode(unicode, encoding, errors); 1224 if (v == NULL) 1225 goto onError; 1226 return v; 1227 1228 onError: 1229 return NULL; 1230} 1231 1232PyObject *PyUnicode_AsDecodedUnicode(PyObject *unicode, 1233 const char *encoding, 1234 const char *errors) 1235{ 1236 PyObject *v; 1237 1238 if (!PyUnicode_Check(unicode)) { 1239 PyErr_BadArgument(); 1240 goto onError; 1241 } 1242 1243 if (encoding == NULL) 1244 encoding = PyUnicode_GetDefaultEncoding(); 1245 1246 /* Decode via the codec registry */ 1247 v = PyCodec_Decode(unicode, encoding, errors); 1248 if (v == NULL) 1249 goto onError; 1250 if (!PyUnicode_Check(v)) { 1251 PyErr_Format(PyExc_TypeError, 1252 "decoder did not return a unicode object (type=%.400s)", 1253 Py_TYPE(v)->tp_name); 1254 Py_DECREF(v); 1255 goto onError; 1256 } 1257 return v; 1258 1259 onError: 1260 return NULL; 1261} 1262 1263PyObject *PyUnicode_Encode(const Py_UNICODE *s, 1264 Py_ssize_t size, 1265 const char *encoding, 1266 const char *errors) 1267{ 1268 PyObject *v, *unicode; 1269 1270 unicode = PyUnicode_FromUnicode(s, size); 1271 if (unicode == NULL) 1272 return NULL; 1273 v = PyUnicode_AsEncodedString(unicode, encoding, errors); 1274 Py_DECREF(unicode); 1275 return v; 1276} 1277 1278PyObject *PyUnicode_AsEncodedObject(PyObject *unicode, 1279 const char *encoding, 1280 const char *errors) 1281{ 1282 PyObject *v; 1283 1284 if (!PyUnicode_Check(unicode)) { 1285 PyErr_BadArgument(); 1286 goto onError; 1287 } 1288 1289 if (encoding == NULL) 1290 encoding = PyUnicode_GetDefaultEncoding(); 1291 1292 /* Encode via the codec registry */ 1293 v = PyCodec_Encode(unicode, encoding, errors); 1294 if (v == NULL) 1295 goto onError; 1296 return v; 1297 1298 onError: 1299 return NULL; 1300} 1301 1302PyObject *PyUnicode_AsEncodedString(PyObject *unicode, 1303 const char *encoding, 1304 const char *errors) 1305{ 1306 PyObject *v; 1307 1308 if (!PyUnicode_Check(unicode)) { 1309 PyErr_BadArgument(); 1310 goto onError; 1311 } 1312 1313 if (encoding == NULL) 1314 encoding = PyUnicode_GetDefaultEncoding(); 1315 1316 /* Shortcuts for common default encodings */ 1317 if (errors == NULL) { 1318 if (strcmp(encoding, "utf-8") == 0) 1319 return PyUnicode_AsUTF8String(unicode); 1320 else if (strcmp(encoding, "latin-1") == 0) 1321 return PyUnicode_AsLatin1String(unicode); 1322#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T) 1323 else if (strcmp(encoding, "mbcs") == 0) 1324 return PyUnicode_AsMBCSString(unicode); 1325#endif 1326 else if (strcmp(encoding, "ascii") == 0) 1327 return PyUnicode_AsASCIIString(unicode); 1328 } 1329 1330 /* Encode via the codec registry */ 1331 v = PyCodec_Encode(unicode, encoding, errors); 1332 if (v == NULL) 1333 goto onError; 1334 if (PyByteArray_Check(v)) { 1335 char msg[100]; 1336 PyOS_snprintf(msg, sizeof(msg), 1337 "encoder %s returned buffer instead of bytes", 1338 encoding); 1339 if (PyErr_WarnEx(PyExc_RuntimeWarning, msg, 1) < 0) { 1340 v = NULL; 1341 goto onError; 1342 } 1343 v = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v)); 1344 } 1345 else if (!PyBytes_Check(v)) { 1346 PyErr_Format(PyExc_TypeError, 1347 "encoder did not return a bytes object (type=%.400s)", 1348 Py_TYPE(v)->tp_name); 1349 v = NULL; 1350 } 1351 return v; 1352 1353 onError: 1354 return NULL; 1355} 1356 1357PyObject *PyUnicode_AsEncodedUnicode(PyObject *unicode, 1358 const char *encoding, 1359 const char *errors) 1360{ 1361 PyObject *v; 1362 1363 if (!PyUnicode_Check(unicode)) { 1364 PyErr_BadArgument(); 1365 goto onError; 1366 } 1367 1368 if (encoding == NULL) 1369 encoding = PyUnicode_GetDefaultEncoding(); 1370 1371 /* Encode via the codec registry */ 1372 v = PyCodec_Encode(unicode, encoding, errors); 1373 if (v == NULL) 1374 goto onError; 1375 if (!PyUnicode_Check(v)) { 1376 PyErr_Format(PyExc_TypeError, 1377 "encoder did not return an unicode object (type=%.400s)", 1378 Py_TYPE(v)->tp_name); 1379 Py_DECREF(v); 1380 goto onError; 1381 } 1382 return v; 1383 1384 onError: 1385 return NULL; 1386} 1387 1388PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode, 1389 const char *errors) 1390{ 1391 PyObject *v = ((PyUnicodeObject *)unicode)->defenc; 1392 if (v) 1393 return v; 1394 if (errors != NULL) 1395 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString"); 1396 v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode), 1397 PyUnicode_GET_SIZE(unicode), 1398 NULL); 1399 if (!v) 1400 return NULL; 1401 ((PyUnicodeObject *)unicode)->defenc = v; 1402 return v; 1403} 1404 1405PyObject* 1406PyUnicode_DecodeFSDefault(const char *s) { 1407 Py_ssize_t size = (Py_ssize_t)strlen(s); 1408 return PyUnicode_DecodeFSDefaultAndSize(s, size); 1409} 1410 1411PyObject* 1412PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size) 1413{ 1414 /* During the early bootstrapping process, Py_FileSystemDefaultEncoding 1415 can be undefined. If it is case, decode using UTF-8. The following assumes 1416 that Py_FileSystemDefaultEncoding is set to a built-in encoding during the 1417 bootstrapping process where the codecs aren't ready yet. 1418 */ 1419 if (Py_FileSystemDefaultEncoding) { 1420#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T) 1421 if (strcmp(Py_FileSystemDefaultEncoding, "mbcs") == 0) { 1422 return PyUnicode_DecodeMBCS(s, size, "replace"); 1423 } 1424#elif defined(__APPLE__) 1425 if (strcmp(Py_FileSystemDefaultEncoding, "utf-8") == 0) { 1426 return PyUnicode_DecodeUTF8(s, size, "replace"); 1427 } 1428#endif 1429 return PyUnicode_Decode(s, size, 1430 Py_FileSystemDefaultEncoding, 1431 "replace"); 1432 } 1433 else { 1434 return PyUnicode_DecodeUTF8(s, size, "replace"); 1435 } 1436} 1437 1438char* 1439PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize) 1440{ 1441 PyObject *bytes; 1442 if (!PyUnicode_Check(unicode)) { 1443 PyErr_BadArgument(); 1444 return NULL; 1445 } 1446 bytes = _PyUnicode_AsDefaultEncodedString(unicode, NULL); 1447 if (bytes == NULL) 1448 return NULL; 1449 if (psize != NULL) 1450 *psize = PyBytes_GET_SIZE(bytes); 1451 return PyBytes_AS_STRING(bytes); 1452} 1453 1454char* 1455PyUnicode_AsString(PyObject *unicode) 1456{ 1457 return PyUnicode_AsStringAndSize(unicode, NULL); 1458} 1459 1460Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode) 1461{ 1462 if (!PyUnicode_Check(unicode)) { 1463 PyErr_BadArgument(); 1464 goto onError; 1465 } 1466 return PyUnicode_AS_UNICODE(unicode); 1467 1468 onError: 1469 return NULL; 1470} 1471 1472Py_ssize_t PyUnicode_GetSize(PyObject *unicode) 1473{ 1474 if (!PyUnicode_Check(unicode)) { 1475 PyErr_BadArgument(); 1476 goto onError; 1477 } 1478 return PyUnicode_GET_SIZE(unicode); 1479 1480 onError: 1481 return -1; 1482} 1483 1484const char *PyUnicode_GetDefaultEncoding(void) 1485{ 1486 return unicode_default_encoding; 1487} 1488 1489int PyUnicode_SetDefaultEncoding(const char *encoding) 1490{ 1491 if (strcmp(encoding, unicode_default_encoding) != 0) { 1492 PyErr_Format(PyExc_ValueError, 1493 "Can only set default encoding to %s", 1494 unicode_default_encoding); 1495 return -1; 1496 } 1497 return 0; 1498} 1499 1500/* error handling callback helper: 1501 build arguments, call the callback and check the arguments, 1502 if no exception occurred, copy the replacement to the output 1503 and adjust various state variables. 1504 return 0 on success, -1 on error 1505*/ 1506 1507static 1508int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler, 1509 const char *encoding, const char *reason, 1510 const char **input, const char **inend, Py_ssize_t *startinpos, 1511 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr, 1512 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr) 1513{ 1514 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple"; 1515 1516 PyObject *restuple = NULL; 1517 PyObject *repunicode = NULL; 1518 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output); 1519 Py_ssize_t insize; 1520 Py_ssize_t requiredsize; 1521 Py_ssize_t newpos; 1522 Py_UNICODE *repptr; 1523 PyObject *inputobj = NULL; 1524 Py_ssize_t repsize; 1525 int res = -1; 1526 1527 if (*errorHandler == NULL) { 1528 *errorHandler = PyCodec_LookupError(errors); 1529 if (*errorHandler == NULL) 1530 goto onError; 1531 } 1532 1533 if (*exceptionObject == NULL) { 1534 *exceptionObject = PyUnicodeDecodeError_Create( 1535 encoding, *input, *inend-*input, *startinpos, *endinpos, reason); 1536 if (*exceptionObject == NULL) 1537 goto onError; 1538 } 1539 else { 1540 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos)) 1541 goto onError; 1542 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos)) 1543 goto onError; 1544 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason)) 1545 goto onError; 1546 } 1547 1548 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL); 1549 if (restuple == NULL) 1550 goto onError; 1551 if (!PyTuple_Check(restuple)) { 1552 PyErr_Format(PyExc_TypeError, &argparse[4]); 1553 goto onError; 1554 } 1555 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos)) 1556 goto onError; 1557 1558 /* Copy back the bytes variables, which might have been modified by the 1559 callback */ 1560 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject); 1561 if (!inputobj) 1562 goto onError; 1563 if (!PyBytes_Check(inputobj)) { 1564 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes"); 1565 } 1566 *input = PyBytes_AS_STRING(inputobj); 1567 insize = PyBytes_GET_SIZE(inputobj); 1568 *inend = *input + insize; 1569 /* we can DECREF safely, as the exception has another reference, 1570 so the object won't go away. */ 1571 Py_DECREF(inputobj); 1572 1573 if (newpos<0) 1574 newpos = insize+newpos; 1575 if (newpos<0 || newpos>insize) { 1576 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos); 1577 goto onError; 1578 } 1579 1580 /* need more space? (at least enough for what we 1581 have+the replacement+the rest of the string (starting 1582 at the new input position), so we won't have to check space 1583 when there are no errors in the rest of the string) */ 1584 repptr = PyUnicode_AS_UNICODE(repunicode); 1585 repsize = PyUnicode_GET_SIZE(repunicode); 1586 requiredsize = *outpos + repsize + insize-newpos; 1587 if (requiredsize > outsize) { 1588 if (requiredsize<2*outsize) 1589 requiredsize = 2*outsize; 1590 if (PyUnicode_Resize(output, requiredsize) < 0) 1591 goto onError; 1592 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos; 1593 } 1594 *endinpos = newpos; 1595 *inptr = *input + newpos; 1596 Py_UNICODE_COPY(*outptr, repptr, repsize); 1597 *outptr += repsize; 1598 *outpos += repsize; 1599 1600 /* we made it! */ 1601 res = 0; 1602 1603 onError: 1604 Py_XDECREF(restuple); 1605 return res; 1606} 1607 1608/* --- UTF-7 Codec -------------------------------------------------------- */ 1609 1610/* see RFC2152 for details */ 1611 1612static 1613char utf7_special[128] = { 1614 /* indicate whether a UTF-7 character is special i.e. cannot be directly 1615 encoded: 1616 0 - not special 1617 1 - special 1618 2 - whitespace (optional) 1619 3 - RFC2152 Set O (optional) */ 1620 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1, 1621 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1622 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1, 1623 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0, 1624 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1625 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3, 1626 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1627 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1, 1628 1629}; 1630 1631/* Note: The comparison (c) <= 0 is a trick to work-around gcc 1632 warnings about the comparison always being false; since 1633 utf7_special[0] is 1, we can safely make that one comparison 1634 true */ 1635 1636#define SPECIAL(c, encodeO, encodeWS) \ 1637 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \ 1638 (encodeWS && (utf7_special[(c)] == 2)) || \ 1639 (encodeO && (utf7_special[(c)] == 3))) 1640 1641#define B64(n) \ 1642 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f]) 1643#define B64CHAR(c) \ 1644 (ISALNUM(c) || (c) == '+' || (c) == '/') 1645#define UB64(c) \ 1646 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \ 1647 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 ) 1648 1649#define ENCODE(out, ch, bits) \ 1650 while (bits >= 6) { \ 1651 *out++ = B64(ch >> (bits-6)); \ 1652 bits -= 6; \ 1653 } 1654 1655#define DECODE(out, ch, bits, surrogate) \ 1656 while (bits >= 16) { \ 1657 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \ 1658 bits -= 16; \ 1659 if (surrogate) { \ 1660 /* We have already generated an error for the high surrogate \ 1661 so let's not bother seeing if the low surrogate is correct or not */ \ 1662 surrogate = 0; \ 1663 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \ 1664 /* This is a surrogate pair. Unfortunately we can't represent \ 1665 it in a 16-bit character */ \ 1666 surrogate = 1; \ 1667 errmsg = "code pairs are not supported"; \ 1668 goto utf7Error; \ 1669 } else { \ 1670 *out++ = outCh; \ 1671 } \ 1672 } 1673 1674PyObject *PyUnicode_DecodeUTF7(const char *s, 1675 Py_ssize_t size, 1676 const char *errors) 1677{ 1678 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL); 1679} 1680 1681PyObject *PyUnicode_DecodeUTF7Stateful(const char *s, 1682 Py_ssize_t size, 1683 const char *errors, 1684 Py_ssize_t *consumed) 1685{ 1686 const char *starts = s; 1687 Py_ssize_t startinpos; 1688 Py_ssize_t endinpos; 1689 Py_ssize_t outpos; 1690 const char *e; 1691 PyUnicodeObject *unicode; 1692 Py_UNICODE *p; 1693 const char *errmsg = ""; 1694 int inShift = 0; 1695 unsigned int bitsleft = 0; 1696 unsigned long charsleft = 0; 1697 int surrogate = 0; 1698 PyObject *errorHandler = NULL; 1699 PyObject *exc = NULL; 1700 1701 unicode = _PyUnicode_New(size); 1702 if (!unicode) 1703 return NULL; 1704 if (size == 0) { 1705 if (consumed) 1706 *consumed = 0; 1707 return (PyObject *)unicode; 1708 } 1709 1710 p = unicode->str; 1711 e = s + size; 1712 1713 while (s < e) { 1714 Py_UNICODE ch; 1715 restart: 1716 ch = *s; 1717 1718 if (inShift) { 1719 if ((ch == '-') || !B64CHAR(ch)) { 1720 inShift = 0; 1721 s++; 1722 1723 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate); 1724 if (bitsleft >= 6) { 1725 /* The shift sequence has a partial character in it. If 1726 bitsleft < 6 then we could just classify it as padding 1727 but that is not the case here */ 1728 1729 errmsg = "partial character in shift sequence"; 1730 goto utf7Error; 1731 } 1732 /* According to RFC2152 the remaining bits should be zero. We 1733 choose to signal an error/insert a replacement character 1734 here so indicate the potential of a misencoded character. */ 1735 1736 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */ 1737 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) { 1738 errmsg = "non-zero padding bits in shift sequence"; 1739 goto utf7Error; 1740 } 1741 1742 if (ch == '-') { 1743 if ((s < e) && (*(s) == '-')) { 1744 *p++ = '-'; 1745 inShift = 1; 1746 } 1747 } else if (SPECIAL(ch,0,0)) { 1748 errmsg = "unexpected special character"; 1749 goto utf7Error; 1750 } else { 1751 *p++ = ch; 1752 } 1753 } else { 1754 charsleft = (charsleft << 6) | UB64(ch); 1755 bitsleft += 6; 1756 s++; 1757 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate); 1758 } 1759 } 1760 else if ( ch == '+' ) { 1761 startinpos = s-starts; 1762 s++; 1763 if (s < e && *s == '-') { 1764 s++; 1765 *p++ = '+'; 1766 } else 1767 { 1768 inShift = 1; 1769 bitsleft = 0; 1770 } 1771 } 1772 else if (SPECIAL(ch,0,0)) { 1773 startinpos = s-starts; 1774 errmsg = "unexpected special character"; 1775 s++; 1776 goto utf7Error; 1777 } 1778 else { 1779 *p++ = ch; 1780 s++; 1781 } 1782 continue; 1783 utf7Error: 1784 outpos = p-PyUnicode_AS_UNICODE(unicode); 1785 endinpos = s-starts; 1786 if (unicode_decode_call_errorhandler( 1787 errors, &errorHandler, 1788 "utf7", errmsg, 1789 &starts, &e, &startinpos, &endinpos, &exc, &s, 1790 (PyObject **)&unicode, &outpos, &p)) 1791 goto onError; 1792 } 1793 1794 if (inShift && !consumed) { 1795 outpos = p-PyUnicode_AS_UNICODE(unicode); 1796 endinpos = size; 1797 if (unicode_decode_call_errorhandler( 1798 errors, &errorHandler, 1799 "utf7", "unterminated shift sequence", 1800 &starts, &e, &startinpos, &endinpos, &exc, &s, 1801 (PyObject **)&unicode, &outpos, &p)) 1802 goto onError; 1803 if (s < e) 1804 goto restart; 1805 } 1806 if (consumed) { 1807 if(inShift) 1808 *consumed = startinpos; 1809 else 1810 *consumed = s-starts; 1811 } 1812 1813 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0) 1814 goto onError; 1815 1816 Py_XDECREF(errorHandler); 1817 Py_XDECREF(exc); 1818 return (PyObject *)unicode; 1819 1820onError: 1821 Py_XDECREF(errorHandler); 1822 Py_XDECREF(exc); 1823 Py_DECREF(unicode); 1824 return NULL; 1825} 1826 1827 1828PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s, 1829 Py_ssize_t size, 1830 int encodeSetO, 1831 int encodeWhiteSpace, 1832 const char *errors) 1833{ 1834 PyObject *v, *result; 1835 /* It might be possible to tighten this worst case */ 1836 Py_ssize_t cbAllocated = 5 * size; 1837 int inShift = 0; 1838 Py_ssize_t i = 0; 1839 unsigned int bitsleft = 0; 1840 unsigned long charsleft = 0; 1841 char * out; 1842 char * start; 1843 1844 if (size == 0) 1845 return PyBytes_FromStringAndSize(NULL, 0); 1846 1847 v = PyByteArray_FromStringAndSize(NULL, cbAllocated); 1848 if (v == NULL) 1849 return NULL; 1850 1851 start = out = PyByteArray_AS_STRING(v); 1852 for (;i < size; ++i) { 1853 Py_UNICODE ch = s[i]; 1854 1855 if (!inShift) { 1856 if (ch == '+') { 1857 *out++ = '+'; 1858 *out++ = '-'; 1859 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) { 1860 charsleft = ch; 1861 bitsleft = 16; 1862 *out++ = '+'; 1863 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft); 1864 inShift = bitsleft > 0; 1865 } else { 1866 *out++ = (char) ch; 1867 } 1868 } else { 1869 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) { 1870 *out++ = B64(charsleft << (6-bitsleft)); 1871 charsleft = 0; 1872 bitsleft = 0; 1873 /* Characters not in the BASE64 set implicitly unshift the sequence 1874 so no '-' is required, except if the character is itself a '-' */ 1875 if (B64CHAR(ch) || ch == '-') { 1876 *out++ = '-'; 1877 } 1878 inShift = 0; 1879 *out++ = (char) ch; 1880 } else { 1881 bitsleft += 16; 1882 charsleft = (charsleft << 16) | ch; 1883 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft); 1884 1885 /* If the next character is special then we dont' need to terminate 1886 the shift sequence. If the next character is not a BASE64 character 1887 or '-' then the shift sequence will be terminated implicitly and we 1888 don't have to insert a '-'. */ 1889 1890 if (bitsleft == 0) { 1891 if (i + 1 < size) { 1892 Py_UNICODE ch2 = s[i+1]; 1893 1894 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) { 1895 1896 } else if (B64CHAR(ch2) || ch2 == '-') { 1897 *out++ = '-'; 1898 inShift = 0; 1899 } else { 1900 inShift = 0; 1901 } 1902 1903 } 1904 else { 1905 *out++ = '-'; 1906 inShift = 0; 1907 } 1908 } 1909 } 1910 } 1911 } 1912 if (bitsleft) { 1913 *out++= B64(charsleft << (6-bitsleft) ); 1914 *out++ = '-'; 1915 } 1916 1917 result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), out - start); 1918 Py_DECREF(v); 1919 return result; 1920} 1921 1922#undef SPECIAL 1923#undef B64 1924#undef B64CHAR 1925#undef UB64 1926#undef ENCODE 1927#undef DECODE 1928 1929/* --- UTF-8 Codec -------------------------------------------------------- */ 1930 1931static 1932char utf8_code_length[256] = { 1933 /* Map UTF-8 encoded prefix byte to sequence length. zero means 1934 illegal prefix. see RFC 2279 for details */ 1935 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1936 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1937 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1938 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1939 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1940 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1941 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1942 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1943 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1944 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1945 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1946 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1947 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1948 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1949 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1950 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0 1951}; 1952 1953PyObject *PyUnicode_DecodeUTF8(const char *s, 1954 Py_ssize_t size, 1955 const char *errors) 1956{ 1957 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL); 1958} 1959 1960PyObject *PyUnicode_DecodeUTF8Stateful(const char *s, 1961 Py_ssize_t size, 1962 const char *errors, 1963 Py_ssize_t *consumed) 1964{ 1965 const char *starts = s; 1966 int n; 1967 Py_ssize_t startinpos; 1968 Py_ssize_t endinpos; 1969 Py_ssize_t outpos; 1970 const char *e; 1971 PyUnicodeObject *unicode; 1972 Py_UNICODE *p; 1973 const char *errmsg = ""; 1974 PyObject *errorHandler = NULL; 1975 PyObject *exc = NULL; 1976 1977 /* Note: size will always be longer than the resulting Unicode 1978 character count */ 1979 unicode = _PyUnicode_New(size); 1980 if (!unicode) 1981 return NULL; 1982 if (size == 0) { 1983 if (consumed) 1984 *consumed = 0; 1985 return (PyObject *)unicode; 1986 } 1987 1988 /* Unpack UTF-8 encoded data */ 1989 p = unicode->str; 1990 e = s + size; 1991 1992 while (s < e) { 1993 Py_UCS4 ch = (unsigned char)*s; 1994 1995 if (ch < 0x80) { 1996 *p++ = (Py_UNICODE)ch; 1997 s++; 1998 continue; 1999 } 2000 2001 n = utf8_code_length[ch]; 2002 2003 if (s + n > e) { 2004 if (consumed) 2005 break; 2006 else { 2007 errmsg = "unexpected end of data"; 2008 startinpos = s-starts; 2009 endinpos = size; 2010 goto utf8Error; 2011 } 2012 } 2013 2014 switch (n) { 2015 2016 case 0: 2017 errmsg = "unexpected code byte"; 2018 startinpos = s-starts; 2019 endinpos = startinpos+1; 2020 goto utf8Error; 2021 2022 case 1: 2023 errmsg = "internal error"; 2024 startinpos = s-starts; 2025 endinpos = startinpos+1; 2026 goto utf8Error; 2027 2028 case 2: 2029 if ((s[1] & 0xc0) != 0x80) { 2030 errmsg = "invalid data"; 2031 startinpos = s-starts; 2032 endinpos = startinpos+2; 2033 goto utf8Error; 2034 } 2035 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f); 2036 if (ch < 0x80) { 2037 startinpos = s-starts; 2038 endinpos = startinpos+2; 2039 errmsg = "illegal encoding"; 2040 goto utf8Error; 2041 } 2042 else 2043 *p++ = (Py_UNICODE)ch; 2044 break; 2045 2046 case 3: 2047 if ((s[1] & 0xc0) != 0x80 || 2048 (s[2] & 0xc0) != 0x80) { 2049 errmsg = "invalid data"; 2050 startinpos = s-starts; 2051 endinpos = startinpos+3; 2052 goto utf8Error; 2053 } 2054 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f); 2055 if (ch < 0x0800) { 2056 /* Note: UTF-8 encodings of surrogates are considered 2057 legal UTF-8 sequences; 2058 2059 XXX For wide builds (UCS-4) we should probably try 2060 to recombine the surrogates into a single code 2061 unit. 2062 */ 2063 errmsg = "illegal encoding"; 2064 startinpos = s-starts; 2065 endinpos = startinpos+3; 2066 goto utf8Error; 2067 } 2068 else 2069 *p++ = (Py_UNICODE)ch; 2070 break; 2071 2072 case 4: 2073 if ((s[1] & 0xc0) != 0x80 || 2074 (s[2] & 0xc0) != 0x80 || 2075 (s[3] & 0xc0) != 0x80) { 2076 errmsg = "invalid data"; 2077 startinpos = s-starts; 2078 endinpos = startinpos+4; 2079 goto utf8Error; 2080 } 2081 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) + 2082 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f); 2083 /* validate and convert to UTF-16 */ 2084 if ((ch < 0x10000) /* minimum value allowed for 4 2085 byte encoding */ 2086 || (ch > 0x10ffff)) /* maximum value allowed for 2087 UTF-16 */ 2088 { 2089 errmsg = "illegal encoding"; 2090 startinpos = s-starts; 2091 endinpos = startinpos+4; 2092 goto utf8Error; 2093 } 2094#ifdef Py_UNICODE_WIDE 2095 *p++ = (Py_UNICODE)ch; 2096#else 2097 /* compute and append the two surrogates: */ 2098 2099 /* translate from 10000..10FFFF to 0..FFFF */ 2100 ch -= 0x10000; 2101 2102 /* high surrogate = top 10 bits added to D800 */ 2103 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10)); 2104 2105 /* low surrogate = bottom 10 bits added to DC00 */ 2106 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF)); 2107#endif 2108 break; 2109 2110 default: 2111 /* Other sizes are only needed for UCS-4 */ 2112 errmsg = "unsupported Unicode code range"; 2113 startinpos = s-starts; 2114 endinpos = startinpos+n; 2115 goto utf8Error; 2116 } 2117 s += n; 2118 continue; 2119 2120 utf8Error: 2121 outpos = p-PyUnicode_AS_UNICODE(unicode); 2122 if (unicode_decode_call_errorhandler( 2123 errors, &errorHandler, 2124 "utf8", errmsg, 2125 &starts, &e, &startinpos, &endinpos, &exc, &s, 2126 (PyObject **)&unicode, &outpos, &p)) 2127 goto onError; 2128 } 2129 if (consumed) 2130 *consumed = s-starts; 2131 2132 /* Adjust length */ 2133 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0) 2134 goto onError; 2135 2136 Py_XDECREF(errorHandler); 2137 Py_XDECREF(exc); 2138 return (PyObject *)unicode; 2139 2140onError: 2141 Py_XDECREF(errorHandler); 2142 Py_XDECREF(exc); 2143 Py_DECREF(unicode); 2144 return NULL; 2145} 2146 2147/* Allocation strategy: if the string is short, convert into a stack buffer 2148 and allocate exactly as much space needed at the end. Else allocate the 2149 maximum possible needed (4 result bytes per Unicode character), and return 2150 the excess memory at the end. 2151*/ 2152PyObject * 2153PyUnicode_EncodeUTF8(const Py_UNICODE *s, 2154 Py_ssize_t size, 2155 const char *errors) 2156{ 2157#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */ 2158 2159 Py_ssize_t i; /* index into s of next input byte */ 2160 PyObject *result; /* result string object */ 2161 char *p; /* next free byte in output buffer */ 2162 Py_ssize_t nallocated; /* number of result bytes allocated */ 2163 Py_ssize_t nneeded; /* number of result bytes needed */ 2164 char stackbuf[MAX_SHORT_UNICHARS * 4]; 2165 2166 assert(s != NULL); 2167 assert(size >= 0); 2168 2169 if (size <= MAX_SHORT_UNICHARS) { 2170 /* Write into the stack buffer; nallocated can't overflow. 2171 * At the end, we'll allocate exactly as much heap space as it 2172 * turns out we need. 2173 */ 2174 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int); 2175 result = NULL; /* will allocate after we're done */ 2176 p = stackbuf; 2177 } 2178 else { 2179 /* Overallocate on the heap, and give the excess back at the end. */ 2180 nallocated = size * 4; 2181 if (nallocated / 4 != size) /* overflow! */ 2182 return PyErr_NoMemory(); 2183 result = PyBytes_FromStringAndSize(NULL, nallocated); 2184 if (result == NULL) 2185 return NULL; 2186 p = PyBytes_AS_STRING(result); 2187 } 2188 2189 for (i = 0; i < size;) { 2190 Py_UCS4 ch = s[i++]; 2191 2192 if (ch < 0x80) 2193 /* Encode ASCII */ 2194 *p++ = (char) ch; 2195 2196 else if (ch < 0x0800) { 2197 /* Encode Latin-1 */ 2198 *p++ = (char)(0xc0 | (ch >> 6)); 2199 *p++ = (char)(0x80 | (ch & 0x3f)); 2200 } 2201 else { 2202 /* Encode UCS2 Unicode ordinals */ 2203 if (ch < 0x10000) { 2204 /* Special case: check for high surrogate */ 2205 if (0xD800 <= ch && ch <= 0xDBFF && i != size) { 2206 Py_UCS4 ch2 = s[i]; 2207 /* Check for low surrogate and combine the two to 2208 form a UCS4 value */ 2209 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) { 2210 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000; 2211 i++; 2212 goto encodeUCS4; 2213 } 2214 /* Fall through: handles isolated high surrogates */ 2215 } 2216 *p++ = (char)(0xe0 | (ch >> 12)); 2217 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); 2218 *p++ = (char)(0x80 | (ch & 0x3f)); 2219 continue; 2220 } 2221encodeUCS4: 2222 /* Encode UCS4 Unicode ordinals */ 2223 *p++ = (char)(0xf0 | (ch >> 18)); 2224 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f)); 2225 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); 2226 *p++ = (char)(0x80 | (ch & 0x3f)); 2227 } 2228 } 2229 2230 if (result == NULL) { 2231 /* This was stack allocated. */ 2232 nneeded = p - stackbuf; 2233 assert(nneeded <= nallocated); 2234 result = PyBytes_FromStringAndSize(stackbuf, nneeded); 2235 } 2236 else { 2237 /* Cut back to size actually needed. */ 2238 nneeded = p - PyBytes_AS_STRING(result); 2239 assert(nneeded <= nallocated); 2240 _PyBytes_Resize(&result, nneeded); 2241 } 2242 return result; 2243 2244#undef MAX_SHORT_UNICHARS 2245} 2246 2247PyObject *PyUnicode_AsUTF8String(PyObject *unicode) 2248{ 2249 if (!PyUnicode_Check(unicode)) { 2250 PyErr_BadArgument(); 2251 return NULL; 2252 } 2253 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode), 2254 PyUnicode_GET_SIZE(unicode), 2255 NULL); 2256} 2257 2258/* --- UTF-32 Codec ------------------------------------------------------- */ 2259 2260PyObject * 2261PyUnicode_DecodeUTF32(const char *s, 2262 Py_ssize_t size, 2263 const char *errors, 2264 int *byteorder) 2265{ 2266 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL); 2267} 2268 2269PyObject * 2270PyUnicode_DecodeUTF32Stateful(const char *s, 2271 Py_ssize_t size, 2272 const char *errors, 2273 int *byteorder, 2274 Py_ssize_t *consumed) 2275{ 2276 const char *starts = s; 2277 Py_ssize_t startinpos; 2278 Py_ssize_t endinpos; 2279 Py_ssize_t outpos; 2280 PyUnicodeObject *unicode; 2281 Py_UNICODE *p; 2282#ifndef Py_UNICODE_WIDE 2283 int i, pairs; 2284#else 2285 const int pairs = 0; 2286#endif 2287 const unsigned char *q, *e; 2288 int bo = 0; /* assume native ordering by default */ 2289 const char *errmsg = ""; 2290 /* Offsets from q for retrieving bytes in the right order. */ 2291#ifdef BYTEORDER_IS_LITTLE_ENDIAN 2292 int iorder[] = {0, 1, 2, 3}; 2293#else 2294 int iorder[] = {3, 2, 1, 0}; 2295#endif 2296 PyObject *errorHandler = NULL; 2297 PyObject *exc = NULL; 2298 /* On narrow builds we split characters outside the BMP into two 2299 codepoints => count how much extra space we need. */ 2300#ifndef Py_UNICODE_WIDE 2301 for (i = pairs = 0; i < size/4; i++) 2302 if (((Py_UCS4 *)s)[i] >= 0x10000) 2303 pairs++; 2304#endif 2305 2306 /* This might be one to much, because of a BOM */ 2307 unicode = _PyUnicode_New((size+3)/4+pairs); 2308 if (!unicode) 2309 return NULL; 2310 if (size == 0) 2311 return (PyObject *)unicode; 2312 2313 /* Unpack UTF-32 encoded data */ 2314 p = unicode->str; 2315 q = (unsigned char *)s; 2316 e = q + size; 2317 2318 if (byteorder) 2319 bo = *byteorder; 2320 2321 /* Check for BOM marks (U+FEFF) in the input and adjust current 2322 byte order setting accordingly. In native mode, the leading BOM 2323 mark is skipped, in all other modes, it is copied to the output 2324 stream as-is (giving a ZWNBSP character). */ 2325 if (bo == 0) { 2326 if (size >= 4) { 2327 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) | 2328 (q[iorder[1]] << 8) | q[iorder[0]]; 2329#ifdef BYTEORDER_IS_LITTLE_ENDIAN 2330 if (bom == 0x0000FEFF) { 2331 q += 4; 2332 bo = -1; 2333 } 2334 else if (bom == 0xFFFE0000) { 2335 q += 4; 2336 bo = 1; 2337 } 2338#else 2339 if (bom == 0x0000FEFF) { 2340 q += 4; 2341 bo = 1; 2342 } 2343 else if (bom == 0xFFFE0000) { 2344 q += 4; 2345 bo = -1; 2346 } 2347#endif 2348 } 2349 } 2350 2351 if (bo == -1) { 2352 /* force LE */ 2353 iorder[0] = 0; 2354 iorder[1] = 1; 2355 iorder[2] = 2; 2356 iorder[3] = 3; 2357 } 2358 else if (bo == 1) { 2359 /* force BE */ 2360 iorder[0] = 3; 2361 iorder[1] = 2; 2362 iorder[2] = 1; 2363 iorder[3] = 0; 2364 } 2365 2366 while (q < e) { 2367 Py_UCS4 ch; 2368 /* remaining bytes at the end? (size should be divisible by 4) */ 2369 if (e-q<4) { 2370 if (consumed) 2371 break; 2372 errmsg = "truncated data"; 2373 startinpos = ((const char *)q)-starts; 2374 endinpos = ((const char *)e)-starts; 2375 goto utf32Error; 2376 /* The remaining input chars are ignored if the callback 2377 chooses to skip the input */ 2378 } 2379 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) | 2380 (q[iorder[1]] << 8) | q[iorder[0]]; 2381 2382 if (ch >= 0x110000) 2383 { 2384 errmsg = "codepoint not in range(0x110000)"; 2385 startinpos = ((const char *)q)-starts; 2386 endinpos = startinpos+4; 2387 goto utf32Error; 2388 } 2389#ifndef Py_UNICODE_WIDE 2390 if (ch >= 0x10000) 2391 { 2392 *p++ = 0xD800 | ((ch-0x10000) >> 10); 2393 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF); 2394 } 2395 else 2396#endif 2397 *p++ = ch; 2398 q += 4; 2399 continue; 2400 utf32Error: 2401 outpos = p-PyUnicode_AS_UNICODE(unicode); 2402 if (unicode_decode_call_errorhandler( 2403 errors, &errorHandler, 2404 "utf32", errmsg, 2405 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q, 2406 (PyObject **)&unicode, &outpos, &p)) 2407 goto onError; 2408 } 2409 2410 if (byteorder) 2411 *byteorder = bo; 2412 2413 if (consumed) 2414 *consumed = (const char *)q-starts; 2415 2416 /* Adjust length */ 2417 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0) 2418 goto onError; 2419 2420 Py_XDECREF(errorHandler); 2421 Py_XDECREF(exc); 2422 return (PyObject *)unicode; 2423 2424onError: 2425 Py_DECREF(unicode); 2426 Py_XDECREF(errorHandler); 2427 Py_XDECREF(exc); 2428 return NULL; 2429} 2430 2431PyObject * 2432PyUnicode_EncodeUTF32(const Py_UNICODE *s, 2433 Py_ssize_t size, 2434 const char *errors, 2435 int byteorder) 2436{ 2437 PyObject *v, *result; 2438 unsigned char *p; 2439#ifndef Py_UNICODE_WIDE 2440 int i, pairs; 2441#else 2442 const int pairs = 0; 2443#endif 2444 /* Offsets from p for storing byte pairs in the right order. */ 2445#ifdef BYTEORDER_IS_LITTLE_ENDIAN 2446 int iorder[] = {0, 1, 2, 3}; 2447#else 2448 int iorder[] = {3, 2, 1, 0}; 2449#endif 2450 2451#define STORECHAR(CH) \ 2452 do { \ 2453 p[iorder[3]] = ((CH) >> 24) & 0xff; \ 2454 p[iorder[2]] = ((CH) >> 16) & 0xff; \ 2455 p[iorder[1]] = ((CH) >> 8) & 0xff; \ 2456 p[iorder[0]] = (CH) & 0xff; \ 2457 p += 4; \ 2458 } while(0) 2459 2460 /* In narrow builds we can output surrogate pairs as one codepoint, 2461 so we need less space. */ 2462#ifndef Py_UNICODE_WIDE 2463 for (i = pairs = 0; i < size-1; i++) 2464 if (0xD800 <= s[i] && s[i] <= 0xDBFF && 2465 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF) 2466 pairs++; 2467#endif 2468 v = PyByteArray_FromStringAndSize(NULL, 2469 4 * (size - pairs + (byteorder == 0))); 2470 if (v == NULL) 2471 return NULL; 2472 2473 p = (unsigned char *)PyByteArray_AS_STRING(v); 2474 if (byteorder == 0) 2475 STORECHAR(0xFEFF); 2476 if (size == 0) 2477 goto done; 2478 2479 if (byteorder == -1) { 2480 /* force LE */ 2481 iorder[0] = 0; 2482 iorder[1] = 1; 2483 iorder[2] = 2; 2484 iorder[3] = 3; 2485 } 2486 else if (byteorder == 1) { 2487 /* force BE */ 2488 iorder[0] = 3; 2489 iorder[1] = 2; 2490 iorder[2] = 1; 2491 iorder[3] = 0; 2492 } 2493 2494 while (size-- > 0) { 2495 Py_UCS4 ch = *s++; 2496#ifndef Py_UNICODE_WIDE 2497 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) { 2498 Py_UCS4 ch2 = *s; 2499 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) { 2500 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000; 2501 s++; 2502 size--; 2503 } 2504 } 2505#endif 2506 STORECHAR(ch); 2507 } 2508 2509 done: 2510 result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v)); 2511 Py_DECREF(v); 2512 return result; 2513#undef STORECHAR 2514} 2515 2516PyObject *PyUnicode_AsUTF32String(PyObject *unicode) 2517{ 2518 if (!PyUnicode_Check(unicode)) { 2519 PyErr_BadArgument(); 2520 return NULL; 2521 } 2522 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode), 2523 PyUnicode_GET_SIZE(unicode), 2524 NULL, 2525 0); 2526} 2527 2528/* --- UTF-16 Codec ------------------------------------------------------- */ 2529 2530PyObject * 2531PyUnicode_DecodeUTF16(const char *s, 2532 Py_ssize_t size, 2533 const char *errors, 2534 int *byteorder) 2535{ 2536 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL); 2537} 2538 2539PyObject * 2540PyUnicode_DecodeUTF16Stateful(const char *s, 2541 Py_ssize_t size, 2542 const char *errors, 2543 int *byteorder, 2544 Py_ssize_t *consumed) 2545{ 2546 const char *starts = s; 2547 Py_ssize_t startinpos; 2548 Py_ssize_t endinpos; 2549 Py_ssize_t outpos; 2550 PyUnicodeObject *unicode; 2551 Py_UNICODE *p; 2552 const unsigned char *q, *e; 2553 int bo = 0; /* assume native ordering by default */ 2554 const char *errmsg = ""; 2555 /* Offsets from q for retrieving byte pairs in the right order. */ 2556#ifdef BYTEORDER_IS_LITTLE_ENDIAN 2557 int ihi = 1, ilo = 0; 2558#else 2559 int ihi = 0, ilo = 1; 2560#endif 2561 PyObject *errorHandler = NULL; 2562 PyObject *exc = NULL; 2563 2564 /* Note: size will always be longer than the resulting Unicode 2565 character count */ 2566 unicode = _PyUnicode_New(size); 2567 if (!unicode) 2568 return NULL; 2569 if (size == 0) 2570 return (PyObject *)unicode; 2571 2572 /* Unpack UTF-16 encoded data */ 2573 p = unicode->str; 2574 q = (unsigned char *)s; 2575 e = q + size; 2576 2577 if (byteorder) 2578 bo = *byteorder; 2579 2580 /* Check for BOM marks (U+FEFF) in the input and adjust current 2581 byte order setting accordingly. In native mode, the leading BOM 2582 mark is skipped, in all other modes, it is copied to the output 2583 stream as-is (giving a ZWNBSP character). */ 2584 if (bo == 0) { 2585 if (size >= 2) { 2586 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo]; 2587#ifdef BYTEORDER_IS_LITTLE_ENDIAN 2588 if (bom == 0xFEFF) { 2589 q += 2; 2590 bo = -1; 2591 } 2592 else if (bom == 0xFFFE) { 2593 q += 2; 2594 bo = 1; 2595 } 2596#else 2597 if (bom == 0xFEFF) { 2598 q += 2; 2599 bo = 1; 2600 } 2601 else if (bom == 0xFFFE) { 2602 q += 2; 2603 bo = -1; 2604 } 2605#endif 2606 } 2607 } 2608 2609 if (bo == -1) { 2610 /* force LE */ 2611 ihi = 1; 2612 ilo = 0; 2613 } 2614 else if (bo == 1) { 2615 /* force BE */ 2616 ihi = 0; 2617 ilo = 1; 2618 } 2619 2620 while (q < e) { 2621 Py_UNICODE ch; 2622 /* remaining bytes at the end? (size should be even) */ 2623 if (e-q<2) { 2624 if (consumed) 2625 break; 2626 errmsg = "truncated data"; 2627 startinpos = ((const char *)q)-starts; 2628 endinpos = ((const char *)e)-starts; 2629 goto utf16Error; 2630 /* The remaining input chars are ignored if the callback 2631 chooses to skip the input */ 2632 } 2633 ch = (q[ihi] << 8) | q[ilo]; 2634 2635 q += 2; 2636 2637 if (ch < 0xD800 || ch > 0xDFFF) { 2638 *p++ = ch; 2639 continue; 2640 } 2641 2642 /* UTF-16 code pair: */ 2643 if (q >= e) { 2644 errmsg = "unexpected end of data"; 2645 startinpos = (((const char *)q)-2)-starts; 2646 endinpos = ((const char *)e)-starts; 2647 goto utf16Error; 2648 } 2649 if (0xD800 <= ch && ch <= 0xDBFF) { 2650 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo]; 2651 q += 2; 2652 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) { 2653#ifndef Py_UNICODE_WIDE 2654 *p++ = ch; 2655 *p++ = ch2; 2656#else 2657 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000; 2658#endif 2659 continue; 2660 } 2661 else { 2662 errmsg = "illegal UTF-16 surrogate"; 2663 startinpos = (((const char *)q)-4)-starts; 2664 endinpos = startinpos+2; 2665 goto utf16Error; 2666 } 2667 2668 } 2669 errmsg = "illegal encoding"; 2670 startinpos = (((const char *)q)-2)-starts; 2671 endinpos = startinpos+2; 2672 /* Fall through to report the error */ 2673 2674 utf16Error: 2675 outpos = p-PyUnicode_AS_UNICODE(unicode); 2676 if (unicode_decode_call_errorhandler( 2677 errors, &errorHandler, 2678 "utf16", errmsg, 2679 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q, 2680 (PyObject **)&unicode, &outpos, &p)) 2681 goto onError; 2682 } 2683 2684 if (byteorder) 2685 *byteorder = bo; 2686 2687 if (consumed) 2688 *consumed = (const char *)q-starts; 2689 2690 /* Adjust length */ 2691 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0) 2692 goto onError; 2693 2694 Py_XDECREF(errorHandler); 2695 Py_XDECREF(exc); 2696 return (PyObject *)unicode; 2697 2698onError: 2699 Py_DECREF(unicode); 2700 Py_XDECREF(errorHandler); 2701 Py_XDECREF(exc); 2702 return NULL; 2703} 2704 2705PyObject * 2706PyUnicode_EncodeUTF16(const Py_UNICODE *s, 2707 Py_ssize_t size, 2708 const char *errors, 2709 int byteorder) 2710{ 2711 PyObject *v, *result; 2712 unsigned char *p; 2713#ifdef Py_UNICODE_WIDE 2714 int i, pairs; 2715#else 2716 const int pairs = 0; 2717#endif 2718 /* Offsets from p for storing byte pairs in the right order. */ 2719#ifdef BYTEORDER_IS_LITTLE_ENDIAN 2720 int ihi = 1, ilo = 0; 2721#else 2722 int ihi = 0, ilo = 1; 2723#endif 2724 2725#define STORECHAR(CH) \ 2726 do { \ 2727 p[ihi] = ((CH) >> 8) & 0xff; \ 2728 p[ilo] = (CH) & 0xff; \ 2729 p += 2; \ 2730 } while(0) 2731 2732#ifdef Py_UNICODE_WIDE 2733 for (i = pairs = 0; i < size; i++) 2734 if (s[i] >= 0x10000) 2735 pairs++; 2736#endif 2737 v = PyByteArray_FromStringAndSize(NULL, 2738 2 * (size + pairs + (byteorder == 0))); 2739 if (v == NULL) 2740 return NULL; 2741 2742 p = (unsigned char *)PyByteArray_AS_STRING(v); 2743 if (byteorder == 0) 2744 STORECHAR(0xFEFF); 2745 if (size == 0) 2746 goto done; 2747 2748 if (byteorder == -1) { 2749 /* force LE */ 2750 ihi = 1; 2751 ilo = 0; 2752 } 2753 else if (byteorder == 1) { 2754 /* force BE */ 2755 ihi = 0; 2756 ilo = 1; 2757 } 2758 2759 while (size-- > 0) { 2760 Py_UNICODE ch = *s++; 2761 Py_UNICODE ch2 = 0; 2762#ifdef Py_UNICODE_WIDE 2763 if (ch >= 0x10000) { 2764 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF); 2765 ch = 0xD800 | ((ch-0x10000) >> 10); 2766 } 2767#endif 2768 STORECHAR(ch); 2769 if (ch2) 2770 STORECHAR(ch2); 2771 } 2772 2773 done: 2774 result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v)); 2775 Py_DECREF(v); 2776 return result; 2777#undef STORECHAR 2778} 2779 2780PyObject *PyUnicode_AsUTF16String(PyObject *unicode) 2781{ 2782 if (!PyUnicode_Check(unicode)) { 2783 PyErr_BadArgument(); 2784 return NULL; 2785 } 2786 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode), 2787 PyUnicode_GET_SIZE(unicode), 2788 NULL, 2789 0); 2790} 2791 2792/* --- Unicode Escape Codec ----------------------------------------------- */ 2793 2794static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL; 2795 2796PyObject *PyUnicode_DecodeUnicodeEscape(const char *s, 2797 Py_ssize_t size, 2798 const char *errors) 2799{ 2800 const char *starts = s; 2801 Py_ssize_t startinpos; 2802 Py_ssize_t endinpos; 2803 Py_ssize_t outpos; 2804 int i; 2805 PyUnicodeObject *v; 2806 Py_UNICODE *p; 2807 const char *end; 2808 char* message; 2809 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */ 2810 PyObject *errorHandler = NULL; 2811 PyObject *exc = NULL; 2812 2813 /* Escaped strings will always be longer than the resulting 2814 Unicode string, so we start with size here and then reduce the 2815 length after conversion to the true value. 2816 (but if the error callback returns a long replacement string 2817 we'll have to allocate more space) */ 2818 v = _PyUnicode_New(size); 2819 if (v == NULL) 2820 goto onError; 2821 if (size == 0) 2822 return (PyObject *)v; 2823 2824 p = PyUnicode_AS_UNICODE(v); 2825 end = s + size; 2826 2827 while (s < end) { 2828 unsigned char c; 2829 Py_UNICODE x; 2830 int digits; 2831 2832 /* Non-escape characters are interpreted as Unicode ordinals */ 2833 if (*s != '\\') { 2834 *p++ = (unsigned char) *s++; 2835 continue; 2836 } 2837 2838 startinpos = s-starts; 2839 /* \ - Escapes */ 2840 s++; 2841 c = *s++; 2842 if (s > end) 2843 c = '\0'; /* Invalid after \ */ 2844 switch (c) { 2845 2846 /* \x escapes */ 2847 case '\n': break; 2848 case '\\': *p++ = '\\'; break; 2849 case '\'': *p++ = '\''; break; 2850 case '\"': *p++ = '\"'; break; 2851 case 'b': *p++ = '\b'; break; 2852 case 'f': *p++ = '\014'; break; /* FF */ 2853 case 't': *p++ = '\t'; break; 2854 case 'n': *p++ = '\n'; break; 2855 case 'r': *p++ = '\r'; break; 2856 case 'v': *p++ = '\013'; break; /* VT */ 2857 case 'a': *p++ = '\007'; break; /* BEL, not classic C */ 2858 2859 /* \OOO (octal) escapes */ 2860 case '0': case '1': case '2': case '3': 2861 case '4': case '5': case '6': case '7': 2862 x = s[-1] - '0'; 2863 if (s < end && '0' <= *s && *s <= '7') { 2864 x = (x<<3) + *s++ - '0'; 2865 if (s < end && '0' <= *s && *s <= '7') 2866 x = (x<<3) + *s++ - '0'; 2867 } 2868 *p++ = x; 2869 break; 2870 2871 /* hex escapes */ 2872 /* \xXX */ 2873 case 'x': 2874 digits = 2; 2875 message = "truncated \\xXX escape"; 2876 goto hexescape; 2877 2878 /* \uXXXX */ 2879 case 'u': 2880 digits = 4; 2881 message = "truncated \\uXXXX escape"; 2882 goto hexescape; 2883 2884 /* \UXXXXXXXX */ 2885 case 'U': 2886 digits = 8; 2887 message = "truncated \\UXXXXXXXX escape"; 2888 hexescape: 2889 chr = 0; 2890 outpos = p-PyUnicode_AS_UNICODE(v); 2891 if (s+digits>end) { 2892 endinpos = size; 2893 if (unicode_decode_call_errorhandler( 2894 errors, &errorHandler, 2895 "unicodeescape", "end of string in escape sequence", 2896 &starts, &end, &startinpos, &endinpos, &exc, &s, 2897 (PyObject **)&v, &outpos, &p)) 2898 goto onError; 2899 goto nextByte; 2900 } 2901 for (i = 0; i < digits; ++i) { 2902 c = (unsigned char) s[i]; 2903 if (!ISXDIGIT(c)) { 2904 endinpos = (s+i+1)-starts; 2905 if (unicode_decode_call_errorhandler( 2906 errors, &errorHandler, 2907 "unicodeescape", message, 2908 &starts, &end, &startinpos, &endinpos, &exc, &s, 2909 (PyObject **)&v, &outpos, &p)) 2910 goto onError; 2911 goto nextByte; 2912 } 2913 chr = (chr<<4) & ~0xF; 2914 if (c >= '0' && c <= '9') 2915 chr += c - '0'; 2916 else if (c >= 'a' && c <= 'f') 2917 chr += 10 + c - 'a'; 2918 else 2919 chr += 10 + c - 'A'; 2920 } 2921 s += i; 2922 if (chr == 0xffffffff && PyErr_Occurred()) 2923 /* _decoding_error will have already written into the 2924 target buffer. */ 2925 break; 2926 store: 2927 /* when we get here, chr is a 32-bit unicode character */ 2928 if (chr <= 0xffff) 2929 /* UCS-2 character */ 2930 *p++ = (Py_UNICODE) chr; 2931 else if (chr <= 0x10ffff) { 2932 /* UCS-4 character. Either store directly, or as 2933 surrogate pair. */ 2934#ifdef Py_UNICODE_WIDE 2935 *p++ = chr; 2936#else 2937 chr -= 0x10000L; 2938 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10); 2939 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF); 2940#endif 2941 } else { 2942 endinpos = s-starts; 2943 outpos = p-PyUnicode_AS_UNICODE(v); 2944 if (unicode_decode_call_errorhandler( 2945 errors, &errorHandler, 2946 "unicodeescape", "illegal Unicode character", 2947 &starts, &end, &startinpos, &endinpos, &exc, &s, 2948 (PyObject **)&v, &outpos, &p)) 2949 goto onError; 2950 } 2951 break; 2952 2953 /* \N{name} */ 2954 case 'N': 2955 message = "malformed \\N character escape"; 2956 if (ucnhash_CAPI == NULL) { 2957 /* load the unicode data module */ 2958 PyObject *m, *api; 2959 m = PyImport_ImportModuleNoBlock("unicodedata"); 2960 if (m == NULL) 2961 goto ucnhashError; 2962 api = PyObject_GetAttrString(m, "ucnhash_CAPI"); 2963 Py_DECREF(m); 2964 if (api == NULL) 2965 goto ucnhashError; 2966 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api); 2967 Py_DECREF(api); 2968 if (ucnhash_CAPI == NULL) 2969 goto ucnhashError; 2970 } 2971 if (*s == '{') { 2972 const char *start = s+1; 2973 /* look for the closing brace */ 2974 while (*s != '}' && s < end) 2975 s++; 2976 if (s > start && s < end && *s == '}') { 2977 /* found a name. look it up in the unicode database */ 2978 message = "unknown Unicode character name"; 2979 s++; 2980 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr)) 2981 goto store; 2982 } 2983 } 2984 endinpos = s-starts; 2985 outpos = p-PyUnicode_AS_UNICODE(v); 2986 if (unicode_decode_call_errorhandler( 2987 errors, &errorHandler, 2988 "unicodeescape", message, 2989 &starts, &end, &startinpos, &endinpos, &exc, &s, 2990 (PyObject **)&v, &outpos, &p)) 2991 goto onError; 2992 break; 2993 2994 default: 2995 if (s > end) { 2996 message = "\\ at end of string"; 2997 s--; 2998 endinpos = s-starts; 2999 outpos = p-PyUnicode_AS_UNICODE(v); 3000 if (unicode_decode_call_errorhandler( 3001 errors, &errorHandler, 3002 "unicodeescape", message, 3003 &starts, &end, &startinpos, &endinpos, &exc, &s, 3004 (PyObject **)&v, &outpos, &p)) 3005 goto onError; 3006 } 3007 else { 3008 *p++ = '\\'; 3009 *p++ = (unsigned char)s[-1]; 3010 } 3011 break; 3012 } 3013 nextByte: 3014 ; 3015 } 3016 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0) 3017 goto onError; 3018 Py_XDECREF(errorHandler); 3019 Py_XDECREF(exc); 3020 return (PyObject *)v; 3021 3022ucnhashError: 3023 PyErr_SetString( 3024 PyExc_UnicodeError, 3025 "\\N escapes not supported (can't load unicodedata module)" 3026 ); 3027 Py_XDECREF(v); 3028 Py_XDECREF(errorHandler); 3029 Py_XDECREF(exc); 3030 return NULL; 3031 3032onError: 3033 Py_XDECREF(v); 3034 Py_XDECREF(errorHandler); 3035 Py_XDECREF(exc); 3036 return NULL; 3037} 3038 3039/* Return a Unicode-Escape string version of the Unicode object. 3040 3041 If quotes is true, the string is enclosed in u"" or u'' quotes as 3042 appropriate. 3043 3044*/ 3045 3046Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s, 3047 Py_ssize_t size, 3048 Py_UNICODE ch) 3049{ 3050 /* like wcschr, but doesn't stop at NULL characters */ 3051 3052 while (size-- > 0) { 3053 if (*s == ch) 3054 return s; 3055 s++; 3056 } 3057 3058 return NULL; 3059} 3060 3061static const char *hexdigits = "0123456789abcdef"; 3062 3063PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s, 3064 Py_ssize_t size) 3065{ 3066 PyObject *repr, *result; 3067 char *p; 3068 3069 /* XXX(nnorwitz): rather than over-allocating, it would be 3070 better to choose a different scheme. Perhaps scan the 3071 first N-chars of the string and allocate based on that size. 3072 */ 3073 /* Initial allocation is based on the longest-possible unichr 3074 escape. 3075 3076 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source 3077 unichr, so in this case it's the longest unichr escape. In 3078 narrow (UTF-16) builds this is five chars per source unichr 3079 since there are two unichrs in the surrogate pair, so in narrow 3080 (UTF-16) builds it's not the longest unichr escape. 3081 3082 In wide or narrow builds '\uxxxx' is 6 chars per source unichr, 3083 so in the narrow (UTF-16) build case it's the longest unichr 3084 escape. 3085 */ 3086 3087 repr = PyByteArray_FromStringAndSize(NULL, 3088#ifdef Py_UNICODE_WIDE 3089 + 10*size 3090#else 3091 + 6*size 3092#endif 3093 + 1); 3094 if (repr == NULL) 3095 return NULL; 3096 3097 p = PyByteArray_AS_STRING(repr); 3098 3099 while (size-- > 0) { 3100 Py_UNICODE ch = *s++; 3101 3102 /* Escape backslashes */ 3103 if (ch == '\\') { 3104 *p++ = '\\'; 3105 *p++ = (char) ch; 3106 continue; 3107 } 3108 3109#ifdef Py_UNICODE_WIDE 3110 /* Map 21-bit characters to '\U00xxxxxx' */ 3111 else if (ch >= 0x10000) { 3112 *p++ = '\\'; 3113 *p++ = 'U'; 3114 *p++ = hexdigits[(ch >> 28) & 0x0000000F]; 3115 *p++ = hexdigits[(ch >> 24) & 0x0000000F]; 3116 *p++ = hexdigits[(ch >> 20) & 0x0000000F]; 3117 *p++ = hexdigits[(ch >> 16) & 0x0000000F]; 3118 *p++ = hexdigits[(ch >> 12) & 0x0000000F]; 3119 *p++ = hexdigits[(ch >> 8) & 0x0000000F]; 3120 *p++ = hexdigits[(ch >> 4) & 0x0000000F]; 3121 *p++ = hexdigits[ch & 0x0000000F]; 3122 continue; 3123 } 3124#else 3125 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */ 3126 else if (ch >= 0xD800 && ch < 0xDC00) { 3127 Py_UNICODE ch2; 3128 Py_UCS4 ucs; 3129 3130 ch2 = *s++; 3131 size--; 3132 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) { 3133 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000; 3134 *p++ = '\\'; 3135 *p++ = 'U'; 3136 *p++ = hexdigits[(ucs >> 28) & 0x0000000F]; 3137 *p++ = hexdigits[(ucs >> 24) & 0x0000000F]; 3138 *p++ = hexdigits[(ucs >> 20) & 0x0000000F]; 3139 *p++ = hexdigits[(ucs >> 16) & 0x0000000F]; 3140 *p++ = hexdigits[(ucs >> 12) & 0x0000000F]; 3141 *p++ = hexdigits[(ucs >> 8) & 0x0000000F]; 3142 *p++ = hexdigits[(ucs >> 4) & 0x0000000F]; 3143 *p++ = hexdigits[ucs & 0x0000000F]; 3144 continue; 3145 } 3146 /* Fall through: isolated surrogates are copied as-is */ 3147 s--; 3148 size++; 3149 } 3150#endif 3151 3152 /* Map 16-bit characters to '\uxxxx' */ 3153 if (ch >= 256) { 3154 *p++ = '\\'; 3155 *p++ = 'u'; 3156 *p++ = hexdigits[(ch >> 12) & 0x000F]; 3157 *p++ = hexdigits[(ch >> 8) & 0x000F]; 3158 *p++ = hexdigits[(ch >> 4) & 0x000F]; 3159 *p++ = hexdigits[ch & 0x000F]; 3160 } 3161 3162 /* Map special whitespace to '\t', \n', '\r' */ 3163 else if (ch == '\t') { 3164 *p++ = '\\'; 3165 *p++ = 't'; 3166 } 3167 else if (ch == '\n') { 3168 *p++ = '\\'; 3169 *p++ = 'n'; 3170 } 3171 else if (ch == '\r') { 3172 *p++ = '\\'; 3173 *p++ = 'r'; 3174 } 3175 3176 /* Map non-printable US ASCII to '\xhh' */ 3177 else if (ch < ' ' || ch >= 0x7F) { 3178 *p++ = '\\'; 3179 *p++ = 'x'; 3180 *p++ = hexdigits[(ch >> 4) & 0x000F]; 3181 *p++ = hexdigits[ch & 0x000F]; 3182 } 3183 3184 /* Copy everything else as-is */ 3185 else 3186 *p++ = (char) ch; 3187 } 3188 3189 result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(repr), 3190 p - PyByteArray_AS_STRING(repr)); 3191 Py_DECREF(repr); 3192 return result; 3193} 3194 3195PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode) 3196{ 3197 PyObject *s, *result; 3198 if (!PyUnicode_Check(unicode)) { 3199 PyErr_BadArgument(); 3200 return NULL; 3201 } 3202 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode), 3203 PyUnicode_GET_SIZE(unicode)); 3204 3205 if (!s) 3206 return NULL; 3207 result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(s), 3208 PyByteArray_GET_SIZE(s)); 3209 Py_DECREF(s); 3210 return result; 3211} 3212 3213/* --- Raw Unicode Escape Codec ------------------------------------------- */ 3214 3215PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s, 3216 Py_ssize_t size, 3217 const char *errors) 3218{ 3219 const char *starts = s; 3220 Py_ssize_t startinpos; 3221 Py_ssize_t endinpos; 3222 Py_ssize_t outpos; 3223 PyUnicodeObject *v; 3224 Py_UNICODE *p; 3225 const char *end; 3226 const char *bs; 3227 PyObject *errorHandler = NULL; 3228 PyObject *exc = NULL; 3229 3230 /* Escaped strings will always be longer than the resulting 3231 Unicode string, so we start with size here and then reduce the 3232 length after conversion to the true value. (But decoding error 3233 handler might have to resize the string) */ 3234 v = _PyUnicode_New(size); 3235 if (v == NULL) 3236 goto onError; 3237 if (size == 0) 3238 return (PyObject *)v; 3239 p = PyUnicode_AS_UNICODE(v); 3240 end = s + size; 3241 while (s < end) { 3242 unsigned char c; 3243 Py_UCS4 x; 3244 int i; 3245 int count; 3246 3247 /* Non-escape characters are interpreted as Unicode ordinals */ 3248 if (*s != '\\') { 3249 *p++ = (unsigned char)*s++; 3250 continue; 3251 } 3252 startinpos = s-starts; 3253 3254 /* \u-escapes are only interpreted iff the number of leading 3255 backslashes if odd */ 3256 bs = s; 3257 for (;s < end;) { 3258 if (*s != '\\') 3259 break; 3260 *p++ = (unsigned char)*s++; 3261 } 3262 if (((s - bs) & 1) == 0 || 3263 s >= end || 3264 (*s != 'u' && *s != 'U')) { 3265 continue; 3266 } 3267 p--; 3268 count = *s=='u' ? 4 : 8; 3269 s++; 3270 3271 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */ 3272 outpos = p-PyUnicode_AS_UNICODE(v); 3273 for (x = 0, i = 0; i < count; ++i, ++s) { 3274 c = (unsigned char)*s; 3275 if (!ISXDIGIT(c)) { 3276 endinpos = s-starts; 3277 if (unicode_decode_call_errorhandler( 3278 errors, &errorHandler, 3279 "rawunicodeescape", "truncated \\uXXXX", 3280 &starts, &end, &startinpos, &endinpos, &exc, &s, 3281 (PyObject **)&v, &outpos, &p)) 3282 goto onError; 3283 goto nextByte; 3284 } 3285 x = (x<<4) & ~0xF; 3286 if (c >= '0' && c <= '9') 3287 x += c - '0'; 3288 else if (c >= 'a' && c <= 'f') 3289 x += 10 + c - 'a'; 3290 else 3291 x += 10 + c - 'A'; 3292 } 3293 if (x <= 0xffff) 3294 /* UCS-2 character */ 3295 *p++ = (Py_UNICODE) x; 3296 else if (x <= 0x10ffff) { 3297 /* UCS-4 character. Either store directly, or as 3298 surrogate pair. */ 3299#ifdef Py_UNICODE_WIDE 3300 *p++ = (Py_UNICODE) x; 3301#else 3302 x -= 0x10000L; 3303 *p++ = 0xD800 + (Py_UNICODE) (x >> 10); 3304 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF); 3305#endif 3306 } else { 3307 endinpos = s-starts; 3308 outpos = p-PyUnicode_AS_UNICODE(v); 3309 if (unicode_decode_call_errorhandler( 3310 errors, &errorHandler, 3311 "rawunicodeescape", "\\Uxxxxxxxx out of range", 3312 &starts, &end, &startinpos, &endinpos, &exc, &s, 3313 (PyObject **)&v, &outpos, &p)) 3314 goto onError; 3315 } 3316 nextByte: 3317 ; 3318 } 3319 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0) 3320 goto onError; 3321 Py_XDECREF(errorHandler); 3322 Py_XDECREF(exc); 3323 return (PyObject *)v; 3324 3325 onError: 3326 Py_XDECREF(v); 3327 Py_XDECREF(errorHandler); 3328 Py_XDECREF(exc); 3329 return NULL; 3330} 3331 3332PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s, 3333 Py_ssize_t size) 3334{ 3335 PyObject *repr, *result; 3336 char *p; 3337 char *q; 3338 3339#ifdef Py_UNICODE_WIDE 3340 repr = PyByteArray_FromStringAndSize(NULL, 10 * size); 3341#else 3342 repr = PyByteArray_FromStringAndSize(NULL, 6 * size); 3343#endif 3344 if (repr == NULL) 3345 return NULL; 3346 if (size == 0) 3347 goto done; 3348 3349 p = q = PyByteArray_AS_STRING(repr); 3350 while (size-- > 0) { 3351 Py_UNICODE ch = *s++; 3352#ifdef Py_UNICODE_WIDE 3353 /* Map 32-bit characters to '\Uxxxxxxxx' */ 3354 if (ch >= 0x10000) { 3355 *p++ = '\\'; 3356 *p++ = 'U'; 3357 *p++ = hexdigits[(ch >> 28) & 0xf]; 3358 *p++ = hexdigits[(ch >> 24) & 0xf]; 3359 *p++ = hexdigits[(ch >> 20) & 0xf]; 3360 *p++ = hexdigits[(ch >> 16) & 0xf]; 3361 *p++ = hexdigits[(ch >> 12) & 0xf]; 3362 *p++ = hexdigits[(ch >> 8) & 0xf]; 3363 *p++ = hexdigits[(ch >> 4) & 0xf]; 3364 *p++ = hexdigits[ch & 15]; 3365 } 3366 else 3367#else 3368 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */ 3369 if (ch >= 0xD800 && ch < 0xDC00) { 3370 Py_UNICODE ch2; 3371 Py_UCS4 ucs; 3372 3373 ch2 = *s++; 3374 size--; 3375 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) { 3376 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000; 3377 *p++ = '\\'; 3378 *p++ = 'U'; 3379 *p++ = hexdigits[(ucs >> 28) & 0xf]; 3380 *p++ = hexdigits[(ucs >> 24) & 0xf]; 3381 *p++ = hexdigits[(ucs >> 20) & 0xf]; 3382 *p++ = hexdigits[(ucs >> 16) & 0xf]; 3383 *p++ = hexdigits[(ucs >> 12) & 0xf]; 3384 *p++ = hexdigits[(ucs >> 8) & 0xf]; 3385 *p++ = hexdigits[(ucs >> 4) & 0xf]; 3386 *p++ = hexdigits[ucs & 0xf]; 3387 continue; 3388 } 3389 /* Fall through: isolated surrogates are copied as-is */ 3390 s--; 3391 size++; 3392 } 3393#endif 3394 /* Map 16-bit characters to '\uxxxx' */ 3395 if (ch >= 256) { 3396 *p++ = '\\'; 3397 *p++ = 'u'; 3398 *p++ = hexdigits[(ch >> 12) & 0xf]; 3399 *p++ = hexdigits[(ch >> 8) & 0xf]; 3400 *p++ = hexdigits[(ch >> 4) & 0xf]; 3401 *p++ = hexdigits[ch & 15]; 3402 } 3403 /* Copy everything else as-is */ 3404 else 3405 *p++ = (char) ch; 3406 } 3407 size = p - q; 3408 3409 done: 3410 result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(repr), size); 3411 Py_DECREF(repr); 3412 return result; 3413} 3414 3415PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode) 3416{ 3417 PyObject *s, *result; 3418 if (!PyUnicode_Check(unicode)) { 3419 PyErr_BadArgument(); 3420 return NULL; 3421 } 3422 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode), 3423 PyUnicode_GET_SIZE(unicode)); 3424 3425 if (!s) 3426 return NULL; 3427 result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(s), 3428 PyByteArray_GET_SIZE(s)); 3429 Py_DECREF(s); 3430 return result; 3431} 3432 3433/* --- Unicode Internal Codec ------------------------------------------- */ 3434 3435PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s, 3436 Py_ssize_t size, 3437 const char *errors) 3438{ 3439 const char *starts = s; 3440 Py_ssize_t startinpos; 3441 Py_ssize_t endinpos; 3442 Py_ssize_t outpos; 3443 PyUnicodeObject *v; 3444 Py_UNICODE *p; 3445 const char *end; 3446 const char *reason; 3447 PyObject *errorHandler = NULL; 3448 PyObject *exc = NULL; 3449 3450#ifdef Py_UNICODE_WIDE 3451 Py_UNICODE unimax = PyUnicode_GetMax(); 3452#endif 3453 3454 /* XXX overflow detection missing */ 3455 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE); 3456 if (v == NULL) 3457 goto onError; 3458 if (PyUnicode_GetSize((PyObject *)v) == 0) 3459 return (PyObject *)v; 3460 p = PyUnicode_AS_UNICODE(v); 3461 end = s + size; 3462 3463 while (s < end) { 3464 memcpy(p, s, sizeof(Py_UNICODE)); 3465 /* We have to sanity check the raw data, otherwise doom looms for 3466 some malformed UCS-4 data. */ 3467 if ( 3468 #ifdef Py_UNICODE_WIDE 3469 *p > unimax || *p < 0 || 3470 #endif 3471 end-s < Py_UNICODE_SIZE 3472 ) 3473 { 3474 startinpos = s - starts; 3475 if (end-s < Py_UNICODE_SIZE) { 3476 endinpos = end-starts; 3477 reason = "truncated input"; 3478 } 3479 else { 3480 endinpos = s - starts + Py_UNICODE_SIZE; 3481 reason = "illegal code point (> 0x10FFFF)"; 3482 } 3483 outpos = p - PyUnicode_AS_UNICODE(v); 3484 if (unicode_decode_call_errorhandler( 3485 errors, &errorHandler, 3486 "unicode_internal", reason, 3487 &starts, &end, &startinpos, &endinpos, &exc, &s, 3488 (PyObject **)&v, &outpos, &p)) { 3489 goto onError; 3490 } 3491 } 3492 else { 3493 p++; 3494 s += Py_UNICODE_SIZE; 3495 } 3496 } 3497 3498 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0) 3499 goto onError; 3500 Py_XDECREF(errorHandler); 3501 Py_XDECREF(exc); 3502 return (PyObject *)v; 3503 3504 onError: 3505 Py_XDECREF(v); 3506 Py_XDECREF(errorHandler); 3507 Py_XDECREF(exc); 3508 return NULL; 3509} 3510 3511/* --- Latin-1 Codec ------------------------------------------------------ */ 3512 3513PyObject *PyUnicode_DecodeLatin1(const char *s, 3514 Py_ssize_t size, 3515 const char *errors) 3516{ 3517 PyUnicodeObject *v; 3518 Py_UNICODE *p; 3519 3520 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */ 3521 if (size == 1) { 3522 Py_UNICODE r = *(unsigned char*)s; 3523 return PyUnicode_FromUnicode(&r, 1); 3524 } 3525 3526 v = _PyUnicode_New(size); 3527 if (v == NULL) 3528 goto onError; 3529 if (size == 0) 3530 return (PyObject *)v; 3531 p = PyUnicode_AS_UNICODE(v); 3532 while (size-- > 0) 3533 *p++ = (unsigned char)*s++; 3534 return (PyObject *)v; 3535 3536 onError: 3537 Py_XDECREF(v); 3538 return NULL; 3539} 3540 3541/* create or adjust a UnicodeEncodeError */ 3542static void make_encode_exception(PyObject **exceptionObject, 3543 const char *encoding, 3544 const Py_UNICODE *unicode, Py_ssize_t size, 3545 Py_ssize_t startpos, Py_ssize_t endpos, 3546 const char *reason) 3547{ 3548 if (*exceptionObject == NULL) { 3549 *exceptionObject = PyUnicodeEncodeError_Create( 3550 encoding, unicode, size, startpos, endpos, reason); 3551 } 3552 else { 3553 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos)) 3554 goto onError; 3555 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos)) 3556 goto onError; 3557 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason)) 3558 goto onError; 3559 return; 3560 onError: 3561 Py_DECREF(*exceptionObject); 3562 *exceptionObject = NULL; 3563 } 3564} 3565 3566/* raises a UnicodeEncodeError */ 3567static void raise_encode_exception(PyObject **exceptionObject, 3568 const char *encoding, 3569 const Py_UNICODE *unicode, Py_ssize_t size, 3570 Py_ssize_t startpos, Py_ssize_t endpos, 3571 const char *reason) 3572{ 3573 make_encode_exception(exceptionObject, 3574 encoding, unicode, size, startpos, endpos, reason); 3575 if (*exceptionObject != NULL) 3576 PyCodec_StrictErrors(*exceptionObject); 3577} 3578 3579/* error handling callback helper: 3580 build arguments, call the callback and check the arguments, 3581 put the result into newpos and return the replacement string, which 3582 has to be freed by the caller */ 3583static PyObject *unicode_encode_call_errorhandler(const char *errors, 3584 PyObject **errorHandler, 3585 const char *encoding, const char *reason, 3586 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject, 3587 Py_ssize_t startpos, Py_ssize_t endpos, 3588 Py_ssize_t *newpos) 3589{ 3590 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple"; 3591 3592 PyObject *restuple; 3593 PyObject *resunicode; 3594 3595 if (*errorHandler == NULL) { 3596 *errorHandler = PyCodec_LookupError(errors); 3597 if (*errorHandler == NULL) 3598 return NULL; 3599 } 3600 3601 make_encode_exception(exceptionObject, 3602 encoding, unicode, size, startpos, endpos, reason); 3603 if (*exceptionObject == NULL) 3604 return NULL; 3605 3606 restuple = PyObject_CallFunctionObjArgs( 3607 *errorHandler, *exceptionObject, NULL); 3608 if (restuple == NULL) 3609 return NULL; 3610 if (!PyTuple_Check(restuple)) { 3611 PyErr_Format(PyExc_TypeError, &argparse[4]); 3612 Py_DECREF(restuple); 3613 return NULL; 3614 } 3615 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, 3616 &resunicode, newpos)) { 3617 Py_DECREF(restuple); 3618 return NULL; 3619 } 3620 if (*newpos<0) 3621 *newpos = size+*newpos; 3622 if (*newpos<0 || *newpos>size) { 3623 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos); 3624 Py_DECREF(restuple); 3625 return NULL; 3626 } 3627 Py_INCREF(resunicode); 3628 Py_DECREF(restuple); 3629 return resunicode; 3630} 3631 3632static PyObject *unicode_encode_ucs1(const Py_UNICODE *p, 3633 Py_ssize_t size, 3634 const char *errors, 3635 int limit) 3636{ 3637 /* output object */ 3638 PyObject *res; 3639 /* pointers to the beginning and end+1 of input */ 3640 const Py_UNICODE *startp = p; 3641 const Py_UNICODE *endp = p + size; 3642 /* pointer to the beginning of the unencodable characters */ 3643 /* const Py_UNICODE *badp = NULL; */ 3644 /* pointer into the output */ 3645 char *str; 3646 /* current output position */ 3647 Py_ssize_t ressize; 3648 const char *encoding = (limit == 256) ? "latin-1" : "ascii"; 3649 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)"; 3650 PyObject *errorHandler = NULL; 3651 PyObject *exc = NULL; 3652 PyObject *result = NULL; 3653 /* the following variable is used for caching string comparisons 3654 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */ 3655 int known_errorHandler = -1; 3656 3657 /* allocate enough for a simple encoding without 3658 replacements, if we need more, we'll resize */ 3659 if (size == 0) 3660 return PyBytes_FromStringAndSize(NULL, 0); 3661 res = PyByteArray_FromStringAndSize(NULL, size); 3662 if (res == NULL) 3663 return NULL; 3664 str = PyByteArray_AS_STRING(res); 3665 ressize = size; 3666 3667 while (p<endp) { 3668 Py_UNICODE c = *p; 3669 3670 /* can we encode this? */ 3671 if (c<limit) { 3672 /* no overflow check, because we know that the space is enough */ 3673 *str++ = (char)c; 3674 ++p; 3675 } 3676 else { 3677 Py_ssize_t unicodepos = p-startp; 3678 Py_ssize_t requiredsize; 3679 PyObject *repunicode; 3680 Py_ssize_t repsize; 3681 Py_ssize_t newpos; 3682 Py_ssize_t respos; 3683 Py_UNICODE *uni2; 3684 /* startpos for collecting unencodable chars */ 3685 const Py_UNICODE *collstart = p; 3686 const Py_UNICODE *collend = p; 3687 /* find all unecodable characters */ 3688 while ((collend < endp) && ((*collend)>=limit)) 3689 ++collend; 3690 /* cache callback name lookup (if not done yet, i.e. it's the first error) */ 3691 if (known_errorHandler==-1) { 3692 if ((errors==NULL) || (!strcmp(errors, "strict"))) 3693 known_errorHandler = 1; 3694 else if (!strcmp(errors, "replace")) 3695 known_errorHandler = 2; 3696 else if (!strcmp(errors, "ignore")) 3697 known_errorHandler = 3; 3698 else if (!strcmp(errors, "xmlcharrefreplace")) 3699 known_errorHandler = 4; 3700 else 3701 known_errorHandler = 0; 3702 } 3703 switch (known_errorHandler) { 3704 case 1: /* strict */ 3705 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason); 3706 goto onError; 3707 case 2: /* replace */ 3708 while (collstart++<collend) 3709 *str++ = '?'; /* fall through */ 3710 case 3: /* ignore */ 3711 p = collend; 3712 break; 3713 case 4: /* xmlcharrefreplace */ 3714 respos = str - PyByteArray_AS_STRING(res); 3715 /* determine replacement size (temporarily (mis)uses p) */ 3716 for (p = collstart, repsize = 0; p < collend; ++p) { 3717 if (*p<10) 3718 repsize += 2+1+1; 3719 else if (*p<100) 3720 repsize += 2+2+1; 3721 else if (*p<1000) 3722 repsize += 2+3+1; 3723 else if (*p<10000) 3724 repsize += 2+4+1; 3725#ifndef Py_UNICODE_WIDE 3726 else 3727 repsize += 2+5+1; 3728#else 3729 else if (*p<100000) 3730 repsize += 2+5+1; 3731 else if (*p<1000000) 3732 repsize += 2+6+1; 3733 else 3734 repsize += 2+7+1; 3735#endif 3736 } 3737 requiredsize = respos+repsize+(endp-collend); 3738 if (requiredsize > ressize) { 3739 if (requiredsize<2*ressize) 3740 requiredsize = 2*ressize; 3741 if (PyByteArray_Resize(res, requiredsize)) 3742 goto onError; 3743 str = PyByteArray_AS_STRING(res) + respos; 3744 ressize = requiredsize; 3745 } 3746 /* generate replacement (temporarily (mis)uses p) */ 3747 for (p = collstart; p < collend; ++p) { 3748 str += sprintf(str, "&#%d;", (int)*p); 3749 } 3750 p = collend; 3751 break; 3752 default: 3753 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler, 3754 encoding, reason, startp, size, &exc, 3755 collstart-startp, collend-startp, &newpos); 3756 if (repunicode == NULL) 3757 goto onError; 3758 /* need more space? (at least enough for what we 3759 have+the replacement+the rest of the string, so 3760 we won't have to check space for encodable characters) */ 3761 respos = str - PyByteArray_AS_STRING(res); 3762 repsize = PyUnicode_GET_SIZE(repunicode); 3763 requiredsize = respos+repsize+(endp-collend); 3764 if (requiredsize > ressize) { 3765 if (requiredsize<2*ressize) 3766 requiredsize = 2*ressize; 3767 if (PyByteArray_Resize(res, requiredsize)) { 3768 Py_DECREF(repunicode); 3769 goto onError; 3770 } 3771 str = PyByteArray_AS_STRING(res) + respos; 3772 ressize = requiredsize; 3773 } 3774 /* check if there is anything unencodable in the replacement 3775 and copy it to the output */ 3776 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) { 3777 c = *uni2; 3778 if (c >= limit) { 3779 raise_encode_exception(&exc, encoding, startp, size, 3780 unicodepos, unicodepos+1, reason); 3781 Py_DECREF(repunicode); 3782 goto onError; 3783 } 3784 *str = (char)c; 3785 } 3786 p = startp + newpos; 3787 Py_DECREF(repunicode); 3788 } 3789 } 3790 } 3791 result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(res), 3792 str - PyByteArray_AS_STRING(res)); 3793 onError: 3794 Py_DECREF(res); 3795 Py_XDECREF(errorHandler); 3796 Py_XDECREF(exc); 3797 return result; 3798} 3799 3800PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p, 3801 Py_ssize_t size, 3802 const char *errors) 3803{ 3804 return unicode_encode_ucs1(p, size, errors, 256); 3805} 3806 3807PyObject *PyUnicode_AsLatin1String(PyObject *unicode) 3808{ 3809 if (!PyUnicode_Check(unicode)) { 3810 PyErr_BadArgument(); 3811 return NULL; 3812 } 3813 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode), 3814 PyUnicode_GET_SIZE(unicode), 3815 NULL); 3816} 3817 3818/* --- 7-bit ASCII Codec -------------------------------------------------- */ 3819 3820PyObject *PyUnicode_DecodeASCII(const char *s, 3821 Py_ssize_t size, 3822 const char *errors) 3823{ 3824 const char *starts = s; 3825 PyUnicodeObject *v; 3826 Py_UNICODE *p; 3827 Py_ssize_t startinpos; 3828 Py_ssize_t endinpos; 3829 Py_ssize_t outpos; 3830 const char *e; 3831 PyObject *errorHandler = NULL; 3832 PyObject *exc = NULL; 3833 3834 /* ASCII is equivalent to the first 128 ordinals in Unicode. */ 3835 if (size == 1 && *(unsigned char*)s < 128) { 3836 Py_UNICODE r = *(unsigned char*)s; 3837 return PyUnicode_FromUnicode(&r, 1); 3838 } 3839 3840 v = _PyUnicode_New(size); 3841 if (v == NULL) 3842 goto onError; 3843 if (size == 0) 3844 return (PyObject *)v; 3845 p = PyUnicode_AS_UNICODE(v); 3846 e = s + size; 3847 while (s < e) { 3848 register unsigned char c = (unsigned char)*s; 3849 if (c < 128) { 3850 *p++ = c; 3851 ++s; 3852 } 3853 else { 3854 startinpos = s-starts; 3855 endinpos = startinpos + 1; 3856 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v); 3857 if (unicode_decode_call_errorhandler( 3858 errors, &errorHandler, 3859 "ascii", "ordinal not in range(128)", 3860 &starts, &e, &startinpos, &endinpos, &exc, &s, 3861 (PyObject **)&v, &outpos, &p)) 3862 goto onError; 3863 } 3864 } 3865 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v)) 3866 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0) 3867 goto onError; 3868 Py_XDECREF(errorHandler); 3869 Py_XDECREF(exc); 3870 return (PyObject *)v; 3871 3872 onError: 3873 Py_XDECREF(v); 3874 Py_XDECREF(errorHandler); 3875 Py_XDECREF(exc); 3876 return NULL; 3877} 3878 3879PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p, 3880 Py_ssize_t size, 3881 const char *errors) 3882{ 3883 return unicode_encode_ucs1(p, size, errors, 128); 3884} 3885 3886PyObject *PyUnicode_AsASCIIString(PyObject *unicode) 3887{ 3888 if (!PyUnicode_Check(unicode)) { 3889 PyErr_BadArgument(); 3890 return NULL; 3891 } 3892 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode), 3893 PyUnicode_GET_SIZE(unicode), 3894 NULL); 3895} 3896 3897#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T) 3898 3899/* --- MBCS codecs for Windows -------------------------------------------- */ 3900 3901#if SIZEOF_INT < SIZEOF_SSIZE_T 3902#define NEED_RETRY 3903#endif 3904 3905/* XXX This code is limited to "true" double-byte encodings, as 3906 a) it assumes an incomplete character consists of a single byte, and 3907 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte 3908 encodings, see IsDBCSLeadByteEx documentation. */ 3909 3910static int is_dbcs_lead_byte(const char *s, int offset) 3911{ 3912 const char *curr = s + offset; 3913 3914 if (IsDBCSLeadByte(*curr)) { 3915 const char *prev = CharPrev(s, curr); 3916 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2); 3917 } 3918 return 0; 3919} 3920 3921/* 3922 * Decode MBCS string into unicode object. If 'final' is set, converts 3923 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise. 3924 */ 3925static int decode_mbcs(PyUnicodeObject **v, 3926 const char *s, /* MBCS string */ 3927 int size, /* sizeof MBCS string */ 3928 int final) 3929{ 3930 Py_UNICODE *p; 3931 Py_ssize_t n = 0; 3932 int usize = 0; 3933 3934 assert(size >= 0); 3935 3936 /* Skip trailing lead-byte unless 'final' is set */ 3937 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1)) 3938 --size; 3939 3940 /* First get the size of the result */ 3941 if (size > 0) { 3942 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0); 3943 if (usize == 0) { 3944 PyErr_SetFromWindowsErrWithFilename(0, NULL); 3945 return -1; 3946 } 3947 } 3948 3949 if (*v == NULL) { 3950 /* Create unicode object */ 3951 *v = _PyUnicode_New(usize); 3952 if (*v == NULL) 3953 return -1; 3954 } 3955 else { 3956 /* Extend unicode object */ 3957 n = PyUnicode_GET_SIZE(*v); 3958 if (_PyUnicode_Resize(v, n + usize) < 0) 3959 return -1; 3960 } 3961 3962 /* Do the conversion */ 3963 if (size > 0) { 3964 p = PyUnicode_AS_UNICODE(*v) + n; 3965 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) { 3966 PyErr_SetFromWindowsErrWithFilename(0, NULL); 3967 return -1; 3968 } 3969 } 3970 3971 return size; 3972} 3973 3974PyObject *PyUnicode_DecodeMBCSStateful(const char *s, 3975 Py_ssize_t size, 3976 const char *errors, 3977 Py_ssize_t *consumed) 3978{ 3979 PyUnicodeObject *v = NULL; 3980 int done; 3981 3982 if (consumed) 3983 *consumed = 0; 3984 3985#ifdef NEED_RETRY 3986 retry: 3987 if (size > INT_MAX) 3988 done = decode_mbcs(&v, s, INT_MAX, 0); 3989 else 3990#endif 3991 done = decode_mbcs(&v, s, (int)size, !consumed); 3992 3993 if (done < 0) { 3994 Py_XDECREF(v); 3995 return NULL; 3996 } 3997 3998 if (consumed) 3999 *consumed += done; 4000 4001#ifdef NEED_RETRY 4002 if (size > INT_MAX) { 4003 s += done; 4004 size -= done; 4005 goto retry; 4006 } 4007#endif 4008 4009 return (PyObject *)v; 4010} 4011 4012PyObject *PyUnicode_DecodeMBCS(const char *s, 4013 Py_ssize_t size, 4014 const char *errors) 4015{ 4016 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL); 4017} 4018 4019/* 4020 * Convert unicode into string object (MBCS). 4021 * Returns 0 if succeed, -1 otherwise. 4022 */ 4023static int encode_mbcs(PyObject **repr, 4024 const Py_UNICODE *p, /* unicode */ 4025 int size) /* size of unicode */ 4026{ 4027 int mbcssize = 0; 4028 Py_ssize_t n = 0; 4029 4030 assert(size >= 0); 4031 4032 /* First get the size of the result */ 4033 if (size > 0) { 4034 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL); 4035 if (mbcssize == 0) { 4036 PyErr_SetFromWindowsErrWithFilename(0, NULL); 4037 return -1; 4038 } 4039 } 4040 4041 if (*repr == NULL) { 4042 /* Create string object */ 4043 *repr = PyBytes_FromStringAndSize(NULL, mbcssize); 4044 if (*repr == NULL) 4045 return -1; 4046 } 4047 else { 4048 /* Extend string object */ 4049 n = PyBytes_Size(*repr); 4050 if (_PyBytes_Resize(repr, n + mbcssize) < 0) 4051 return -1; 4052 } 4053 4054 /* Do the conversion */ 4055 if (size > 0) { 4056 char *s = PyBytes_AS_STRING(*repr) + n; 4057 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) { 4058 PyErr_SetFromWindowsErrWithFilename(0, NULL); 4059 return -1; 4060 } 4061 } 4062 4063 return 0; 4064} 4065 4066PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p, 4067 Py_ssize_t size, 4068 const char *errors) 4069{ 4070 PyObject *repr = NULL; 4071 int ret; 4072 4073#ifdef NEED_RETRY 4074 retry: 4075 if (size > INT_MAX) 4076 ret = encode_mbcs(&repr, p, INT_MAX); 4077 else 4078#endif 4079 ret = encode_mbcs(&repr, p, (int)size); 4080 4081 if (ret < 0) { 4082 Py_XDECREF(repr); 4083 return NULL; 4084 } 4085 4086#ifdef NEED_RETRY 4087 if (size > INT_MAX) { 4088 p += INT_MAX; 4089 size -= INT_MAX; 4090 goto retry; 4091 } 4092#endif 4093 4094 return repr; 4095} 4096 4097PyObject *PyUnicode_AsMBCSString(PyObject *unicode) 4098{ 4099 if (!PyUnicode_Check(unicode)) { 4100 PyErr_BadArgument(); 4101 return NULL; 4102 } 4103 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode), 4104 PyUnicode_GET_SIZE(unicode), 4105 NULL); 4106} 4107 4108#undef NEED_RETRY 4109 4110#endif /* MS_WINDOWS */ 4111 4112/* --- Character Mapping Codec -------------------------------------------- */ 4113 4114PyObject *PyUnicode_DecodeCharmap(const char *s, 4115 Py_ssize_t size, 4116 PyObject *mapping, 4117 const char *errors) 4118{ 4119 const char *starts = s; 4120 Py_ssize_t startinpos; 4121 Py_ssize_t endinpos; 4122 Py_ssize_t outpos; 4123 const char *e; 4124 PyUnicodeObject *v; 4125 Py_UNICODE *p; 4126 Py_ssize_t extrachars = 0; 4127 PyObject *errorHandler = NULL; 4128 PyObject *exc = NULL; 4129 Py_UNICODE *mapstring = NULL; 4130 Py_ssize_t maplen = 0; 4131 4132 /* Default to Latin-1 */ 4133 if (mapping == NULL) 4134 return PyUnicode_DecodeLatin1(s, size, errors); 4135 4136 v = _PyUnicode_New(size); 4137 if (v == NULL) 4138 goto onError; 4139 if (size == 0) 4140 return (PyObject *)v; 4141 p = PyUnicode_AS_UNICODE(v); 4142 e = s + size; 4143 if (PyUnicode_CheckExact(mapping)) { 4144 mapstring = PyUnicode_AS_UNICODE(mapping); 4145 maplen = PyUnicode_GET_SIZE(mapping); 4146 while (s < e) { 4147 unsigned char ch = *s; 4148 Py_UNICODE x = 0xfffe; /* illegal value */ 4149 4150 if (ch < maplen) 4151 x = mapstring[ch]; 4152 4153 if (x == 0xfffe) { 4154 /* undefined mapping */ 4155 outpos = p-PyUnicode_AS_UNICODE(v); 4156 startinpos = s-starts; 4157 endinpos = startinpos+1; 4158 if (unicode_decode_call_errorhandler( 4159 errors, &errorHandler, 4160 "charmap", "character maps to <undefined>", 4161 &starts, &e, &startinpos, &endinpos, &exc, &s, 4162 (PyObject **)&v, &outpos, &p)) { 4163 goto onError; 4164 } 4165 continue; 4166 } 4167 *p++ = x; 4168 ++s; 4169 } 4170 } 4171 else { 4172 while (s < e) { 4173 unsigned char ch = *s; 4174 PyObject *w, *x; 4175 4176 /* Get mapping (char ordinal -> integer, Unicode char or None) */ 4177 w = PyLong_FromLong((long)ch); 4178 if (w == NULL) 4179 goto onError; 4180 x = PyObject_GetItem(mapping, w); 4181 Py_DECREF(w); 4182 if (x == NULL) { 4183 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 4184 /* No mapping found means: mapping is undefined. */ 4185 PyErr_Clear(); 4186 x = Py_None; 4187 Py_INCREF(x); 4188 } else 4189 goto onError; 4190 } 4191 4192 /* Apply mapping */ 4193 if (PyLong_Check(x)) { 4194 long value = PyLong_AS_LONG(x); 4195 if (value < 0 || value > 65535) { 4196 PyErr_SetString(PyExc_TypeError, 4197 "character mapping must be in range(65536)"); 4198 Py_DECREF(x); 4199 goto onError; 4200 } 4201 *p++ = (Py_UNICODE)value; 4202 } 4203 else if (x == Py_None) { 4204 /* undefined mapping */ 4205 outpos = p-PyUnicode_AS_UNICODE(v); 4206 startinpos = s-starts; 4207 endinpos = startinpos+1; 4208 if (unicode_decode_call_errorhandler( 4209 errors, &errorHandler, 4210 "charmap", "character maps to <undefined>", 4211 &starts, &e, &startinpos, &endinpos, &exc, &s, 4212 (PyObject **)&v, &outpos, &p)) { 4213 Py_DECREF(x); 4214 goto onError; 4215 } 4216 Py_DECREF(x); 4217 continue; 4218 } 4219 else if (PyUnicode_Check(x)) { 4220 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x); 4221 4222 if (targetsize == 1) 4223 /* 1-1 mapping */ 4224 *p++ = *PyUnicode_AS_UNICODE(x); 4225 4226 else if (targetsize > 1) { 4227 /* 1-n mapping */ 4228 if (targetsize > extrachars) { 4229 /* resize first */ 4230 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v); 4231 Py_ssize_t needed = (targetsize - extrachars) + \ 4232 (targetsize << 2); 4233 extrachars += needed; 4234 /* XXX overflow detection missing */ 4235 if (_PyUnicode_Resize(&v, 4236 PyUnicode_GET_SIZE(v) + needed) < 0) { 4237 Py_DECREF(x); 4238 goto onError; 4239 } 4240 p = PyUnicode_AS_UNICODE(v) + oldpos; 4241 } 4242 Py_UNICODE_COPY(p, 4243 PyUnicode_AS_UNICODE(x), 4244 targetsize); 4245 p += targetsize; 4246 extrachars -= targetsize; 4247 } 4248 /* 1-0 mapping: skip the character */ 4249 } 4250 else { 4251 /* wrong return value */ 4252 PyErr_SetString(PyExc_TypeError, 4253 "character mapping must return integer, None or unicode"); 4254 Py_DECREF(x); 4255 goto onError; 4256 } 4257 Py_DECREF(x); 4258 ++s; 4259 } 4260 } 4261 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v)) 4262 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0) 4263 goto onError; 4264 Py_XDECREF(errorHandler); 4265 Py_XDECREF(exc); 4266 return (PyObject *)v; 4267 4268 onError: 4269 Py_XDECREF(errorHandler); 4270 Py_XDECREF(exc); 4271 Py_XDECREF(v); 4272 return NULL; 4273} 4274 4275/* Charmap encoding: the lookup table */ 4276 4277struct encoding_map{ 4278 PyObject_HEAD 4279 unsigned char level1[32]; 4280 int count2, count3; 4281 unsigned char level23[1]; 4282}; 4283 4284static PyObject* 4285encoding_map_size(PyObject *obj, PyObject* args) 4286{ 4287 struct encoding_map *map = (struct encoding_map*)obj; 4288 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 + 4289 128*map->count3); 4290} 4291 4292static PyMethodDef encoding_map_methods[] = { 4293 {"size", encoding_map_size, METH_NOARGS, 4294 PyDoc_STR("Return the size (in bytes) of this object") }, 4295 { 0 } 4296}; 4297 4298static void 4299encoding_map_dealloc(PyObject* o) 4300{ 4301 PyObject_FREE(o); 4302} 4303 4304static PyTypeObject EncodingMapType = { 4305 PyVarObject_HEAD_INIT(NULL, 0) 4306 "EncodingMap", /*tp_name*/ 4307 sizeof(struct encoding_map), /*tp_basicsize*/ 4308 0, /*tp_itemsize*/ 4309 /* methods */ 4310 encoding_map_dealloc, /*tp_dealloc*/ 4311 0, /*tp_print*/ 4312 0, /*tp_getattr*/ 4313 0, /*tp_setattr*/ 4314 0, /*tp_compare*/ 4315 0, /*tp_repr*/ 4316 0, /*tp_as_number*/ 4317 0, /*tp_as_sequence*/ 4318 0, /*tp_as_mapping*/ 4319 0, /*tp_hash*/ 4320 0, /*tp_call*/ 4321 0, /*tp_str*/ 4322 0, /*tp_getattro*/ 4323 0, /*tp_setattro*/ 4324 0, /*tp_as_buffer*/ 4325 Py_TPFLAGS_DEFAULT, /*tp_flags*/ 4326 0, /*tp_doc*/ 4327 0, /*tp_traverse*/ 4328 0, /*tp_clear*/ 4329 0, /*tp_richcompare*/ 4330 0, /*tp_weaklistoffset*/ 4331 0, /*tp_iter*/ 4332 0, /*tp_iternext*/ 4333 encoding_map_methods, /*tp_methods*/ 4334 0, /*tp_members*/ 4335 0, /*tp_getset*/ 4336 0, /*tp_base*/ 4337 0, /*tp_dict*/ 4338 0, /*tp_descr_get*/ 4339 0, /*tp_descr_set*/ 4340 0, /*tp_dictoffset*/ 4341 0, /*tp_init*/ 4342 0, /*tp_alloc*/ 4343 0, /*tp_new*/ 4344 0, /*tp_free*/ 4345 0, /*tp_is_gc*/ 4346}; 4347 4348PyObject* 4349PyUnicode_BuildEncodingMap(PyObject* string) 4350{ 4351 Py_UNICODE *decode; 4352 PyObject *result; 4353 struct encoding_map *mresult; 4354 int i; 4355 int need_dict = 0; 4356 unsigned char level1[32]; 4357 unsigned char level2[512]; 4358 unsigned char *mlevel1, *mlevel2, *mlevel3; 4359 int count2 = 0, count3 = 0; 4360 4361 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) { 4362 PyErr_BadArgument(); 4363 return NULL; 4364 } 4365 decode = PyUnicode_AS_UNICODE(string); 4366 memset(level1, 0xFF, sizeof level1); 4367 memset(level2, 0xFF, sizeof level2); 4368 4369 /* If there isn't a one-to-one mapping of NULL to \0, 4370 or if there are non-BMP characters, we need to use 4371 a mapping dictionary. */ 4372 if (decode[0] != 0) 4373 need_dict = 1; 4374 for (i = 1; i < 256; i++) { 4375 int l1, l2; 4376 if (decode[i] == 0 4377 #ifdef Py_UNICODE_WIDE 4378 || decode[i] > 0xFFFF 4379 #endif 4380 ) { 4381 need_dict = 1; 4382 break; 4383 } 4384 if (decode[i] == 0xFFFE) 4385 /* unmapped character */ 4386 continue; 4387 l1 = decode[i] >> 11; 4388 l2 = decode[i] >> 7; 4389 if (level1[l1] == 0xFF) 4390 level1[l1] = count2++; 4391 if (level2[l2] == 0xFF) 4392 level2[l2] = count3++; 4393 } 4394 4395 if (count2 >= 0xFF || count3 >= 0xFF) 4396 need_dict = 1; 4397 4398 if (need_dict) { 4399 PyObject *result = PyDict_New(); 4400 PyObject *key, *value; 4401 if (!result) 4402 return NULL; 4403 for (i = 0; i < 256; i++) { 4404 key = value = NULL; 4405 key = PyLong_FromLong(decode[i]); 4406 value = PyLong_FromLong(i); 4407 if (!key || !value) 4408 goto failed1; 4409 if (PyDict_SetItem(result, key, value) == -1) 4410 goto failed1; 4411 Py_DECREF(key); 4412 Py_DECREF(value); 4413 } 4414 return result; 4415 failed1: 4416 Py_XDECREF(key); 4417 Py_XDECREF(value); 4418 Py_DECREF(result); 4419 return NULL; 4420 } 4421 4422 /* Create a three-level trie */ 4423 result = PyObject_MALLOC(sizeof(struct encoding_map) + 4424 16*count2 + 128*count3 - 1); 4425 if (!result) 4426 return PyErr_NoMemory(); 4427 PyObject_Init(result, &EncodingMapType); 4428 mresult = (struct encoding_map*)result; 4429 mresult->count2 = count2; 4430 mresult->count3 = count3; 4431 mlevel1 = mresult->level1; 4432 mlevel2 = mresult->level23; 4433 mlevel3 = mresult->level23 + 16*count2; 4434 memcpy(mlevel1, level1, 32); 4435 memset(mlevel2, 0xFF, 16*count2); 4436 memset(mlevel3, 0, 128*count3); 4437 count3 = 0; 4438 for (i = 1; i < 256; i++) { 4439 int o1, o2, o3, i2, i3; 4440 if (decode[i] == 0xFFFE) 4441 /* unmapped character */ 4442 continue; 4443 o1 = decode[i]>>11; 4444 o2 = (decode[i]>>7) & 0xF; 4445 i2 = 16*mlevel1[o1] + o2; 4446 if (mlevel2[i2] == 0xFF) 4447 mlevel2[i2] = count3++; 4448 o3 = decode[i] & 0x7F; 4449 i3 = 128*mlevel2[i2] + o3; 4450 mlevel3[i3] = i; 4451 } 4452 return result; 4453} 4454 4455static int 4456encoding_map_lookup(Py_UNICODE c, PyObject *mapping) 4457{ 4458 struct encoding_map *map = (struct encoding_map*)mapping; 4459 int l1 = c>>11; 4460 int l2 = (c>>7) & 0xF; 4461 int l3 = c & 0x7F; 4462 int i; 4463 4464#ifdef Py_UNICODE_WIDE 4465 if (c > 0xFFFF) { 4466 return -1; 4467 } 4468#endif 4469 if (c == 0) 4470 return 0; 4471 /* level 1*/ 4472 i = map->level1[l1]; 4473 if (i == 0xFF) { 4474 return -1; 4475 } 4476 /* level 2*/ 4477 i = map->level23[16*i+l2]; 4478 if (i == 0xFF) { 4479 return -1; 4480 } 4481 /* level 3 */ 4482 i = map->level23[16*map->count2 + 128*i + l3]; 4483 if (i == 0) { 4484 return -1; 4485 } 4486 return i; 4487} 4488 4489/* Lookup the character ch in the mapping. If the character 4490 can't be found, Py_None is returned (or NULL, if another 4491 error occurred). */ 4492static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping) 4493{ 4494 PyObject *w = PyLong_FromLong((long)c); 4495 PyObject *x; 4496 4497 if (w == NULL) 4498 return NULL; 4499 x = PyObject_GetItem(mapping, w); 4500 Py_DECREF(w); 4501 if (x == NULL) { 4502 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 4503 /* No mapping found means: mapping is undefined. */ 4504 PyErr_Clear(); 4505 x = Py_None; 4506 Py_INCREF(x); 4507 return x; 4508 } else 4509 return NULL; 4510 } 4511 else if (x == Py_None) 4512 return x; 4513 else if (PyLong_Check(x)) { 4514 long value = PyLong_AS_LONG(x); 4515 if (value < 0 || value > 255) { 4516 PyErr_SetString(PyExc_TypeError, 4517 "character mapping must be in range(256)"); 4518 Py_DECREF(x); 4519 return NULL; 4520 } 4521 return x; 4522 } 4523 else if (PyBytes_Check(x)) 4524 return x; 4525 else { 4526 /* wrong return value */ 4527 PyErr_Format(PyExc_TypeError, 4528 "character mapping must return integer, bytes or None, not %.400s", 4529 x->ob_type->tp_name); 4530 Py_DECREF(x); 4531 return NULL; 4532 } 4533} 4534 4535static int 4536charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize) 4537{ 4538 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj); 4539 /* exponentially overallocate to minimize reallocations */ 4540 if (requiredsize < 2*outsize) 4541 requiredsize = 2*outsize; 4542 if (_PyBytes_Resize(outobj, requiredsize)) 4543 return -1; 4544 return 0; 4545} 4546 4547typedef enum charmapencode_result { 4548 enc_SUCCESS, enc_FAILED, enc_EXCEPTION 4549}charmapencode_result; 4550/* lookup the character, put the result in the output string and adjust 4551 various state variables. Resize the output bytes object if not enough 4552 space is available. Return a new reference to the object that 4553 was put in the output buffer, or Py_None, if the mapping was undefined 4554 (in which case no character was written) or NULL, if a 4555 reallocation error occurred. The caller must decref the result */ 4556static 4557charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping, 4558 PyObject **outobj, Py_ssize_t *outpos) 4559{ 4560 PyObject *rep; 4561 char *outstart; 4562 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj); 4563 4564 if (Py_TYPE(mapping) == &EncodingMapType) { 4565 int res = encoding_map_lookup(c, mapping); 4566 Py_ssize_t requiredsize = *outpos+1; 4567 if (res == -1) 4568 return enc_FAILED; 4569 if (outsize<requiredsize) 4570 if (charmapencode_resize(outobj, outpos, requiredsize)) 4571 return enc_EXCEPTION; 4572 outstart = PyBytes_AS_STRING(*outobj); 4573 outstart[(*outpos)++] = (char)res; 4574 return enc_SUCCESS; 4575 } 4576 4577 rep = charmapencode_lookup(c, mapping); 4578 if (rep==NULL) 4579 return enc_EXCEPTION; 4580 else if (rep==Py_None) { 4581 Py_DECREF(rep); 4582 return enc_FAILED; 4583 } else { 4584 if (PyLong_Check(rep)) { 4585 Py_ssize_t requiredsize = *outpos+1; 4586 if (outsize<requiredsize) 4587 if (charmapencode_resize(outobj, outpos, requiredsize)) { 4588 Py_DECREF(rep); 4589 return enc_EXCEPTION; 4590 } 4591 outstart = PyBytes_AS_STRING(*outobj); 4592 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep); 4593 } 4594 else { 4595 const char *repchars = PyBytes_AS_STRING(rep); 4596 Py_ssize_t repsize = PyBytes_GET_SIZE(rep); 4597 Py_ssize_t requiredsize = *outpos+repsize; 4598 if (outsize<requiredsize) 4599 if (charmapencode_resize(outobj, outpos, requiredsize)) { 4600 Py_DECREF(rep); 4601 return enc_EXCEPTION; 4602 } 4603 outstart = PyBytes_AS_STRING(*outobj); 4604 memcpy(outstart + *outpos, repchars, repsize); 4605 *outpos += repsize; 4606 } 4607 } 4608 Py_DECREF(rep); 4609 return enc_SUCCESS; 4610} 4611 4612/* handle an error in PyUnicode_EncodeCharmap 4613 Return 0 on success, -1 on error */ 4614static 4615int charmap_encoding_error( 4616 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping, 4617 PyObject **exceptionObject, 4618 int *known_errorHandler, PyObject **errorHandler, const char *errors, 4619 PyObject **res, Py_ssize_t *respos) 4620{ 4621 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */ 4622 Py_ssize_t repsize; 4623 Py_ssize_t newpos; 4624 Py_UNICODE *uni2; 4625 /* startpos for collecting unencodable chars */ 4626 Py_ssize_t collstartpos = *inpos; 4627 Py_ssize_t collendpos = *inpos+1; 4628 Py_ssize_t collpos; 4629 char *encoding = "charmap"; 4630 char *reason = "character maps to <undefined>"; 4631 charmapencode_result x; 4632 4633 /* find all unencodable characters */ 4634 while (collendpos < size) { 4635 PyObject *rep; 4636 if (Py_TYPE(mapping) == &EncodingMapType) { 4637 int res = encoding_map_lookup(p[collendpos], mapping); 4638 if (res != -1) 4639 break; 4640 ++collendpos; 4641 continue; 4642 } 4643 4644 rep = charmapencode_lookup(p[collendpos], mapping); 4645 if (rep==NULL) 4646 return -1; 4647 else if (rep!=Py_None) { 4648 Py_DECREF(rep); 4649 break; 4650 } 4651 Py_DECREF(rep); 4652 ++collendpos; 4653 } 4654 /* cache callback name lookup 4655 * (if not done yet, i.e. it's the first error) */ 4656 if (*known_errorHandler==-1) { 4657 if ((errors==NULL) || (!strcmp(errors, "strict"))) 4658 *known_errorHandler = 1; 4659 else if (!strcmp(errors, "replace")) 4660 *known_errorHandler = 2; 4661 else if (!strcmp(errors, "ignore")) 4662 *known_errorHandler = 3; 4663 else if (!strcmp(errors, "xmlcharrefreplace")) 4664 *known_errorHandler = 4; 4665 else 4666 *known_errorHandler = 0; 4667 } 4668 switch (*known_errorHandler) { 4669 case 1: /* strict */ 4670 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason); 4671 return -1; 4672 case 2: /* replace */ 4673 for (collpos = collstartpos; collpos<collendpos; ++collpos) { 4674 x = charmapencode_output('?', mapping, res, respos); 4675 if (x==enc_EXCEPTION) { 4676 return -1; 4677 } 4678 else if (x==enc_FAILED) { 4679 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason); 4680 return -1; 4681 } 4682 } 4683 /* fall through */ 4684 case 3: /* ignore */ 4685 *inpos = collendpos; 4686 break; 4687 case 4: /* xmlcharrefreplace */ 4688 /* generate replacement (temporarily (mis)uses p) */ 4689 for (collpos = collstartpos; collpos < collendpos; ++collpos) { 4690 char buffer[2+29+1+1]; 4691 char *cp; 4692 sprintf(buffer, "&#%d;", (int)p[collpos]); 4693 for (cp = buffer; *cp; ++cp) { 4694 x = charmapencode_output(*cp, mapping, res, respos); 4695 if (x==enc_EXCEPTION) 4696 return -1; 4697 else if (x==enc_FAILED) { 4698 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason); 4699 return -1; 4700 } 4701 } 4702 } 4703 *inpos = collendpos; 4704 break; 4705 default: 4706 repunicode = unicode_encode_call_errorhandler(errors, errorHandler, 4707 encoding, reason, p, size, exceptionObject, 4708 collstartpos, collendpos, &newpos); 4709 if (repunicode == NULL) 4710 return -1; 4711 /* generate replacement */ 4712 repsize = PyUnicode_GET_SIZE(repunicode); 4713 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) { 4714 x = charmapencode_output(*uni2, mapping, res, respos); 4715 if (x==enc_EXCEPTION) { 4716 return -1; 4717 } 4718 else if (x==enc_FAILED) { 4719 Py_DECREF(repunicode); 4720 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason); 4721 return -1; 4722 } 4723 } 4724 *inpos = newpos; 4725 Py_DECREF(repunicode); 4726 } 4727 return 0; 4728} 4729 4730PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p, 4731 Py_ssize_t size, 4732 PyObject *mapping, 4733 const char *errors) 4734{ 4735 /* output object */ 4736 PyObject *res = NULL; 4737 /* current input position */ 4738 Py_ssize_t inpos = 0; 4739 /* current output position */ 4740 Py_ssize_t respos = 0; 4741 PyObject *errorHandler = NULL; 4742 PyObject *exc = NULL; 4743 /* the following variable is used for caching string comparisons 4744 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 4745 * 3=ignore, 4=xmlcharrefreplace */ 4746 int known_errorHandler = -1; 4747 4748 /* Default to Latin-1 */ 4749 if (mapping == NULL) 4750 return PyUnicode_EncodeLatin1(p, size, errors); 4751 4752 /* allocate enough for a simple encoding without 4753 replacements, if we need more, we'll resize */ 4754 res = PyBytes_FromStringAndSize(NULL, size); 4755 if (res == NULL) 4756 goto onError; 4757 if (size == 0) 4758 return res; 4759 4760 while (inpos<size) { 4761 /* try to encode it */ 4762 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos); 4763 if (x==enc_EXCEPTION) /* error */ 4764 goto onError; 4765 if (x==enc_FAILED) { /* unencodable character */ 4766 if (charmap_encoding_error(p, size, &inpos, mapping, 4767 &exc, 4768 &known_errorHandler, &errorHandler, errors, 4769 &res, &respos)) { 4770 goto onError; 4771 } 4772 } 4773 else 4774 /* done with this character => adjust input position */ 4775 ++inpos; 4776 } 4777 4778 /* Resize if we allocated to much */ 4779 if (respos<PyBytes_GET_SIZE(res)) 4780 _PyBytes_Resize(&res, respos); 4781 4782 Py_XDECREF(exc); 4783 Py_XDECREF(errorHandler); 4784 return res; 4785 4786 onError: 4787 Py_XDECREF(res); 4788 Py_XDECREF(exc); 4789 Py_XDECREF(errorHandler); 4790 return NULL; 4791} 4792 4793PyObject *PyUnicode_AsCharmapString(PyObject *unicode, 4794 PyObject *mapping) 4795{ 4796 if (!PyUnicode_Check(unicode) || mapping == NULL) { 4797 PyErr_BadArgument(); 4798 return NULL; 4799 } 4800 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode), 4801 PyUnicode_GET_SIZE(unicode), 4802 mapping, 4803 NULL); 4804} 4805 4806/* create or adjust a UnicodeTranslateError */ 4807static void make_translate_exception(PyObject **exceptionObject, 4808 const Py_UNICODE *unicode, Py_ssize_t size, 4809 Py_ssize_t startpos, Py_ssize_t endpos, 4810 const char *reason) 4811{ 4812 if (*exceptionObject == NULL) { 4813 *exceptionObject = PyUnicodeTranslateError_Create( 4814 unicode, size, startpos, endpos, reason); 4815 } 4816 else { 4817 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos)) 4818 goto onError; 4819 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos)) 4820 goto onError; 4821 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason)) 4822 goto onError; 4823 return; 4824 onError: 4825 Py_DECREF(*exceptionObject); 4826 *exceptionObject = NULL; 4827 } 4828} 4829 4830/* raises a UnicodeTranslateError */ 4831static void raise_translate_exception(PyObject **exceptionObject, 4832 const Py_UNICODE *unicode, Py_ssize_t size, 4833 Py_ssize_t startpos, Py_ssize_t endpos, 4834 const char *reason) 4835{ 4836 make_translate_exception(exceptionObject, 4837 unicode, size, startpos, endpos, reason); 4838 if (*exceptionObject != NULL) 4839 PyCodec_StrictErrors(*exceptionObject); 4840} 4841 4842/* error handling callback helper: 4843 build arguments, call the callback and check the arguments, 4844 put the result into newpos and return the replacement string, which 4845 has to be freed by the caller */ 4846static PyObject *unicode_translate_call_errorhandler(const char *errors, 4847 PyObject **errorHandler, 4848 const char *reason, 4849 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject, 4850 Py_ssize_t startpos, Py_ssize_t endpos, 4851 Py_ssize_t *newpos) 4852{ 4853 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple"; 4854 4855 Py_ssize_t i_newpos; 4856 PyObject *restuple; 4857 PyObject *resunicode; 4858 4859 if (*errorHandler == NULL) { 4860 *errorHandler = PyCodec_LookupError(errors); 4861 if (*errorHandler == NULL) 4862 return NULL; 4863 } 4864 4865 make_translate_exception(exceptionObject, 4866 unicode, size, startpos, endpos, reason); 4867 if (*exceptionObject == NULL) 4868 return NULL; 4869 4870 restuple = PyObject_CallFunctionObjArgs( 4871 *errorHandler, *exceptionObject, NULL); 4872 if (restuple == NULL) 4873 return NULL; 4874 if (!PyTuple_Check(restuple)) { 4875 PyErr_Format(PyExc_TypeError, &argparse[4]); 4876 Py_DECREF(restuple); 4877 return NULL; 4878 } 4879 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, 4880 &resunicode, &i_newpos)) { 4881 Py_DECREF(restuple); 4882 return NULL; 4883 } 4884 if (i_newpos<0) 4885 *newpos = size+i_newpos; 4886 else 4887 *newpos = i_newpos; 4888 if (*newpos<0 || *newpos>size) { 4889 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos); 4890 Py_DECREF(restuple); 4891 return NULL; 4892 } 4893 Py_INCREF(resunicode); 4894 Py_DECREF(restuple); 4895 return resunicode; 4896} 4897 4898/* Lookup the character ch in the mapping and put the result in result, 4899 which must be decrefed by the caller. 4900 Return 0 on success, -1 on error */ 4901static 4902int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result) 4903{ 4904 PyObject *w = PyLong_FromLong((long)c); 4905 PyObject *x; 4906 4907 if (w == NULL) 4908 return -1; 4909 x = PyObject_GetItem(mapping, w); 4910 Py_DECREF(w); 4911 if (x == NULL) { 4912 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 4913 /* No mapping found means: use 1:1 mapping. */ 4914 PyErr_Clear(); 4915 *result = NULL; 4916 return 0; 4917 } else 4918 return -1; 4919 } 4920 else if (x == Py_None) { 4921 *result = x; 4922 return 0; 4923 } 4924 else if (PyLong_Check(x)) { 4925 long value = PyLong_AS_LONG(x); 4926 long max = PyUnicode_GetMax(); 4927 if (value < 0 || value > max) { 4928 PyErr_Format(PyExc_TypeError, 4929 "character mapping must be in range(0x%x)", max+1); 4930 Py_DECREF(x); 4931 return -1; 4932 } 4933 *result = x; 4934 return 0; 4935 } 4936 else if (PyUnicode_Check(x)) { 4937 *result = x; 4938 return 0; 4939 } 4940 else { 4941 /* wrong return value */ 4942 PyErr_SetString(PyExc_TypeError, 4943 "character mapping must return integer, None or unicode"); 4944 Py_DECREF(x); 4945 return -1; 4946 } 4947} 4948/* ensure that *outobj is at least requiredsize characters long, 4949if not reallocate and adjust various state variables. 4950Return 0 on success, -1 on error */ 4951static 4952int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp, 4953 Py_ssize_t requiredsize) 4954{ 4955 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj); 4956 if (requiredsize > oldsize) { 4957 /* remember old output position */ 4958 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj); 4959 /* exponentially overallocate to minimize reallocations */ 4960 if (requiredsize < 2 * oldsize) 4961 requiredsize = 2 * oldsize; 4962 if (_PyUnicode_Resize(outobj, requiredsize) < 0) 4963 return -1; 4964 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos; 4965 } 4966 return 0; 4967} 4968/* lookup the character, put the result in the output string and adjust 4969 various state variables. Return a new reference to the object that 4970 was put in the output buffer in *result, or Py_None, if the mapping was 4971 undefined (in which case no character was written). 4972 The called must decref result. 4973 Return 0 on success, -1 on error. */ 4974static 4975int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp, 4976 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp, 4977 PyObject **res) 4978{ 4979 if (charmaptranslate_lookup(*curinp, mapping, res)) 4980 return -1; 4981 if (*res==NULL) { 4982 /* not found => default to 1:1 mapping */ 4983 *(*outp)++ = *curinp; 4984 } 4985 else if (*res==Py_None) 4986 ; 4987 else if (PyLong_Check(*res)) { 4988 /* no overflow check, because we know that the space is enough */ 4989 *(*outp)++ = (Py_UNICODE)PyLong_AS_LONG(*res); 4990 } 4991 else if (PyUnicode_Check(*res)) { 4992 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res); 4993 if (repsize==1) { 4994 /* no overflow check, because we know that the space is enough */ 4995 *(*outp)++ = *PyUnicode_AS_UNICODE(*res); 4996 } 4997 else if (repsize!=0) { 4998 /* more than one character */ 4999 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) + 5000 (insize - (curinp-startinp)) + 5001 repsize - 1; 5002 if (charmaptranslate_makespace(outobj, outp, requiredsize)) 5003 return -1; 5004 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize); 5005 *outp += repsize; 5006 } 5007 } 5008 else 5009 return -1; 5010 return 0; 5011} 5012 5013PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p, 5014 Py_ssize_t size, 5015 PyObject *mapping, 5016 const char *errors) 5017{ 5018 /* output object */ 5019 PyObject *res = NULL; 5020 /* pointers to the beginning and end+1 of input */ 5021 const Py_UNICODE *startp = p; 5022 const Py_UNICODE *endp = p + size; 5023 /* pointer into the output */ 5024 Py_UNICODE *str; 5025 /* current output position */ 5026 Py_ssize_t respos = 0; 5027 char *reason = "character maps to <undefined>"; 5028 PyObject *errorHandler = NULL; 5029 PyObject *exc = NULL; 5030 /* the following variable is used for caching string comparisons 5031 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 5032 * 3=ignore, 4=xmlcharrefreplace */ 5033 int known_errorHandler = -1; 5034 5035 if (mapping == NULL) { 5036 PyErr_BadArgument(); 5037 return NULL; 5038 } 5039 5040 /* allocate enough for a simple 1:1 translation without 5041 replacements, if we need more, we'll resize */ 5042 res = PyUnicode_FromUnicode(NULL, size); 5043 if (res == NULL) 5044 goto onError; 5045 if (size == 0) 5046 return res; 5047 str = PyUnicode_AS_UNICODE(res); 5048 5049 while (p<endp) { 5050 /* try to encode it */ 5051 PyObject *x = NULL; 5052 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) { 5053 Py_XDECREF(x); 5054 goto onError; 5055 } 5056 Py_XDECREF(x); 5057 if (x!=Py_None) /* it worked => adjust input pointer */ 5058 ++p; 5059 else { /* untranslatable character */ 5060 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */ 5061 Py_ssize_t repsize; 5062 Py_ssize_t newpos; 5063 Py_UNICODE *uni2; 5064 /* startpos for collecting untranslatable chars */ 5065 const Py_UNICODE *collstart = p; 5066 const Py_UNICODE *collend = p+1; 5067 const Py_UNICODE *coll; 5068 5069 /* find all untranslatable characters */ 5070 while (collend < endp) { 5071 if (charmaptranslate_lookup(*collend, mapping, &x)) 5072 goto onError; 5073 Py_XDECREF(x); 5074 if (x!=Py_None) 5075 break; 5076 ++collend; 5077 } 5078 /* cache callback name lookup 5079 * (if not done yet, i.e. it's the first error) */ 5080 if (known_errorHandler==-1) { 5081 if ((errors==NULL) || (!strcmp(errors, "strict"))) 5082 known_errorHandler = 1; 5083 else if (!strcmp(errors, "replace")) 5084 known_errorHandler = 2; 5085 else if (!strcmp(errors, "ignore")) 5086 known_errorHandler = 3; 5087 else if (!strcmp(errors, "xmlcharrefreplace")) 5088 known_errorHandler = 4; 5089 else 5090 known_errorHandler = 0; 5091 } 5092 switch (known_errorHandler) { 5093 case 1: /* strict */ 5094 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason); 5095 goto onError; 5096 case 2: /* replace */ 5097 /* No need to check for space, this is a 1:1 replacement */ 5098 for (coll = collstart; coll<collend; ++coll) 5099 *str++ = '?'; 5100 /* fall through */ 5101 case 3: /* ignore */ 5102 p = collend; 5103 break; 5104 case 4: /* xmlcharrefreplace */ 5105 /* generate replacement (temporarily (mis)uses p) */ 5106 for (p = collstart; p < collend; ++p) { 5107 char buffer[2+29+1+1]; 5108 char *cp; 5109 sprintf(buffer, "&#%d;", (int)*p); 5110 if (charmaptranslate_makespace(&res, &str, 5111 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend))) 5112 goto onError; 5113 for (cp = buffer; *cp; ++cp) 5114 *str++ = *cp; 5115 } 5116 p = collend; 5117 break; 5118 default: 5119 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler, 5120 reason, startp, size, &exc, 5121 collstart-startp, collend-startp, &newpos); 5122 if (repunicode == NULL) 5123 goto onError; 5124 /* generate replacement */ 5125 repsize = PyUnicode_GET_SIZE(repunicode); 5126 if (charmaptranslate_makespace(&res, &str, 5127 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) { 5128 Py_DECREF(repunicode); 5129 goto onError; 5130 } 5131 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) 5132 *str++ = *uni2; 5133 p = startp + newpos; 5134 Py_DECREF(repunicode); 5135 } 5136 } 5137 } 5138 /* Resize if we allocated to much */ 5139 respos = str-PyUnicode_AS_UNICODE(res); 5140 if (respos<PyUnicode_GET_SIZE(res)) { 5141 if (_PyUnicode_Resize(&res, respos) < 0) 5142 goto onError; 5143 } 5144 Py_XDECREF(exc); 5145 Py_XDECREF(errorHandler); 5146 return res; 5147 5148 onError: 5149 Py_XDECREF(res); 5150 Py_XDECREF(exc); 5151 Py_XDECREF(errorHandler); 5152 return NULL; 5153} 5154 5155PyObject *PyUnicode_Translate(PyObject *str, 5156 PyObject *mapping, 5157 const char *errors) 5158{ 5159 PyObject *result; 5160 5161 str = PyUnicode_FromObject(str); 5162 if (str == NULL) 5163 goto onError; 5164 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str), 5165 PyUnicode_GET_SIZE(str), 5166 mapping, 5167 errors); 5168 Py_DECREF(str); 5169 return result; 5170 5171 onError: 5172 Py_XDECREF(str); 5173 return NULL; 5174} 5175 5176/* --- Decimal Encoder ---------------------------------------------------- */ 5177 5178int PyUnicode_EncodeDecimal(Py_UNICODE *s, 5179 Py_ssize_t length, 5180 char *output, 5181 const char *errors) 5182{ 5183 Py_UNICODE *p, *end; 5184 PyObject *errorHandler = NULL; 5185 PyObject *exc = NULL; 5186 const char *encoding = "decimal"; 5187 const char *reason = "invalid decimal Unicode string"; 5188 /* the following variable is used for caching string comparisons 5189 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */ 5190 int known_errorHandler = -1; 5191 5192 if (output == NULL) { 5193 PyErr_BadArgument(); 5194 return -1; 5195 } 5196 5197 p = s; 5198 end = s + length; 5199 while (p < end) { 5200 register Py_UNICODE ch = *p; 5201 int decimal; 5202 PyObject *repunicode; 5203 Py_ssize_t repsize; 5204 Py_ssize_t newpos; 5205 Py_UNICODE *uni2; 5206 Py_UNICODE *collstart; 5207 Py_UNICODE *collend; 5208 5209 if (Py_UNICODE_ISSPACE(ch)) { 5210 *output++ = ' '; 5211 ++p; 5212 continue; 5213 } 5214 decimal = Py_UNICODE_TODECIMAL(ch); 5215 if (decimal >= 0) { 5216 *output++ = '0' + decimal; 5217 ++p; 5218 continue; 5219 } 5220 if (0 < ch && ch < 256) { 5221 *output++ = (char)ch; 5222 ++p; 5223 continue; 5224 } 5225 /* All other characters are considered unencodable */ 5226 collstart = p; 5227 collend = p+1; 5228 while (collend < end) { 5229 if ((0 < *collend && *collend < 256) || 5230 !Py_UNICODE_ISSPACE(*collend) || 5231 Py_UNICODE_TODECIMAL(*collend)) 5232 break; 5233 } 5234 /* cache callback name lookup 5235 * (if not done yet, i.e. it's the first error) */ 5236 if (known_errorHandler==-1) { 5237 if ((errors==NULL) || (!strcmp(errors, "strict"))) 5238 known_errorHandler = 1; 5239 else if (!strcmp(errors, "replace")) 5240 known_errorHandler = 2; 5241 else if (!strcmp(errors, "ignore")) 5242 known_errorHandler = 3; 5243 else if (!strcmp(errors, "xmlcharrefreplace")) 5244 known_errorHandler = 4; 5245 else 5246 known_errorHandler = 0; 5247 } 5248 switch (known_errorHandler) { 5249 case 1: /* strict */ 5250 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason); 5251 goto onError; 5252 case 2: /* replace */ 5253 for (p = collstart; p < collend; ++p) 5254 *output++ = '?'; 5255 /* fall through */ 5256 case 3: /* ignore */ 5257 p = collend; 5258 break; 5259 case 4: /* xmlcharrefreplace */ 5260 /* generate replacement (temporarily (mis)uses p) */ 5261 for (p = collstart; p < collend; ++p) 5262 output += sprintf(output, "&#%d;", (int)*p); 5263 p = collend; 5264 break; 5265 default: 5266 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler, 5267 encoding, reason, s, length, &exc, 5268 collstart-s, collend-s, &newpos); 5269 if (repunicode == NULL) 5270 goto onError; 5271 /* generate replacement */ 5272 repsize = PyUnicode_GET_SIZE(repunicode); 5273 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) { 5274 Py_UNICODE ch = *uni2; 5275 if (Py_UNICODE_ISSPACE(ch)) 5276 *output++ = ' '; 5277 else { 5278 decimal = Py_UNICODE_TODECIMAL(ch); 5279 if (decimal >= 0) 5280 *output++ = '0' + decimal; 5281 else if (0 < ch && ch < 256) 5282 *output++ = (char)ch; 5283 else { 5284 Py_DECREF(repunicode); 5285 raise_encode_exception(&exc, encoding, 5286 s, length, collstart-s, collend-s, reason); 5287 goto onError; 5288 } 5289 } 5290 } 5291 p = s + newpos; 5292 Py_DECREF(repunicode); 5293 } 5294 } 5295 /* 0-terminate the output string */ 5296 *output++ = '\0'; 5297 Py_XDECREF(exc); 5298 Py_XDECREF(errorHandler); 5299 return 0; 5300 5301 onError: 5302 Py_XDECREF(exc); 5303 Py_XDECREF(errorHandler); 5304 return -1; 5305} 5306 5307/* --- Helpers ------------------------------------------------------------ */ 5308 5309#include "stringlib/unicodedefs.h" 5310#include "stringlib/fastsearch.h" 5311#include "stringlib/count.h" 5312/* Include _ParseTupleFinds from find.h */ 5313#define FROM_UNICODE 5314#include "stringlib/find.h" 5315#include "stringlib/partition.h" 5316 5317#define _Py_InsertThousandsGrouping _PyUnicode_InsertThousandsGrouping 5318#include "stringlib/localeutil.h" 5319 5320/* helper macro to fixup start/end slice values */ 5321#define FIX_START_END(obj) \ 5322 if (start < 0) \ 5323 start += (obj)->length; \ 5324 if (start < 0) \ 5325 start = 0; \ 5326 if (end > (obj)->length) \ 5327 end = (obj)->length; \ 5328 if (end < 0) \ 5329 end += (obj)->length; \ 5330 if (end < 0) \ 5331 end = 0; 5332 5333Py_ssize_t PyUnicode_Count(PyObject *str, 5334 PyObject *substr, 5335 Py_ssize_t start, 5336 Py_ssize_t end) 5337{ 5338 Py_ssize_t result; 5339 PyUnicodeObject* str_obj; 5340 PyUnicodeObject* sub_obj; 5341 5342 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str); 5343 if (!str_obj) 5344 return -1; 5345 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr); 5346 if (!sub_obj) { 5347 Py_DECREF(str_obj); 5348 return -1; 5349 } 5350 5351 FIX_START_END(str_obj); 5352 5353 result = stringlib_count( 5354 str_obj->str + start, end - start, sub_obj->str, sub_obj->length 5355 ); 5356 5357 Py_DECREF(sub_obj); 5358 Py_DECREF(str_obj); 5359 5360 return result; 5361} 5362 5363Py_ssize_t PyUnicode_Find(PyObject *str, 5364 PyObject *sub, 5365 Py_ssize_t start, 5366 Py_ssize_t end, 5367 int direction) 5368{ 5369 Py_ssize_t result; 5370 5371 str = PyUnicode_FromObject(str); 5372 if (!str) 5373 return -2; 5374 sub = PyUnicode_FromObject(sub); 5375 if (!sub) { 5376 Py_DECREF(str); 5377 return -2; 5378 } 5379 5380 if (direction > 0) 5381 result = stringlib_find_slice( 5382 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str), 5383 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub), 5384 start, end 5385 ); 5386 else 5387 result = stringlib_rfind_slice( 5388 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str), 5389 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub), 5390 start, end 5391 ); 5392 5393 Py_DECREF(str); 5394 Py_DECREF(sub); 5395 5396 return result; 5397} 5398 5399static 5400int tailmatch(PyUnicodeObject *self, 5401 PyUnicodeObject *substring, 5402 Py_ssize_t start, 5403 Py_ssize_t end, 5404 int direction) 5405{ 5406 if (substring->length == 0) 5407 return 1; 5408 5409 FIX_START_END(self); 5410 5411 end -= substring->length; 5412 if (end < start) 5413 return 0; 5414 5415 if (direction > 0) { 5416 if (Py_UNICODE_MATCH(self, end, substring)) 5417 return 1; 5418 } else { 5419 if (Py_UNICODE_MATCH(self, start, substring)) 5420 return 1; 5421 } 5422 5423 return 0; 5424} 5425 5426Py_ssize_t PyUnicode_Tailmatch(PyObject *str, 5427 PyObject *substr, 5428 Py_ssize_t start, 5429 Py_ssize_t end, 5430 int direction) 5431{ 5432 Py_ssize_t result; 5433 5434 str = PyUnicode_FromObject(str); 5435 if (str == NULL) 5436 return -1; 5437 substr = PyUnicode_FromObject(substr); 5438 if (substr == NULL) { 5439 Py_DECREF(str); 5440 return -1; 5441 } 5442 5443 result = tailmatch((PyUnicodeObject *)str, 5444 (PyUnicodeObject *)substr, 5445 start, end, direction); 5446 Py_DECREF(str); 5447 Py_DECREF(substr); 5448 return result; 5449} 5450 5451/* Apply fixfct filter to the Unicode object self and return a 5452 reference to the modified object */ 5453 5454static 5455PyObject *fixup(PyUnicodeObject *self, 5456 int (*fixfct)(PyUnicodeObject *s)) 5457{ 5458 5459 PyUnicodeObject *u; 5460 5461 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length); 5462 if (u == NULL) 5463 return NULL; 5464 5465 Py_UNICODE_COPY(u->str, self->str, self->length); 5466 5467 if (!fixfct(u) && PyUnicode_CheckExact(self)) { 5468 /* fixfct should return TRUE if it modified the buffer. If 5469 FALSE, return a reference to the original buffer instead 5470 (to save space, not time) */ 5471 Py_INCREF(self); 5472 Py_DECREF(u); 5473 return (PyObject*) self; 5474 } 5475 return (PyObject*) u; 5476} 5477 5478static 5479int fixupper(PyUnicodeObject *self) 5480{ 5481 Py_ssize_t len = self->length; 5482 Py_UNICODE *s = self->str; 5483 int status = 0; 5484 5485 while (len-- > 0) { 5486 register Py_UNICODE ch; 5487 5488 ch = Py_UNICODE_TOUPPER(*s); 5489 if (ch != *s) { 5490 status = 1; 5491 *s = ch; 5492 } 5493 s++; 5494 } 5495 5496 return status; 5497} 5498 5499static 5500int fixlower(PyUnicodeObject *self) 5501{ 5502 Py_ssize_t len = self->length; 5503 Py_UNICODE *s = self->str; 5504 int status = 0; 5505 5506 while (len-- > 0) { 5507 register Py_UNICODE ch; 5508 5509 ch = Py_UNICODE_TOLOWER(*s); 5510 if (ch != *s) { 5511 status = 1; 5512 *s = ch; 5513 } 5514 s++; 5515 } 5516 5517 return status; 5518} 5519 5520static 5521int fixswapcase(PyUnicodeObject *self) 5522{ 5523 Py_ssize_t len = self->length; 5524 Py_UNICODE *s = self->str; 5525 int status = 0; 5526 5527 while (len-- > 0) { 5528 if (Py_UNICODE_ISUPPER(*s)) { 5529 *s = Py_UNICODE_TOLOWER(*s); 5530 status = 1; 5531 } else if (Py_UNICODE_ISLOWER(*s)) { 5532 *s = Py_UNICODE_TOUPPER(*s); 5533 status = 1; 5534 } 5535 s++; 5536 } 5537 5538 return status; 5539} 5540 5541static 5542int fixcapitalize(PyUnicodeObject *self) 5543{ 5544 Py_ssize_t len = self->length; 5545 Py_UNICODE *s = self->str; 5546 int status = 0; 5547 5548 if (len == 0) 5549 return 0; 5550 if (Py_UNICODE_ISLOWER(*s)) { 5551 *s = Py_UNICODE_TOUPPER(*s); 5552 status = 1; 5553 } 5554 s++; 5555 while (--len > 0) { 5556 if (Py_UNICODE_ISUPPER(*s)) { 5557 *s = Py_UNICODE_TOLOWER(*s); 5558 status = 1; 5559 } 5560 s++; 5561 } 5562 return status; 5563} 5564 5565static 5566int fixtitle(PyUnicodeObject *self) 5567{ 5568 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 5569 register Py_UNICODE *e; 5570 int previous_is_cased; 5571 5572 /* Shortcut for single character strings */ 5573 if (PyUnicode_GET_SIZE(self) == 1) { 5574 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p); 5575 if (*p != ch) { 5576 *p = ch; 5577 return 1; 5578 } 5579 else 5580 return 0; 5581 } 5582 5583 e = p + PyUnicode_GET_SIZE(self); 5584 previous_is_cased = 0; 5585 for (; p < e; p++) { 5586 register const Py_UNICODE ch = *p; 5587 5588 if (previous_is_cased) 5589 *p = Py_UNICODE_TOLOWER(ch); 5590 else 5591 *p = Py_UNICODE_TOTITLE(ch); 5592 5593 if (Py_UNICODE_ISLOWER(ch) || 5594 Py_UNICODE_ISUPPER(ch) || 5595 Py_UNICODE_ISTITLE(ch)) 5596 previous_is_cased = 1; 5597 else 5598 previous_is_cased = 0; 5599 } 5600 return 1; 5601} 5602 5603PyObject * 5604PyUnicode_Join(PyObject *separator, PyObject *seq) 5605{ 5606 PyObject *internal_separator = NULL; 5607 const Py_UNICODE blank = ' '; 5608 const Py_UNICODE *sep = ␣ 5609 Py_ssize_t seplen = 1; 5610 PyUnicodeObject *res = NULL; /* the result */ 5611 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */ 5612 Py_ssize_t res_used; /* # used bytes */ 5613 Py_UNICODE *res_p; /* pointer to free byte in res's string area */ 5614 PyObject *fseq; /* PySequence_Fast(seq) */ 5615 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */ 5616 PyObject *item; 5617 Py_ssize_t i; 5618 5619 fseq = PySequence_Fast(seq, ""); 5620 if (fseq == NULL) { 5621 return NULL; 5622 } 5623 5624 /* Grrrr. A codec may be invoked to convert str objects to 5625 * Unicode, and so it's possible to call back into Python code 5626 * during PyUnicode_FromObject(), and so it's possible for a sick 5627 * codec to change the size of fseq (if seq is a list). Therefore 5628 * we have to keep refetching the size -- can't assume seqlen 5629 * is invariant. 5630 */ 5631 seqlen = PySequence_Fast_GET_SIZE(fseq); 5632 /* If empty sequence, return u"". */ 5633 if (seqlen == 0) { 5634 res = _PyUnicode_New(0); /* empty sequence; return u"" */ 5635 goto Done; 5636 } 5637 /* If singleton sequence with an exact Unicode, return that. */ 5638 if (seqlen == 1) { 5639 item = PySequence_Fast_GET_ITEM(fseq, 0); 5640 if (PyUnicode_CheckExact(item)) { 5641 Py_INCREF(item); 5642 res = (PyUnicodeObject *)item; 5643 goto Done; 5644 } 5645 } 5646 5647 /* At least two items to join, or one that isn't exact Unicode. */ 5648 if (seqlen > 1) { 5649 /* Set up sep and seplen -- they're needed. */ 5650 if (separator == NULL) { 5651 sep = ␣ 5652 seplen = 1; 5653 } 5654 else { 5655 internal_separator = PyUnicode_FromObject(separator); 5656 if (internal_separator == NULL) 5657 goto onError; 5658 sep = PyUnicode_AS_UNICODE(internal_separator); 5659 seplen = PyUnicode_GET_SIZE(internal_separator); 5660 /* In case PyUnicode_FromObject() mutated seq. */ 5661 seqlen = PySequence_Fast_GET_SIZE(fseq); 5662 } 5663 } 5664 5665 /* Get space. */ 5666 res = _PyUnicode_New(res_alloc); 5667 if (res == NULL) 5668 goto onError; 5669 res_p = PyUnicode_AS_UNICODE(res); 5670 res_used = 0; 5671 5672 for (i = 0; i < seqlen; ++i) { 5673 Py_ssize_t itemlen; 5674 Py_ssize_t new_res_used; 5675 5676 item = PySequence_Fast_GET_ITEM(fseq, i); 5677 /* Convert item to Unicode. */ 5678 if (!PyUnicode_Check(item)) { 5679 PyErr_Format(PyExc_TypeError, 5680 "sequence item %zd: expected str instance," 5681 " %.80s found", 5682 i, Py_TYPE(item)->tp_name); 5683 goto onError; 5684 } 5685 item = PyUnicode_FromObject(item); 5686 if (item == NULL) 5687 goto onError; 5688 /* We own a reference to item from here on. */ 5689 5690 /* In case PyUnicode_FromObject() mutated seq. */ 5691 seqlen = PySequence_Fast_GET_SIZE(fseq); 5692 5693 /* Make sure we have enough space for the separator and the item. */ 5694 itemlen = PyUnicode_GET_SIZE(item); 5695 new_res_used = res_used + itemlen; 5696 if (new_res_used < 0) 5697 goto Overflow; 5698 if (i < seqlen - 1) { 5699 new_res_used += seplen; 5700 if (new_res_used < 0) 5701 goto Overflow; 5702 } 5703 if (new_res_used > res_alloc) { 5704 /* double allocated size until it's big enough */ 5705 do { 5706 res_alloc += res_alloc; 5707 if (res_alloc <= 0) 5708 goto Overflow; 5709 } while (new_res_used > res_alloc); 5710 if (_PyUnicode_Resize(&res, res_alloc) < 0) { 5711 Py_DECREF(item); 5712 goto onError; 5713 } 5714 res_p = PyUnicode_AS_UNICODE(res) + res_used; 5715 } 5716 5717 /* Copy item, and maybe the separator. */ 5718 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen); 5719 res_p += itemlen; 5720 if (i < seqlen - 1) { 5721 Py_UNICODE_COPY(res_p, sep, seplen); 5722 res_p += seplen; 5723 } 5724 Py_DECREF(item); 5725 res_used = new_res_used; 5726 } 5727 5728 /* Shrink res to match the used area; this probably can't fail, 5729 * but it's cheap to check. 5730 */ 5731 if (_PyUnicode_Resize(&res, res_used) < 0) 5732 goto onError; 5733 5734 Done: 5735 Py_XDECREF(internal_separator); 5736 Py_DECREF(fseq); 5737 return (PyObject *)res; 5738 5739 Overflow: 5740 PyErr_SetString(PyExc_OverflowError, 5741 "join() result is too long for a Python string"); 5742 Py_DECREF(item); 5743 /* fall through */ 5744 5745 onError: 5746 Py_XDECREF(internal_separator); 5747 Py_DECREF(fseq); 5748 Py_XDECREF(res); 5749 return NULL; 5750} 5751 5752static 5753PyUnicodeObject *pad(PyUnicodeObject *self, 5754 Py_ssize_t left, 5755 Py_ssize_t right, 5756 Py_UNICODE fill) 5757{ 5758 PyUnicodeObject *u; 5759 5760 if (left < 0) 5761 left = 0; 5762 if (right < 0) 5763 right = 0; 5764 5765 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) { 5766 Py_INCREF(self); 5767 return self; 5768 } 5769 5770 u = _PyUnicode_New(left + self->length + right); 5771 if (u) { 5772 if (left) 5773 Py_UNICODE_FILL(u->str, fill, left); 5774 Py_UNICODE_COPY(u->str + left, self->str, self->length); 5775 if (right) 5776 Py_UNICODE_FILL(u->str + left + self->length, fill, right); 5777 } 5778 5779 return u; 5780} 5781 5782#define SPLIT_APPEND(data, left, right) \ 5783 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \ 5784 if (!str) \ 5785 goto onError; \ 5786 if (PyList_Append(list, str)) { \ 5787 Py_DECREF(str); \ 5788 goto onError; \ 5789 } \ 5790 else \ 5791 Py_DECREF(str); 5792 5793static 5794PyObject *split_whitespace(PyUnicodeObject *self, 5795 PyObject *list, 5796 Py_ssize_t maxcount) 5797{ 5798 register Py_ssize_t i; 5799 register Py_ssize_t j; 5800 Py_ssize_t len = self->length; 5801 PyObject *str; 5802 register const Py_UNICODE *buf = self->str; 5803 5804 for (i = j = 0; i < len; ) { 5805 /* find a token */ 5806 while (i < len && Py_UNICODE_ISSPACE(buf[i])) 5807 i++; 5808 j = i; 5809 while (i < len && !Py_UNICODE_ISSPACE(buf[i])) 5810 i++; 5811 if (j < i) { 5812 if (maxcount-- <= 0) 5813 break; 5814 SPLIT_APPEND(buf, j, i); 5815 while (i < len && Py_UNICODE_ISSPACE(buf[i])) 5816 i++; 5817 j = i; 5818 } 5819 } 5820 if (j < len) { 5821 SPLIT_APPEND(buf, j, len); 5822 } 5823 return list; 5824 5825 onError: 5826 Py_DECREF(list); 5827 return NULL; 5828} 5829 5830PyObject *PyUnicode_Splitlines(PyObject *string, 5831 int keepends) 5832{ 5833 register Py_ssize_t i; 5834 register Py_ssize_t j; 5835 Py_ssize_t len; 5836 PyObject *list; 5837 PyObject *str; 5838 Py_UNICODE *data; 5839 5840 string = PyUnicode_FromObject(string); 5841 if (string == NULL) 5842 return NULL; 5843 data = PyUnicode_AS_UNICODE(string); 5844 len = PyUnicode_GET_SIZE(string); 5845 5846 list = PyList_New(0); 5847 if (!list) 5848 goto onError; 5849 5850 for (i = j = 0; i < len; ) { 5851 Py_ssize_t eol; 5852 5853 /* Find a line and append it */ 5854 while (i < len && !BLOOM_LINEBREAK(data[i])) 5855 i++; 5856 5857 /* Skip the line break reading CRLF as one line break */ 5858 eol = i; 5859 if (i < len) { 5860 if (data[i] == '\r' && i + 1 < len && 5861 data[i+1] == '\n') 5862 i += 2; 5863 else 5864 i++; 5865 if (keepends) 5866 eol = i; 5867 } 5868 SPLIT_APPEND(data, j, eol); 5869 j = i; 5870 } 5871 if (j < len) { 5872 SPLIT_APPEND(data, j, len); 5873 } 5874 5875 Py_DECREF(string); 5876 return list; 5877 5878 onError: 5879 Py_XDECREF(list); 5880 Py_DECREF(string); 5881 return NULL; 5882} 5883 5884static 5885PyObject *split_char(PyUnicodeObject *self, 5886 PyObject *list, 5887 Py_UNICODE ch, 5888 Py_ssize_t maxcount) 5889{ 5890 register Py_ssize_t i; 5891 register Py_ssize_t j; 5892 Py_ssize_t len = self->length; 5893 PyObject *str; 5894 register const Py_UNICODE *buf = self->str; 5895 5896 for (i = j = 0; i < len; ) { 5897 if (buf[i] == ch) { 5898 if (maxcount-- <= 0) 5899 break; 5900 SPLIT_APPEND(buf, j, i); 5901 i = j = i + 1; 5902 } else 5903 i++; 5904 } 5905 if (j <= len) { 5906 SPLIT_APPEND(buf, j, len); 5907 } 5908 return list; 5909 5910 onError: 5911 Py_DECREF(list); 5912 return NULL; 5913} 5914 5915static 5916PyObject *split_substring(PyUnicodeObject *self, 5917 PyObject *list, 5918 PyUnicodeObject *substring, 5919 Py_ssize_t maxcount) 5920{ 5921 register Py_ssize_t i; 5922 register Py_ssize_t j; 5923 Py_ssize_t len = self->length; 5924 Py_ssize_t sublen = substring->length; 5925 PyObject *str; 5926 5927 for (i = j = 0; i <= len - sublen; ) { 5928 if (Py_UNICODE_MATCH(self, i, substring)) { 5929 if (maxcount-- <= 0) 5930 break; 5931 SPLIT_APPEND(self->str, j, i); 5932 i = j = i + sublen; 5933 } else 5934 i++; 5935 } 5936 if (j <= len) { 5937 SPLIT_APPEND(self->str, j, len); 5938 } 5939 return list; 5940 5941 onError: 5942 Py_DECREF(list); 5943 return NULL; 5944} 5945 5946static 5947PyObject *rsplit_whitespace(PyUnicodeObject *self, 5948 PyObject *list, 5949 Py_ssize_t maxcount) 5950{ 5951 register Py_ssize_t i; 5952 register Py_ssize_t j; 5953 Py_ssize_t len = self->length; 5954 PyObject *str; 5955 register const Py_UNICODE *buf = self->str; 5956 5957 for (i = j = len - 1; i >= 0; ) { 5958 /* find a token */ 5959 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i])) 5960 i--; 5961 j = i; 5962 while (i >= 0 && !Py_UNICODE_ISSPACE(buf[i])) 5963 i--; 5964 if (j > i) { 5965 if (maxcount-- <= 0) 5966 break; 5967 SPLIT_APPEND(buf, i + 1, j + 1); 5968 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i])) 5969 i--; 5970 j = i; 5971 } 5972 } 5973 if (j >= 0) { 5974 SPLIT_APPEND(buf, 0, j + 1); 5975 } 5976 if (PyList_Reverse(list) < 0) 5977 goto onError; 5978 return list; 5979 5980 onError: 5981 Py_DECREF(list); 5982 return NULL; 5983} 5984 5985static 5986PyObject *rsplit_char(PyUnicodeObject *self, 5987 PyObject *list, 5988 Py_UNICODE ch, 5989 Py_ssize_t maxcount) 5990{ 5991 register Py_ssize_t i; 5992 register Py_ssize_t j; 5993 Py_ssize_t len = self->length; 5994 PyObject *str; 5995 register const Py_UNICODE *buf = self->str; 5996 5997 for (i = j = len - 1; i >= 0; ) { 5998 if (buf[i] == ch) { 5999 if (maxcount-- <= 0) 6000 break; 6001 SPLIT_APPEND(buf, i + 1, j + 1); 6002 j = i = i - 1; 6003 } else 6004 i--; 6005 } 6006 if (j >= -1) { 6007 SPLIT_APPEND(buf, 0, j + 1); 6008 } 6009 if (PyList_Reverse(list) < 0) 6010 goto onError; 6011 return list; 6012 6013 onError: 6014 Py_DECREF(list); 6015 return NULL; 6016} 6017 6018static 6019PyObject *rsplit_substring(PyUnicodeObject *self, 6020 PyObject *list, 6021 PyUnicodeObject *substring, 6022 Py_ssize_t maxcount) 6023{ 6024 register Py_ssize_t i; 6025 register Py_ssize_t j; 6026 Py_ssize_t len = self->length; 6027 Py_ssize_t sublen = substring->length; 6028 PyObject *str; 6029 6030 for (i = len - sublen, j = len; i >= 0; ) { 6031 if (Py_UNICODE_MATCH(self, i, substring)) { 6032 if (maxcount-- <= 0) 6033 break; 6034 SPLIT_APPEND(self->str, i + sublen, j); 6035 j = i; 6036 i -= sublen; 6037 } else 6038 i--; 6039 } 6040 if (j >= 0) { 6041 SPLIT_APPEND(self->str, 0, j); 6042 } 6043 if (PyList_Reverse(list) < 0) 6044 goto onError; 6045 return list; 6046 6047 onError: 6048 Py_DECREF(list); 6049 return NULL; 6050} 6051 6052#undef SPLIT_APPEND 6053 6054static 6055PyObject *split(PyUnicodeObject *self, 6056 PyUnicodeObject *substring, 6057 Py_ssize_t maxcount) 6058{ 6059 PyObject *list; 6060 6061 if (maxcount < 0) 6062 maxcount = PY_SSIZE_T_MAX; 6063 6064 list = PyList_New(0); 6065 if (!list) 6066 return NULL; 6067 6068 if (substring == NULL) 6069 return split_whitespace(self,list,maxcount); 6070 6071 else if (substring->length == 1) 6072 return split_char(self,list,substring->str[0],maxcount); 6073 6074 else if (substring->length == 0) { 6075 Py_DECREF(list); 6076 PyErr_SetString(PyExc_ValueError, "empty separator"); 6077 return NULL; 6078 } 6079 else 6080 return split_substring(self,list,substring,maxcount); 6081} 6082 6083static 6084PyObject *rsplit(PyUnicodeObject *self, 6085 PyUnicodeObject *substring, 6086 Py_ssize_t maxcount) 6087{ 6088 PyObject *list; 6089 6090 if (maxcount < 0) 6091 maxcount = PY_SSIZE_T_MAX; 6092 6093 list = PyList_New(0); 6094 if (!list) 6095 return NULL; 6096 6097 if (substring == NULL) 6098 return rsplit_whitespace(self,list,maxcount); 6099 6100 else if (substring->length == 1) 6101 return rsplit_char(self,list,substring->str[0],maxcount); 6102 6103 else if (substring->length == 0) { 6104 Py_DECREF(list); 6105 PyErr_SetString(PyExc_ValueError, "empty separator"); 6106 return NULL; 6107 } 6108 else 6109 return rsplit_substring(self,list,substring,maxcount); 6110} 6111 6112static 6113PyObject *replace(PyUnicodeObject *self, 6114 PyUnicodeObject *str1, 6115 PyUnicodeObject *str2, 6116 Py_ssize_t maxcount) 6117{ 6118 PyUnicodeObject *u; 6119 6120 if (maxcount < 0) 6121 maxcount = PY_SSIZE_T_MAX; 6122 6123 if (str1->length == str2->length) { 6124 /* same length */ 6125 Py_ssize_t i; 6126 if (str1->length == 1) { 6127 /* replace characters */ 6128 Py_UNICODE u1, u2; 6129 if (!findchar(self->str, self->length, str1->str[0])) 6130 goto nothing; 6131 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length); 6132 if (!u) 6133 return NULL; 6134 Py_UNICODE_COPY(u->str, self->str, self->length); 6135 u1 = str1->str[0]; 6136 u2 = str2->str[0]; 6137 for (i = 0; i < u->length; i++) 6138 if (u->str[i] == u1) { 6139 if (--maxcount < 0) 6140 break; 6141 u->str[i] = u2; 6142 } 6143 } else { 6144 i = fastsearch( 6145 self->str, self->length, str1->str, str1->length, FAST_SEARCH 6146 ); 6147 if (i < 0) 6148 goto nothing; 6149 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length); 6150 if (!u) 6151 return NULL; 6152 Py_UNICODE_COPY(u->str, self->str, self->length); 6153 while (i <= self->length - str1->length) 6154 if (Py_UNICODE_MATCH(self, i, str1)) { 6155 if (--maxcount < 0) 6156 break; 6157 Py_UNICODE_COPY(u->str+i, str2->str, str2->length); 6158 i += str1->length; 6159 } else 6160 i++; 6161 } 6162 } else { 6163 6164 Py_ssize_t n, i, j, e; 6165 Py_ssize_t product, new_size, delta; 6166 Py_UNICODE *p; 6167 6168 /* replace strings */ 6169 n = stringlib_count(self->str, self->length, str1->str, str1->length); 6170 if (n > maxcount) 6171 n = maxcount; 6172 if (n == 0) 6173 goto nothing; 6174 /* new_size = self->length + n * (str2->length - str1->length)); */ 6175 delta = (str2->length - str1->length); 6176 if (delta == 0) { 6177 new_size = self->length; 6178 } else { 6179 product = n * (str2->length - str1->length); 6180 if ((product / (str2->length - str1->length)) != n) { 6181 PyErr_SetString(PyExc_OverflowError, 6182 "replace string is too long"); 6183 return NULL; 6184 } 6185 new_size = self->length + product; 6186 if (new_size < 0) { 6187 PyErr_SetString(PyExc_OverflowError, 6188 "replace string is too long"); 6189 return NULL; 6190 } 6191 } 6192 u = _PyUnicode_New(new_size); 6193 if (!u) 6194 return NULL; 6195 i = 0; 6196 p = u->str; 6197 e = self->length - str1->length; 6198 if (str1->length > 0) { 6199 while (n-- > 0) { 6200 /* look for next match */ 6201 j = i; 6202 while (j <= e) { 6203 if (Py_UNICODE_MATCH(self, j, str1)) 6204 break; 6205 j++; 6206 } 6207 if (j > i) { 6208 if (j > e) 6209 break; 6210 /* copy unchanged part [i:j] */ 6211 Py_UNICODE_COPY(p, self->str+i, j-i); 6212 p += j - i; 6213 } 6214 /* copy substitution string */ 6215 if (str2->length > 0) { 6216 Py_UNICODE_COPY(p, str2->str, str2->length); 6217 p += str2->length; 6218 } 6219 i = j + str1->length; 6220 } 6221 if (i < self->length) 6222 /* copy tail [i:] */ 6223 Py_UNICODE_COPY(p, self->str+i, self->length-i); 6224 } else { 6225 /* interleave */ 6226 while (n > 0) { 6227 Py_UNICODE_COPY(p, str2->str, str2->length); 6228 p += str2->length; 6229 if (--n <= 0) 6230 break; 6231 *p++ = self->str[i++]; 6232 } 6233 Py_UNICODE_COPY(p, self->str+i, self->length-i); 6234 } 6235 } 6236 return (PyObject *) u; 6237 6238nothing: 6239 /* nothing to replace; return original string (when possible) */ 6240 if (PyUnicode_CheckExact(self)) { 6241 Py_INCREF(self); 6242 return (PyObject *) self; 6243 } 6244 return PyUnicode_FromUnicode(self->str, self->length); 6245} 6246 6247/* --- Unicode Object Methods --------------------------------------------- */ 6248 6249PyDoc_STRVAR(title__doc__, 6250"S.title() -> str\n\ 6251\n\ 6252Return a titlecased version of S, i.e. words start with title case\n\ 6253characters, all remaining cased characters have lower case."); 6254 6255static PyObject* 6256unicode_title(PyUnicodeObject *self) 6257{ 6258 return fixup(self, fixtitle); 6259} 6260 6261PyDoc_STRVAR(capitalize__doc__, 6262"S.capitalize() -> str\n\ 6263\n\ 6264Return a capitalized version of S, i.e. make the first character\n\ 6265have upper case."); 6266 6267static PyObject* 6268unicode_capitalize(PyUnicodeObject *self) 6269{ 6270 return fixup(self, fixcapitalize); 6271} 6272 6273#if 0 6274PyDoc_STRVAR(capwords__doc__, 6275"S.capwords() -> str\n\ 6276\n\ 6277Apply .capitalize() to all words in S and return the result with\n\ 6278normalized whitespace (all whitespace strings are replaced by ' ')."); 6279 6280static PyObject* 6281unicode_capwords(PyUnicodeObject *self) 6282{ 6283 PyObject *list; 6284 PyObject *item; 6285 Py_ssize_t i; 6286 6287 /* Split into words */ 6288 list = split(self, NULL, -1); 6289 if (!list) 6290 return NULL; 6291 6292 /* Capitalize each word */ 6293 for (i = 0; i < PyList_GET_SIZE(list); i++) { 6294 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i), 6295 fixcapitalize); 6296 if (item == NULL) 6297 goto onError; 6298 Py_DECREF(PyList_GET_ITEM(list, i)); 6299 PyList_SET_ITEM(list, i, item); 6300 } 6301 6302 /* Join the words to form a new string */ 6303 item = PyUnicode_Join(NULL, list); 6304 6305onError: 6306 Py_DECREF(list); 6307 return (PyObject *)item; 6308} 6309#endif 6310 6311/* Argument converter. Coerces to a single unicode character */ 6312 6313static int 6314convert_uc(PyObject *obj, void *addr) 6315{ 6316 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr; 6317 PyObject *uniobj; 6318 Py_UNICODE *unistr; 6319 6320 uniobj = PyUnicode_FromObject(obj); 6321 if (uniobj == NULL) { 6322 PyErr_SetString(PyExc_TypeError, 6323 "The fill character cannot be converted to Unicode"); 6324 return 0; 6325 } 6326 if (PyUnicode_GET_SIZE(uniobj) != 1) { 6327 PyErr_SetString(PyExc_TypeError, 6328 "The fill character must be exactly one character long"); 6329 Py_DECREF(uniobj); 6330 return 0; 6331 } 6332 unistr = PyUnicode_AS_UNICODE(uniobj); 6333 *fillcharloc = unistr[0]; 6334 Py_DECREF(uniobj); 6335 return 1; 6336} 6337 6338PyDoc_STRVAR(center__doc__, 6339"S.center(width[, fillchar]) -> str\n\ 6340\n\ 6341Return S centered in a Unicode string of length width. Padding is\n\ 6342done using the specified fill character (default is a space)"); 6343 6344static PyObject * 6345unicode_center(PyUnicodeObject *self, PyObject *args) 6346{ 6347 Py_ssize_t marg, left; 6348 Py_ssize_t width; 6349 Py_UNICODE fillchar = ' '; 6350 6351 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar)) 6352 return NULL; 6353 6354 if (self->length >= width && PyUnicode_CheckExact(self)) { 6355 Py_INCREF(self); 6356 return (PyObject*) self; 6357 } 6358 6359 marg = width - self->length; 6360 left = marg / 2 + (marg & width & 1); 6361 6362 return (PyObject*) pad(self, left, marg - left, fillchar); 6363} 6364 6365#if 0 6366 6367/* This code should go into some future Unicode collation support 6368 module. The basic comparison should compare ordinals on a naive 6369 basis (this is what Java does and thus JPython too). */ 6370 6371/* speedy UTF-16 code point order comparison */ 6372/* gleaned from: */ 6373/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */ 6374 6375static short utf16Fixup[32] = 6376{ 6377 0, 0, 0, 0, 0, 0, 0, 0, 6378 0, 0, 0, 0, 0, 0, 0, 0, 6379 0, 0, 0, 0, 0, 0, 0, 0, 6380 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800 6381}; 6382 6383static int 6384unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2) 6385{ 6386 Py_ssize_t len1, len2; 6387 6388 Py_UNICODE *s1 = str1->str; 6389 Py_UNICODE *s2 = str2->str; 6390 6391 len1 = str1->length; 6392 len2 = str2->length; 6393 6394 while (len1 > 0 && len2 > 0) { 6395 Py_UNICODE c1, c2; 6396 6397 c1 = *s1++; 6398 c2 = *s2++; 6399 6400 if (c1 > (1<<11) * 26) 6401 c1 += utf16Fixup[c1>>11]; 6402 if (c2 > (1<<11) * 26) 6403 c2 += utf16Fixup[c2>>11]; 6404 /* now c1 and c2 are in UTF-32-compatible order */ 6405 6406 if (c1 != c2) 6407 return (c1 < c2) ? -1 : 1; 6408 6409 len1--; len2--; 6410 } 6411 6412 return (len1 < len2) ? -1 : (len1 != len2); 6413} 6414 6415#else 6416 6417static int 6418unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2) 6419{ 6420 register Py_ssize_t len1, len2; 6421 6422 Py_UNICODE *s1 = str1->str; 6423 Py_UNICODE *s2 = str2->str; 6424 6425 len1 = str1->length; 6426 len2 = str2->length; 6427 6428 while (len1 > 0 && len2 > 0) { 6429 Py_UNICODE c1, c2; 6430 6431 c1 = *s1++; 6432 c2 = *s2++; 6433 6434 if (c1 != c2) 6435 return (c1 < c2) ? -1 : 1; 6436 6437 len1--; len2--; 6438 } 6439 6440 return (len1 < len2) ? -1 : (len1 != len2); 6441} 6442 6443#endif 6444 6445int PyUnicode_Compare(PyObject *left, 6446 PyObject *right) 6447{ 6448 if (PyUnicode_Check(left) && PyUnicode_Check(right)) 6449 return unicode_compare((PyUnicodeObject *)left, 6450 (PyUnicodeObject *)right); 6451 PyErr_Format(PyExc_TypeError, 6452 "Can't compare %.100s and %.100s", 6453 left->ob_type->tp_name, 6454 right->ob_type->tp_name); 6455 return -1; 6456} 6457 6458int 6459PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str) 6460{ 6461 int i; 6462 Py_UNICODE *id; 6463 assert(PyUnicode_Check(uni)); 6464 id = PyUnicode_AS_UNICODE(uni); 6465 /* Compare Unicode string and source character set string */ 6466 for (i = 0; id[i] && str[i]; i++) 6467 if (id[i] != str[i]) 6468 return ((int)id[i] < (int)str[i]) ? -1 : 1; 6469 if (id[i]) 6470 return 1; /* uni is longer */ 6471 if (str[i]) 6472 return -1; /* str is longer */ 6473 return 0; 6474} 6475 6476PyObject *PyUnicode_RichCompare(PyObject *left, 6477 PyObject *right, 6478 int op) 6479{ 6480 int result; 6481 6482 result = PyUnicode_Compare(left, right); 6483 if (result == -1 && PyErr_Occurred()) 6484 goto onError; 6485 6486 /* Convert the return value to a Boolean */ 6487 switch (op) { 6488 case Py_EQ: 6489 result = (result == 0); 6490 break; 6491 case Py_NE: 6492 result = (result != 0); 6493 break; 6494 case Py_LE: 6495 result = (result <= 0); 6496 break; 6497 case Py_GE: 6498 result = (result >= 0); 6499 break; 6500 case Py_LT: 6501 result = (result == -1); 6502 break; 6503 case Py_GT: 6504 result = (result == 1); 6505 break; 6506 } 6507 return PyBool_FromLong(result); 6508 6509 onError: 6510 6511 /* Standard case 6512 6513 Type errors mean that PyUnicode_FromObject() could not convert 6514 one of the arguments (usually the right hand side) to Unicode, 6515 ie. we can't handle the comparison request. However, it is 6516 possible that the other object knows a comparison method, which 6517 is why we return Py_NotImplemented to give the other object a 6518 chance. 6519 6520 */ 6521 if (PyErr_ExceptionMatches(PyExc_TypeError)) { 6522 PyErr_Clear(); 6523 Py_INCREF(Py_NotImplemented); 6524 return Py_NotImplemented; 6525 } 6526 if (op != Py_EQ && op != Py_NE) 6527 return NULL; 6528 6529 /* Equality comparison. 6530 6531 This is a special case: we silence any PyExc_UnicodeDecodeError 6532 and instead turn it into a PyErr_UnicodeWarning. 6533 6534 */ 6535 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) 6536 return NULL; 6537 PyErr_Clear(); 6538 if (PyErr_WarnEx(PyExc_UnicodeWarning, 6539 (op == Py_EQ) ? 6540 "Unicode equal comparison " 6541 "failed to convert both arguments to Unicode - " 6542 "interpreting them as being unequal" 6543 : 6544 "Unicode unequal comparison " 6545 "failed to convert both arguments to Unicode - " 6546 "interpreting them as being unequal", 6547 1) < 0) 6548 return NULL; 6549 result = (op == Py_NE); 6550 return PyBool_FromLong(result); 6551} 6552 6553int PyUnicode_Contains(PyObject *container, 6554 PyObject *element) 6555{ 6556 PyObject *str, *sub; 6557 int result; 6558 6559 /* Coerce the two arguments */ 6560 sub = PyUnicode_FromObject(element); 6561 if (!sub) { 6562 PyErr_Format(PyExc_TypeError, 6563 "'in <string>' requires string as left operand, not %s", 6564 element->ob_type->tp_name); 6565 return -1; 6566 } 6567 6568 str = PyUnicode_FromObject(container); 6569 if (!str) { 6570 Py_DECREF(sub); 6571 return -1; 6572 } 6573 6574 result = stringlib_contains_obj(str, sub); 6575 6576 Py_DECREF(str); 6577 Py_DECREF(sub); 6578 6579 return result; 6580} 6581 6582/* Concat to string or Unicode object giving a new Unicode object. */ 6583 6584PyObject *PyUnicode_Concat(PyObject *left, 6585 PyObject *right) 6586{ 6587 PyUnicodeObject *u = NULL, *v = NULL, *w; 6588 6589 /* Coerce the two arguments */ 6590 u = (PyUnicodeObject *)PyUnicode_FromObject(left); 6591 if (u == NULL) 6592 goto onError; 6593 v = (PyUnicodeObject *)PyUnicode_FromObject(right); 6594 if (v == NULL) 6595 goto onError; 6596 6597 /* Shortcuts */ 6598 if (v == unicode_empty) { 6599 Py_DECREF(v); 6600 return (PyObject *)u; 6601 } 6602 if (u == unicode_empty) { 6603 Py_DECREF(u); 6604 return (PyObject *)v; 6605 } 6606 6607 /* Concat the two Unicode strings */ 6608 w = _PyUnicode_New(u->length + v->length); 6609 if (w == NULL) 6610 goto onError; 6611 Py_UNICODE_COPY(w->str, u->str, u->length); 6612 Py_UNICODE_COPY(w->str + u->length, v->str, v->length); 6613 6614 Py_DECREF(u); 6615 Py_DECREF(v); 6616 return (PyObject *)w; 6617 6618onError: 6619 Py_XDECREF(u); 6620 Py_XDECREF(v); 6621 return NULL; 6622} 6623 6624void 6625PyUnicode_Append(PyObject **pleft, PyObject *right) 6626{ 6627 PyObject *new; 6628 if (*pleft == NULL) 6629 return; 6630 if (right == NULL || !PyUnicode_Check(*pleft)) { 6631 Py_DECREF(*pleft); 6632 *pleft = NULL; 6633 return; 6634 } 6635 new = PyUnicode_Concat(*pleft, right); 6636 Py_DECREF(*pleft); 6637 *pleft = new; 6638} 6639 6640void 6641PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right) 6642{ 6643 PyUnicode_Append(pleft, right); 6644 Py_XDECREF(right); 6645} 6646 6647PyDoc_STRVAR(count__doc__, 6648"S.count(sub[, start[, end]]) -> int\n\ 6649\n\ 6650Return the number of non-overlapping occurrences of substring sub in\n\ 6651Unicode string S[start:end]. Optional arguments start and end are\n\ 6652interpreted as in slice notation."); 6653 6654static PyObject * 6655unicode_count(PyUnicodeObject *self, PyObject *args) 6656{ 6657 PyUnicodeObject *substring; 6658 Py_ssize_t start = 0; 6659 Py_ssize_t end = PY_SSIZE_T_MAX; 6660 PyObject *result; 6661 6662 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring, 6663 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 6664 return NULL; 6665 6666 substring = (PyUnicodeObject *)PyUnicode_FromObject( 6667 (PyObject *)substring); 6668 if (substring == NULL) 6669 return NULL; 6670 6671 FIX_START_END(self); 6672 6673 result = PyLong_FromSsize_t( 6674 stringlib_count(self->str + start, end - start, 6675 substring->str, substring->length) 6676 ); 6677 6678 Py_DECREF(substring); 6679 6680 return result; 6681} 6682 6683PyDoc_STRVAR(encode__doc__, 6684"S.encode([encoding[, errors]]) -> bytes\n\ 6685\n\ 6686Encode S using the codec registered for encoding. encoding defaults\n\ 6687to the default encoding. errors may be given to set a different error\n\ 6688handling scheme. Default is 'strict' meaning that encoding errors raise\n\ 6689a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\ 6690'xmlcharrefreplace' as well as any other name registered with\n\ 6691codecs.register_error that can handle UnicodeEncodeErrors."); 6692 6693static PyObject * 6694unicode_encode(PyUnicodeObject *self, PyObject *args) 6695{ 6696 char *encoding = NULL; 6697 char *errors = NULL; 6698 PyObject *v; 6699 6700 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors)) 6701 return NULL; 6702 v = PyUnicode_AsEncodedString((PyObject *)self, encoding, errors); 6703 if (v == NULL) 6704 goto onError; 6705 if (!PyBytes_Check(v)) { 6706 PyErr_Format(PyExc_TypeError, 6707 "encoder did not return a bytes object " 6708 "(type=%.400s)", 6709 Py_TYPE(v)->tp_name); 6710 Py_DECREF(v); 6711 return NULL; 6712 } 6713 return v; 6714 6715 onError: 6716 return NULL; 6717} 6718 6719PyDoc_STRVAR(expandtabs__doc__, 6720"S.expandtabs([tabsize]) -> str\n\ 6721\n\ 6722Return a copy of S where all tab characters are expanded using spaces.\n\ 6723If tabsize is not given, a tab size of 8 characters is assumed."); 6724 6725static PyObject* 6726unicode_expandtabs(PyUnicodeObject *self, PyObject *args) 6727{ 6728 Py_UNICODE *e; 6729 Py_UNICODE *p; 6730 Py_UNICODE *q; 6731 Py_UNICODE *qe; 6732 Py_ssize_t i, j, incr; 6733 PyUnicodeObject *u; 6734 int tabsize = 8; 6735 6736 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize)) 6737 return NULL; 6738 6739 /* First pass: determine size of output string */ 6740 i = 0; /* chars up to and including most recent \n or \r */ 6741 j = 0; /* chars since most recent \n or \r (use in tab calculations) */ 6742 e = self->str + self->length; /* end of input */ 6743 for (p = self->str; p < e; p++) 6744 if (*p == '\t') { 6745 if (tabsize > 0) { 6746 incr = tabsize - (j % tabsize); /* cannot overflow */ 6747 if (j > PY_SSIZE_T_MAX - incr) 6748 goto overflow1; 6749 j += incr; 6750 } 6751 } 6752 else { 6753 if (j > PY_SSIZE_T_MAX - 1) 6754 goto overflow1; 6755 j++; 6756 if (*p == '\n' || *p == '\r') { 6757 if (i > PY_SSIZE_T_MAX - j) 6758 goto overflow1; 6759 i += j; 6760 j = 0; 6761 } 6762 } 6763 6764 if (i > PY_SSIZE_T_MAX - j) 6765 goto overflow1; 6766 6767 /* Second pass: create output string and fill it */ 6768 u = _PyUnicode_New(i + j); 6769 if (!u) 6770 return NULL; 6771 6772 j = 0; /* same as in first pass */ 6773 q = u->str; /* next output char */ 6774 qe = u->str + u->length; /* end of output */ 6775 6776 for (p = self->str; p < e; p++) 6777 if (*p == '\t') { 6778 if (tabsize > 0) { 6779 i = tabsize - (j % tabsize); 6780 j += i; 6781 while (i--) { 6782 if (q >= qe) 6783 goto overflow2; 6784 *q++ = ' '; 6785 } 6786 } 6787 } 6788 else { 6789 if (q >= qe) 6790 goto overflow2; 6791 *q++ = *p; 6792 j++; 6793 if (*p == '\n' || *p == '\r') 6794 j = 0; 6795 } 6796 6797 return (PyObject*) u; 6798 6799 overflow2: 6800 Py_DECREF(u); 6801 overflow1: 6802 PyErr_SetString(PyExc_OverflowError, "new string is too long"); 6803 return NULL; 6804} 6805 6806PyDoc_STRVAR(find__doc__, 6807"S.find(sub[, start[, end]]) -> int\n\ 6808\n\ 6809Return the lowest index in S where substring sub is found,\n\ 6810such that sub is contained within s[start:end]. Optional\n\ 6811arguments start and end are interpreted as in slice notation.\n\ 6812\n\ 6813Return -1 on failure."); 6814 6815static PyObject * 6816unicode_find(PyUnicodeObject *self, PyObject *args) 6817{ 6818 PyObject *substring; 6819 Py_ssize_t start; 6820 Py_ssize_t end; 6821 Py_ssize_t result; 6822 6823 if (!_ParseTupleFinds(args, &substring, &start, &end)) 6824 return NULL; 6825 6826 result = stringlib_find_slice( 6827 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self), 6828 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring), 6829 start, end 6830 ); 6831 6832 Py_DECREF(substring); 6833 6834 return PyLong_FromSsize_t(result); 6835} 6836 6837static PyObject * 6838unicode_getitem(PyUnicodeObject *self, Py_ssize_t index) 6839{ 6840 if (index < 0 || index >= self->length) { 6841 PyErr_SetString(PyExc_IndexError, "string index out of range"); 6842 return NULL; 6843 } 6844 6845 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1); 6846} 6847 6848/* Believe it or not, this produces the same value for ASCII strings 6849 as string_hash(). */ 6850static long 6851unicode_hash(PyUnicodeObject *self) 6852{ 6853 Py_ssize_t len; 6854 Py_UNICODE *p; 6855 long x; 6856 6857 if (self->hash != -1) 6858 return self->hash; 6859 len = Py_SIZE(self); 6860 p = self->str; 6861 x = *p << 7; 6862 while (--len >= 0) 6863 x = (1000003*x) ^ *p++; 6864 x ^= Py_SIZE(self); 6865 if (x == -1) 6866 x = -2; 6867 self->hash = x; 6868 return x; 6869} 6870 6871PyDoc_STRVAR(index__doc__, 6872"S.index(sub[, start[, end]]) -> int\n\ 6873\n\ 6874Like S.find() but raise ValueError when the substring is not found."); 6875 6876static PyObject * 6877unicode_index(PyUnicodeObject *self, PyObject *args) 6878{ 6879 Py_ssize_t result; 6880 PyObject *substring; 6881 Py_ssize_t start; 6882 Py_ssize_t end; 6883 6884 if (!_ParseTupleFinds(args, &substring, &start, &end)) 6885 return NULL; 6886 6887 result = stringlib_find_slice( 6888 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self), 6889 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring), 6890 start, end 6891 ); 6892 6893 Py_DECREF(substring); 6894 6895 if (result < 0) { 6896 PyErr_SetString(PyExc_ValueError, "substring not found"); 6897 return NULL; 6898 } 6899 6900 return PyLong_FromSsize_t(result); 6901} 6902 6903PyDoc_STRVAR(islower__doc__, 6904"S.islower() -> bool\n\ 6905\n\ 6906Return True if all cased characters in S are lowercase and there is\n\ 6907at least one cased character in S, False otherwise."); 6908 6909static PyObject* 6910unicode_islower(PyUnicodeObject *self) 6911{ 6912 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 6913 register const Py_UNICODE *e; 6914 int cased; 6915 6916 /* Shortcut for single character strings */ 6917 if (PyUnicode_GET_SIZE(self) == 1) 6918 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p)); 6919 6920 /* Special case for empty strings */ 6921 if (PyUnicode_GET_SIZE(self) == 0) 6922 return PyBool_FromLong(0); 6923 6924 e = p + PyUnicode_GET_SIZE(self); 6925 cased = 0; 6926 for (; p < e; p++) { 6927 register const Py_UNICODE ch = *p; 6928 6929 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) 6930 return PyBool_FromLong(0); 6931 else if (!cased && Py_UNICODE_ISLOWER(ch)) 6932 cased = 1; 6933 } 6934 return PyBool_FromLong(cased); 6935} 6936 6937PyDoc_STRVAR(isupper__doc__, 6938"S.isupper() -> bool\n\ 6939\n\ 6940Return True if all cased characters in S are uppercase and there is\n\ 6941at least one cased character in S, False otherwise."); 6942 6943static PyObject* 6944unicode_isupper(PyUnicodeObject *self) 6945{ 6946 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 6947 register const Py_UNICODE *e; 6948 int cased; 6949 6950 /* Shortcut for single character strings */ 6951 if (PyUnicode_GET_SIZE(self) == 1) 6952 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0); 6953 6954 /* Special case for empty strings */ 6955 if (PyUnicode_GET_SIZE(self) == 0) 6956 return PyBool_FromLong(0); 6957 6958 e = p + PyUnicode_GET_SIZE(self); 6959 cased = 0; 6960 for (; p < e; p++) { 6961 register const Py_UNICODE ch = *p; 6962 6963 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch)) 6964 return PyBool_FromLong(0); 6965 else if (!cased && Py_UNICODE_ISUPPER(ch)) 6966 cased = 1; 6967 } 6968 return PyBool_FromLong(cased); 6969} 6970 6971PyDoc_STRVAR(istitle__doc__, 6972"S.istitle() -> bool\n\ 6973\n\ 6974Return True if S is a titlecased string and there is at least one\n\ 6975character in S, i.e. upper- and titlecase characters may only\n\ 6976follow uncased characters and lowercase characters only cased ones.\n\ 6977Return False otherwise."); 6978 6979static PyObject* 6980unicode_istitle(PyUnicodeObject *self) 6981{ 6982 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 6983 register const Py_UNICODE *e; 6984 int cased, previous_is_cased; 6985 6986 /* Shortcut for single character strings */ 6987 if (PyUnicode_GET_SIZE(self) == 1) 6988 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) || 6989 (Py_UNICODE_ISUPPER(*p) != 0)); 6990 6991 /* Special case for empty strings */ 6992 if (PyUnicode_GET_SIZE(self) == 0) 6993 return PyBool_FromLong(0); 6994 6995 e = p + PyUnicode_GET_SIZE(self); 6996 cased = 0; 6997 previous_is_cased = 0; 6998 for (; p < e; p++) { 6999 register const Py_UNICODE ch = *p; 7000 7001 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) { 7002 if (previous_is_cased) 7003 return PyBool_FromLong(0); 7004 previous_is_cased = 1; 7005 cased = 1; 7006 } 7007 else if (Py_UNICODE_ISLOWER(ch)) { 7008 if (!previous_is_cased) 7009 return PyBool_FromLong(0); 7010 previous_is_cased = 1; 7011 cased = 1; 7012 } 7013 else 7014 previous_is_cased = 0; 7015 } 7016 return PyBool_FromLong(cased); 7017} 7018 7019PyDoc_STRVAR(isspace__doc__, 7020"S.isspace() -> bool\n\ 7021\n\ 7022Return True if all characters in S are whitespace\n\ 7023and there is at least one character in S, False otherwise."); 7024 7025static PyObject* 7026unicode_isspace(PyUnicodeObject *self) 7027{ 7028 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 7029 register const Py_UNICODE *e; 7030 7031 /* Shortcut for single character strings */ 7032 if (PyUnicode_GET_SIZE(self) == 1 && 7033 Py_UNICODE_ISSPACE(*p)) 7034 return PyBool_FromLong(1); 7035 7036 /* Special case for empty strings */ 7037 if (PyUnicode_GET_SIZE(self) == 0) 7038 return PyBool_FromLong(0); 7039 7040 e = p + PyUnicode_GET_SIZE(self); 7041 for (; p < e; p++) { 7042 if (!Py_UNICODE_ISSPACE(*p)) 7043 return PyBool_FromLong(0); 7044 } 7045 return PyBool_FromLong(1); 7046} 7047 7048PyDoc_STRVAR(isalpha__doc__, 7049"S.isalpha() -> bool\n\ 7050\n\ 7051Return True if all characters in S are alphabetic\n\ 7052and there is at least one character in S, False otherwise."); 7053 7054static PyObject* 7055unicode_isalpha(PyUnicodeObject *self) 7056{ 7057 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 7058 register const Py_UNICODE *e; 7059 7060 /* Shortcut for single character strings */ 7061 if (PyUnicode_GET_SIZE(self) == 1 && 7062 Py_UNICODE_ISALPHA(*p)) 7063 return PyBool_FromLong(1); 7064 7065 /* Special case for empty strings */ 7066 if (PyUnicode_GET_SIZE(self) == 0) 7067 return PyBool_FromLong(0); 7068 7069 e = p + PyUnicode_GET_SIZE(self); 7070 for (; p < e; p++) { 7071 if (!Py_UNICODE_ISALPHA(*p)) 7072 return PyBool_FromLong(0); 7073 } 7074 return PyBool_FromLong(1); 7075} 7076 7077PyDoc_STRVAR(isalnum__doc__, 7078"S.isalnum() -> bool\n\ 7079\n\ 7080Return True if all characters in S are alphanumeric\n\ 7081and there is at least one character in S, False otherwise."); 7082 7083static PyObject* 7084unicode_isalnum(PyUnicodeObject *self) 7085{ 7086 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 7087 register const Py_UNICODE *e; 7088 7089 /* Shortcut for single character strings */ 7090 if (PyUnicode_GET_SIZE(self) == 1 && 7091 Py_UNICODE_ISALNUM(*p)) 7092 return PyBool_FromLong(1); 7093 7094 /* Special case for empty strings */ 7095 if (PyUnicode_GET_SIZE(self) == 0) 7096 return PyBool_FromLong(0); 7097 7098 e = p + PyUnicode_GET_SIZE(self); 7099 for (; p < e; p++) { 7100 if (!Py_UNICODE_ISALNUM(*p)) 7101 return PyBool_FromLong(0); 7102 } 7103 return PyBool_FromLong(1); 7104} 7105 7106PyDoc_STRVAR(isdecimal__doc__, 7107"S.isdecimal() -> bool\n\ 7108\n\ 7109Return True if there are only decimal characters in S,\n\ 7110False otherwise."); 7111 7112static PyObject* 7113unicode_isdecimal(PyUnicodeObject *self) 7114{ 7115 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 7116 register const Py_UNICODE *e; 7117 7118 /* Shortcut for single character strings */ 7119 if (PyUnicode_GET_SIZE(self) == 1 && 7120 Py_UNICODE_ISDECIMAL(*p)) 7121 return PyBool_FromLong(1); 7122 7123 /* Special case for empty strings */ 7124 if (PyUnicode_GET_SIZE(self) == 0) 7125 return PyBool_FromLong(0); 7126 7127 e = p + PyUnicode_GET_SIZE(self); 7128 for (; p < e; p++) { 7129 if (!Py_UNICODE_ISDECIMAL(*p)) 7130 return PyBool_FromLong(0); 7131 } 7132 return PyBool_FromLong(1); 7133} 7134 7135PyDoc_STRVAR(isdigit__doc__, 7136"S.isdigit() -> bool\n\ 7137\n\ 7138Return True if all characters in S are digits\n\ 7139and there is at least one character in S, False otherwise."); 7140 7141static PyObject* 7142unicode_isdigit(PyUnicodeObject *self) 7143{ 7144 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 7145 register const Py_UNICODE *e; 7146 7147 /* Shortcut for single character strings */ 7148 if (PyUnicode_GET_SIZE(self) == 1 && 7149 Py_UNICODE_ISDIGIT(*p)) 7150 return PyBool_FromLong(1); 7151 7152 /* Special case for empty strings */ 7153 if (PyUnicode_GET_SIZE(self) == 0) 7154 return PyBool_FromLong(0); 7155 7156 e = p + PyUnicode_GET_SIZE(self); 7157 for (; p < e; p++) { 7158 if (!Py_UNICODE_ISDIGIT(*p)) 7159 return PyBool_FromLong(0); 7160 } 7161 return PyBool_FromLong(1); 7162} 7163 7164PyDoc_STRVAR(isnumeric__doc__, 7165"S.isnumeric() -> bool\n\ 7166\n\ 7167Return True if there are only numeric characters in S,\n\ 7168False otherwise."); 7169 7170static PyObject* 7171unicode_isnumeric(PyUnicodeObject *self) 7172{ 7173 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 7174 register const Py_UNICODE *e; 7175 7176 /* Shortcut for single character strings */ 7177 if (PyUnicode_GET_SIZE(self) == 1 && 7178 Py_UNICODE_ISNUMERIC(*p)) 7179 return PyBool_FromLong(1); 7180 7181 /* Special case for empty strings */ 7182 if (PyUnicode_GET_SIZE(self) == 0) 7183 return PyBool_FromLong(0); 7184 7185 e = p + PyUnicode_GET_SIZE(self); 7186 for (; p < e; p++) { 7187 if (!Py_UNICODE_ISNUMERIC(*p)) 7188 return PyBool_FromLong(0); 7189 } 7190 return PyBool_FromLong(1); 7191} 7192 7193int 7194PyUnicode_IsIdentifier(PyObject *self) 7195{ 7196 register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self); 7197 register const Py_UNICODE *e; 7198 7199 /* Special case for empty strings */ 7200 if (PyUnicode_GET_SIZE(self) == 0) 7201 return 0; 7202 7203 /* PEP 3131 says that the first character must be in 7204 XID_Start and subsequent characters in XID_Continue, 7205 and for the ASCII range, the 2.x rules apply (i.e 7206 start with letters and underscore, continue with 7207 letters, digits, underscore). However, given the current 7208 definition of XID_Start and XID_Continue, it is sufficient 7209 to check just for these, except that _ must be allowed 7210 as starting an identifier. */ 7211 if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */) 7212 return 0; 7213 7214 e = p + PyUnicode_GET_SIZE(self); 7215 for (p++; p < e; p++) { 7216 if (!_PyUnicode_IsXidContinue(*p)) 7217 return 0; 7218 } 7219 return 1; 7220} 7221 7222PyDoc_STRVAR(isidentifier__doc__, 7223"S.isidentifier() -> bool\n\ 7224\n\ 7225Return True if S is a valid identifier according\n\ 7226to the language definition."); 7227 7228static PyObject* 7229unicode_isidentifier(PyObject *self) 7230{ 7231 return PyBool_FromLong(PyUnicode_IsIdentifier(self)); 7232} 7233 7234PyDoc_STRVAR(join__doc__, 7235"S.join(sequence) -> str\n\ 7236\n\ 7237Return a string which is the concatenation of the strings in the\n\ 7238sequence. The separator between elements is S."); 7239 7240static PyObject* 7241unicode_join(PyObject *self, PyObject *data) 7242{ 7243 return PyUnicode_Join(self, data); 7244} 7245 7246static Py_ssize_t 7247unicode_length(PyUnicodeObject *self) 7248{ 7249 return self->length; 7250} 7251 7252PyDoc_STRVAR(ljust__doc__, 7253"S.ljust(width[, fillchar]) -> str\n\ 7254\n\ 7255Return S left justified in a Unicode string of length width. Padding is\n\ 7256done using the specified fill character (default is a space)."); 7257 7258static PyObject * 7259unicode_ljust(PyUnicodeObject *self, PyObject *args) 7260{ 7261 Py_ssize_t width; 7262 Py_UNICODE fillchar = ' '; 7263 7264 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar)) 7265 return NULL; 7266 7267 if (self->length >= width && PyUnicode_CheckExact(self)) { 7268 Py_INCREF(self); 7269 return (PyObject*) self; 7270 } 7271 7272 return (PyObject*) pad(self, 0, width - self->length, fillchar); 7273} 7274 7275PyDoc_STRVAR(lower__doc__, 7276"S.lower() -> str\n\ 7277\n\ 7278Return a copy of the string S converted to lowercase."); 7279 7280static PyObject* 7281unicode_lower(PyUnicodeObject *self) 7282{ 7283 return fixup(self, fixlower); 7284} 7285 7286#define LEFTSTRIP 0 7287#define RIGHTSTRIP 1 7288#define BOTHSTRIP 2 7289 7290/* Arrays indexed by above */ 7291static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"}; 7292 7293#define STRIPNAME(i) (stripformat[i]+3) 7294 7295/* externally visible for str.strip(unicode) */ 7296PyObject * 7297_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj) 7298{ 7299 Py_UNICODE *s = PyUnicode_AS_UNICODE(self); 7300 Py_ssize_t len = PyUnicode_GET_SIZE(self); 7301 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj); 7302 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj); 7303 Py_ssize_t i, j; 7304 7305 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen); 7306 7307 i = 0; 7308 if (striptype != RIGHTSTRIP) { 7309 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) { 7310 i++; 7311 } 7312 } 7313 7314 j = len; 7315 if (striptype != LEFTSTRIP) { 7316 do { 7317 j--; 7318 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen)); 7319 j++; 7320 } 7321 7322 if (i == 0 && j == len && PyUnicode_CheckExact(self)) { 7323 Py_INCREF(self); 7324 return (PyObject*)self; 7325 } 7326 else 7327 return PyUnicode_FromUnicode(s+i, j-i); 7328} 7329 7330 7331static PyObject * 7332do_strip(PyUnicodeObject *self, int striptype) 7333{ 7334 Py_UNICODE *s = PyUnicode_AS_UNICODE(self); 7335 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j; 7336 7337 i = 0; 7338 if (striptype != RIGHTSTRIP) { 7339 while (i < len && Py_UNICODE_ISSPACE(s[i])) { 7340 i++; 7341 } 7342 } 7343 7344 j = len; 7345 if (striptype != LEFTSTRIP) { 7346 do { 7347 j--; 7348 } while (j >= i && Py_UNICODE_ISSPACE(s[j])); 7349 j++; 7350 } 7351 7352 if (i == 0 && j == len && PyUnicode_CheckExact(self)) { 7353 Py_INCREF(self); 7354 return (PyObject*)self; 7355 } 7356 else 7357 return PyUnicode_FromUnicode(s+i, j-i); 7358} 7359 7360 7361static PyObject * 7362do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args) 7363{ 7364 PyObject *sep = NULL; 7365 7366 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep)) 7367 return NULL; 7368 7369 if (sep != NULL && sep != Py_None) { 7370 if (PyUnicode_Check(sep)) 7371 return _PyUnicode_XStrip(self, striptype, sep); 7372 else { 7373 PyErr_Format(PyExc_TypeError, 7374 "%s arg must be None, unicode or str", 7375 STRIPNAME(striptype)); 7376 return NULL; 7377 } 7378 } 7379 7380 return do_strip(self, striptype); 7381} 7382 7383 7384PyDoc_STRVAR(strip__doc__, 7385"S.strip([chars]) -> str\n\ 7386\n\ 7387Return a copy of the string S with leading and trailing\n\ 7388whitespace removed.\n\ 7389If chars is given and not None, remove characters in chars instead.\n\ 7390If chars is a str, it will be converted to unicode before stripping"); 7391 7392static PyObject * 7393unicode_strip(PyUnicodeObject *self, PyObject *args) 7394{ 7395 if (PyTuple_GET_SIZE(args) == 0) 7396 return do_strip(self, BOTHSTRIP); /* Common case */ 7397 else 7398 return do_argstrip(self, BOTHSTRIP, args); 7399} 7400 7401 7402PyDoc_STRVAR(lstrip__doc__, 7403"S.lstrip([chars]) -> str\n\ 7404\n\ 7405Return a copy of the string S with leading whitespace removed.\n\ 7406If chars is given and not None, remove characters in chars instead.\n\ 7407If chars is a str, it will be converted to unicode before stripping"); 7408 7409static PyObject * 7410unicode_lstrip(PyUnicodeObject *self, PyObject *args) 7411{ 7412 if (PyTuple_GET_SIZE(args) == 0) 7413 return do_strip(self, LEFTSTRIP); /* Common case */ 7414 else 7415 return do_argstrip(self, LEFTSTRIP, args); 7416} 7417 7418 7419PyDoc_STRVAR(rstrip__doc__, 7420"S.rstrip([chars]) -> str\n\ 7421\n\ 7422Return a copy of the string S with trailing whitespace removed.\n\ 7423If chars is given and not None, remove characters in chars instead.\n\ 7424If chars is a str, it will be converted to unicode before stripping"); 7425 7426static PyObject * 7427unicode_rstrip(PyUnicodeObject *self, PyObject *args) 7428{ 7429 if (PyTuple_GET_SIZE(args) == 0) 7430 return do_strip(self, RIGHTSTRIP); /* Common case */ 7431 else 7432 return do_argstrip(self, RIGHTSTRIP, args); 7433} 7434 7435 7436static PyObject* 7437unicode_repeat(PyUnicodeObject *str, Py_ssize_t len) 7438{ 7439 PyUnicodeObject *u; 7440 Py_UNICODE *p; 7441 Py_ssize_t nchars; 7442 size_t nbytes; 7443 7444 if (len < 0) 7445 len = 0; 7446 7447 if (len == 1 && PyUnicode_CheckExact(str)) { 7448 /* no repeat, return original string */ 7449 Py_INCREF(str); 7450 return (PyObject*) str; 7451 } 7452 7453 /* ensure # of chars needed doesn't overflow int and # of bytes 7454 * needed doesn't overflow size_t 7455 */ 7456 nchars = len * str->length; 7457 if (len && nchars / len != str->length) { 7458 PyErr_SetString(PyExc_OverflowError, 7459 "repeated string is too long"); 7460 return NULL; 7461 } 7462 nbytes = (nchars + 1) * sizeof(Py_UNICODE); 7463 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) { 7464 PyErr_SetString(PyExc_OverflowError, 7465 "repeated string is too long"); 7466 return NULL; 7467 } 7468 u = _PyUnicode_New(nchars); 7469 if (!u) 7470 return NULL; 7471 7472 p = u->str; 7473 7474 if (str->length == 1 && len > 0) { 7475 Py_UNICODE_FILL(p, str->str[0], len); 7476 } else { 7477 Py_ssize_t done = 0; /* number of characters copied this far */ 7478 if (done < nchars) { 7479 Py_UNICODE_COPY(p, str->str, str->length); 7480 done = str->length; 7481 } 7482 while (done < nchars) { 7483 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done; 7484 Py_UNICODE_COPY(p+done, p, n); 7485 done += n; 7486 } 7487 } 7488 7489 return (PyObject*) u; 7490} 7491 7492PyObject *PyUnicode_Replace(PyObject *obj, 7493 PyObject *subobj, 7494 PyObject *replobj, 7495 Py_ssize_t maxcount) 7496{ 7497 PyObject *self; 7498 PyObject *str1; 7499 PyObject *str2; 7500 PyObject *result; 7501 7502 self = PyUnicode_FromObject(obj); 7503 if (self == NULL) 7504 return NULL; 7505 str1 = PyUnicode_FromObject(subobj); 7506 if (str1 == NULL) { 7507 Py_DECREF(self); 7508 return NULL; 7509 } 7510 str2 = PyUnicode_FromObject(replobj); 7511 if (str2 == NULL) { 7512 Py_DECREF(self); 7513 Py_DECREF(str1); 7514 return NULL; 7515 } 7516 result = replace((PyUnicodeObject *)self, 7517 (PyUnicodeObject *)str1, 7518 (PyUnicodeObject *)str2, 7519 maxcount); 7520 Py_DECREF(self); 7521 Py_DECREF(str1); 7522 Py_DECREF(str2); 7523 return result; 7524} 7525 7526PyDoc_STRVAR(replace__doc__, 7527"S.replace (old, new[, count]) -> str\n\ 7528\n\ 7529Return a copy of S with all occurrences of substring\n\ 7530old replaced by new. If the optional argument count is\n\ 7531given, only the first count occurrences are replaced."); 7532 7533static PyObject* 7534unicode_replace(PyUnicodeObject *self, PyObject *args) 7535{ 7536 PyUnicodeObject *str1; 7537 PyUnicodeObject *str2; 7538 Py_ssize_t maxcount = -1; 7539 PyObject *result; 7540 7541 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount)) 7542 return NULL; 7543 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1); 7544 if (str1 == NULL) 7545 return NULL; 7546 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2); 7547 if (str2 == NULL) { 7548 Py_DECREF(str1); 7549 return NULL; 7550 } 7551 7552 result = replace(self, str1, str2, maxcount); 7553 7554 Py_DECREF(str1); 7555 Py_DECREF(str2); 7556 return result; 7557} 7558 7559static 7560PyObject *unicode_repr(PyObject *unicode) 7561{ 7562 PyObject *repr; 7563 Py_UNICODE *p; 7564 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode); 7565 Py_ssize_t size = PyUnicode_GET_SIZE(unicode); 7566 7567 /* XXX(nnorwitz): rather than over-allocating, it would be 7568 better to choose a different scheme. Perhaps scan the 7569 first N-chars of the string and allocate based on that size. 7570 */ 7571 /* Initial allocation is based on the longest-possible unichr 7572 escape. 7573 7574 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source 7575 unichr, so in this case it's the longest unichr escape. In 7576 narrow (UTF-16) builds this is five chars per source unichr 7577 since there are two unichrs in the surrogate pair, so in narrow 7578 (UTF-16) builds it's not the longest unichr escape. 7579 7580 In wide or narrow builds '\uxxxx' is 6 chars per source unichr, 7581 so in the narrow (UTF-16) build case it's the longest unichr 7582 escape. 7583 */ 7584 7585 repr = PyUnicode_FromUnicode(NULL, 7586 2 /* quotes */ 7587#ifdef Py_UNICODE_WIDE 7588 + 10*size 7589#else 7590 + 6*size 7591#endif 7592 + 1); 7593 if (repr == NULL) 7594 return NULL; 7595 7596 p = PyUnicode_AS_UNICODE(repr); 7597 7598 /* Add quote */ 7599 *p++ = (findchar(s, size, '\'') && 7600 !findchar(s, size, '"')) ? '"' : '\''; 7601 while (size-- > 0) { 7602 Py_UNICODE ch = *s++; 7603 7604 /* Escape quotes and backslashes */ 7605 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) { 7606 *p++ = '\\'; 7607 *p++ = ch; 7608 continue; 7609 } 7610 7611#ifdef Py_UNICODE_WIDE 7612 /* Map 21-bit characters to '\U00xxxxxx' */ 7613 else if (ch >= 0x10000) { 7614 *p++ = '\\'; 7615 *p++ = 'U'; 7616 *p++ = hexdigits[(ch >> 28) & 0x0000000F]; 7617 *p++ = hexdigits[(ch >> 24) & 0x0000000F]; 7618 *p++ = hexdigits[(ch >> 20) & 0x0000000F]; 7619 *p++ = hexdigits[(ch >> 16) & 0x0000000F]; 7620 *p++ = hexdigits[(ch >> 12) & 0x0000000F]; 7621 *p++ = hexdigits[(ch >> 8) & 0x0000000F]; 7622 *p++ = hexdigits[(ch >> 4) & 0x0000000F]; 7623 *p++ = hexdigits[ch & 0x0000000F]; 7624 continue; 7625 } 7626#else 7627 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */ 7628 else if (ch >= 0xD800 && ch < 0xDC00) { 7629 Py_UNICODE ch2; 7630 Py_UCS4 ucs; 7631 7632 ch2 = *s++; 7633 size--; 7634 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) { 7635 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000; 7636 *p++ = '\\'; 7637 *p++ = 'U'; 7638 *p++ = hexdigits[(ucs >> 28) & 0x0000000F]; 7639 *p++ = hexdigits[(ucs >> 24) & 0x0000000F]; 7640 *p++ = hexdigits[(ucs >> 20) & 0x0000000F]; 7641 *p++ = hexdigits[(ucs >> 16) & 0x0000000F]; 7642 *p++ = hexdigits[(ucs >> 12) & 0x0000000F]; 7643 *p++ = hexdigits[(ucs >> 8) & 0x0000000F]; 7644 *p++ = hexdigits[(ucs >> 4) & 0x0000000F]; 7645 *p++ = hexdigits[ucs & 0x0000000F]; 7646 continue; 7647 } 7648 /* Fall through: isolated surrogates are copied as-is */ 7649 s--; 7650 size++; 7651 } 7652#endif 7653 7654 /* Map 16-bit characters to '\uxxxx' */ 7655 if (ch >= 256) { 7656 *p++ = '\\'; 7657 *p++ = 'u'; 7658 *p++ = hexdigits[(ch >> 12) & 0x000F]; 7659 *p++ = hexdigits[(ch >> 8) & 0x000F]; 7660 *p++ = hexdigits[(ch >> 4) & 0x000F]; 7661 *p++ = hexdigits[ch & 0x000F]; 7662 } 7663 7664 /* Map special whitespace to '\t', \n', '\r' */ 7665 else if (ch == '\t') { 7666 *p++ = '\\'; 7667 *p++ = 't'; 7668 } 7669 else if (ch == '\n') { 7670 *p++ = '\\'; 7671 *p++ = 'n'; 7672 } 7673 else if (ch == '\r') { 7674 *p++ = '\\'; 7675 *p++ = 'r'; 7676 } 7677 7678 /* Map non-printable US ASCII to '\xhh' */ 7679 else if (ch < ' ' || ch >= 0x7F) { 7680 *p++ = '\\'; 7681 *p++ = 'x'; 7682 *p++ = hexdigits[(ch >> 4) & 0x000F]; 7683 *p++ = hexdigits[ch & 0x000F]; 7684 } 7685 7686 /* Copy everything else as-is */ 7687 else 7688 *p++ = (char) ch; 7689 } 7690 /* Add quote */ 7691 *p++ = PyUnicode_AS_UNICODE(repr)[0]; 7692 7693 *p = '\0'; 7694 _PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr)); 7695 return repr; 7696} 7697 7698PyDoc_STRVAR(rfind__doc__, 7699"S.rfind(sub[, start[, end]]) -> int\n\ 7700\n\ 7701Return the highest index in S where substring sub is found,\n\ 7702such that sub is contained within s[start:end]. Optional\n\ 7703arguments start and end are interpreted as in slice notation.\n\ 7704\n\ 7705Return -1 on failure."); 7706 7707static PyObject * 7708unicode_rfind(PyUnicodeObject *self, PyObject *args) 7709{ 7710 PyObject *substring; 7711 Py_ssize_t start; 7712 Py_ssize_t end; 7713 Py_ssize_t result; 7714 7715 if (!_ParseTupleFinds(args, &substring, &start, &end)) 7716 return NULL; 7717 7718 result = stringlib_rfind_slice( 7719 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self), 7720 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring), 7721 start, end 7722 ); 7723 7724 Py_DECREF(substring); 7725 7726 return PyLong_FromSsize_t(result); 7727} 7728 7729PyDoc_STRVAR(rindex__doc__, 7730"S.rindex(sub[, start[, end]]) -> int\n\ 7731\n\ 7732Like S.rfind() but raise ValueError when the substring is not found."); 7733 7734static PyObject * 7735unicode_rindex(PyUnicodeObject *self, PyObject *args) 7736{ 7737 PyObject *substring; 7738 Py_ssize_t start; 7739 Py_ssize_t end; 7740 Py_ssize_t result; 7741 7742 if (!_ParseTupleFinds(args, &substring, &start, &end)) 7743 return NULL; 7744 7745 result = stringlib_rfind_slice( 7746 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self), 7747 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring), 7748 start, end 7749 ); 7750 7751 Py_DECREF(substring); 7752 7753 if (result < 0) { 7754 PyErr_SetString(PyExc_ValueError, "substring not found"); 7755 return NULL; 7756 } 7757 return PyLong_FromSsize_t(result); 7758} 7759 7760PyDoc_STRVAR(rjust__doc__, 7761"S.rjust(width[, fillchar]) -> str\n\ 7762\n\ 7763Return S right justified in a Unicode string of length width. Padding is\n\ 7764done using the specified fill character (default is a space)."); 7765 7766static PyObject * 7767unicode_rjust(PyUnicodeObject *self, PyObject *args) 7768{ 7769 Py_ssize_t width; 7770 Py_UNICODE fillchar = ' '; 7771 7772 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar)) 7773 return NULL; 7774 7775 if (self->length >= width && PyUnicode_CheckExact(self)) { 7776 Py_INCREF(self); 7777 return (PyObject*) self; 7778 } 7779 7780 return (PyObject*) pad(self, width - self->length, 0, fillchar); 7781} 7782 7783PyObject *PyUnicode_Split(PyObject *s, 7784 PyObject *sep, 7785 Py_ssize_t maxsplit) 7786{ 7787 PyObject *result; 7788 7789 s = PyUnicode_FromObject(s); 7790 if (s == NULL) 7791 return NULL; 7792 if (sep != NULL) { 7793 sep = PyUnicode_FromObject(sep); 7794 if (sep == NULL) { 7795 Py_DECREF(s); 7796 return NULL; 7797 } 7798 } 7799 7800 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit); 7801 7802 Py_DECREF(s); 7803 Py_XDECREF(sep); 7804 return result; 7805} 7806 7807PyDoc_STRVAR(split__doc__, 7808"S.split([sep[, maxsplit]]) -> list of strings\n\ 7809\n\ 7810Return a list of the words in S, using sep as the\n\ 7811delimiter string. If maxsplit is given, at most maxsplit\n\ 7812splits are done. If sep is not specified or is None, any\n\ 7813whitespace string is a separator and empty strings are\n\ 7814removed from the result."); 7815 7816static PyObject* 7817unicode_split(PyUnicodeObject *self, PyObject *args) 7818{ 7819 PyObject *substring = Py_None; 7820 Py_ssize_t maxcount = -1; 7821 7822 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount)) 7823 return NULL; 7824 7825 if (substring == Py_None) 7826 return split(self, NULL, maxcount); 7827 else if (PyUnicode_Check(substring)) 7828 return split(self, (PyUnicodeObject *)substring, maxcount); 7829 else 7830 return PyUnicode_Split((PyObject *)self, substring, maxcount); 7831} 7832 7833PyObject * 7834PyUnicode_Partition(PyObject *str_in, PyObject *sep_in) 7835{ 7836 PyObject* str_obj; 7837 PyObject* sep_obj; 7838 PyObject* out; 7839 7840 str_obj = PyUnicode_FromObject(str_in); 7841 if (!str_obj) 7842 return NULL; 7843 sep_obj = PyUnicode_FromObject(sep_in); 7844 if (!sep_obj) { 7845 Py_DECREF(str_obj); 7846 return NULL; 7847 } 7848 7849 out = stringlib_partition( 7850 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj), 7851 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj) 7852 ); 7853 7854 Py_DECREF(sep_obj); 7855 Py_DECREF(str_obj); 7856 7857 return out; 7858} 7859 7860 7861PyObject * 7862PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in) 7863{ 7864 PyObject* str_obj; 7865 PyObject* sep_obj; 7866 PyObject* out; 7867 7868 str_obj = PyUnicode_FromObject(str_in); 7869 if (!str_obj) 7870 return NULL; 7871 sep_obj = PyUnicode_FromObject(sep_in); 7872 if (!sep_obj) { 7873 Py_DECREF(str_obj); 7874 return NULL; 7875 } 7876 7877 out = stringlib_rpartition( 7878 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj), 7879 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj) 7880 ); 7881 7882 Py_DECREF(sep_obj); 7883 Py_DECREF(str_obj); 7884 7885 return out; 7886} 7887 7888PyDoc_STRVAR(partition__doc__, 7889"S.partition(sep) -> (head, sep, tail)\n\ 7890\n\ 7891Search for the separator sep in S, and return the part before it,\n\ 7892the separator itself, and the part after it. If the separator is not\n\ 7893found, returns S and two empty strings."); 7894 7895static PyObject* 7896unicode_partition(PyUnicodeObject *self, PyObject *separator) 7897{ 7898 return PyUnicode_Partition((PyObject *)self, separator); 7899} 7900 7901PyDoc_STRVAR(rpartition__doc__, 7902"S.rpartition(sep) -> (tail, sep, head)\n\ 7903\n\ 7904Search for the separator sep in S, starting at the end of S, and return\n\ 7905the part before it, the separator itself, and the part after it. If the\n\ 7906separator is not found, returns two empty strings and S."); 7907 7908static PyObject* 7909unicode_rpartition(PyUnicodeObject *self, PyObject *separator) 7910{ 7911 return PyUnicode_RPartition((PyObject *)self, separator); 7912} 7913 7914PyObject *PyUnicode_RSplit(PyObject *s, 7915 PyObject *sep, 7916 Py_ssize_t maxsplit) 7917{ 7918 PyObject *result; 7919 7920 s = PyUnicode_FromObject(s); 7921 if (s == NULL) 7922 return NULL; 7923 if (sep != NULL) { 7924 sep = PyUnicode_FromObject(sep); 7925 if (sep == NULL) { 7926 Py_DECREF(s); 7927 return NULL; 7928 } 7929 } 7930 7931 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit); 7932 7933 Py_DECREF(s); 7934 Py_XDECREF(sep); 7935 return result; 7936} 7937 7938PyDoc_STRVAR(rsplit__doc__, 7939"S.rsplit([sep[, maxsplit]]) -> list of strings\n\ 7940\n\ 7941Return a list of the words in S, using sep as the\n\ 7942delimiter string, starting at the end of the string and\n\ 7943working to the front. If maxsplit is given, at most maxsplit\n\ 7944splits are done. If sep is not specified, any whitespace string\n\ 7945is a separator."); 7946 7947static PyObject* 7948unicode_rsplit(PyUnicodeObject *self, PyObject *args) 7949{ 7950 PyObject *substring = Py_None; 7951 Py_ssize_t maxcount = -1; 7952 7953 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount)) 7954 return NULL; 7955 7956 if (substring == Py_None) 7957 return rsplit(self, NULL, maxcount); 7958 else if (PyUnicode_Check(substring)) 7959 return rsplit(self, (PyUnicodeObject *)substring, maxcount); 7960 else 7961 return PyUnicode_RSplit((PyObject *)self, substring, maxcount); 7962} 7963 7964PyDoc_STRVAR(splitlines__doc__, 7965"S.splitlines([keepends]]) -> list of strings\n\ 7966\n\ 7967Return a list of the lines in S, breaking at line boundaries.\n\ 7968Line breaks are not included in the resulting list unless keepends\n\ 7969is given and true."); 7970 7971static PyObject* 7972unicode_splitlines(PyUnicodeObject *self, PyObject *args) 7973{ 7974 int keepends = 0; 7975 7976 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends)) 7977 return NULL; 7978 7979 return PyUnicode_Splitlines((PyObject *)self, keepends); 7980} 7981 7982static 7983PyObject *unicode_str(PyObject *self) 7984{ 7985 if (PyUnicode_CheckExact(self)) { 7986 Py_INCREF(self); 7987 return self; 7988 } else 7989 /* Subtype -- return genuine unicode string with the same value. */ 7990 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self), 7991 PyUnicode_GET_SIZE(self)); 7992} 7993 7994PyDoc_STRVAR(swapcase__doc__, 7995"S.swapcase() -> str\n\ 7996\n\ 7997Return a copy of S with uppercase characters converted to lowercase\n\ 7998and vice versa."); 7999 8000static PyObject* 8001unicode_swapcase(PyUnicodeObject *self) 8002{ 8003 return fixup(self, fixswapcase); 8004} 8005 8006PyDoc_STRVAR(maketrans__doc__, 8007"str.maketrans(x[, y[, z]]) -> dict (static method)\n\ 8008\n\ 8009Return a translation table usable for str.translate().\n\ 8010If there is only one argument, it must be a dictionary mapping Unicode\n\ 8011ordinals (integers) or characters to Unicode ordinals, strings or None.\n\ 8012Character keys will then be converted to ordinals.\n\ 8013If there are two arguments, they must be strings of equal length, and\n\ 8014in the resulting dictionary, each character in x will be mapped to the\n\ 8015character at the same position in y. If there is a third argument, it\n\ 8016must be a string, whose characters will be mapped to None in the result."); 8017 8018static PyObject* 8019unicode_maketrans(PyUnicodeObject *null, PyObject *args) 8020{ 8021 PyObject *x, *y = NULL, *z = NULL; 8022 PyObject *new = NULL, *key, *value; 8023 Py_ssize_t i = 0; 8024 int res; 8025 8026 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z)) 8027 return NULL; 8028 new = PyDict_New(); 8029 if (!new) 8030 return NULL; 8031 if (y != NULL) { 8032 /* x must be a string too, of equal length */ 8033 Py_ssize_t ylen = PyUnicode_GET_SIZE(y); 8034 if (!PyUnicode_Check(x)) { 8035 PyErr_SetString(PyExc_TypeError, "first maketrans argument must " 8036 "be a string if there is a second argument"); 8037 goto err; 8038 } 8039 if (PyUnicode_GET_SIZE(x) != ylen) { 8040 PyErr_SetString(PyExc_ValueError, "the first two maketrans " 8041 "arguments must have equal length"); 8042 goto err; 8043 } 8044 /* create entries for translating chars in x to those in y */ 8045 for (i = 0; i < PyUnicode_GET_SIZE(x); i++) { 8046 key = PyLong_FromLong(PyUnicode_AS_UNICODE(x)[i]); 8047 value = PyLong_FromLong(PyUnicode_AS_UNICODE(y)[i]); 8048 if (!key || !value) 8049 goto err; 8050 res = PyDict_SetItem(new, key, value); 8051 Py_DECREF(key); 8052 Py_DECREF(value); 8053 if (res < 0) 8054 goto err; 8055 } 8056 /* create entries for deleting chars in z */ 8057 if (z != NULL) { 8058 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) { 8059 key = PyLong_FromLong(PyUnicode_AS_UNICODE(z)[i]); 8060 if (!key) 8061 goto err; 8062 res = PyDict_SetItem(new, key, Py_None); 8063 Py_DECREF(key); 8064 if (res < 0) 8065 goto err; 8066 } 8067 } 8068 } else { 8069 /* x must be a dict */ 8070 if (!PyDict_Check(x)) { 8071 PyErr_SetString(PyExc_TypeError, "if you give only one argument " 8072 "to maketrans it must be a dict"); 8073 goto err; 8074 } 8075 /* copy entries into the new dict, converting string keys to int keys */ 8076 while (PyDict_Next(x, &i, &key, &value)) { 8077 if (PyUnicode_Check(key)) { 8078 /* convert string keys to integer keys */ 8079 PyObject *newkey; 8080 if (PyUnicode_GET_SIZE(key) != 1) { 8081 PyErr_SetString(PyExc_ValueError, "string keys in translate " 8082 "table must be of length 1"); 8083 goto err; 8084 } 8085 newkey = PyLong_FromLong(PyUnicode_AS_UNICODE(key)[0]); 8086 if (!newkey) 8087 goto err; 8088 res = PyDict_SetItem(new, newkey, value); 8089 Py_DECREF(newkey); 8090 if (res < 0) 8091 goto err; 8092 } else if (PyLong_Check(key)) { 8093 /* just keep integer keys */ 8094 if (PyDict_SetItem(new, key, value) < 0) 8095 goto err; 8096 } else { 8097 PyErr_SetString(PyExc_TypeError, "keys in translate table must " 8098 "be strings or integers"); 8099 goto err; 8100 } 8101 } 8102 } 8103 return new; 8104 err: 8105 Py_DECREF(new); 8106 return NULL; 8107} 8108 8109PyDoc_STRVAR(translate__doc__, 8110"S.translate(table) -> str\n\ 8111\n\ 8112Return a copy of the string S, where all characters have been mapped\n\ 8113through the given translation table, which must be a mapping of\n\ 8114Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\ 8115Unmapped characters are left untouched. Characters mapped to None\n\ 8116are deleted."); 8117 8118static PyObject* 8119unicode_translate(PyUnicodeObject *self, PyObject *table) 8120{ 8121 return PyUnicode_TranslateCharmap(self->str, self->length, table, "ignore"); 8122} 8123 8124PyDoc_STRVAR(upper__doc__, 8125"S.upper() -> str\n\ 8126\n\ 8127Return a copy of S converted to uppercase."); 8128 8129static PyObject* 8130unicode_upper(PyUnicodeObject *self) 8131{ 8132 return fixup(self, fixupper); 8133} 8134 8135PyDoc_STRVAR(zfill__doc__, 8136"S.zfill(width) -> str\n\ 8137\n\ 8138Pad a numeric string x with zeros on the left, to fill a field\n\ 8139of the specified width. The string x is never truncated."); 8140 8141static PyObject * 8142unicode_zfill(PyUnicodeObject *self, PyObject *args) 8143{ 8144 Py_ssize_t fill; 8145 PyUnicodeObject *u; 8146 8147 Py_ssize_t width; 8148 if (!PyArg_ParseTuple(args, "n:zfill", &width)) 8149 return NULL; 8150 8151 if (self->length >= width) { 8152 if (PyUnicode_CheckExact(self)) { 8153 Py_INCREF(self); 8154 return (PyObject*) self; 8155 } 8156 else 8157 return PyUnicode_FromUnicode( 8158 PyUnicode_AS_UNICODE(self), 8159 PyUnicode_GET_SIZE(self) 8160 ); 8161 } 8162 8163 fill = width - self->length; 8164 8165 u = pad(self, fill, 0, '0'); 8166 8167 if (u == NULL) 8168 return NULL; 8169 8170 if (u->str[fill] == '+' || u->str[fill] == '-') { 8171 /* move sign to beginning of string */ 8172 u->str[0] = u->str[fill]; 8173 u->str[fill] = '0'; 8174 } 8175 8176 return (PyObject*) u; 8177} 8178 8179#if 0 8180static PyObject* 8181unicode_freelistsize(PyUnicodeObject *self) 8182{ 8183 return PyLong_FromLong(numfree); 8184} 8185#endif 8186 8187PyDoc_STRVAR(startswith__doc__, 8188"S.startswith(prefix[, start[, end]]) -> bool\n\ 8189\n\ 8190Return True if S starts with the specified prefix, False otherwise.\n\ 8191With optional start, test S beginning at that position.\n\ 8192With optional end, stop comparing S at that position.\n\ 8193prefix can also be a tuple of strings to try."); 8194 8195static PyObject * 8196unicode_startswith(PyUnicodeObject *self, 8197 PyObject *args) 8198{ 8199 PyObject *subobj; 8200 PyUnicodeObject *substring; 8201 Py_ssize_t start = 0; 8202 Py_ssize_t end = PY_SSIZE_T_MAX; 8203 int result; 8204 8205 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj, 8206 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 8207 return NULL; 8208 if (PyTuple_Check(subobj)) { 8209 Py_ssize_t i; 8210 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) { 8211 substring = (PyUnicodeObject *)PyUnicode_FromObject( 8212 PyTuple_GET_ITEM(subobj, i)); 8213 if (substring == NULL) 8214 return NULL; 8215 result = tailmatch(self, substring, start, end, -1); 8216 Py_DECREF(substring); 8217 if (result) { 8218 Py_RETURN_TRUE; 8219 } 8220 } 8221 /* nothing matched */ 8222 Py_RETURN_FALSE; 8223 } 8224 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj); 8225 if (substring == NULL) 8226 return NULL; 8227 result = tailmatch(self, substring, start, end, -1); 8228 Py_DECREF(substring); 8229 return PyBool_FromLong(result); 8230} 8231 8232 8233PyDoc_STRVAR(endswith__doc__, 8234"S.endswith(suffix[, start[, end]]) -> bool\n\ 8235\n\ 8236Return True if S ends with the specified suffix, False otherwise.\n\ 8237With optional start, test S beginning at that position.\n\ 8238With optional end, stop comparing S at that position.\n\ 8239suffix can also be a tuple of strings to try."); 8240 8241static PyObject * 8242unicode_endswith(PyUnicodeObject *self, 8243 PyObject *args) 8244{ 8245 PyObject *subobj; 8246 PyUnicodeObject *substring; 8247 Py_ssize_t start = 0; 8248 Py_ssize_t end = PY_SSIZE_T_MAX; 8249 int result; 8250 8251 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj, 8252 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 8253 return NULL; 8254 if (PyTuple_Check(subobj)) { 8255 Py_ssize_t i; 8256 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) { 8257 substring = (PyUnicodeObject *)PyUnicode_FromObject( 8258 PyTuple_GET_ITEM(subobj, i)); 8259 if (substring == NULL) 8260 return NULL; 8261 result = tailmatch(self, substring, start, end, +1); 8262 Py_DECREF(substring); 8263 if (result) { 8264 Py_RETURN_TRUE; 8265 } 8266 } 8267 Py_RETURN_FALSE; 8268 } 8269 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj); 8270 if (substring == NULL) 8271 return NULL; 8272 8273 result = tailmatch(self, substring, start, end, +1); 8274 Py_DECREF(substring); 8275 return PyBool_FromLong(result); 8276} 8277 8278#include "stringlib/string_format.h" 8279 8280PyDoc_STRVAR(format__doc__, 8281"S.format(*args, **kwargs) -> str\n\ 8282\n\ 8283"); 8284 8285static PyObject * 8286unicode__format__(PyObject* self, PyObject* args) 8287{ 8288 PyObject *format_spec; 8289 8290 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec)) 8291 return NULL; 8292 8293 return _PyUnicode_FormatAdvanced(self, 8294 PyUnicode_AS_UNICODE(format_spec), 8295 PyUnicode_GET_SIZE(format_spec)); 8296} 8297 8298PyDoc_STRVAR(p_format__doc__, 8299"S.__format__(format_spec) -> str\n\ 8300\n\ 8301"); 8302 8303static PyObject * 8304unicode__sizeof__(PyUnicodeObject *v) 8305{ 8306 PyObject *res = NULL, *defsize = NULL; 8307 8308 res = PyLong_FromSsize_t(sizeof(PyUnicodeObject) + 8309 sizeof(Py_UNICODE) * (v->length + 1)); 8310 if (v->defenc) { 8311 defsize = PyObject_CallMethod(v->defenc, "__sizeof__", NULL); 8312 if (defsize == NULL) { 8313 Py_DECREF(res); 8314 return NULL; 8315 } 8316 res = PyNumber_Add(res, defsize); 8317 Py_DECREF(defsize); 8318 } 8319 return res; 8320} 8321 8322PyDoc_STRVAR(sizeof__doc__, 8323"S.__sizeof__() -> size of S in memory, in bytes"); 8324 8325static PyObject * 8326unicode_getnewargs(PyUnicodeObject *v) 8327{ 8328 return Py_BuildValue("(u#)", v->str, v->length); 8329} 8330 8331 8332static PyMethodDef unicode_methods[] = { 8333 8334 /* Order is according to common usage: often used methods should 8335 appear first, since lookup is done sequentially. */ 8336 8337 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__}, 8338 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__}, 8339 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__}, 8340 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__}, 8341 {"join", (PyCFunction) unicode_join, METH_O, join__doc__}, 8342 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__}, 8343 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__}, 8344 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__}, 8345 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__}, 8346 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__}, 8347 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__}, 8348 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__}, 8349 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__}, 8350 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__}, 8351 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__}, 8352 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__}, 8353 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__}, 8354 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__}, 8355 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__}, 8356 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__}, 8357 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__}, 8358 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__}, 8359 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__}, 8360 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__}, 8361 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__}, 8362 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__}, 8363 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__}, 8364 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__}, 8365 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__}, 8366 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__}, 8367 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__}, 8368 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__}, 8369 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__}, 8370 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__}, 8371 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__}, 8372 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__}, 8373 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__}, 8374 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__}, 8375 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__}, 8376 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__}, 8377 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__}, 8378 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS}, 8379 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS}, 8380 {"maketrans", (PyCFunction) unicode_maketrans, 8381 METH_VARARGS | METH_STATIC, maketrans__doc__}, 8382 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__}, 8383#if 0 8384 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__}, 8385#endif 8386 8387#if 0 8388 /* This one is just used for debugging the implementation. */ 8389 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS}, 8390#endif 8391 8392 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS}, 8393 {NULL, NULL} 8394}; 8395 8396static PyObject * 8397unicode_mod(PyObject *v, PyObject *w) 8398{ 8399 if (!PyUnicode_Check(v)) { 8400 Py_INCREF(Py_NotImplemented); 8401 return Py_NotImplemented; 8402 } 8403 return PyUnicode_Format(v, w); 8404} 8405 8406static PyNumberMethods unicode_as_number = { 8407 0, /*nb_add*/ 8408 0, /*nb_subtract*/ 8409 0, /*nb_multiply*/ 8410 unicode_mod, /*nb_remainder*/ 8411}; 8412 8413static PySequenceMethods unicode_as_sequence = { 8414 (lenfunc) unicode_length, /* sq_length */ 8415 PyUnicode_Concat, /* sq_concat */ 8416 (ssizeargfunc) unicode_repeat, /* sq_repeat */ 8417 (ssizeargfunc) unicode_getitem, /* sq_item */ 8418 0, /* sq_slice */ 8419 0, /* sq_ass_item */ 8420 0, /* sq_ass_slice */ 8421 PyUnicode_Contains, /* sq_contains */ 8422}; 8423 8424static PyObject* 8425unicode_subscript(PyUnicodeObject* self, PyObject* item) 8426{ 8427 if (PyIndex_Check(item)) { 8428 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError); 8429 if (i == -1 && PyErr_Occurred()) 8430 return NULL; 8431 if (i < 0) 8432 i += PyUnicode_GET_SIZE(self); 8433 return unicode_getitem(self, i); 8434 } else if (PySlice_Check(item)) { 8435 Py_ssize_t start, stop, step, slicelength, cur, i; 8436 Py_UNICODE* source_buf; 8437 Py_UNICODE* result_buf; 8438 PyObject* result; 8439 8440 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self), 8441 &start, &stop, &step, &slicelength) < 0) { 8442 return NULL; 8443 } 8444 8445 if (slicelength <= 0) { 8446 return PyUnicode_FromUnicode(NULL, 0); 8447 } else if (start == 0 && step == 1 && slicelength == self->length && 8448 PyUnicode_CheckExact(self)) { 8449 Py_INCREF(self); 8450 return (PyObject *)self; 8451 } else if (step == 1) { 8452 return PyUnicode_FromUnicode(self->str + start, slicelength); 8453 } else { 8454 source_buf = PyUnicode_AS_UNICODE((PyObject*)self); 8455 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength* 8456 sizeof(Py_UNICODE)); 8457 8458 if (result_buf == NULL) 8459 return PyErr_NoMemory(); 8460 8461 for (cur = start, i = 0; i < slicelength; cur += step, i++) { 8462 result_buf[i] = source_buf[cur]; 8463 } 8464 8465 result = PyUnicode_FromUnicode(result_buf, slicelength); 8466 PyObject_FREE(result_buf); 8467 return result; 8468 } 8469 } else { 8470 PyErr_SetString(PyExc_TypeError, "string indices must be integers"); 8471 return NULL; 8472 } 8473} 8474 8475static PyMappingMethods unicode_as_mapping = { 8476 (lenfunc)unicode_length, /* mp_length */ 8477 (binaryfunc)unicode_subscript, /* mp_subscript */ 8478 (objobjargproc)0, /* mp_ass_subscript */ 8479}; 8480 8481 8482/* Helpers for PyUnicode_Format() */ 8483 8484static PyObject * 8485getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx) 8486{ 8487 Py_ssize_t argidx = *p_argidx; 8488 if (argidx < arglen) { 8489 (*p_argidx)++; 8490 if (arglen < 0) 8491 return args; 8492 else 8493 return PyTuple_GetItem(args, argidx); 8494 } 8495 PyErr_SetString(PyExc_TypeError, 8496 "not enough arguments for format string"); 8497 return NULL; 8498} 8499 8500static Py_ssize_t 8501strtounicode(Py_UNICODE *buffer, const char *charbuffer) 8502{ 8503 register Py_ssize_t i; 8504 Py_ssize_t len = strlen(charbuffer); 8505 for (i = len - 1; i >= 0; i--) 8506 buffer[i] = (Py_UNICODE) charbuffer[i]; 8507 8508 return len; 8509} 8510 8511static int 8512doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x) 8513{ 8514 Py_ssize_t result; 8515 8516 PyOS_ascii_formatd((char *)buffer, len, format, x); 8517 result = strtounicode(buffer, (char *)buffer); 8518 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int); 8519} 8520 8521#if 0 8522static int 8523longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x) 8524{ 8525 Py_ssize_t result; 8526 8527 PyOS_snprintf((char *)buffer, len, format, x); 8528 result = strtounicode(buffer, (char *)buffer); 8529 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int); 8530} 8531#endif 8532 8533/* XXX To save some code duplication, formatfloat/long/int could have been 8534 shared with stringobject.c, converting from 8-bit to Unicode after the 8535 formatting is done. */ 8536 8537static int 8538formatfloat(Py_UNICODE *buf, 8539 size_t buflen, 8540 int flags, 8541 int prec, 8542 int type, 8543 PyObject *v) 8544{ 8545 /* fmt = '%#.' + `prec` + `type` 8546 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/ 8547 char fmt[20]; 8548 double x; 8549 8550 x = PyFloat_AsDouble(v); 8551 if (x == -1.0 && PyErr_Occurred()) 8552 return -1; 8553 if (prec < 0) 8554 prec = 6; 8555 if (type == 'f' && (fabs(x) / 1e25) >= 1e25) 8556 type = 'g'; 8557 /* Worst case length calc to ensure no buffer overrun: 8558 8559 'g' formats: 8560 fmt = %#.<prec>g 8561 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp 8562 for any double rep.) 8563 len = 1 + prec + 1 + 2 + 5 = 9 + prec 8564 8565 'f' formats: 8566 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50) 8567 len = 1 + 50 + 1 + prec = 52 + prec 8568 8569 If prec=0 the effective precision is 1 (the leading digit is 8570 always given), therefore increase the length by one. 8571 8572 */ 8573 if (((type == 'g' || type == 'G') && 8574 buflen <= (size_t)10 + (size_t)prec) || 8575 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) { 8576 PyErr_SetString(PyExc_OverflowError, 8577 "formatted float is too long (precision too large?)"); 8578 return -1; 8579 } 8580 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c", 8581 (flags&F_ALT) ? "#" : "", 8582 prec, type); 8583 return doubletounicode(buf, buflen, fmt, x); 8584} 8585 8586static PyObject* 8587formatlong(PyObject *val, int flags, int prec, int type) 8588{ 8589 char *buf; 8590 int len; 8591 PyObject *str; /* temporary string object. */ 8592 PyObject *result; 8593 8594 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len); 8595 if (!str) 8596 return NULL; 8597 result = PyUnicode_FromStringAndSize(buf, len); 8598 Py_DECREF(str); 8599 return result; 8600} 8601 8602#if 0 8603static int 8604formatint(Py_UNICODE *buf, 8605 size_t buflen, 8606 int flags, 8607 int prec, 8608 int type, 8609 PyObject *v) 8610{ 8611 /* fmt = '%#.' + `prec` + 'l' + `type` 8612 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine) 8613 * + 1 + 1 8614 * = 24 8615 */ 8616 char fmt[64]; /* plenty big enough! */ 8617 char *sign; 8618 long x; 8619 8620 x = PyLong_AsLong(v); 8621 if (x == -1 && PyErr_Occurred()) 8622 return -1; 8623 if (x < 0 && type == 'u') { 8624 type = 'd'; 8625 } 8626 if (x < 0 && (type == 'x' || type == 'X' || type == 'o')) 8627 sign = "-"; 8628 else 8629 sign = ""; 8630 if (prec < 0) 8631 prec = 1; 8632 8633 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal)) 8634 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11 8635 */ 8636 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) { 8637 PyErr_SetString(PyExc_OverflowError, 8638 "formatted integer is too long (precision too large?)"); 8639 return -1; 8640 } 8641 8642 if ((flags & F_ALT) && 8643 (type == 'x' || type == 'X' || type == 'o')) { 8644 /* When converting under %#o, %#x or %#X, there are a number 8645 * of issues that cause pain: 8646 * - for %#o, we want a different base marker than C 8647 * - when 0 is being converted, the C standard leaves off 8648 * the '0x' or '0X', which is inconsistent with other 8649 * %#x/%#X conversions and inconsistent with Python's 8650 * hex() function 8651 * - there are platforms that violate the standard and 8652 * convert 0 with the '0x' or '0X' 8653 * (Metrowerks, Compaq Tru64) 8654 * - there are platforms that give '0x' when converting 8655 * under %#X, but convert 0 in accordance with the 8656 * standard (OS/2 EMX) 8657 * 8658 * We can achieve the desired consistency by inserting our 8659 * own '0x' or '0X' prefix, and substituting %x/%X in place 8660 * of %#x/%#X. 8661 * 8662 * Note that this is the same approach as used in 8663 * formatint() in stringobject.c 8664 */ 8665 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c", 8666 sign, type, prec, type); 8667 } 8668 else { 8669 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c", 8670 sign, (flags&F_ALT) ? "#" : "", 8671 prec, type); 8672 } 8673 if (sign[0]) 8674 return longtounicode(buf, buflen, fmt, -x); 8675 else 8676 return longtounicode(buf, buflen, fmt, x); 8677} 8678#endif 8679 8680static int 8681formatchar(Py_UNICODE *buf, 8682 size_t buflen, 8683 PyObject *v) 8684{ 8685 /* presume that the buffer is at least 2 characters long */ 8686 if (PyUnicode_Check(v)) { 8687 if (PyUnicode_GET_SIZE(v) != 1) 8688 goto onError; 8689 buf[0] = PyUnicode_AS_UNICODE(v)[0]; 8690 } 8691 else { 8692 /* Integer input truncated to a character */ 8693 long x; 8694 x = PyLong_AsLong(v); 8695 if (x == -1 && PyErr_Occurred()) 8696 goto onError; 8697#ifdef Py_UNICODE_WIDE 8698 if (x < 0 || x > 0x10ffff) { 8699 PyErr_SetString(PyExc_OverflowError, 8700 "%c arg not in range(0x110000) " 8701 "(wide Python build)"); 8702 return -1; 8703 } 8704#else 8705 if (x < 0 || x > 0xffff) { 8706 PyErr_SetString(PyExc_OverflowError, 8707 "%c arg not in range(0x10000) " 8708 "(narrow Python build)"); 8709 return -1; 8710 } 8711#endif 8712 buf[0] = (Py_UNICODE) x; 8713 } 8714 buf[1] = '\0'; 8715 return 1; 8716 8717 onError: 8718 PyErr_SetString(PyExc_TypeError, 8719 "%c requires int or char"); 8720 return -1; 8721} 8722 8723/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...) 8724 8725 FORMATBUFLEN is the length of the buffer in which the floats, ints, & 8726 chars are formatted. XXX This is a magic number. Each formatting 8727 routine does bounds checking to ensure no overflow, but a better 8728 solution may be to malloc a buffer of appropriate size for each 8729 format. For now, the current solution is sufficient. 8730*/ 8731#define FORMATBUFLEN (size_t)120 8732 8733PyObject *PyUnicode_Format(PyObject *format, 8734 PyObject *args) 8735{ 8736 Py_UNICODE *fmt, *res; 8737 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx; 8738 int args_owned = 0; 8739 PyUnicodeObject *result = NULL; 8740 PyObject *dict = NULL; 8741 PyObject *uformat; 8742 8743 if (format == NULL || args == NULL) { 8744 PyErr_BadInternalCall(); 8745 return NULL; 8746 } 8747 uformat = PyUnicode_FromObject(format); 8748 if (uformat == NULL) 8749 return NULL; 8750 fmt = PyUnicode_AS_UNICODE(uformat); 8751 fmtcnt = PyUnicode_GET_SIZE(uformat); 8752 8753 reslen = rescnt = fmtcnt + 100; 8754 result = _PyUnicode_New(reslen); 8755 if (result == NULL) 8756 goto onError; 8757 res = PyUnicode_AS_UNICODE(result); 8758 8759 if (PyTuple_Check(args)) { 8760 arglen = PyTuple_Size(args); 8761 argidx = 0; 8762 } 8763 else { 8764 arglen = -1; 8765 argidx = -2; 8766 } 8767 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) && 8768 !PyUnicode_Check(args)) 8769 dict = args; 8770 8771 while (--fmtcnt >= 0) { 8772 if (*fmt != '%') { 8773 if (--rescnt < 0) { 8774 rescnt = fmtcnt + 100; 8775 reslen += rescnt; 8776 if (_PyUnicode_Resize(&result, reslen) < 0) 8777 goto onError; 8778 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt; 8779 --rescnt; 8780 } 8781 *res++ = *fmt++; 8782 } 8783 else { 8784 /* Got a format specifier */ 8785 int flags = 0; 8786 Py_ssize_t width = -1; 8787 int prec = -1; 8788 Py_UNICODE c = '\0'; 8789 Py_UNICODE fill; 8790 int isnumok; 8791 PyObject *v = NULL; 8792 PyObject *temp = NULL; 8793 Py_UNICODE *pbuf; 8794 Py_UNICODE sign; 8795 Py_ssize_t len; 8796 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */ 8797 8798 fmt++; 8799 if (*fmt == '(') { 8800 Py_UNICODE *keystart; 8801 Py_ssize_t keylen; 8802 PyObject *key; 8803 int pcount = 1; 8804 8805 if (dict == NULL) { 8806 PyErr_SetString(PyExc_TypeError, 8807 "format requires a mapping"); 8808 goto onError; 8809 } 8810 ++fmt; 8811 --fmtcnt; 8812 keystart = fmt; 8813 /* Skip over balanced parentheses */ 8814 while (pcount > 0 && --fmtcnt >= 0) { 8815 if (*fmt == ')') 8816 --pcount; 8817 else if (*fmt == '(') 8818 ++pcount; 8819 fmt++; 8820 } 8821 keylen = fmt - keystart - 1; 8822 if (fmtcnt < 0 || pcount > 0) { 8823 PyErr_SetString(PyExc_ValueError, 8824 "incomplete format key"); 8825 goto onError; 8826 } 8827#if 0 8828 /* keys are converted to strings using UTF-8 and 8829 then looked up since Python uses strings to hold 8830 variables names etc. in its namespaces and we 8831 wouldn't want to break common idioms. */ 8832 key = PyUnicode_EncodeUTF8(keystart, 8833 keylen, 8834 NULL); 8835#else 8836 key = PyUnicode_FromUnicode(keystart, keylen); 8837#endif 8838 if (key == NULL) 8839 goto onError; 8840 if (args_owned) { 8841 Py_DECREF(args); 8842 args_owned = 0; 8843 } 8844 args = PyObject_GetItem(dict, key); 8845 Py_DECREF(key); 8846 if (args == NULL) { 8847 goto onError; 8848 } 8849 args_owned = 1; 8850 arglen = -1; 8851 argidx = -2; 8852 } 8853 while (--fmtcnt >= 0) { 8854 switch (c = *fmt++) { 8855 case '-': flags |= F_LJUST; continue; 8856 case '+': flags |= F_SIGN; continue; 8857 case ' ': flags |= F_BLANK; continue; 8858 case '#': flags |= F_ALT; continue; 8859 case '0': flags |= F_ZERO; continue; 8860 } 8861 break; 8862 } 8863 if (c == '*') { 8864 v = getnextarg(args, arglen, &argidx); 8865 if (v == NULL) 8866 goto onError; 8867 if (!PyLong_Check(v)) { 8868 PyErr_SetString(PyExc_TypeError, 8869 "* wants int"); 8870 goto onError; 8871 } 8872 width = PyLong_AsLong(v); 8873 if (width == -1 && PyErr_Occurred()) 8874 goto onError; 8875 if (width < 0) { 8876 flags |= F_LJUST; 8877 width = -width; 8878 } 8879 if (--fmtcnt >= 0) 8880 c = *fmt++; 8881 } 8882 else if (c >= '0' && c <= '9') { 8883 width = c - '0'; 8884 while (--fmtcnt >= 0) { 8885 c = *fmt++; 8886 if (c < '0' || c > '9') 8887 break; 8888 if ((width*10) / 10 != width) { 8889 PyErr_SetString(PyExc_ValueError, 8890 "width too big"); 8891 goto onError; 8892 } 8893 width = width*10 + (c - '0'); 8894 } 8895 } 8896 if (c == '.') { 8897 prec = 0; 8898 if (--fmtcnt >= 0) 8899 c = *fmt++; 8900 if (c == '*') { 8901 v = getnextarg(args, arglen, &argidx); 8902 if (v == NULL) 8903 goto onError; 8904 if (!PyLong_Check(v)) { 8905 PyErr_SetString(PyExc_TypeError, 8906 "* wants int"); 8907 goto onError; 8908 } 8909 prec = PyLong_AsLong(v); 8910 if (prec == -1 && PyErr_Occurred()) 8911 goto onError; 8912 if (prec < 0) 8913 prec = 0; 8914 if (--fmtcnt >= 0) 8915 c = *fmt++; 8916 } 8917 else if (c >= '0' && c <= '9') { 8918 prec = c - '0'; 8919 while (--fmtcnt >= 0) { 8920 c = Py_CHARMASK(*fmt++); 8921 if (c < '0' || c > '9') 8922 break; 8923 if ((prec*10) / 10 != prec) { 8924 PyErr_SetString(PyExc_ValueError, 8925 "prec too big"); 8926 goto onError; 8927 } 8928 prec = prec*10 + (c - '0'); 8929 } 8930 } 8931 } /* prec */ 8932 if (fmtcnt >= 0) { 8933 if (c == 'h' || c == 'l' || c == 'L') { 8934 if (--fmtcnt >= 0) 8935 c = *fmt++; 8936 } 8937 } 8938 if (fmtcnt < 0) { 8939 PyErr_SetString(PyExc_ValueError, 8940 "incomplete format"); 8941 goto onError; 8942 } 8943 if (c != '%') { 8944 v = getnextarg(args, arglen, &argidx); 8945 if (v == NULL) 8946 goto onError; 8947 } 8948 sign = 0; 8949 fill = ' '; 8950 switch (c) { 8951 8952 case '%': 8953 pbuf = formatbuf; 8954 /* presume that buffer length is at least 1 */ 8955 pbuf[0] = '%'; 8956 len = 1; 8957 break; 8958 8959 case 's': 8960 case 'r': 8961 if (PyUnicode_Check(v) && c == 's') { 8962 temp = v; 8963 Py_INCREF(temp); 8964 } 8965 else { 8966 if (c == 's') 8967 temp = PyObject_Str(v); 8968 else 8969 temp = PyObject_Repr(v); 8970 if (temp == NULL) 8971 goto onError; 8972 if (PyUnicode_Check(temp)) 8973 /* nothing to do */; 8974 else { 8975 Py_DECREF(temp); 8976 PyErr_SetString(PyExc_TypeError, 8977 "%s argument has non-string str()"); 8978 goto onError; 8979 } 8980 } 8981 pbuf = PyUnicode_AS_UNICODE(temp); 8982 len = PyUnicode_GET_SIZE(temp); 8983 if (prec >= 0 && len > prec) 8984 len = prec; 8985 break; 8986 8987 case 'i': 8988 case 'd': 8989 case 'u': 8990 case 'o': 8991 case 'x': 8992 case 'X': 8993 if (c == 'i') 8994 c = 'd'; 8995 isnumok = 0; 8996 if (PyNumber_Check(v)) { 8997 PyObject *iobj=NULL; 8998 8999 if (PyLong_Check(v)) { 9000 iobj = v; 9001 Py_INCREF(iobj); 9002 } 9003 else { 9004 iobj = PyNumber_Long(v); 9005 } 9006 if (iobj!=NULL) { 9007 if (PyLong_Check(iobj)) { 9008 isnumok = 1; 9009 temp = formatlong(iobj, flags, prec, c); 9010 Py_DECREF(iobj); 9011 if (!temp) 9012 goto onError; 9013 pbuf = PyUnicode_AS_UNICODE(temp); 9014 len = PyUnicode_GET_SIZE(temp); 9015 sign = 1; 9016 } 9017 else { 9018 Py_DECREF(iobj); 9019 } 9020 } 9021 } 9022 if (!isnumok) { 9023 PyErr_Format(PyExc_TypeError, 9024 "%%%c format: a number is required, " 9025 "not %.200s", (char)c, Py_TYPE(v)->tp_name); 9026 goto onError; 9027 } 9028 if (flags & F_ZERO) 9029 fill = '0'; 9030 break; 9031 9032 case 'e': 9033 case 'E': 9034 case 'f': 9035 case 'F': 9036 case 'g': 9037 case 'G': 9038 if (c == 'F') 9039 c = 'f'; 9040 pbuf = formatbuf; 9041 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), 9042 flags, prec, c, v); 9043 if (len < 0) 9044 goto onError; 9045 sign = 1; 9046 if (flags & F_ZERO) 9047 fill = '0'; 9048 break; 9049 9050 case 'c': 9051 pbuf = formatbuf; 9052 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v); 9053 if (len < 0) 9054 goto onError; 9055 break; 9056 9057 default: 9058 PyErr_Format(PyExc_ValueError, 9059 "unsupported format character '%c' (0x%x) " 9060 "at index %zd", 9061 (31<=c && c<=126) ? (char)c : '?', 9062 (int)c, 9063 (Py_ssize_t)(fmt - 1 - 9064 PyUnicode_AS_UNICODE(uformat))); 9065 goto onError; 9066 } 9067 if (sign) { 9068 if (*pbuf == '-' || *pbuf == '+') { 9069 sign = *pbuf++; 9070 len--; 9071 } 9072 else if (flags & F_SIGN) 9073 sign = '+'; 9074 else if (flags & F_BLANK) 9075 sign = ' '; 9076 else 9077 sign = 0; 9078 } 9079 if (width < len) 9080 width = len; 9081 if (rescnt - (sign != 0) < width) { 9082 reslen -= rescnt; 9083 rescnt = width + fmtcnt + 100; 9084 reslen += rescnt; 9085 if (reslen < 0) { 9086 Py_XDECREF(temp); 9087 PyErr_NoMemory(); 9088 goto onError; 9089 } 9090 if (_PyUnicode_Resize(&result, reslen) < 0) { 9091 Py_XDECREF(temp); 9092 goto onError; 9093 } 9094 res = PyUnicode_AS_UNICODE(result) 9095 + reslen - rescnt; 9096 } 9097 if (sign) { 9098 if (fill != ' ') 9099 *res++ = sign; 9100 rescnt--; 9101 if (width > len) 9102 width--; 9103 } 9104 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) { 9105 assert(pbuf[0] == '0'); 9106 assert(pbuf[1] == c); 9107 if (fill != ' ') { 9108 *res++ = *pbuf++; 9109 *res++ = *pbuf++; 9110 } 9111 rescnt -= 2; 9112 width -= 2; 9113 if (width < 0) 9114 width = 0; 9115 len -= 2; 9116 } 9117 if (width > len && !(flags & F_LJUST)) { 9118 do { 9119 --rescnt; 9120 *res++ = fill; 9121 } while (--width > len); 9122 } 9123 if (fill == ' ') { 9124 if (sign) 9125 *res++ = sign; 9126 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) { 9127 assert(pbuf[0] == '0'); 9128 assert(pbuf[1] == c); 9129 *res++ = *pbuf++; 9130 *res++ = *pbuf++; 9131 } 9132 } 9133 Py_UNICODE_COPY(res, pbuf, len); 9134 res += len; 9135 rescnt -= len; 9136 while (--width >= len) { 9137 --rescnt; 9138 *res++ = ' '; 9139 } 9140 if (dict && (argidx < arglen) && c != '%') { 9141 PyErr_SetString(PyExc_TypeError, 9142 "not all arguments converted during string formatting"); 9143 Py_XDECREF(temp); 9144 goto onError; 9145 } 9146 Py_XDECREF(temp); 9147 } /* '%' */ 9148 } /* until end */ 9149 if (argidx < arglen && !dict) { 9150 PyErr_SetString(PyExc_TypeError, 9151 "not all arguments converted during string formatting"); 9152 goto onError; 9153 } 9154 9155 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0) 9156 goto onError; 9157 if (args_owned) { 9158 Py_DECREF(args); 9159 } 9160 Py_DECREF(uformat); 9161 return (PyObject *)result; 9162 9163 onError: 9164 Py_XDECREF(result); 9165 Py_DECREF(uformat); 9166 if (args_owned) { 9167 Py_DECREF(args); 9168 } 9169 return NULL; 9170} 9171 9172static PyObject * 9173unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds); 9174 9175static PyObject * 9176unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds) 9177{ 9178 PyObject *x = NULL; 9179 static char *kwlist[] = {"object", "encoding", "errors", 0}; 9180 char *encoding = NULL; 9181 char *errors = NULL; 9182 9183 if (type != &PyUnicode_Type) 9184 return unicode_subtype_new(type, args, kwds); 9185 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str", 9186 kwlist, &x, &encoding, &errors)) 9187 return NULL; 9188 if (x == NULL) 9189 return (PyObject *)_PyUnicode_New(0); 9190 if (encoding == NULL && errors == NULL) 9191 return PyObject_Str(x); 9192 else 9193 return PyUnicode_FromEncodedObject(x, encoding, errors); 9194} 9195 9196static PyObject * 9197unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds) 9198{ 9199 PyUnicodeObject *tmp, *pnew; 9200 Py_ssize_t n; 9201 9202 assert(PyType_IsSubtype(type, &PyUnicode_Type)); 9203 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds); 9204 if (tmp == NULL) 9205 return NULL; 9206 assert(PyUnicode_Check(tmp)); 9207 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length); 9208 if (pnew == NULL) { 9209 Py_DECREF(tmp); 9210 return NULL; 9211 } 9212 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1)); 9213 if (pnew->str == NULL) { 9214 _Py_ForgetReference((PyObject *)pnew); 9215 PyObject_Del(pnew); 9216 Py_DECREF(tmp); 9217 return PyErr_NoMemory(); 9218 } 9219 Py_UNICODE_COPY(pnew->str, tmp->str, n+1); 9220 pnew->length = n; 9221 pnew->hash = tmp->hash; 9222 Py_DECREF(tmp); 9223 return (PyObject *)pnew; 9224} 9225 9226PyDoc_STRVAR(unicode_doc, 9227"str(string[, encoding[, errors]]) -> str\n\ 9228\n\ 9229Create a new string object from the given encoded string.\n\ 9230encoding defaults to the current default string encoding.\n\ 9231errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'."); 9232 9233static PyObject *unicode_iter(PyObject *seq); 9234 9235PyTypeObject PyUnicode_Type = { 9236 PyVarObject_HEAD_INIT(&PyType_Type, 0) 9237 "str", /* tp_name */ 9238 sizeof(PyUnicodeObject), /* tp_size */ 9239 0, /* tp_itemsize */ 9240 /* Slots */ 9241 (destructor)unicode_dealloc, /* tp_dealloc */ 9242 0, /* tp_print */ 9243 0, /* tp_getattr */ 9244 0, /* tp_setattr */ 9245 0, /* tp_compare */ 9246 unicode_repr, /* tp_repr */ 9247 &unicode_as_number, /* tp_as_number */ 9248 &unicode_as_sequence, /* tp_as_sequence */ 9249 &unicode_as_mapping, /* tp_as_mapping */ 9250 (hashfunc) unicode_hash, /* tp_hash*/ 9251 0, /* tp_call*/ 9252 (reprfunc) unicode_str, /* tp_str */ 9253 PyObject_GenericGetAttr, /* tp_getattro */ 9254 0, /* tp_setattro */ 9255 0, /* tp_as_buffer */ 9256 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | 9257 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */ 9258 unicode_doc, /* tp_doc */ 9259 0, /* tp_traverse */ 9260 0, /* tp_clear */ 9261 PyUnicode_RichCompare, /* tp_richcompare */ 9262 0, /* tp_weaklistoffset */ 9263 unicode_iter, /* tp_iter */ 9264 0, /* tp_iternext */ 9265 unicode_methods, /* tp_methods */ 9266 0, /* tp_members */ 9267 0, /* tp_getset */ 9268 &PyBaseObject_Type, /* tp_base */ 9269 0, /* tp_dict */ 9270 0, /* tp_descr_get */ 9271 0, /* tp_descr_set */ 9272 0, /* tp_dictoffset */ 9273 0, /* tp_init */ 9274 0, /* tp_alloc */ 9275 unicode_new, /* tp_new */ 9276 PyObject_Del, /* tp_free */ 9277}; 9278 9279/* Initialize the Unicode implementation */ 9280 9281void _PyUnicode_Init(void) 9282{ 9283 int i; 9284 9285 /* XXX - move this array to unicodectype.c ? */ 9286 Py_UNICODE linebreak[] = { 9287 0x000A, /* LINE FEED */ 9288 0x000D, /* CARRIAGE RETURN */ 9289 0x001C, /* FILE SEPARATOR */ 9290 0x001D, /* GROUP SEPARATOR */ 9291 0x001E, /* RECORD SEPARATOR */ 9292 0x0085, /* NEXT LINE */ 9293 0x2028, /* LINE SEPARATOR */ 9294 0x2029, /* PARAGRAPH SEPARATOR */ 9295 }; 9296 9297 /* Init the implementation */ 9298 free_list = NULL; 9299 numfree = 0; 9300 unicode_empty = _PyUnicode_New(0); 9301 if (!unicode_empty) 9302 return; 9303 9304 for (i = 0; i < 256; i++) 9305 unicode_latin1[i] = NULL; 9306 if (PyType_Ready(&PyUnicode_Type) < 0) 9307 Py_FatalError("Can't initialize 'unicode'"); 9308 9309 /* initialize the linebreak bloom filter */ 9310 bloom_linebreak = make_bloom_mask( 9311 linebreak, sizeof(linebreak) / sizeof(linebreak[0]) 9312 ); 9313 9314 PyType_Ready(&EncodingMapType); 9315} 9316 9317/* Finalize the Unicode implementation */ 9318 9319int 9320PyUnicode_ClearFreeList(void) 9321{ 9322 int freelist_size = numfree; 9323 PyUnicodeObject *u; 9324 9325 for (u = free_list; u != NULL;) { 9326 PyUnicodeObject *v = u; 9327 u = *(PyUnicodeObject **)u; 9328 if (v->str) 9329 PyObject_DEL(v->str); 9330 Py_XDECREF(v->defenc); 9331 PyObject_Del(v); 9332 numfree--; 9333 } 9334 free_list = NULL; 9335 assert(numfree == 0); 9336 return freelist_size; 9337} 9338 9339void 9340_PyUnicode_Fini(void) 9341{ 9342 int i; 9343 9344 Py_XDECREF(unicode_empty); 9345 unicode_empty = NULL; 9346 9347 for (i = 0; i < 256; i++) { 9348 if (unicode_latin1[i]) { 9349 Py_DECREF(unicode_latin1[i]); 9350 unicode_latin1[i] = NULL; 9351 } 9352 } 9353 (void)PyUnicode_ClearFreeList(); 9354} 9355 9356void 9357PyUnicode_InternInPlace(PyObject **p) 9358{ 9359 register PyUnicodeObject *s = (PyUnicodeObject *)(*p); 9360 PyObject *t; 9361 if (s == NULL || !PyUnicode_Check(s)) 9362 Py_FatalError( 9363 "PyUnicode_InternInPlace: unicode strings only please!"); 9364 /* If it's a subclass, we don't really know what putting 9365 it in the interned dict might do. */ 9366 if (!PyUnicode_CheckExact(s)) 9367 return; 9368 if (PyUnicode_CHECK_INTERNED(s)) 9369 return; 9370 if (interned == NULL) { 9371 interned = PyDict_New(); 9372 if (interned == NULL) { 9373 PyErr_Clear(); /* Don't leave an exception */ 9374 return; 9375 } 9376 } 9377 /* It might be that the GetItem call fails even 9378 though the key is present in the dictionary, 9379 namely when this happens during a stack overflow. */ 9380 Py_ALLOW_RECURSION 9381 t = PyDict_GetItem(interned, (PyObject *)s); 9382 Py_END_ALLOW_RECURSION 9383 9384 if (t) { 9385 Py_INCREF(t); 9386 Py_DECREF(*p); 9387 *p = t; 9388 return; 9389 } 9390 9391 PyThreadState_GET()->recursion_critical = 1; 9392 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) { 9393 PyErr_Clear(); 9394 PyThreadState_GET()->recursion_critical = 0; 9395 return; 9396 } 9397 PyThreadState_GET()->recursion_critical = 0; 9398 /* The two references in interned are not counted by refcnt. 9399 The deallocator will take care of this */ 9400 Py_REFCNT(s) -= 2; 9401 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL; 9402} 9403 9404void 9405PyUnicode_InternImmortal(PyObject **p) 9406{ 9407 PyUnicode_InternInPlace(p); 9408 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) { 9409 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL; 9410 Py_INCREF(*p); 9411 } 9412} 9413 9414PyObject * 9415PyUnicode_InternFromString(const char *cp) 9416{ 9417 PyObject *s = PyUnicode_FromString(cp); 9418 if (s == NULL) 9419 return NULL; 9420 PyUnicode_InternInPlace(&s); 9421 return s; 9422} 9423 9424void _Py_ReleaseInternedUnicodeStrings(void) 9425{ 9426 PyObject *keys; 9427 PyUnicodeObject *s; 9428 Py_ssize_t i, n; 9429 Py_ssize_t immortal_size = 0, mortal_size = 0; 9430 9431 if (interned == NULL || !PyDict_Check(interned)) 9432 return; 9433 keys = PyDict_Keys(interned); 9434 if (keys == NULL || !PyList_Check(keys)) { 9435 PyErr_Clear(); 9436 return; 9437 } 9438 9439 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak 9440 detector, interned unicode strings are not forcibly deallocated; 9441 rather, we give them their stolen references back, and then clear 9442 and DECREF the interned dict. */ 9443 9444 n = PyList_GET_SIZE(keys); 9445 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n", 9446 n); 9447 for (i = 0; i < n; i++) { 9448 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i); 9449 switch (s->state) { 9450 case SSTATE_NOT_INTERNED: 9451 /* XXX Shouldn't happen */ 9452 break; 9453 case SSTATE_INTERNED_IMMORTAL: 9454 Py_REFCNT(s) += 1; 9455 immortal_size += s->length; 9456 break; 9457 case SSTATE_INTERNED_MORTAL: 9458 Py_REFCNT(s) += 2; 9459 mortal_size += s->length; 9460 break; 9461 default: 9462 Py_FatalError("Inconsistent interned string state."); 9463 } 9464 s->state = SSTATE_NOT_INTERNED; 9465 } 9466 fprintf(stderr, "total size of all interned strings: " 9467 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d " 9468 "mortal/immortal\n", mortal_size, immortal_size); 9469 Py_DECREF(keys); 9470 PyDict_Clear(interned); 9471 Py_DECREF(interned); 9472 interned = NULL; 9473} 9474 9475 9476/********************* Unicode Iterator **************************/ 9477 9478typedef struct { 9479 PyObject_HEAD 9480 Py_ssize_t it_index; 9481 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */ 9482} unicodeiterobject; 9483 9484static void 9485unicodeiter_dealloc(unicodeiterobject *it) 9486{ 9487 _PyObject_GC_UNTRACK(it); 9488 Py_XDECREF(it->it_seq); 9489 PyObject_GC_Del(it); 9490} 9491 9492static int 9493unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg) 9494{ 9495 Py_VISIT(it->it_seq); 9496 return 0; 9497} 9498 9499static PyObject * 9500unicodeiter_next(unicodeiterobject *it) 9501{ 9502 PyUnicodeObject *seq; 9503 PyObject *item; 9504 9505 assert(it != NULL); 9506 seq = it->it_seq; 9507 if (seq == NULL) 9508 return NULL; 9509 assert(PyUnicode_Check(seq)); 9510 9511 if (it->it_index < PyUnicode_GET_SIZE(seq)) { 9512 item = PyUnicode_FromUnicode( 9513 PyUnicode_AS_UNICODE(seq)+it->it_index, 1); 9514 if (item != NULL) 9515 ++it->it_index; 9516 return item; 9517 } 9518 9519 Py_DECREF(seq); 9520 it->it_seq = NULL; 9521 return NULL; 9522} 9523 9524static PyObject * 9525unicodeiter_len(unicodeiterobject *it) 9526{ 9527 Py_ssize_t len = 0; 9528 if (it->it_seq) 9529 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index; 9530 return PyLong_FromSsize_t(len); 9531} 9532 9533PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it))."); 9534 9535static PyMethodDef unicodeiter_methods[] = { 9536 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS, 9537 length_hint_doc}, 9538 {NULL, NULL} /* sentinel */ 9539}; 9540 9541PyTypeObject PyUnicodeIter_Type = { 9542 PyVarObject_HEAD_INIT(&PyType_Type, 0) 9543 "str_iterator", /* tp_name */ 9544 sizeof(unicodeiterobject), /* tp_basicsize */ 9545 0, /* tp_itemsize */ 9546 /* methods */ 9547 (destructor)unicodeiter_dealloc, /* tp_dealloc */ 9548 0, /* tp_print */ 9549 0, /* tp_getattr */ 9550 0, /* tp_setattr */ 9551 0, /* tp_compare */ 9552 0, /* tp_repr */ 9553 0, /* tp_as_number */ 9554 0, /* tp_as_sequence */ 9555 0, /* tp_as_mapping */ 9556 0, /* tp_hash */ 9557 0, /* tp_call */ 9558 0, /* tp_str */ 9559 PyObject_GenericGetAttr, /* tp_getattro */ 9560 0, /* tp_setattro */ 9561 0, /* tp_as_buffer */ 9562 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */ 9563 0, /* tp_doc */ 9564 (traverseproc)unicodeiter_traverse, /* tp_traverse */ 9565 0, /* tp_clear */ 9566 0, /* tp_richcompare */ 9567 0, /* tp_weaklistoffset */ 9568 PyObject_SelfIter, /* tp_iter */ 9569 (iternextfunc)unicodeiter_next, /* tp_iternext */ 9570 unicodeiter_methods, /* tp_methods */ 9571 0, 9572}; 9573 9574static PyObject * 9575unicode_iter(PyObject *seq) 9576{ 9577 unicodeiterobject *it; 9578 9579 if (!PyUnicode_Check(seq)) { 9580 PyErr_BadInternalCall(); 9581 return NULL; 9582 } 9583 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type); 9584 if (it == NULL) 9585 return NULL; 9586 it->it_index = 0; 9587 Py_INCREF(seq); 9588 it->it_seq = (PyUnicodeObject *)seq; 9589 _PyObject_GC_TRACK(it); 9590 return (PyObject *)it; 9591} 9592 9593size_t 9594Py_UNICODE_strlen(const Py_UNICODE *u) 9595{ 9596 int res = 0; 9597 while(*u++) 9598 res++; 9599 return res; 9600} 9601 9602Py_UNICODE* 9603Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2) 9604{ 9605 Py_UNICODE *u = s1; 9606 while ((*u++ = *s2++)); 9607 return s1; 9608} 9609 9610Py_UNICODE* 9611Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n) 9612{ 9613 Py_UNICODE *u = s1; 9614 while ((*u++ = *s2++)) 9615 if (n-- == 0) 9616 break; 9617 return s1; 9618} 9619 9620int 9621Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2) 9622{ 9623 while (*s1 && *s2 && *s1 == *s2) 9624 s1++, s2++; 9625 if (*s1 && *s2) 9626 return (*s1 < *s2) ? -1 : +1; 9627 if (*s1) 9628 return 1; 9629 if (*s2) 9630 return -1; 9631 return 0; 9632} 9633 9634Py_UNICODE* 9635Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c) 9636{ 9637 const Py_UNICODE *p; 9638 for (p = s; *p; p++) 9639 if (*p == c) 9640 return (Py_UNICODE*)p; 9641 return NULL; 9642} 9643 9644 9645#ifdef __cplusplus 9646} 9647#endif 9648 9649 9650/* 9651Local variables: 9652c-basic-offset: 4 9653indent-tabs-mode: nil 9654End: 9655*/ 9656