unicodeobject.c revision 53516a82df8db500a968451daa54fc72eaed7056
1/* 2 3Unicode implementation based on original code by Fredrik Lundh, 4modified by Marc-Andre Lemburg <mal@lemburg.com> according to the 5Unicode Integration Proposal (see file Misc/unicode.txt). 6 7Major speed upgrades to the method implementations at the Reykjavik 8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke. 9 10Copyright (c) Corporation for National Research Initiatives. 11 12-------------------------------------------------------------------- 13The original string type implementation is: 14 15 Copyright (c) 1999 by Secret Labs AB 16 Copyright (c) 1999 by Fredrik Lundh 17 18By obtaining, using, and/or copying this software and/or its 19associated documentation, you agree that you have read, understood, 20and will comply with the following terms and conditions: 21 22Permission to use, copy, modify, and distribute this software and its 23associated documentation for any purpose and without fee is hereby 24granted, provided that the above copyright notice appears in all 25copies, and that both that copyright notice and this permission notice 26appear in supporting documentation, and that the name of Secret Labs 27AB or the author not be used in advertising or publicity pertaining to 28distribution of the software without specific, written prior 29permission. 30 31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO 32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND 33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR 34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT 37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 38-------------------------------------------------------------------- 39 40*/ 41 42#define PY_SSIZE_T_CLEAN 43#include "Python.h" 44#include "ucnhash.h" 45 46#ifdef MS_WINDOWS 47#include <windows.h> 48#endif 49 50/* Limit for the Unicode object free list */ 51 52#define PyUnicode_MAXFREELIST 1024 53 54/* Limit for the Unicode object free list stay alive optimization. 55 56 The implementation will keep allocated Unicode memory intact for 57 all objects on the free list having a size less than this 58 limit. This reduces malloc() overhead for small Unicode objects. 59 60 At worst this will result in PyUnicode_MAXFREELIST * 61 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT + 62 malloc()-overhead) bytes of unused garbage. 63 64 Setting the limit to 0 effectively turns the feature off. 65 66 Note: This is an experimental feature ! If you get core dumps when 67 using Unicode objects, turn this feature off. 68 69*/ 70 71#define KEEPALIVE_SIZE_LIMIT 9 72 73/* Endianness switches; defaults to little endian */ 74 75#ifdef WORDS_BIGENDIAN 76# define BYTEORDER_IS_BIG_ENDIAN 77#else 78# define BYTEORDER_IS_LITTLE_ENDIAN 79#endif 80 81/* --- Globals ------------------------------------------------------------ 82 83 The globals are initialized by the _PyUnicode_Init() API and should 84 not be used before calling that API. 85 86*/ 87 88 89#ifdef __cplusplus 90extern "C" { 91#endif 92 93/* This dictionary holds all interned unicode strings. Note that references 94 to strings in this dictionary are *not* counted in the string's ob_refcnt. 95 When the interned string reaches a refcnt of 0 the string deallocation 96 function will delete the reference from this dictionary. 97 98 Another way to look at this is that to say that the actual reference 99 count of a string is: s->ob_refcnt + (s->state ? 2 : 0) 100*/ 101static PyObject *interned; 102 103/* Free list for Unicode objects */ 104static PyUnicodeObject *free_list; 105static int numfree; 106 107/* The empty Unicode object is shared to improve performance. */ 108static PyUnicodeObject *unicode_empty; 109 110/* Single character Unicode strings in the Latin-1 range are being 111 shared as well. */ 112static PyUnicodeObject *unicode_latin1[256]; 113 114/* Fast detection of the most frequent whitespace characters */ 115const unsigned char _Py_ascii_whitespace[] = { 116 0, 0, 0, 0, 0, 0, 0, 0, 117/* case 0x0009: * CHARACTER TABULATION */ 118/* case 0x000A: * LINE FEED */ 119/* case 0x000B: * LINE TABULATION */ 120/* case 0x000C: * FORM FEED */ 121/* case 0x000D: * CARRIAGE RETURN */ 122 0, 1, 1, 1, 1, 1, 0, 0, 123 0, 0, 0, 0, 0, 0, 0, 0, 124/* case 0x001C: * FILE SEPARATOR */ 125/* case 0x001D: * GROUP SEPARATOR */ 126/* case 0x001E: * RECORD SEPARATOR */ 127/* case 0x001F: * UNIT SEPARATOR */ 128 0, 0, 0, 0, 1, 1, 1, 1, 129/* case 0x0020: * SPACE */ 130 1, 0, 0, 0, 0, 0, 0, 0, 131 0, 0, 0, 0, 0, 0, 0, 0, 132 0, 0, 0, 0, 0, 0, 0, 0, 133 0, 0, 0, 0, 0, 0, 0, 0, 134 135 0, 0, 0, 0, 0, 0, 0, 0, 136 0, 0, 0, 0, 0, 0, 0, 0, 137 0, 0, 0, 0, 0, 0, 0, 0, 138 0, 0, 0, 0, 0, 0, 0, 0, 139 0, 0, 0, 0, 0, 0, 0, 0, 140 0, 0, 0, 0, 0, 0, 0, 0, 141 0, 0, 0, 0, 0, 0, 0, 0, 142 0, 0, 0, 0, 0, 0, 0, 0 143}; 144 145static PyObject *unicode_encode_call_errorhandler(const char *errors, 146 PyObject **errorHandler,const char *encoding, const char *reason, 147 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject, 148 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos); 149 150static void raise_encode_exception(PyObject **exceptionObject, 151 const char *encoding, 152 const Py_UNICODE *unicode, Py_ssize_t size, 153 Py_ssize_t startpos, Py_ssize_t endpos, 154 const char *reason); 155 156/* Same for linebreaks */ 157static unsigned char ascii_linebreak[] = { 158 0, 0, 0, 0, 0, 0, 0, 0, 159/* 0x000A, * LINE FEED */ 160/* 0x000B, * LINE TABULATION */ 161/* 0x000C, * FORM FEED */ 162/* 0x000D, * CARRIAGE RETURN */ 163 0, 0, 1, 1, 1, 1, 0, 0, 164 0, 0, 0, 0, 0, 0, 0, 0, 165/* 0x001C, * FILE SEPARATOR */ 166/* 0x001D, * GROUP SEPARATOR */ 167/* 0x001E, * RECORD SEPARATOR */ 168 0, 0, 0, 0, 1, 1, 1, 0, 169 0, 0, 0, 0, 0, 0, 0, 0, 170 0, 0, 0, 0, 0, 0, 0, 0, 171 0, 0, 0, 0, 0, 0, 0, 0, 172 0, 0, 0, 0, 0, 0, 0, 0, 173 174 0, 0, 0, 0, 0, 0, 0, 0, 175 0, 0, 0, 0, 0, 0, 0, 0, 176 0, 0, 0, 0, 0, 0, 0, 0, 177 0, 0, 0, 0, 0, 0, 0, 0, 178 0, 0, 0, 0, 0, 0, 0, 0, 179 0, 0, 0, 0, 0, 0, 0, 0, 180 0, 0, 0, 0, 0, 0, 0, 0, 181 0, 0, 0, 0, 0, 0, 0, 0 182}; 183 184 185Py_UNICODE 186PyUnicode_GetMax(void) 187{ 188#ifdef Py_UNICODE_WIDE 189 return 0x10FFFF; 190#else 191 /* This is actually an illegal character, so it should 192 not be passed to unichr. */ 193 return 0xFFFF; 194#endif 195} 196 197/* --- Bloom Filters ----------------------------------------------------- */ 198 199/* stuff to implement simple "bloom filters" for Unicode characters. 200 to keep things simple, we use a single bitmask, using the least 5 201 bits from each unicode characters as the bit index. */ 202 203/* the linebreak mask is set up by Unicode_Init below */ 204 205#if LONG_BIT >= 128 206#define BLOOM_WIDTH 128 207#elif LONG_BIT >= 64 208#define BLOOM_WIDTH 64 209#elif LONG_BIT >= 32 210#define BLOOM_WIDTH 32 211#else 212#error "LONG_BIT is smaller than 32" 213#endif 214 215#define BLOOM_MASK unsigned long 216 217static BLOOM_MASK bloom_linebreak; 218 219#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1))))) 220#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1))))) 221 222#define BLOOM_LINEBREAK(ch) \ 223 ((ch) < 128U ? ascii_linebreak[(ch)] : \ 224 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch))) 225 226Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len) 227{ 228 /* calculate simple bloom-style bitmask for a given unicode string */ 229 230 BLOOM_MASK mask; 231 Py_ssize_t i; 232 233 mask = 0; 234 for (i = 0; i < len; i++) 235 BLOOM_ADD(mask, ptr[i]); 236 237 return mask; 238} 239 240Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen) 241{ 242 Py_ssize_t i; 243 244 for (i = 0; i < setlen; i++) 245 if (set[i] == chr) 246 return 1; 247 248 return 0; 249} 250 251#define BLOOM_MEMBER(mask, chr, set, setlen) \ 252 BLOOM(mask, chr) && unicode_member(chr, set, setlen) 253 254/* --- Unicode Object ----------------------------------------------------- */ 255 256static 257int unicode_resize(register PyUnicodeObject *unicode, 258 Py_ssize_t length) 259{ 260 void *oldstr; 261 262 /* Shortcut if there's nothing much to do. */ 263 if (unicode->length == length) 264 goto reset; 265 266 /* Resizing shared object (unicode_empty or single character 267 objects) in-place is not allowed. Use PyUnicode_Resize() 268 instead ! */ 269 270 if (unicode == unicode_empty || 271 (unicode->length == 1 && 272 unicode->str[0] < 256U && 273 unicode_latin1[unicode->str[0]] == unicode)) { 274 PyErr_SetString(PyExc_SystemError, 275 "can't resize shared str objects"); 276 return -1; 277 } 278 279 /* We allocate one more byte to make sure the string is Ux0000 terminated. 280 The overallocation is also used by fastsearch, which assumes that it's 281 safe to look at str[length] (without making any assumptions about what 282 it contains). */ 283 284 oldstr = unicode->str; 285 unicode->str = PyObject_REALLOC(unicode->str, 286 sizeof(Py_UNICODE) * (length + 1)); 287 if (!unicode->str) { 288 unicode->str = (Py_UNICODE *)oldstr; 289 PyErr_NoMemory(); 290 return -1; 291 } 292 unicode->str[length] = 0; 293 unicode->length = length; 294 295 reset: 296 /* Reset the object caches */ 297 if (unicode->defenc) { 298 Py_CLEAR(unicode->defenc); 299 } 300 unicode->hash = -1; 301 302 return 0; 303} 304 305/* We allocate one more byte to make sure the string is 306 Ux0000 terminated; some code (e.g. new_identifier) 307 relies on that. 308 309 XXX This allocator could further be enhanced by assuring that the 310 free list never reduces its size below 1. 311 312*/ 313 314static 315PyUnicodeObject *_PyUnicode_New(Py_ssize_t length) 316{ 317 register PyUnicodeObject *unicode; 318 319 /* Optimization for empty strings */ 320 if (length == 0 && unicode_empty != NULL) { 321 Py_INCREF(unicode_empty); 322 return unicode_empty; 323 } 324 325 /* Ensure we won't overflow the size. */ 326 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) { 327 return (PyUnicodeObject *)PyErr_NoMemory(); 328 } 329 330 /* Unicode freelist & memory allocation */ 331 if (free_list) { 332 unicode = free_list; 333 free_list = *(PyUnicodeObject **)unicode; 334 numfree--; 335 if (unicode->str) { 336 /* Keep-Alive optimization: we only upsize the buffer, 337 never downsize it. */ 338 if ((unicode->length < length) && 339 unicode_resize(unicode, length) < 0) { 340 PyObject_DEL(unicode->str); 341 unicode->str = NULL; 342 } 343 } 344 else { 345 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1); 346 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size); 347 } 348 PyObject_INIT(unicode, &PyUnicode_Type); 349 } 350 else { 351 size_t new_size; 352 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type); 353 if (unicode == NULL) 354 return NULL; 355 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1); 356 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size); 357 } 358 359 if (!unicode->str) { 360 PyErr_NoMemory(); 361 goto onError; 362 } 363 /* Initialize the first element to guard against cases where 364 * the caller fails before initializing str -- unicode_resize() 365 * reads str[0], and the Keep-Alive optimization can keep memory 366 * allocated for str alive across a call to unicode_dealloc(unicode). 367 * We don't want unicode_resize to read uninitialized memory in 368 * that case. 369 */ 370 unicode->str[0] = 0; 371 unicode->str[length] = 0; 372 unicode->length = length; 373 unicode->hash = -1; 374 unicode->state = 0; 375 unicode->defenc = NULL; 376 return unicode; 377 378 onError: 379 /* XXX UNREF/NEWREF interface should be more symmetrical */ 380 _Py_DEC_REFTOTAL; 381 _Py_ForgetReference((PyObject *)unicode); 382 PyObject_Del(unicode); 383 return NULL; 384} 385 386static 387void unicode_dealloc(register PyUnicodeObject *unicode) 388{ 389 switch (PyUnicode_CHECK_INTERNED(unicode)) { 390 case SSTATE_NOT_INTERNED: 391 break; 392 393 case SSTATE_INTERNED_MORTAL: 394 /* revive dead object temporarily for DelItem */ 395 Py_REFCNT(unicode) = 3; 396 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0) 397 Py_FatalError( 398 "deletion of interned string failed"); 399 break; 400 401 case SSTATE_INTERNED_IMMORTAL: 402 Py_FatalError("Immortal interned string died."); 403 404 default: 405 Py_FatalError("Inconsistent interned string state."); 406 } 407 408 if (PyUnicode_CheckExact(unicode) && 409 numfree < PyUnicode_MAXFREELIST) { 410 /* Keep-Alive optimization */ 411 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) { 412 PyObject_DEL(unicode->str); 413 unicode->str = NULL; 414 unicode->length = 0; 415 } 416 if (unicode->defenc) { 417 Py_CLEAR(unicode->defenc); 418 } 419 /* Add to free list */ 420 *(PyUnicodeObject **)unicode = free_list; 421 free_list = unicode; 422 numfree++; 423 } 424 else { 425 PyObject_DEL(unicode->str); 426 Py_XDECREF(unicode->defenc); 427 Py_TYPE(unicode)->tp_free((PyObject *)unicode); 428 } 429} 430 431static 432int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length) 433{ 434 register PyUnicodeObject *v; 435 436 /* Argument checks */ 437 if (unicode == NULL) { 438 PyErr_BadInternalCall(); 439 return -1; 440 } 441 v = *unicode; 442 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) { 443 PyErr_BadInternalCall(); 444 return -1; 445 } 446 447 /* Resizing unicode_empty and single character objects is not 448 possible since these are being shared. We simply return a fresh 449 copy with the same Unicode content. */ 450 if (v->length != length && 451 (v == unicode_empty || v->length == 1)) { 452 PyUnicodeObject *w = _PyUnicode_New(length); 453 if (w == NULL) 454 return -1; 455 Py_UNICODE_COPY(w->str, v->str, 456 length < v->length ? length : v->length); 457 Py_DECREF(*unicode); 458 *unicode = w; 459 return 0; 460 } 461 462 /* Note that we don't have to modify *unicode for unshared Unicode 463 objects, since we can modify them in-place. */ 464 return unicode_resize(v, length); 465} 466 467int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length) 468{ 469 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length); 470} 471 472PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u, 473 Py_ssize_t size) 474{ 475 PyUnicodeObject *unicode; 476 477 /* If the Unicode data is known at construction time, we can apply 478 some optimizations which share commonly used objects. */ 479 if (u != NULL) { 480 481 /* Optimization for empty strings */ 482 if (size == 0 && unicode_empty != NULL) { 483 Py_INCREF(unicode_empty); 484 return (PyObject *)unicode_empty; 485 } 486 487 /* Single character Unicode objects in the Latin-1 range are 488 shared when using this constructor */ 489 if (size == 1 && *u < 256) { 490 unicode = unicode_latin1[*u]; 491 if (!unicode) { 492 unicode = _PyUnicode_New(1); 493 if (!unicode) 494 return NULL; 495 unicode->str[0] = *u; 496 unicode_latin1[*u] = unicode; 497 } 498 Py_INCREF(unicode); 499 return (PyObject *)unicode; 500 } 501 } 502 503 unicode = _PyUnicode_New(size); 504 if (!unicode) 505 return NULL; 506 507 /* Copy the Unicode data into the new object */ 508 if (u != NULL) 509 Py_UNICODE_COPY(unicode->str, u, size); 510 511 return (PyObject *)unicode; 512} 513 514PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size) 515{ 516 PyUnicodeObject *unicode; 517 518 if (size < 0) { 519 PyErr_SetString(PyExc_SystemError, 520 "Negative size passed to PyUnicode_FromStringAndSize"); 521 return NULL; 522 } 523 524 /* If the Unicode data is known at construction time, we can apply 525 some optimizations which share commonly used objects. 526 Also, this means the input must be UTF-8, so fall back to the 527 UTF-8 decoder at the end. */ 528 if (u != NULL) { 529 530 /* Optimization for empty strings */ 531 if (size == 0 && unicode_empty != NULL) { 532 Py_INCREF(unicode_empty); 533 return (PyObject *)unicode_empty; 534 } 535 536 /* Single characters are shared when using this constructor. 537 Restrict to ASCII, since the input must be UTF-8. */ 538 if (size == 1 && Py_CHARMASK(*u) < 128) { 539 unicode = unicode_latin1[Py_CHARMASK(*u)]; 540 if (!unicode) { 541 unicode = _PyUnicode_New(1); 542 if (!unicode) 543 return NULL; 544 unicode->str[0] = Py_CHARMASK(*u); 545 unicode_latin1[Py_CHARMASK(*u)] = unicode; 546 } 547 Py_INCREF(unicode); 548 return (PyObject *)unicode; 549 } 550 551 return PyUnicode_DecodeUTF8(u, size, NULL); 552 } 553 554 unicode = _PyUnicode_New(size); 555 if (!unicode) 556 return NULL; 557 558 return (PyObject *)unicode; 559} 560 561PyObject *PyUnicode_FromString(const char *u) 562{ 563 size_t size = strlen(u); 564 if (size > PY_SSIZE_T_MAX) { 565 PyErr_SetString(PyExc_OverflowError, "input too long"); 566 return NULL; 567 } 568 569 return PyUnicode_FromStringAndSize(u, size); 570} 571 572#ifdef HAVE_WCHAR_H 573 574#if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4) 575# define CONVERT_WCHAR_TO_SURROGATES 576#endif 577 578#ifdef CONVERT_WCHAR_TO_SURROGATES 579 580/* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need 581 to convert from UTF32 to UTF16. */ 582 583PyObject *PyUnicode_FromWideChar(register const wchar_t *w, 584 Py_ssize_t size) 585{ 586 PyUnicodeObject *unicode; 587 register Py_ssize_t i; 588 Py_ssize_t alloc; 589 const wchar_t *orig_w; 590 591 if (w == NULL) { 592 if (size == 0) 593 return PyUnicode_FromStringAndSize(NULL, 0); 594 PyErr_BadInternalCall(); 595 return NULL; 596 } 597 598 if (size == -1) { 599 size = wcslen(w); 600 } 601 602 alloc = size; 603 orig_w = w; 604 for (i = size; i > 0; i--) { 605 if (*w > 0xFFFF) 606 alloc++; 607 w++; 608 } 609 w = orig_w; 610 unicode = _PyUnicode_New(alloc); 611 if (!unicode) 612 return NULL; 613 614 /* Copy the wchar_t data into the new object */ 615 { 616 register Py_UNICODE *u; 617 u = PyUnicode_AS_UNICODE(unicode); 618 for (i = size; i > 0; i--) { 619 if (*w > 0xFFFF) { 620 wchar_t ordinal = *w++; 621 ordinal -= 0x10000; 622 *u++ = 0xD800 | (ordinal >> 10); 623 *u++ = 0xDC00 | (ordinal & 0x3FF); 624 } 625 else 626 *u++ = *w++; 627 } 628 } 629 return (PyObject *)unicode; 630} 631 632#else 633 634PyObject *PyUnicode_FromWideChar(register const wchar_t *w, 635 Py_ssize_t size) 636{ 637 PyUnicodeObject *unicode; 638 639 if (w == NULL) { 640 if (size == 0) 641 return PyUnicode_FromStringAndSize(NULL, 0); 642 PyErr_BadInternalCall(); 643 return NULL; 644 } 645 646 if (size == -1) { 647 size = wcslen(w); 648 } 649 650 unicode = _PyUnicode_New(size); 651 if (!unicode) 652 return NULL; 653 654 /* Copy the wchar_t data into the new object */ 655#if Py_UNICODE_SIZE == SIZEOF_WCHAR_T 656 memcpy(unicode->str, w, size * sizeof(wchar_t)); 657#else 658 { 659 register Py_UNICODE *u; 660 register Py_ssize_t i; 661 u = PyUnicode_AS_UNICODE(unicode); 662 for (i = size; i > 0; i--) 663 *u++ = *w++; 664 } 665#endif 666 667 return (PyObject *)unicode; 668} 669 670#endif /* CONVERT_WCHAR_TO_SURROGATES */ 671 672#undef CONVERT_WCHAR_TO_SURROGATES 673 674static void 675makefmt(char *fmt, int longflag, int longlongflag, int size_tflag, 676 int zeropad, int width, int precision, char c) 677{ 678 *fmt++ = '%'; 679 if (width) { 680 if (zeropad) 681 *fmt++ = '0'; 682 fmt += sprintf(fmt, "%d", width); 683 } 684 if (precision) 685 fmt += sprintf(fmt, ".%d", precision); 686 if (longflag) 687 *fmt++ = 'l'; 688 else if (longlongflag) { 689 /* longlongflag should only ever be nonzero on machines with 690 HAVE_LONG_LONG defined */ 691#ifdef HAVE_LONG_LONG 692 char *f = PY_FORMAT_LONG_LONG; 693 while (*f) 694 *fmt++ = *f++; 695#else 696 /* we shouldn't ever get here */ 697 assert(0); 698 *fmt++ = 'l'; 699#endif 700 } 701 else if (size_tflag) { 702 char *f = PY_FORMAT_SIZE_T; 703 while (*f) 704 *fmt++ = *f++; 705 } 706 *fmt++ = c; 707 *fmt = '\0'; 708} 709 710#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;} 711 712/* size of fixed-size buffer for formatting single arguments */ 713#define ITEM_BUFFER_LEN 21 714/* maximum number of characters required for output of %ld. 21 characters 715 allows for 64-bit integers (in decimal) and an optional sign. */ 716#define MAX_LONG_CHARS 21 717/* maximum number of characters required for output of %lld. 718 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits, 719 plus 1 for the sign. 53/22 is an upper bound for log10(256). */ 720#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22) 721 722PyObject * 723PyUnicode_FromFormatV(const char *format, va_list vargs) 724{ 725 va_list count; 726 Py_ssize_t callcount = 0; 727 PyObject **callresults = NULL; 728 PyObject **callresult = NULL; 729 Py_ssize_t n = 0; 730 int width = 0; 731 int precision = 0; 732 int zeropad; 733 const char* f; 734 Py_UNICODE *s; 735 PyObject *string; 736 /* used by sprintf */ 737 char buffer[ITEM_BUFFER_LEN+1]; 738 /* use abuffer instead of buffer, if we need more space 739 * (which can happen if there's a format specifier with width). */ 740 char *abuffer = NULL; 741 char *realbuffer; 742 Py_ssize_t abuffersize = 0; 743 char fmt[61]; /* should be enough for %0width.precisionlld */ 744 const char *copy; 745 746 Py_VA_COPY(count, vargs); 747 /* step 1: count the number of %S/%R/%A/%s format specifications 748 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/ 749 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the 750 * result in an array) */ 751 for (f = format; *f; f++) { 752 if (*f == '%') { 753 if (*(f+1)=='%') 754 continue; 755 if (*(f+1)=='S' || *(f+1)=='R' || *(f+1)=='A' || *(f+1) == 'V') 756 ++callcount; 757 while (Py_ISDIGIT((unsigned)*f)) 758 width = (width*10) + *f++ - '0'; 759 while (*++f && *f != '%' && !Py_ISALPHA((unsigned)*f)) 760 ; 761 if (*f == 's') 762 ++callcount; 763 } 764 else if (128 <= (unsigned char)*f) { 765 PyErr_Format(PyExc_ValueError, 766 "PyUnicode_FromFormatV() expects an ASCII-encoded format " 767 "string, got a non-ASCII byte: 0x%02x", 768 (unsigned char)*f); 769 return NULL; 770 } 771 } 772 /* step 2: allocate memory for the results of 773 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */ 774 if (callcount) { 775 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount); 776 if (!callresults) { 777 PyErr_NoMemory(); 778 return NULL; 779 } 780 callresult = callresults; 781 } 782 /* step 3: figure out how large a buffer we need */ 783 for (f = format; *f; f++) { 784 if (*f == '%') { 785#ifdef HAVE_LONG_LONG 786 int longlongflag = 0; 787#endif 788 const char* p = f; 789 width = 0; 790 while (Py_ISDIGIT((unsigned)*f)) 791 width = (width*10) + *f++ - '0'; 792 while (*++f && *f != '%' && !Py_ISALPHA((unsigned)*f)) 793 ; 794 795 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since 796 * they don't affect the amount of space we reserve. 797 */ 798 if (*f == 'l') { 799 if (f[1] == 'd' || f[1] == 'u') { 800 ++f; 801 } 802#ifdef HAVE_LONG_LONG 803 else if (f[1] == 'l' && 804 (f[2] == 'd' || f[2] == 'u')) { 805 longlongflag = 1; 806 f += 2; 807 } 808#endif 809 } 810 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) { 811 ++f; 812 } 813 814 switch (*f) { 815 case 'c': 816 { 817#ifndef Py_UNICODE_WIDE 818 int ordinal = va_arg(count, int); 819 if (ordinal > 0xffff) 820 n += 2; 821 else 822 n++; 823#else 824 (void)va_arg(count, int); 825 n++; 826#endif 827 break; 828 } 829 case '%': 830 n++; 831 break; 832 case 'd': case 'u': case 'i': case 'x': 833 (void) va_arg(count, int); 834#ifdef HAVE_LONG_LONG 835 if (longlongflag) { 836 if (width < MAX_LONG_LONG_CHARS) 837 width = MAX_LONG_LONG_CHARS; 838 } 839 else 840#endif 841 /* MAX_LONG_CHARS is enough to hold a 64-bit integer, 842 including sign. Decimal takes the most space. This 843 isn't enough for octal. If a width is specified we 844 need more (which we allocate later). */ 845 if (width < MAX_LONG_CHARS) 846 width = MAX_LONG_CHARS; 847 n += width; 848 /* XXX should allow for large precision here too. */ 849 if (abuffersize < width) 850 abuffersize = width; 851 break; 852 case 's': 853 { 854 /* UTF-8 */ 855 const char *s = va_arg(count, const char*); 856 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace"); 857 if (!str) 858 goto fail; 859 n += PyUnicode_GET_SIZE(str); 860 /* Remember the str and switch to the next slot */ 861 *callresult++ = str; 862 break; 863 } 864 case 'U': 865 { 866 PyObject *obj = va_arg(count, PyObject *); 867 assert(obj && PyUnicode_Check(obj)); 868 n += PyUnicode_GET_SIZE(obj); 869 break; 870 } 871 case 'V': 872 { 873 PyObject *obj = va_arg(count, PyObject *); 874 const char *str = va_arg(count, const char *); 875 PyObject *str_obj; 876 assert(obj || str); 877 assert(!obj || PyUnicode_Check(obj)); 878 if (obj) { 879 n += PyUnicode_GET_SIZE(obj); 880 *callresult++ = NULL; 881 } 882 else { 883 str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace"); 884 if (!str_obj) 885 goto fail; 886 n += PyUnicode_GET_SIZE(str_obj); 887 *callresult++ = str_obj; 888 } 889 break; 890 } 891 case 'S': 892 { 893 PyObject *obj = va_arg(count, PyObject *); 894 PyObject *str; 895 assert(obj); 896 str = PyObject_Str(obj); 897 if (!str) 898 goto fail; 899 n += PyUnicode_GET_SIZE(str); 900 /* Remember the str and switch to the next slot */ 901 *callresult++ = str; 902 break; 903 } 904 case 'R': 905 { 906 PyObject *obj = va_arg(count, PyObject *); 907 PyObject *repr; 908 assert(obj); 909 repr = PyObject_Repr(obj); 910 if (!repr) 911 goto fail; 912 n += PyUnicode_GET_SIZE(repr); 913 /* Remember the repr and switch to the next slot */ 914 *callresult++ = repr; 915 break; 916 } 917 case 'A': 918 { 919 PyObject *obj = va_arg(count, PyObject *); 920 PyObject *ascii; 921 assert(obj); 922 ascii = PyObject_ASCII(obj); 923 if (!ascii) 924 goto fail; 925 n += PyUnicode_GET_SIZE(ascii); 926 /* Remember the repr and switch to the next slot */ 927 *callresult++ = ascii; 928 break; 929 } 930 case 'p': 931 (void) va_arg(count, int); 932 /* maximum 64-bit pointer representation: 933 * 0xffffffffffffffff 934 * so 19 characters is enough. 935 * XXX I count 18 -- what's the extra for? 936 */ 937 n += 19; 938 break; 939 default: 940 /* if we stumble upon an unknown 941 formatting code, copy the rest of 942 the format string to the output 943 string. (we cannot just skip the 944 code, since there's no way to know 945 what's in the argument list) */ 946 n += strlen(p); 947 goto expand; 948 } 949 } else 950 n++; 951 } 952 expand: 953 if (abuffersize > ITEM_BUFFER_LEN) { 954 /* add 1 for sprintf's trailing null byte */ 955 abuffer = PyObject_Malloc(abuffersize + 1); 956 if (!abuffer) { 957 PyErr_NoMemory(); 958 goto fail; 959 } 960 realbuffer = abuffer; 961 } 962 else 963 realbuffer = buffer; 964 /* step 4: fill the buffer */ 965 /* Since we've analyzed how much space we need for the worst case, 966 we don't have to resize the string. 967 There can be no errors beyond this point. */ 968 string = PyUnicode_FromUnicode(NULL, n); 969 if (!string) 970 goto fail; 971 972 s = PyUnicode_AS_UNICODE(string); 973 callresult = callresults; 974 975 for (f = format; *f; f++) { 976 if (*f == '%') { 977 const char* p = f++; 978 int longflag = 0; 979 int longlongflag = 0; 980 int size_tflag = 0; 981 zeropad = (*f == '0'); 982 /* parse the width.precision part */ 983 width = 0; 984 while (Py_ISDIGIT((unsigned)*f)) 985 width = (width*10) + *f++ - '0'; 986 precision = 0; 987 if (*f == '.') { 988 f++; 989 while (Py_ISDIGIT((unsigned)*f)) 990 precision = (precision*10) + *f++ - '0'; 991 } 992 /* Handle %ld, %lu, %lld and %llu. */ 993 if (*f == 'l') { 994 if (f[1] == 'd' || f[1] == 'u') { 995 longflag = 1; 996 ++f; 997 } 998#ifdef HAVE_LONG_LONG 999 else if (f[1] == 'l' && 1000 (f[2] == 'd' || f[2] == 'u')) { 1001 longlongflag = 1; 1002 f += 2; 1003 } 1004#endif 1005 } 1006 /* handle the size_t flag. */ 1007 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) { 1008 size_tflag = 1; 1009 ++f; 1010 } 1011 1012 switch (*f) { 1013 case 'c': 1014 { 1015 int ordinal = va_arg(vargs, int); 1016#ifndef Py_UNICODE_WIDE 1017 if (ordinal > 0xffff) { 1018 ordinal -= 0x10000; 1019 *s++ = 0xD800 | (ordinal >> 10); 1020 *s++ = 0xDC00 | (ordinal & 0x3FF); 1021 } else 1022#endif 1023 *s++ = ordinal; 1024 break; 1025 } 1026 case 'd': 1027 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad, 1028 width, precision, 'd'); 1029 if (longflag) 1030 sprintf(realbuffer, fmt, va_arg(vargs, long)); 1031#ifdef HAVE_LONG_LONG 1032 else if (longlongflag) 1033 sprintf(realbuffer, fmt, va_arg(vargs, PY_LONG_LONG)); 1034#endif 1035 else if (size_tflag) 1036 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t)); 1037 else 1038 sprintf(realbuffer, fmt, va_arg(vargs, int)); 1039 appendstring(realbuffer); 1040 break; 1041 case 'u': 1042 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad, 1043 width, precision, 'u'); 1044 if (longflag) 1045 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long)); 1046#ifdef HAVE_LONG_LONG 1047 else if (longlongflag) 1048 sprintf(realbuffer, fmt, va_arg(vargs, 1049 unsigned PY_LONG_LONG)); 1050#endif 1051 else if (size_tflag) 1052 sprintf(realbuffer, fmt, va_arg(vargs, size_t)); 1053 else 1054 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int)); 1055 appendstring(realbuffer); 1056 break; 1057 case 'i': 1058 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'i'); 1059 sprintf(realbuffer, fmt, va_arg(vargs, int)); 1060 appendstring(realbuffer); 1061 break; 1062 case 'x': 1063 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x'); 1064 sprintf(realbuffer, fmt, va_arg(vargs, int)); 1065 appendstring(realbuffer); 1066 break; 1067 case 's': 1068 { 1069 /* unused, since we already have the result */ 1070 (void) va_arg(vargs, char *); 1071 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult), 1072 PyUnicode_GET_SIZE(*callresult)); 1073 s += PyUnicode_GET_SIZE(*callresult); 1074 /* We're done with the unicode()/repr() => forget it */ 1075 Py_DECREF(*callresult); 1076 /* switch to next unicode()/repr() result */ 1077 ++callresult; 1078 break; 1079 } 1080 case 'U': 1081 { 1082 PyObject *obj = va_arg(vargs, PyObject *); 1083 Py_ssize_t size = PyUnicode_GET_SIZE(obj); 1084 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size); 1085 s += size; 1086 break; 1087 } 1088 case 'V': 1089 { 1090 PyObject *obj = va_arg(vargs, PyObject *); 1091 va_arg(vargs, const char *); 1092 if (obj) { 1093 Py_ssize_t size = PyUnicode_GET_SIZE(obj); 1094 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size); 1095 s += size; 1096 } else { 1097 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult), 1098 PyUnicode_GET_SIZE(*callresult)); 1099 s += PyUnicode_GET_SIZE(*callresult); 1100 Py_DECREF(*callresult); 1101 } 1102 ++callresult; 1103 break; 1104 } 1105 case 'S': 1106 case 'R': 1107 case 'A': 1108 { 1109 Py_UNICODE *ucopy; 1110 Py_ssize_t usize; 1111 Py_ssize_t upos; 1112 /* unused, since we already have the result */ 1113 (void) va_arg(vargs, PyObject *); 1114 ucopy = PyUnicode_AS_UNICODE(*callresult); 1115 usize = PyUnicode_GET_SIZE(*callresult); 1116 for (upos = 0; upos<usize;) 1117 *s++ = ucopy[upos++]; 1118 /* We're done with the unicode()/repr() => forget it */ 1119 Py_DECREF(*callresult); 1120 /* switch to next unicode()/repr() result */ 1121 ++callresult; 1122 break; 1123 } 1124 case 'p': 1125 sprintf(buffer, "%p", va_arg(vargs, void*)); 1126 /* %p is ill-defined: ensure leading 0x. */ 1127 if (buffer[1] == 'X') 1128 buffer[1] = 'x'; 1129 else if (buffer[1] != 'x') { 1130 memmove(buffer+2, buffer, strlen(buffer)+1); 1131 buffer[0] = '0'; 1132 buffer[1] = 'x'; 1133 } 1134 appendstring(buffer); 1135 break; 1136 case '%': 1137 *s++ = '%'; 1138 break; 1139 default: 1140 appendstring(p); 1141 goto end; 1142 } 1143 } 1144 else 1145 *s++ = *f; 1146 } 1147 1148 end: 1149 if (callresults) 1150 PyObject_Free(callresults); 1151 if (abuffer) 1152 PyObject_Free(abuffer); 1153 PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string)); 1154 return string; 1155 fail: 1156 if (callresults) { 1157 PyObject **callresult2 = callresults; 1158 while (callresult2 < callresult) { 1159 Py_XDECREF(*callresult2); 1160 ++callresult2; 1161 } 1162 PyObject_Free(callresults); 1163 } 1164 if (abuffer) 1165 PyObject_Free(abuffer); 1166 return NULL; 1167} 1168 1169#undef appendstring 1170 1171PyObject * 1172PyUnicode_FromFormat(const char *format, ...) 1173{ 1174 PyObject* ret; 1175 va_list vargs; 1176 1177#ifdef HAVE_STDARG_PROTOTYPES 1178 va_start(vargs, format); 1179#else 1180 va_start(vargs); 1181#endif 1182 ret = PyUnicode_FromFormatV(format, vargs); 1183 va_end(vargs); 1184 return ret; 1185} 1186 1187/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString(): 1188 convert a Unicode object to a wide character string. 1189 1190 - If w is NULL: return the number of wide characters (including the nul 1191 character) required to convert the unicode object. Ignore size argument. 1192 1193 - Otherwise: return the number of wide characters (excluding the nul 1194 character) written into w. Write at most size wide characters (including 1195 the nul character). */ 1196static Py_ssize_t 1197unicode_aswidechar(PyUnicodeObject *unicode, 1198 wchar_t *w, 1199 Py_ssize_t size) 1200{ 1201#if Py_UNICODE_SIZE == SIZEOF_WCHAR_T 1202 Py_ssize_t res; 1203 if (w != NULL) { 1204 res = PyUnicode_GET_SIZE(unicode); 1205 if (size > res) 1206 size = res + 1; 1207 else 1208 res = size; 1209 memcpy(w, unicode->str, size * sizeof(wchar_t)); 1210 return res; 1211 } 1212 else 1213 return PyUnicode_GET_SIZE(unicode) + 1; 1214#elif Py_UNICODE_SIZE == 2 && SIZEOF_WCHAR_T == 4 1215 register const Py_UNICODE *u; 1216 const Py_UNICODE *uend; 1217 const wchar_t *worig, *wend; 1218 Py_ssize_t nchar; 1219 1220 u = PyUnicode_AS_UNICODE(unicode); 1221 uend = u + PyUnicode_GET_SIZE(unicode); 1222 if (w != NULL) { 1223 worig = w; 1224 wend = w + size; 1225 while (u != uend && w != wend) { 1226 if (0xD800 <= u[0] && u[0] <= 0xDBFF 1227 && 0xDC00 <= u[1] && u[1] <= 0xDFFF) 1228 { 1229 *w = (((u[0] & 0x3FF) << 10) | (u[1] & 0x3FF)) + 0x10000; 1230 u += 2; 1231 } 1232 else { 1233 *w = *u; 1234 u++; 1235 } 1236 w++; 1237 } 1238 if (w != wend) 1239 *w = L'\0'; 1240 return w - worig; 1241 } 1242 else { 1243 nchar = 1; /* nul character at the end */ 1244 while (u != uend) { 1245 if (0xD800 <= u[0] && u[0] <= 0xDBFF 1246 && 0xDC00 <= u[1] && u[1] <= 0xDFFF) 1247 u += 2; 1248 else 1249 u++; 1250 nchar++; 1251 } 1252 } 1253 return nchar; 1254#elif Py_UNICODE_SIZE == 4 && SIZEOF_WCHAR_T == 2 1255 register Py_UNICODE *u, *uend, ordinal; 1256 register Py_ssize_t i; 1257 wchar_t *worig, *wend; 1258 Py_ssize_t nchar; 1259 1260 u = PyUnicode_AS_UNICODE(unicode); 1261 uend = u + PyUnicode_GET_SIZE(u); 1262 if (w != NULL) { 1263 worig = w; 1264 wend = w + size; 1265 while (u != uend && w != wend) { 1266 ordinal = *u; 1267 if (ordinal > 0xffff) { 1268 ordinal -= 0x10000; 1269 *w++ = 0xD800 | (ordinal >> 10); 1270 *w++ = 0xDC00 | (ordinal & 0x3FF); 1271 } 1272 else 1273 *w++ = ordinal; 1274 u++; 1275 } 1276 if (w != wend) 1277 *w = 0; 1278 return w - worig; 1279 } 1280 else { 1281 nchar = 1; /* nul character */ 1282 while (u != uend) { 1283 if (*u > 0xffff) 1284 nchar += 2; 1285 else 1286 nchar++; 1287 u++; 1288 } 1289 return nchar; 1290 } 1291#else 1292# error "unsupported wchar_t and Py_UNICODE sizes, see issue #8670" 1293#endif 1294} 1295 1296Py_ssize_t 1297PyUnicode_AsWideChar(PyObject *unicode, 1298 wchar_t *w, 1299 Py_ssize_t size) 1300{ 1301 if (unicode == NULL) { 1302 PyErr_BadInternalCall(); 1303 return -1; 1304 } 1305 return unicode_aswidechar((PyUnicodeObject*)unicode, w, size); 1306} 1307 1308wchar_t* 1309PyUnicode_AsWideCharString(PyObject *unicode, 1310 Py_ssize_t *size) 1311{ 1312 wchar_t* buffer; 1313 Py_ssize_t buflen; 1314 1315 if (unicode == NULL) { 1316 PyErr_BadInternalCall(); 1317 return NULL; 1318 } 1319 1320 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, NULL, 0); 1321 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) { 1322 PyErr_NoMemory(); 1323 return NULL; 1324 } 1325 1326 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t)); 1327 if (buffer == NULL) { 1328 PyErr_NoMemory(); 1329 return NULL; 1330 } 1331 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, buffer, buflen); 1332 if (size != NULL) 1333 *size = buflen; 1334 return buffer; 1335} 1336 1337#endif 1338 1339PyObject *PyUnicode_FromOrdinal(int ordinal) 1340{ 1341 Py_UNICODE s[2]; 1342 1343 if (ordinal < 0 || ordinal > 0x10ffff) { 1344 PyErr_SetString(PyExc_ValueError, 1345 "chr() arg not in range(0x110000)"); 1346 return NULL; 1347 } 1348 1349#ifndef Py_UNICODE_WIDE 1350 if (ordinal > 0xffff) { 1351 ordinal -= 0x10000; 1352 s[0] = 0xD800 | (ordinal >> 10); 1353 s[1] = 0xDC00 | (ordinal & 0x3FF); 1354 return PyUnicode_FromUnicode(s, 2); 1355 } 1356#endif 1357 1358 s[0] = (Py_UNICODE)ordinal; 1359 return PyUnicode_FromUnicode(s, 1); 1360} 1361 1362PyObject *PyUnicode_FromObject(register PyObject *obj) 1363{ 1364 /* XXX Perhaps we should make this API an alias of 1365 PyObject_Str() instead ?! */ 1366 if (PyUnicode_CheckExact(obj)) { 1367 Py_INCREF(obj); 1368 return obj; 1369 } 1370 if (PyUnicode_Check(obj)) { 1371 /* For a Unicode subtype that's not a Unicode object, 1372 return a true Unicode object with the same data. */ 1373 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj), 1374 PyUnicode_GET_SIZE(obj)); 1375 } 1376 PyErr_Format(PyExc_TypeError, 1377 "Can't convert '%.100s' object to str implicitly", 1378 Py_TYPE(obj)->tp_name); 1379 return NULL; 1380} 1381 1382PyObject *PyUnicode_FromEncodedObject(register PyObject *obj, 1383 const char *encoding, 1384 const char *errors) 1385{ 1386 Py_buffer buffer; 1387 PyObject *v; 1388 1389 if (obj == NULL) { 1390 PyErr_BadInternalCall(); 1391 return NULL; 1392 } 1393 1394 /* Decoding bytes objects is the most common case and should be fast */ 1395 if (PyBytes_Check(obj)) { 1396 if (PyBytes_GET_SIZE(obj) == 0) { 1397 Py_INCREF(unicode_empty); 1398 v = (PyObject *) unicode_empty; 1399 } 1400 else { 1401 v = PyUnicode_Decode( 1402 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj), 1403 encoding, errors); 1404 } 1405 return v; 1406 } 1407 1408 if (PyUnicode_Check(obj)) { 1409 PyErr_SetString(PyExc_TypeError, 1410 "decoding str is not supported"); 1411 return NULL; 1412 } 1413 1414 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */ 1415 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) { 1416 PyErr_Format(PyExc_TypeError, 1417 "coercing to str: need bytes, bytearray " 1418 "or buffer-like object, %.80s found", 1419 Py_TYPE(obj)->tp_name); 1420 return NULL; 1421 } 1422 1423 if (buffer.len == 0) { 1424 Py_INCREF(unicode_empty); 1425 v = (PyObject *) unicode_empty; 1426 } 1427 else 1428 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors); 1429 1430 PyBuffer_Release(&buffer); 1431 return v; 1432} 1433 1434/* Convert encoding to lower case and replace '_' with '-' in order to 1435 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1), 1436 1 on success. */ 1437static int 1438normalize_encoding(const char *encoding, 1439 char *lower, 1440 size_t lower_len) 1441{ 1442 const char *e; 1443 char *l; 1444 char *l_end; 1445 1446 e = encoding; 1447 l = lower; 1448 l_end = &lower[lower_len - 1]; 1449 while (*e) { 1450 if (l == l_end) 1451 return 0; 1452 if (Py_ISUPPER(*e)) { 1453 *l++ = Py_TOLOWER(*e++); 1454 } 1455 else if (*e == '_') { 1456 *l++ = '-'; 1457 e++; 1458 } 1459 else { 1460 *l++ = *e++; 1461 } 1462 } 1463 *l = '\0'; 1464 return 1; 1465} 1466 1467PyObject *PyUnicode_Decode(const char *s, 1468 Py_ssize_t size, 1469 const char *encoding, 1470 const char *errors) 1471{ 1472 PyObject *buffer = NULL, *unicode; 1473 Py_buffer info; 1474 char lower[11]; /* Enough for any encoding shortcut */ 1475 1476 if (encoding == NULL) 1477 encoding = PyUnicode_GetDefaultEncoding(); 1478 1479 /* Shortcuts for common default encodings */ 1480 if (normalize_encoding(encoding, lower, sizeof(lower))) { 1481 if (strcmp(lower, "utf-8") == 0) 1482 return PyUnicode_DecodeUTF8(s, size, errors); 1483 else if ((strcmp(lower, "latin-1") == 0) || 1484 (strcmp(lower, "iso-8859-1") == 0)) 1485 return PyUnicode_DecodeLatin1(s, size, errors); 1486#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T) 1487 else if (strcmp(lower, "mbcs") == 0) 1488 return PyUnicode_DecodeMBCS(s, size, errors); 1489#endif 1490 else if (strcmp(lower, "ascii") == 0) 1491 return PyUnicode_DecodeASCII(s, size, errors); 1492 else if (strcmp(lower, "utf-16") == 0) 1493 return PyUnicode_DecodeUTF16(s, size, errors, 0); 1494 else if (strcmp(lower, "utf-32") == 0) 1495 return PyUnicode_DecodeUTF32(s, size, errors, 0); 1496 } 1497 1498 /* Decode via the codec registry */ 1499 buffer = NULL; 1500 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0) 1501 goto onError; 1502 buffer = PyMemoryView_FromBuffer(&info); 1503 if (buffer == NULL) 1504 goto onError; 1505 unicode = PyCodec_Decode(buffer, encoding, errors); 1506 if (unicode == NULL) 1507 goto onError; 1508 if (!PyUnicode_Check(unicode)) { 1509 PyErr_Format(PyExc_TypeError, 1510 "decoder did not return a str object (type=%.400s)", 1511 Py_TYPE(unicode)->tp_name); 1512 Py_DECREF(unicode); 1513 goto onError; 1514 } 1515 Py_DECREF(buffer); 1516 return unicode; 1517 1518 onError: 1519 Py_XDECREF(buffer); 1520 return NULL; 1521} 1522 1523PyObject *PyUnicode_AsDecodedObject(PyObject *unicode, 1524 const char *encoding, 1525 const char *errors) 1526{ 1527 PyObject *v; 1528 1529 if (!PyUnicode_Check(unicode)) { 1530 PyErr_BadArgument(); 1531 goto onError; 1532 } 1533 1534 if (encoding == NULL) 1535 encoding = PyUnicode_GetDefaultEncoding(); 1536 1537 /* Decode via the codec registry */ 1538 v = PyCodec_Decode(unicode, encoding, errors); 1539 if (v == NULL) 1540 goto onError; 1541 return v; 1542 1543 onError: 1544 return NULL; 1545} 1546 1547PyObject *PyUnicode_AsDecodedUnicode(PyObject *unicode, 1548 const char *encoding, 1549 const char *errors) 1550{ 1551 PyObject *v; 1552 1553 if (!PyUnicode_Check(unicode)) { 1554 PyErr_BadArgument(); 1555 goto onError; 1556 } 1557 1558 if (encoding == NULL) 1559 encoding = PyUnicode_GetDefaultEncoding(); 1560 1561 /* Decode via the codec registry */ 1562 v = PyCodec_Decode(unicode, encoding, errors); 1563 if (v == NULL) 1564 goto onError; 1565 if (!PyUnicode_Check(v)) { 1566 PyErr_Format(PyExc_TypeError, 1567 "decoder did not return a str object (type=%.400s)", 1568 Py_TYPE(v)->tp_name); 1569 Py_DECREF(v); 1570 goto onError; 1571 } 1572 return v; 1573 1574 onError: 1575 return NULL; 1576} 1577 1578PyObject *PyUnicode_Encode(const Py_UNICODE *s, 1579 Py_ssize_t size, 1580 const char *encoding, 1581 const char *errors) 1582{ 1583 PyObject *v, *unicode; 1584 1585 unicode = PyUnicode_FromUnicode(s, size); 1586 if (unicode == NULL) 1587 return NULL; 1588 v = PyUnicode_AsEncodedString(unicode, encoding, errors); 1589 Py_DECREF(unicode); 1590 return v; 1591} 1592 1593PyObject *PyUnicode_AsEncodedObject(PyObject *unicode, 1594 const char *encoding, 1595 const char *errors) 1596{ 1597 PyObject *v; 1598 1599 if (!PyUnicode_Check(unicode)) { 1600 PyErr_BadArgument(); 1601 goto onError; 1602 } 1603 1604 if (encoding == NULL) 1605 encoding = PyUnicode_GetDefaultEncoding(); 1606 1607 /* Encode via the codec registry */ 1608 v = PyCodec_Encode(unicode, encoding, errors); 1609 if (v == NULL) 1610 goto onError; 1611 return v; 1612 1613 onError: 1614 return NULL; 1615} 1616 1617PyObject * 1618PyUnicode_EncodeFSDefault(PyObject *unicode) 1619{ 1620#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T) 1621 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode), 1622 PyUnicode_GET_SIZE(unicode), 1623 NULL); 1624#elif defined(__APPLE__) 1625 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode), 1626 PyUnicode_GET_SIZE(unicode), 1627 "surrogateescape"); 1628#else 1629 PyInterpreterState *interp = PyThreadState_GET()->interp; 1630 /* Bootstrap check: if the filesystem codec is implemented in Python, we 1631 cannot use it to encode and decode filenames before it is loaded. Load 1632 the Python codec requires to encode at least its own filename. Use the C 1633 version of the locale codec until the codec registry is initialized and 1634 the Python codec is loaded. 1635 1636 Py_FileSystemDefaultEncoding is shared between all interpreters, we 1637 cannot only rely on it: check also interp->fscodec_initialized for 1638 subinterpreters. */ 1639 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) { 1640 return PyUnicode_AsEncodedString(unicode, 1641 Py_FileSystemDefaultEncoding, 1642 "surrogateescape"); 1643 } 1644 else { 1645 /* locale encoding with surrogateescape */ 1646 wchar_t *wchar; 1647 char *bytes; 1648 PyObject *bytes_obj; 1649 size_t error_pos; 1650 1651 wchar = PyUnicode_AsWideCharString(unicode, NULL); 1652 if (wchar == NULL) 1653 return NULL; 1654 bytes = _Py_wchar2char(wchar, &error_pos); 1655 if (bytes == NULL) { 1656 if (error_pos != (size_t)-1) { 1657 char *errmsg = strerror(errno); 1658 PyObject *exc = NULL; 1659 if (errmsg == NULL) 1660 errmsg = "Py_wchar2char() failed"; 1661 raise_encode_exception(&exc, 1662 "filesystemencoding", 1663 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode), 1664 error_pos, error_pos+1, 1665 errmsg); 1666 Py_XDECREF(exc); 1667 } 1668 else 1669 PyErr_NoMemory(); 1670 PyMem_Free(wchar); 1671 return NULL; 1672 } 1673 PyMem_Free(wchar); 1674 1675 bytes_obj = PyBytes_FromString(bytes); 1676 PyMem_Free(bytes); 1677 return bytes_obj; 1678 } 1679#endif 1680} 1681 1682PyObject *PyUnicode_AsEncodedString(PyObject *unicode, 1683 const char *encoding, 1684 const char *errors) 1685{ 1686 PyObject *v; 1687 char lower[11]; /* Enough for any encoding shortcut */ 1688 1689 if (!PyUnicode_Check(unicode)) { 1690 PyErr_BadArgument(); 1691 return NULL; 1692 } 1693 1694 if (encoding == NULL) 1695 encoding = PyUnicode_GetDefaultEncoding(); 1696 1697 /* Shortcuts for common default encodings */ 1698 if (normalize_encoding(encoding, lower, sizeof(lower))) { 1699 if (strcmp(lower, "utf-8") == 0) 1700 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode), 1701 PyUnicode_GET_SIZE(unicode), 1702 errors); 1703 else if ((strcmp(lower, "latin-1") == 0) || 1704 (strcmp(lower, "iso-8859-1") == 0)) 1705 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode), 1706 PyUnicode_GET_SIZE(unicode), 1707 errors); 1708#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T) 1709 else if (strcmp(lower, "mbcs") == 0) 1710 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode), 1711 PyUnicode_GET_SIZE(unicode), 1712 errors); 1713#endif 1714 else if (strcmp(lower, "ascii") == 0) 1715 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode), 1716 PyUnicode_GET_SIZE(unicode), 1717 errors); 1718 } 1719 /* During bootstrap, we may need to find the encodings 1720 package, to load the file system encoding, and require the 1721 file system encoding in order to load the encodings 1722 package. 1723 1724 Break out of this dependency by assuming that the path to 1725 the encodings module is ASCII-only. XXX could try wcstombs 1726 instead, if the file system encoding is the locale's 1727 encoding. */ 1728 if (Py_FileSystemDefaultEncoding && 1729 strcmp(encoding, Py_FileSystemDefaultEncoding) == 0 && 1730 !PyThreadState_GET()->interp->codecs_initialized) 1731 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode), 1732 PyUnicode_GET_SIZE(unicode), 1733 errors); 1734 1735 /* Encode via the codec registry */ 1736 v = PyCodec_Encode(unicode, encoding, errors); 1737 if (v == NULL) 1738 return NULL; 1739 1740 /* The normal path */ 1741 if (PyBytes_Check(v)) 1742 return v; 1743 1744 /* If the codec returns a buffer, raise a warning and convert to bytes */ 1745 if (PyByteArray_Check(v)) { 1746 int error; 1747 PyObject *b; 1748 1749 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1, 1750 "encoder %s returned bytearray instead of bytes", 1751 encoding); 1752 if (error) { 1753 Py_DECREF(v); 1754 return NULL; 1755 } 1756 1757 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v)); 1758 Py_DECREF(v); 1759 return b; 1760 } 1761 1762 PyErr_Format(PyExc_TypeError, 1763 "encoder did not return a bytes object (type=%.400s)", 1764 Py_TYPE(v)->tp_name); 1765 Py_DECREF(v); 1766 return NULL; 1767} 1768 1769PyObject *PyUnicode_AsEncodedUnicode(PyObject *unicode, 1770 const char *encoding, 1771 const char *errors) 1772{ 1773 PyObject *v; 1774 1775 if (!PyUnicode_Check(unicode)) { 1776 PyErr_BadArgument(); 1777 goto onError; 1778 } 1779 1780 if (encoding == NULL) 1781 encoding = PyUnicode_GetDefaultEncoding(); 1782 1783 /* Encode via the codec registry */ 1784 v = PyCodec_Encode(unicode, encoding, errors); 1785 if (v == NULL) 1786 goto onError; 1787 if (!PyUnicode_Check(v)) { 1788 PyErr_Format(PyExc_TypeError, 1789 "encoder did not return an str object (type=%.400s)", 1790 Py_TYPE(v)->tp_name); 1791 Py_DECREF(v); 1792 goto onError; 1793 } 1794 return v; 1795 1796 onError: 1797 return NULL; 1798} 1799 1800PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode, 1801 const char *errors) 1802{ 1803 PyObject *v = ((PyUnicodeObject *)unicode)->defenc; 1804 if (v) 1805 return v; 1806 if (errors != NULL) 1807 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString"); 1808 v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode), 1809 PyUnicode_GET_SIZE(unicode), 1810 NULL); 1811 if (!v) 1812 return NULL; 1813 ((PyUnicodeObject *)unicode)->defenc = v; 1814 return v; 1815} 1816 1817PyObject* 1818PyUnicode_DecodeFSDefault(const char *s) { 1819 Py_ssize_t size = (Py_ssize_t)strlen(s); 1820 return PyUnicode_DecodeFSDefaultAndSize(s, size); 1821} 1822 1823PyObject* 1824PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size) 1825{ 1826#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T) 1827 return PyUnicode_DecodeMBCS(s, size, NULL); 1828#elif defined(__APPLE__) 1829 return PyUnicode_DecodeUTF8(s, size, "surrogateescape"); 1830#else 1831 PyInterpreterState *interp = PyThreadState_GET()->interp; 1832 /* Bootstrap check: if the filesystem codec is implemented in Python, we 1833 cannot use it to encode and decode filenames before it is loaded. Load 1834 the Python codec requires to encode at least its own filename. Use the C 1835 version of the locale codec until the codec registry is initialized and 1836 the Python codec is loaded. 1837 1838 Py_FileSystemDefaultEncoding is shared between all interpreters, we 1839 cannot only rely on it: check also interp->fscodec_initialized for 1840 subinterpreters. */ 1841 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) { 1842 return PyUnicode_Decode(s, size, 1843 Py_FileSystemDefaultEncoding, 1844 "surrogateescape"); 1845 } 1846 else { 1847 /* locale encoding with surrogateescape */ 1848 wchar_t *wchar; 1849 PyObject *unicode; 1850 size_t len; 1851 1852 if (s[size] != '\0' || size != strlen(s)) { 1853 PyErr_SetString(PyExc_TypeError, "embedded NUL character"); 1854 return NULL; 1855 } 1856 1857 wchar = _Py_char2wchar(s, &len); 1858 if (wchar == NULL) 1859 return PyErr_NoMemory(); 1860 1861 unicode = PyUnicode_FromWideChar(wchar, len); 1862 PyMem_Free(wchar); 1863 return unicode; 1864 } 1865#endif 1866} 1867 1868 1869int 1870PyUnicode_FSConverter(PyObject* arg, void* addr) 1871{ 1872 PyObject *output = NULL; 1873 Py_ssize_t size; 1874 void *data; 1875 if (arg == NULL) { 1876 Py_DECREF(*(PyObject**)addr); 1877 return 1; 1878 } 1879 if (PyBytes_Check(arg)) { 1880 output = arg; 1881 Py_INCREF(output); 1882 } 1883 else { 1884 arg = PyUnicode_FromObject(arg); 1885 if (!arg) 1886 return 0; 1887 output = PyUnicode_EncodeFSDefault(arg); 1888 Py_DECREF(arg); 1889 if (!output) 1890 return 0; 1891 if (!PyBytes_Check(output)) { 1892 Py_DECREF(output); 1893 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes"); 1894 return 0; 1895 } 1896 } 1897 size = PyBytes_GET_SIZE(output); 1898 data = PyBytes_AS_STRING(output); 1899 if (size != strlen(data)) { 1900 PyErr_SetString(PyExc_TypeError, "embedded NUL character"); 1901 Py_DECREF(output); 1902 return 0; 1903 } 1904 *(PyObject**)addr = output; 1905 return Py_CLEANUP_SUPPORTED; 1906} 1907 1908 1909int 1910PyUnicode_FSDecoder(PyObject* arg, void* addr) 1911{ 1912 PyObject *output = NULL; 1913 Py_ssize_t size; 1914 void *data; 1915 if (arg == NULL) { 1916 Py_DECREF(*(PyObject**)addr); 1917 return 1; 1918 } 1919 if (PyUnicode_Check(arg)) { 1920 output = arg; 1921 Py_INCREF(output); 1922 } 1923 else { 1924 arg = PyBytes_FromObject(arg); 1925 if (!arg) 1926 return 0; 1927 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg), 1928 PyBytes_GET_SIZE(arg)); 1929 Py_DECREF(arg); 1930 if (!output) 1931 return 0; 1932 if (!PyUnicode_Check(output)) { 1933 Py_DECREF(output); 1934 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode"); 1935 return 0; 1936 } 1937 } 1938 size = PyUnicode_GET_SIZE(output); 1939 data = PyUnicode_AS_UNICODE(output); 1940 if (size != Py_UNICODE_strlen(data)) { 1941 PyErr_SetString(PyExc_TypeError, "embedded NUL character"); 1942 Py_DECREF(output); 1943 return 0; 1944 } 1945 *(PyObject**)addr = output; 1946 return Py_CLEANUP_SUPPORTED; 1947} 1948 1949 1950char* 1951_PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize) 1952{ 1953 PyObject *bytes; 1954 if (!PyUnicode_Check(unicode)) { 1955 PyErr_BadArgument(); 1956 return NULL; 1957 } 1958 bytes = _PyUnicode_AsDefaultEncodedString(unicode, NULL); 1959 if (bytes == NULL) 1960 return NULL; 1961 if (psize != NULL) 1962 *psize = PyBytes_GET_SIZE(bytes); 1963 return PyBytes_AS_STRING(bytes); 1964} 1965 1966char* 1967_PyUnicode_AsString(PyObject *unicode) 1968{ 1969 return _PyUnicode_AsStringAndSize(unicode, NULL); 1970} 1971 1972Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode) 1973{ 1974 if (!PyUnicode_Check(unicode)) { 1975 PyErr_BadArgument(); 1976 goto onError; 1977 } 1978 return PyUnicode_AS_UNICODE(unicode); 1979 1980 onError: 1981 return NULL; 1982} 1983 1984Py_ssize_t PyUnicode_GetSize(PyObject *unicode) 1985{ 1986 if (!PyUnicode_Check(unicode)) { 1987 PyErr_BadArgument(); 1988 goto onError; 1989 } 1990 return PyUnicode_GET_SIZE(unicode); 1991 1992 onError: 1993 return -1; 1994} 1995 1996const char *PyUnicode_GetDefaultEncoding(void) 1997{ 1998 return "utf-8"; 1999} 2000 2001/* create or adjust a UnicodeDecodeError */ 2002static void 2003make_decode_exception(PyObject **exceptionObject, 2004 const char *encoding, 2005 const char *input, Py_ssize_t length, 2006 Py_ssize_t startpos, Py_ssize_t endpos, 2007 const char *reason) 2008{ 2009 if (*exceptionObject == NULL) { 2010 *exceptionObject = PyUnicodeDecodeError_Create( 2011 encoding, input, length, startpos, endpos, reason); 2012 } 2013 else { 2014 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos)) 2015 goto onError; 2016 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos)) 2017 goto onError; 2018 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason)) 2019 goto onError; 2020 } 2021 return; 2022 2023onError: 2024 Py_DECREF(*exceptionObject); 2025 *exceptionObject = NULL; 2026} 2027 2028/* error handling callback helper: 2029 build arguments, call the callback and check the arguments, 2030 if no exception occurred, copy the replacement to the output 2031 and adjust various state variables. 2032 return 0 on success, -1 on error 2033*/ 2034 2035static 2036int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler, 2037 const char *encoding, const char *reason, 2038 const char **input, const char **inend, Py_ssize_t *startinpos, 2039 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr, 2040 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr) 2041{ 2042 static char *argparse = "O!n;decoding error handler must return (str, int) tuple"; 2043 2044 PyObject *restuple = NULL; 2045 PyObject *repunicode = NULL; 2046 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output); 2047 Py_ssize_t insize; 2048 Py_ssize_t requiredsize; 2049 Py_ssize_t newpos; 2050 Py_UNICODE *repptr; 2051 PyObject *inputobj = NULL; 2052 Py_ssize_t repsize; 2053 int res = -1; 2054 2055 if (*errorHandler == NULL) { 2056 *errorHandler = PyCodec_LookupError(errors); 2057 if (*errorHandler == NULL) 2058 goto onError; 2059 } 2060 2061 make_decode_exception(exceptionObject, 2062 encoding, 2063 *input, *inend - *input, 2064 *startinpos, *endinpos, 2065 reason); 2066 if (*exceptionObject == NULL) 2067 goto onError; 2068 2069 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL); 2070 if (restuple == NULL) 2071 goto onError; 2072 if (!PyTuple_Check(restuple)) { 2073 PyErr_SetString(PyExc_TypeError, &argparse[4]); 2074 goto onError; 2075 } 2076 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos)) 2077 goto onError; 2078 2079 /* Copy back the bytes variables, which might have been modified by the 2080 callback */ 2081 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject); 2082 if (!inputobj) 2083 goto onError; 2084 if (!PyBytes_Check(inputobj)) { 2085 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes"); 2086 } 2087 *input = PyBytes_AS_STRING(inputobj); 2088 insize = PyBytes_GET_SIZE(inputobj); 2089 *inend = *input + insize; 2090 /* we can DECREF safely, as the exception has another reference, 2091 so the object won't go away. */ 2092 Py_DECREF(inputobj); 2093 2094 if (newpos<0) 2095 newpos = insize+newpos; 2096 if (newpos<0 || newpos>insize) { 2097 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos); 2098 goto onError; 2099 } 2100 2101 /* need more space? (at least enough for what we 2102 have+the replacement+the rest of the string (starting 2103 at the new input position), so we won't have to check space 2104 when there are no errors in the rest of the string) */ 2105 repptr = PyUnicode_AS_UNICODE(repunicode); 2106 repsize = PyUnicode_GET_SIZE(repunicode); 2107 requiredsize = *outpos + repsize + insize-newpos; 2108 if (requiredsize > outsize) { 2109 if (requiredsize<2*outsize) 2110 requiredsize = 2*outsize; 2111 if (_PyUnicode_Resize(output, requiredsize) < 0) 2112 goto onError; 2113 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos; 2114 } 2115 *endinpos = newpos; 2116 *inptr = *input + newpos; 2117 Py_UNICODE_COPY(*outptr, repptr, repsize); 2118 *outptr += repsize; 2119 *outpos += repsize; 2120 2121 /* we made it! */ 2122 res = 0; 2123 2124 onError: 2125 Py_XDECREF(restuple); 2126 return res; 2127} 2128 2129/* --- UTF-7 Codec -------------------------------------------------------- */ 2130 2131/* See RFC2152 for details. We encode conservatively and decode liberally. */ 2132 2133/* Three simple macros defining base-64. */ 2134 2135/* Is c a base-64 character? */ 2136 2137#define IS_BASE64(c) \ 2138 (((c) >= 'A' && (c) <= 'Z') || \ 2139 ((c) >= 'a' && (c) <= 'z') || \ 2140 ((c) >= '0' && (c) <= '9') || \ 2141 (c) == '+' || (c) == '/') 2142 2143/* given that c is a base-64 character, what is its base-64 value? */ 2144 2145#define FROM_BASE64(c) \ 2146 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \ 2147 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \ 2148 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \ 2149 (c) == '+' ? 62 : 63) 2150 2151/* What is the base-64 character of the bottom 6 bits of n? */ 2152 2153#define TO_BASE64(n) \ 2154 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f]) 2155 2156/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be 2157 * decoded as itself. We are permissive on decoding; the only ASCII 2158 * byte not decoding to itself is the + which begins a base64 2159 * string. */ 2160 2161#define DECODE_DIRECT(c) \ 2162 ((c) <= 127 && (c) != '+') 2163 2164/* The UTF-7 encoder treats ASCII characters differently according to 2165 * whether they are Set D, Set O, Whitespace, or special (i.e. none of 2166 * the above). See RFC2152. This array identifies these different 2167 * sets: 2168 * 0 : "Set D" 2169 * alphanumeric and '(),-./:? 2170 * 1 : "Set O" 2171 * !"#$%&*;<=>@[]^_`{|} 2172 * 2 : "whitespace" 2173 * ht nl cr sp 2174 * 3 : special (must be base64 encoded) 2175 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127) 2176 */ 2177 2178static 2179char utf7_category[128] = { 2180/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */ 2181 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3, 2182/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */ 2183 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2184/* sp ! " # $ % & ' ( ) * + , - . / */ 2185 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0, 2186/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */ 2187 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 2188/* @ A B C D E F G H I J K L M N O */ 2189 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2190/* P Q R S T U V W X Y Z [ \ ] ^ _ */ 2191 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1, 2192/* ` a b c d e f g h i j k l m n o */ 2193 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2194/* p q r s t u v w x y z { | } ~ del */ 2195 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3, 2196}; 2197 2198/* ENCODE_DIRECT: this character should be encoded as itself. The 2199 * answer depends on whether we are encoding set O as itself, and also 2200 * on whether we are encoding whitespace as itself. RFC2152 makes it 2201 * clear that the answers to these questions vary between 2202 * applications, so this code needs to be flexible. */ 2203 2204#define ENCODE_DIRECT(c, directO, directWS) \ 2205 ((c) < 128 && (c) > 0 && \ 2206 ((utf7_category[(c)] == 0) || \ 2207 (directWS && (utf7_category[(c)] == 2)) || \ 2208 (directO && (utf7_category[(c)] == 1)))) 2209 2210PyObject *PyUnicode_DecodeUTF7(const char *s, 2211 Py_ssize_t size, 2212 const char *errors) 2213{ 2214 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL); 2215} 2216 2217/* The decoder. The only state we preserve is our read position, 2218 * i.e. how many characters we have consumed. So if we end in the 2219 * middle of a shift sequence we have to back off the read position 2220 * and the output to the beginning of the sequence, otherwise we lose 2221 * all the shift state (seen bits, number of bits seen, high 2222 * surrogate). */ 2223 2224PyObject *PyUnicode_DecodeUTF7Stateful(const char *s, 2225 Py_ssize_t size, 2226 const char *errors, 2227 Py_ssize_t *consumed) 2228{ 2229 const char *starts = s; 2230 Py_ssize_t startinpos; 2231 Py_ssize_t endinpos; 2232 Py_ssize_t outpos; 2233 const char *e; 2234 PyUnicodeObject *unicode; 2235 Py_UNICODE *p; 2236 const char *errmsg = ""; 2237 int inShift = 0; 2238 Py_UNICODE *shiftOutStart; 2239 unsigned int base64bits = 0; 2240 unsigned long base64buffer = 0; 2241 Py_UNICODE surrogate = 0; 2242 PyObject *errorHandler = NULL; 2243 PyObject *exc = NULL; 2244 2245 unicode = _PyUnicode_New(size); 2246 if (!unicode) 2247 return NULL; 2248 if (size == 0) { 2249 if (consumed) 2250 *consumed = 0; 2251 return (PyObject *)unicode; 2252 } 2253 2254 p = unicode->str; 2255 shiftOutStart = p; 2256 e = s + size; 2257 2258 while (s < e) { 2259 Py_UNICODE ch; 2260 restart: 2261 ch = (unsigned char) *s; 2262 2263 if (inShift) { /* in a base-64 section */ 2264 if (IS_BASE64(ch)) { /* consume a base-64 character */ 2265 base64buffer = (base64buffer << 6) | FROM_BASE64(ch); 2266 base64bits += 6; 2267 s++; 2268 if (base64bits >= 16) { 2269 /* we have enough bits for a UTF-16 value */ 2270 Py_UNICODE outCh = (Py_UNICODE) 2271 (base64buffer >> (base64bits-16)); 2272 base64bits -= 16; 2273 base64buffer &= (1 << base64bits) - 1; /* clear high bits */ 2274 if (surrogate) { 2275 /* expecting a second surrogate */ 2276 if (outCh >= 0xDC00 && outCh <= 0xDFFF) { 2277#ifdef Py_UNICODE_WIDE 2278 *p++ = (((surrogate & 0x3FF)<<10) 2279 | (outCh & 0x3FF)) + 0x10000; 2280#else 2281 *p++ = surrogate; 2282 *p++ = outCh; 2283#endif 2284 surrogate = 0; 2285 } 2286 else { 2287 surrogate = 0; 2288 errmsg = "second surrogate missing"; 2289 goto utf7Error; 2290 } 2291 } 2292 else if (outCh >= 0xD800 && outCh <= 0xDBFF) { 2293 /* first surrogate */ 2294 surrogate = outCh; 2295 } 2296 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) { 2297 errmsg = "unexpected second surrogate"; 2298 goto utf7Error; 2299 } 2300 else { 2301 *p++ = outCh; 2302 } 2303 } 2304 } 2305 else { /* now leaving a base-64 section */ 2306 inShift = 0; 2307 s++; 2308 if (surrogate) { 2309 errmsg = "second surrogate missing at end of shift sequence"; 2310 goto utf7Error; 2311 } 2312 if (base64bits > 0) { /* left-over bits */ 2313 if (base64bits >= 6) { 2314 /* We've seen at least one base-64 character */ 2315 errmsg = "partial character in shift sequence"; 2316 goto utf7Error; 2317 } 2318 else { 2319 /* Some bits remain; they should be zero */ 2320 if (base64buffer != 0) { 2321 errmsg = "non-zero padding bits in shift sequence"; 2322 goto utf7Error; 2323 } 2324 } 2325 } 2326 if (ch != '-') { 2327 /* '-' is absorbed; other terminating 2328 characters are preserved */ 2329 *p++ = ch; 2330 } 2331 } 2332 } 2333 else if ( ch == '+' ) { 2334 startinpos = s-starts; 2335 s++; /* consume '+' */ 2336 if (s < e && *s == '-') { /* '+-' encodes '+' */ 2337 s++; 2338 *p++ = '+'; 2339 } 2340 else { /* begin base64-encoded section */ 2341 inShift = 1; 2342 shiftOutStart = p; 2343 base64bits = 0; 2344 } 2345 } 2346 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */ 2347 *p++ = ch; 2348 s++; 2349 } 2350 else { 2351 startinpos = s-starts; 2352 s++; 2353 errmsg = "unexpected special character"; 2354 goto utf7Error; 2355 } 2356 continue; 2357utf7Error: 2358 outpos = p-PyUnicode_AS_UNICODE(unicode); 2359 endinpos = s-starts; 2360 if (unicode_decode_call_errorhandler( 2361 errors, &errorHandler, 2362 "utf7", errmsg, 2363 &starts, &e, &startinpos, &endinpos, &exc, &s, 2364 &unicode, &outpos, &p)) 2365 goto onError; 2366 } 2367 2368 /* end of string */ 2369 2370 if (inShift && !consumed) { /* in shift sequence, no more to follow */ 2371 /* if we're in an inconsistent state, that's an error */ 2372 if (surrogate || 2373 (base64bits >= 6) || 2374 (base64bits > 0 && base64buffer != 0)) { 2375 outpos = p-PyUnicode_AS_UNICODE(unicode); 2376 endinpos = size; 2377 if (unicode_decode_call_errorhandler( 2378 errors, &errorHandler, 2379 "utf7", "unterminated shift sequence", 2380 &starts, &e, &startinpos, &endinpos, &exc, &s, 2381 &unicode, &outpos, &p)) 2382 goto onError; 2383 if (s < e) 2384 goto restart; 2385 } 2386 } 2387 2388 /* return state */ 2389 if (consumed) { 2390 if (inShift) { 2391 p = shiftOutStart; /* back off output */ 2392 *consumed = startinpos; 2393 } 2394 else { 2395 *consumed = s-starts; 2396 } 2397 } 2398 2399 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0) 2400 goto onError; 2401 2402 Py_XDECREF(errorHandler); 2403 Py_XDECREF(exc); 2404 return (PyObject *)unicode; 2405 2406 onError: 2407 Py_XDECREF(errorHandler); 2408 Py_XDECREF(exc); 2409 Py_DECREF(unicode); 2410 return NULL; 2411} 2412 2413 2414PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s, 2415 Py_ssize_t size, 2416 int base64SetO, 2417 int base64WhiteSpace, 2418 const char *errors) 2419{ 2420 PyObject *v; 2421 /* It might be possible to tighten this worst case */ 2422 Py_ssize_t allocated = 8 * size; 2423 int inShift = 0; 2424 Py_ssize_t i = 0; 2425 unsigned int base64bits = 0; 2426 unsigned long base64buffer = 0; 2427 char * out; 2428 char * start; 2429 2430 if (size == 0) 2431 return PyBytes_FromStringAndSize(NULL, 0); 2432 2433 if (allocated / 8 != size) 2434 return PyErr_NoMemory(); 2435 2436 v = PyBytes_FromStringAndSize(NULL, allocated); 2437 if (v == NULL) 2438 return NULL; 2439 2440 start = out = PyBytes_AS_STRING(v); 2441 for (;i < size; ++i) { 2442 Py_UNICODE ch = s[i]; 2443 2444 if (inShift) { 2445 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) { 2446 /* shifting out */ 2447 if (base64bits) { /* output remaining bits */ 2448 *out++ = TO_BASE64(base64buffer << (6-base64bits)); 2449 base64buffer = 0; 2450 base64bits = 0; 2451 } 2452 inShift = 0; 2453 /* Characters not in the BASE64 set implicitly unshift the sequence 2454 so no '-' is required, except if the character is itself a '-' */ 2455 if (IS_BASE64(ch) || ch == '-') { 2456 *out++ = '-'; 2457 } 2458 *out++ = (char) ch; 2459 } 2460 else { 2461 goto encode_char; 2462 } 2463 } 2464 else { /* not in a shift sequence */ 2465 if (ch == '+') { 2466 *out++ = '+'; 2467 *out++ = '-'; 2468 } 2469 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) { 2470 *out++ = (char) ch; 2471 } 2472 else { 2473 *out++ = '+'; 2474 inShift = 1; 2475 goto encode_char; 2476 } 2477 } 2478 continue; 2479encode_char: 2480#ifdef Py_UNICODE_WIDE 2481 if (ch >= 0x10000) { 2482 /* code first surrogate */ 2483 base64bits += 16; 2484 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10); 2485 while (base64bits >= 6) { 2486 *out++ = TO_BASE64(base64buffer >> (base64bits-6)); 2487 base64bits -= 6; 2488 } 2489 /* prepare second surrogate */ 2490 ch = 0xDC00 | ((ch-0x10000) & 0x3FF); 2491 } 2492#endif 2493 base64bits += 16; 2494 base64buffer = (base64buffer << 16) | ch; 2495 while (base64bits >= 6) { 2496 *out++ = TO_BASE64(base64buffer >> (base64bits-6)); 2497 base64bits -= 6; 2498 } 2499 } 2500 if (base64bits) 2501 *out++= TO_BASE64(base64buffer << (6-base64bits) ); 2502 if (inShift) 2503 *out++ = '-'; 2504 if (_PyBytes_Resize(&v, out - start) < 0) 2505 return NULL; 2506 return v; 2507} 2508 2509#undef IS_BASE64 2510#undef FROM_BASE64 2511#undef TO_BASE64 2512#undef DECODE_DIRECT 2513#undef ENCODE_DIRECT 2514 2515/* --- UTF-8 Codec -------------------------------------------------------- */ 2516 2517static 2518char utf8_code_length[256] = { 2519 /* Map UTF-8 encoded prefix byte to sequence length. Zero means 2520 illegal prefix. See RFC 3629 for details */ 2521 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */ 2522 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2523 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2524 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2525 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2526 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2527 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2528 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */ 2529 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */ 2530 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2531 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2532 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */ 2533 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */ 2534 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */ 2535 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */ 2536 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */ 2537}; 2538 2539PyObject *PyUnicode_DecodeUTF8(const char *s, 2540 Py_ssize_t size, 2541 const char *errors) 2542{ 2543 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL); 2544} 2545 2546/* Mask to check or force alignment of a pointer to C 'long' boundaries */ 2547#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1) 2548 2549/* Mask to quickly check whether a C 'long' contains a 2550 non-ASCII, UTF8-encoded char. */ 2551#if (SIZEOF_LONG == 8) 2552# define ASCII_CHAR_MASK 0x8080808080808080L 2553#elif (SIZEOF_LONG == 4) 2554# define ASCII_CHAR_MASK 0x80808080L 2555#else 2556# error C 'long' size should be either 4 or 8! 2557#endif 2558 2559PyObject *PyUnicode_DecodeUTF8Stateful(const char *s, 2560 Py_ssize_t size, 2561 const char *errors, 2562 Py_ssize_t *consumed) 2563{ 2564 const char *starts = s; 2565 int n; 2566 int k; 2567 Py_ssize_t startinpos; 2568 Py_ssize_t endinpos; 2569 Py_ssize_t outpos; 2570 const char *e, *aligned_end; 2571 PyUnicodeObject *unicode; 2572 Py_UNICODE *p; 2573 const char *errmsg = ""; 2574 PyObject *errorHandler = NULL; 2575 PyObject *exc = NULL; 2576 2577 /* Note: size will always be longer than the resulting Unicode 2578 character count */ 2579 unicode = _PyUnicode_New(size); 2580 if (!unicode) 2581 return NULL; 2582 if (size == 0) { 2583 if (consumed) 2584 *consumed = 0; 2585 return (PyObject *)unicode; 2586 } 2587 2588 /* Unpack UTF-8 encoded data */ 2589 p = unicode->str; 2590 e = s + size; 2591 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK); 2592 2593 while (s < e) { 2594 Py_UCS4 ch = (unsigned char)*s; 2595 2596 if (ch < 0x80) { 2597 /* Fast path for runs of ASCII characters. Given that common UTF-8 2598 input will consist of an overwhelming majority of ASCII 2599 characters, we try to optimize for this case by checking 2600 as many characters as a C 'long' can contain. 2601 First, check if we can do an aligned read, as most CPUs have 2602 a penalty for unaligned reads. 2603 */ 2604 if (!((size_t) s & LONG_PTR_MASK)) { 2605 /* Help register allocation */ 2606 register const char *_s = s; 2607 register Py_UNICODE *_p = p; 2608 while (_s < aligned_end) { 2609 /* Read a whole long at a time (either 4 or 8 bytes), 2610 and do a fast unrolled copy if it only contains ASCII 2611 characters. */ 2612 unsigned long data = *(unsigned long *) _s; 2613 if (data & ASCII_CHAR_MASK) 2614 break; 2615 _p[0] = (unsigned char) _s[0]; 2616 _p[1] = (unsigned char) _s[1]; 2617 _p[2] = (unsigned char) _s[2]; 2618 _p[3] = (unsigned char) _s[3]; 2619#if (SIZEOF_LONG == 8) 2620 _p[4] = (unsigned char) _s[4]; 2621 _p[5] = (unsigned char) _s[5]; 2622 _p[6] = (unsigned char) _s[6]; 2623 _p[7] = (unsigned char) _s[7]; 2624#endif 2625 _s += SIZEOF_LONG; 2626 _p += SIZEOF_LONG; 2627 } 2628 s = _s; 2629 p = _p; 2630 if (s == e) 2631 break; 2632 ch = (unsigned char)*s; 2633 } 2634 } 2635 2636 if (ch < 0x80) { 2637 *p++ = (Py_UNICODE)ch; 2638 s++; 2639 continue; 2640 } 2641 2642 n = utf8_code_length[ch]; 2643 2644 if (s + n > e) { 2645 if (consumed) 2646 break; 2647 else { 2648 errmsg = "unexpected end of data"; 2649 startinpos = s-starts; 2650 endinpos = startinpos+1; 2651 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++) 2652 endinpos++; 2653 goto utf8Error; 2654 } 2655 } 2656 2657 switch (n) { 2658 2659 case 0: 2660 errmsg = "invalid start byte"; 2661 startinpos = s-starts; 2662 endinpos = startinpos+1; 2663 goto utf8Error; 2664 2665 case 1: 2666 errmsg = "internal error"; 2667 startinpos = s-starts; 2668 endinpos = startinpos+1; 2669 goto utf8Error; 2670 2671 case 2: 2672 if ((s[1] & 0xc0) != 0x80) { 2673 errmsg = "invalid continuation byte"; 2674 startinpos = s-starts; 2675 endinpos = startinpos + 1; 2676 goto utf8Error; 2677 } 2678 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f); 2679 assert ((ch > 0x007F) && (ch <= 0x07FF)); 2680 *p++ = (Py_UNICODE)ch; 2681 break; 2682 2683 case 3: 2684 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf 2685 will result in surrogates in range d800-dfff. Surrogates are 2686 not valid UTF-8 so they are rejected. 2687 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf 2688 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */ 2689 if ((s[1] & 0xc0) != 0x80 || 2690 (s[2] & 0xc0) != 0x80 || 2691 ((unsigned char)s[0] == 0xE0 && 2692 (unsigned char)s[1] < 0xA0) || 2693 ((unsigned char)s[0] == 0xED && 2694 (unsigned char)s[1] > 0x9F)) { 2695 errmsg = "invalid continuation byte"; 2696 startinpos = s-starts; 2697 endinpos = startinpos + 1; 2698 2699 /* if s[1] first two bits are 1 and 0, then the invalid 2700 continuation byte is s[2], so increment endinpos by 1, 2701 if not, s[1] is invalid and endinpos doesn't need to 2702 be incremented. */ 2703 if ((s[1] & 0xC0) == 0x80) 2704 endinpos++; 2705 goto utf8Error; 2706 } 2707 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f); 2708 assert ((ch > 0x07FF) && (ch <= 0xFFFF)); 2709 *p++ = (Py_UNICODE)ch; 2710 break; 2711 2712 case 4: 2713 if ((s[1] & 0xc0) != 0x80 || 2714 (s[2] & 0xc0) != 0x80 || 2715 (s[3] & 0xc0) != 0x80 || 2716 ((unsigned char)s[0] == 0xF0 && 2717 (unsigned char)s[1] < 0x90) || 2718 ((unsigned char)s[0] == 0xF4 && 2719 (unsigned char)s[1] > 0x8F)) { 2720 errmsg = "invalid continuation byte"; 2721 startinpos = s-starts; 2722 endinpos = startinpos + 1; 2723 if ((s[1] & 0xC0) == 0x80) { 2724 endinpos++; 2725 if ((s[2] & 0xC0) == 0x80) 2726 endinpos++; 2727 } 2728 goto utf8Error; 2729 } 2730 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) + 2731 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f); 2732 assert ((ch > 0xFFFF) && (ch <= 0x10ffff)); 2733 2734#ifdef Py_UNICODE_WIDE 2735 *p++ = (Py_UNICODE)ch; 2736#else 2737 /* compute and append the two surrogates: */ 2738 2739 /* translate from 10000..10FFFF to 0..FFFF */ 2740 ch -= 0x10000; 2741 2742 /* high surrogate = top 10 bits added to D800 */ 2743 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10)); 2744 2745 /* low surrogate = bottom 10 bits added to DC00 */ 2746 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF)); 2747#endif 2748 break; 2749 } 2750 s += n; 2751 continue; 2752 2753 utf8Error: 2754 outpos = p-PyUnicode_AS_UNICODE(unicode); 2755 if (unicode_decode_call_errorhandler( 2756 errors, &errorHandler, 2757 "utf8", errmsg, 2758 &starts, &e, &startinpos, &endinpos, &exc, &s, 2759 &unicode, &outpos, &p)) 2760 goto onError; 2761 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK); 2762 } 2763 if (consumed) 2764 *consumed = s-starts; 2765 2766 /* Adjust length */ 2767 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0) 2768 goto onError; 2769 2770 Py_XDECREF(errorHandler); 2771 Py_XDECREF(exc); 2772 return (PyObject *)unicode; 2773 2774 onError: 2775 Py_XDECREF(errorHandler); 2776 Py_XDECREF(exc); 2777 Py_DECREF(unicode); 2778 return NULL; 2779} 2780 2781#undef ASCII_CHAR_MASK 2782 2783#ifdef __APPLE__ 2784 2785/* Simplified UTF-8 decoder using surrogateescape error handler, 2786 used to decode the command line arguments on Mac OS X. */ 2787 2788wchar_t* 2789_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size) 2790{ 2791 int n; 2792 const char *e; 2793 wchar_t *unicode, *p; 2794 2795 /* Note: size will always be longer than the resulting Unicode 2796 character count */ 2797 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) { 2798 PyErr_NoMemory(); 2799 return NULL; 2800 } 2801 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t)); 2802 if (!unicode) 2803 return NULL; 2804 2805 /* Unpack UTF-8 encoded data */ 2806 p = unicode; 2807 e = s + size; 2808 while (s < e) { 2809 Py_UCS4 ch = (unsigned char)*s; 2810 2811 if (ch < 0x80) { 2812 *p++ = (wchar_t)ch; 2813 s++; 2814 continue; 2815 } 2816 2817 n = utf8_code_length[ch]; 2818 if (s + n > e) { 2819 goto surrogateescape; 2820 } 2821 2822 switch (n) { 2823 case 0: 2824 case 1: 2825 goto surrogateescape; 2826 2827 case 2: 2828 if ((s[1] & 0xc0) != 0x80) 2829 goto surrogateescape; 2830 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f); 2831 assert ((ch > 0x007F) && (ch <= 0x07FF)); 2832 *p++ = (wchar_t)ch; 2833 break; 2834 2835 case 3: 2836 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf 2837 will result in surrogates in range d800-dfff. Surrogates are 2838 not valid UTF-8 so they are rejected. 2839 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf 2840 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */ 2841 if ((s[1] & 0xc0) != 0x80 || 2842 (s[2] & 0xc0) != 0x80 || 2843 ((unsigned char)s[0] == 0xE0 && 2844 (unsigned char)s[1] < 0xA0) || 2845 ((unsigned char)s[0] == 0xED && 2846 (unsigned char)s[1] > 0x9F)) { 2847 2848 goto surrogateescape; 2849 } 2850 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f); 2851 assert ((ch > 0x07FF) && (ch <= 0xFFFF)); 2852 *p++ = (Py_UNICODE)ch; 2853 break; 2854 2855 case 4: 2856 if ((s[1] & 0xc0) != 0x80 || 2857 (s[2] & 0xc0) != 0x80 || 2858 (s[3] & 0xc0) != 0x80 || 2859 ((unsigned char)s[0] == 0xF0 && 2860 (unsigned char)s[1] < 0x90) || 2861 ((unsigned char)s[0] == 0xF4 && 2862 (unsigned char)s[1] > 0x8F)) { 2863 goto surrogateescape; 2864 } 2865 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) + 2866 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f); 2867 assert ((ch > 0xFFFF) && (ch <= 0x10ffff)); 2868 2869#if SIZEOF_WCHAR_T == 4 2870 *p++ = (wchar_t)ch; 2871#else 2872 /* compute and append the two surrogates: */ 2873 2874 /* translate from 10000..10FFFF to 0..FFFF */ 2875 ch -= 0x10000; 2876 2877 /* high surrogate = top 10 bits added to D800 */ 2878 *p++ = (wchar_t)(0xD800 + (ch >> 10)); 2879 2880 /* low surrogate = bottom 10 bits added to DC00 */ 2881 *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF)); 2882#endif 2883 break; 2884 } 2885 s += n; 2886 continue; 2887 2888 surrogateescape: 2889 *p++ = 0xDC00 + ch; 2890 s++; 2891 } 2892 *p = L'\0'; 2893 return unicode; 2894} 2895 2896#endif /* __APPLE__ */ 2897 2898/* Allocation strategy: if the string is short, convert into a stack buffer 2899 and allocate exactly as much space needed at the end. Else allocate the 2900 maximum possible needed (4 result bytes per Unicode character), and return 2901 the excess memory at the end. 2902*/ 2903PyObject * 2904PyUnicode_EncodeUTF8(const Py_UNICODE *s, 2905 Py_ssize_t size, 2906 const char *errors) 2907{ 2908#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */ 2909 2910 Py_ssize_t i; /* index into s of next input byte */ 2911 PyObject *result; /* result string object */ 2912 char *p; /* next free byte in output buffer */ 2913 Py_ssize_t nallocated; /* number of result bytes allocated */ 2914 Py_ssize_t nneeded; /* number of result bytes needed */ 2915 char stackbuf[MAX_SHORT_UNICHARS * 4]; 2916 PyObject *errorHandler = NULL; 2917 PyObject *exc = NULL; 2918 2919 assert(s != NULL); 2920 assert(size >= 0); 2921 2922 if (size <= MAX_SHORT_UNICHARS) { 2923 /* Write into the stack buffer; nallocated can't overflow. 2924 * At the end, we'll allocate exactly as much heap space as it 2925 * turns out we need. 2926 */ 2927 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int); 2928 result = NULL; /* will allocate after we're done */ 2929 p = stackbuf; 2930 } 2931 else { 2932 /* Overallocate on the heap, and give the excess back at the end. */ 2933 nallocated = size * 4; 2934 if (nallocated / 4 != size) /* overflow! */ 2935 return PyErr_NoMemory(); 2936 result = PyBytes_FromStringAndSize(NULL, nallocated); 2937 if (result == NULL) 2938 return NULL; 2939 p = PyBytes_AS_STRING(result); 2940 } 2941 2942 for (i = 0; i < size;) { 2943 Py_UCS4 ch = s[i++]; 2944 2945 if (ch < 0x80) 2946 /* Encode ASCII */ 2947 *p++ = (char) ch; 2948 2949 else if (ch < 0x0800) { 2950 /* Encode Latin-1 */ 2951 *p++ = (char)(0xc0 | (ch >> 6)); 2952 *p++ = (char)(0x80 | (ch & 0x3f)); 2953 } else if (0xD800 <= ch && ch <= 0xDFFF) { 2954#ifndef Py_UNICODE_WIDE 2955 /* Special case: check for high and low surrogate */ 2956 if (ch <= 0xDBFF && i != size && 0xDC00 <= s[i] && s[i] <= 0xDFFF) { 2957 Py_UCS4 ch2 = s[i]; 2958 /* Combine the two surrogates to form a UCS4 value */ 2959 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000; 2960 i++; 2961 2962 /* Encode UCS4 Unicode ordinals */ 2963 *p++ = (char)(0xf0 | (ch >> 18)); 2964 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f)); 2965 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); 2966 *p++ = (char)(0x80 | (ch & 0x3f)); 2967 } else { 2968#endif 2969 Py_ssize_t newpos; 2970 PyObject *rep; 2971 Py_ssize_t repsize, k; 2972 rep = unicode_encode_call_errorhandler 2973 (errors, &errorHandler, "utf-8", "surrogates not allowed", 2974 s, size, &exc, i-1, i, &newpos); 2975 if (!rep) 2976 goto error; 2977 2978 if (PyBytes_Check(rep)) 2979 repsize = PyBytes_GET_SIZE(rep); 2980 else 2981 repsize = PyUnicode_GET_SIZE(rep); 2982 2983 if (repsize > 4) { 2984 Py_ssize_t offset; 2985 2986 if (result == NULL) 2987 offset = p - stackbuf; 2988 else 2989 offset = p - PyBytes_AS_STRING(result); 2990 2991 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) { 2992 /* integer overflow */ 2993 PyErr_NoMemory(); 2994 goto error; 2995 } 2996 nallocated += repsize - 4; 2997 if (result != NULL) { 2998 if (_PyBytes_Resize(&result, nallocated) < 0) 2999 goto error; 3000 } else { 3001 result = PyBytes_FromStringAndSize(NULL, nallocated); 3002 if (result == NULL) 3003 goto error; 3004 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset); 3005 } 3006 p = PyBytes_AS_STRING(result) + offset; 3007 } 3008 3009 if (PyBytes_Check(rep)) { 3010 char *prep = PyBytes_AS_STRING(rep); 3011 for(k = repsize; k > 0; k--) 3012 *p++ = *prep++; 3013 } else /* rep is unicode */ { 3014 Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep); 3015 Py_UNICODE c; 3016 3017 for(k=0; k<repsize; k++) { 3018 c = prep[k]; 3019 if (0x80 <= c) { 3020 raise_encode_exception(&exc, "utf-8", s, size, 3021 i-1, i, "surrogates not allowed"); 3022 goto error; 3023 } 3024 *p++ = (char)prep[k]; 3025 } 3026 } 3027 Py_DECREF(rep); 3028#ifndef Py_UNICODE_WIDE 3029 } 3030#endif 3031 } else if (ch < 0x10000) { 3032 *p++ = (char)(0xe0 | (ch >> 12)); 3033 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); 3034 *p++ = (char)(0x80 | (ch & 0x3f)); 3035 } else /* ch >= 0x10000 */ { 3036 /* Encode UCS4 Unicode ordinals */ 3037 *p++ = (char)(0xf0 | (ch >> 18)); 3038 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f)); 3039 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); 3040 *p++ = (char)(0x80 | (ch & 0x3f)); 3041 } 3042 } 3043 3044 if (result == NULL) { 3045 /* This was stack allocated. */ 3046 nneeded = p - stackbuf; 3047 assert(nneeded <= nallocated); 3048 result = PyBytes_FromStringAndSize(stackbuf, nneeded); 3049 } 3050 else { 3051 /* Cut back to size actually needed. */ 3052 nneeded = p - PyBytes_AS_STRING(result); 3053 assert(nneeded <= nallocated); 3054 _PyBytes_Resize(&result, nneeded); 3055 } 3056 Py_XDECREF(errorHandler); 3057 Py_XDECREF(exc); 3058 return result; 3059 error: 3060 Py_XDECREF(errorHandler); 3061 Py_XDECREF(exc); 3062 Py_XDECREF(result); 3063 return NULL; 3064 3065#undef MAX_SHORT_UNICHARS 3066} 3067 3068PyObject *PyUnicode_AsUTF8String(PyObject *unicode) 3069{ 3070 if (!PyUnicode_Check(unicode)) { 3071 PyErr_BadArgument(); 3072 return NULL; 3073 } 3074 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode), 3075 PyUnicode_GET_SIZE(unicode), 3076 NULL); 3077} 3078 3079/* --- UTF-32 Codec ------------------------------------------------------- */ 3080 3081PyObject * 3082PyUnicode_DecodeUTF32(const char *s, 3083 Py_ssize_t size, 3084 const char *errors, 3085 int *byteorder) 3086{ 3087 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL); 3088} 3089 3090PyObject * 3091PyUnicode_DecodeUTF32Stateful(const char *s, 3092 Py_ssize_t size, 3093 const char *errors, 3094 int *byteorder, 3095 Py_ssize_t *consumed) 3096{ 3097 const char *starts = s; 3098 Py_ssize_t startinpos; 3099 Py_ssize_t endinpos; 3100 Py_ssize_t outpos; 3101 PyUnicodeObject *unicode; 3102 Py_UNICODE *p; 3103#ifndef Py_UNICODE_WIDE 3104 int pairs = 0; 3105 const unsigned char *qq; 3106#else 3107 const int pairs = 0; 3108#endif 3109 const unsigned char *q, *e; 3110 int bo = 0; /* assume native ordering by default */ 3111 const char *errmsg = ""; 3112 /* Offsets from q for retrieving bytes in the right order. */ 3113#ifdef BYTEORDER_IS_LITTLE_ENDIAN 3114 int iorder[] = {0, 1, 2, 3}; 3115#else 3116 int iorder[] = {3, 2, 1, 0}; 3117#endif 3118 PyObject *errorHandler = NULL; 3119 PyObject *exc = NULL; 3120 3121 q = (unsigned char *)s; 3122 e = q + size; 3123 3124 if (byteorder) 3125 bo = *byteorder; 3126 3127 /* Check for BOM marks (U+FEFF) in the input and adjust current 3128 byte order setting accordingly. In native mode, the leading BOM 3129 mark is skipped, in all other modes, it is copied to the output 3130 stream as-is (giving a ZWNBSP character). */ 3131 if (bo == 0) { 3132 if (size >= 4) { 3133 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) | 3134 (q[iorder[1]] << 8) | q[iorder[0]]; 3135#ifdef BYTEORDER_IS_LITTLE_ENDIAN 3136 if (bom == 0x0000FEFF) { 3137 q += 4; 3138 bo = -1; 3139 } 3140 else if (bom == 0xFFFE0000) { 3141 q += 4; 3142 bo = 1; 3143 } 3144#else 3145 if (bom == 0x0000FEFF) { 3146 q += 4; 3147 bo = 1; 3148 } 3149 else if (bom == 0xFFFE0000) { 3150 q += 4; 3151 bo = -1; 3152 } 3153#endif 3154 } 3155 } 3156 3157 if (bo == -1) { 3158 /* force LE */ 3159 iorder[0] = 0; 3160 iorder[1] = 1; 3161 iorder[2] = 2; 3162 iorder[3] = 3; 3163 } 3164 else if (bo == 1) { 3165 /* force BE */ 3166 iorder[0] = 3; 3167 iorder[1] = 2; 3168 iorder[2] = 1; 3169 iorder[3] = 0; 3170 } 3171 3172 /* On narrow builds we split characters outside the BMP into two 3173 codepoints => count how much extra space we need. */ 3174#ifndef Py_UNICODE_WIDE 3175 for (qq = q; qq < e; qq += 4) 3176 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0) 3177 pairs++; 3178#endif 3179 3180 /* This might be one to much, because of a BOM */ 3181 unicode = _PyUnicode_New((size+3)/4+pairs); 3182 if (!unicode) 3183 return NULL; 3184 if (size == 0) 3185 return (PyObject *)unicode; 3186 3187 /* Unpack UTF-32 encoded data */ 3188 p = unicode->str; 3189 3190 while (q < e) { 3191 Py_UCS4 ch; 3192 /* remaining bytes at the end? (size should be divisible by 4) */ 3193 if (e-q<4) { 3194 if (consumed) 3195 break; 3196 errmsg = "truncated data"; 3197 startinpos = ((const char *)q)-starts; 3198 endinpos = ((const char *)e)-starts; 3199 goto utf32Error; 3200 /* The remaining input chars are ignored if the callback 3201 chooses to skip the input */ 3202 } 3203 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) | 3204 (q[iorder[1]] << 8) | q[iorder[0]]; 3205 3206 if (ch >= 0x110000) 3207 { 3208 errmsg = "codepoint not in range(0x110000)"; 3209 startinpos = ((const char *)q)-starts; 3210 endinpos = startinpos+4; 3211 goto utf32Error; 3212 } 3213#ifndef Py_UNICODE_WIDE 3214 if (ch >= 0x10000) 3215 { 3216 *p++ = 0xD800 | ((ch-0x10000) >> 10); 3217 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF); 3218 } 3219 else 3220#endif 3221 *p++ = ch; 3222 q += 4; 3223 continue; 3224 utf32Error: 3225 outpos = p-PyUnicode_AS_UNICODE(unicode); 3226 if (unicode_decode_call_errorhandler( 3227 errors, &errorHandler, 3228 "utf32", errmsg, 3229 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q, 3230 &unicode, &outpos, &p)) 3231 goto onError; 3232 } 3233 3234 if (byteorder) 3235 *byteorder = bo; 3236 3237 if (consumed) 3238 *consumed = (const char *)q-starts; 3239 3240 /* Adjust length */ 3241 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0) 3242 goto onError; 3243 3244 Py_XDECREF(errorHandler); 3245 Py_XDECREF(exc); 3246 return (PyObject *)unicode; 3247 3248 onError: 3249 Py_DECREF(unicode); 3250 Py_XDECREF(errorHandler); 3251 Py_XDECREF(exc); 3252 return NULL; 3253} 3254 3255PyObject * 3256PyUnicode_EncodeUTF32(const Py_UNICODE *s, 3257 Py_ssize_t size, 3258 const char *errors, 3259 int byteorder) 3260{ 3261 PyObject *v; 3262 unsigned char *p; 3263 Py_ssize_t nsize, bytesize; 3264#ifndef Py_UNICODE_WIDE 3265 Py_ssize_t i, pairs; 3266#else 3267 const int pairs = 0; 3268#endif 3269 /* Offsets from p for storing byte pairs in the right order. */ 3270#ifdef BYTEORDER_IS_LITTLE_ENDIAN 3271 int iorder[] = {0, 1, 2, 3}; 3272#else 3273 int iorder[] = {3, 2, 1, 0}; 3274#endif 3275 3276#define STORECHAR(CH) \ 3277 do { \ 3278 p[iorder[3]] = ((CH) >> 24) & 0xff; \ 3279 p[iorder[2]] = ((CH) >> 16) & 0xff; \ 3280 p[iorder[1]] = ((CH) >> 8) & 0xff; \ 3281 p[iorder[0]] = (CH) & 0xff; \ 3282 p += 4; \ 3283 } while(0) 3284 3285 /* In narrow builds we can output surrogate pairs as one codepoint, 3286 so we need less space. */ 3287#ifndef Py_UNICODE_WIDE 3288 for (i = pairs = 0; i < size-1; i++) 3289 if (0xD800 <= s[i] && s[i] <= 0xDBFF && 3290 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF) 3291 pairs++; 3292#endif 3293 nsize = (size - pairs + (byteorder == 0)); 3294 bytesize = nsize * 4; 3295 if (bytesize / 4 != nsize) 3296 return PyErr_NoMemory(); 3297 v = PyBytes_FromStringAndSize(NULL, bytesize); 3298 if (v == NULL) 3299 return NULL; 3300 3301 p = (unsigned char *)PyBytes_AS_STRING(v); 3302 if (byteorder == 0) 3303 STORECHAR(0xFEFF); 3304 if (size == 0) 3305 goto done; 3306 3307 if (byteorder == -1) { 3308 /* force LE */ 3309 iorder[0] = 0; 3310 iorder[1] = 1; 3311 iorder[2] = 2; 3312 iorder[3] = 3; 3313 } 3314 else if (byteorder == 1) { 3315 /* force BE */ 3316 iorder[0] = 3; 3317 iorder[1] = 2; 3318 iorder[2] = 1; 3319 iorder[3] = 0; 3320 } 3321 3322 while (size-- > 0) { 3323 Py_UCS4 ch = *s++; 3324#ifndef Py_UNICODE_WIDE 3325 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) { 3326 Py_UCS4 ch2 = *s; 3327 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) { 3328 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000; 3329 s++; 3330 size--; 3331 } 3332 } 3333#endif 3334 STORECHAR(ch); 3335 } 3336 3337 done: 3338 return v; 3339#undef STORECHAR 3340} 3341 3342PyObject *PyUnicode_AsUTF32String(PyObject *unicode) 3343{ 3344 if (!PyUnicode_Check(unicode)) { 3345 PyErr_BadArgument(); 3346 return NULL; 3347 } 3348 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode), 3349 PyUnicode_GET_SIZE(unicode), 3350 NULL, 3351 0); 3352} 3353 3354/* --- UTF-16 Codec ------------------------------------------------------- */ 3355 3356PyObject * 3357PyUnicode_DecodeUTF16(const char *s, 3358 Py_ssize_t size, 3359 const char *errors, 3360 int *byteorder) 3361{ 3362 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL); 3363} 3364 3365/* Two masks for fast checking of whether a C 'long' may contain 3366 UTF16-encoded surrogate characters. This is an efficient heuristic, 3367 assuming that non-surrogate characters with a code point >= 0x8000 are 3368 rare in most input. 3369 FAST_CHAR_MASK is used when the input is in native byte ordering, 3370 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering. 3371*/ 3372#if (SIZEOF_LONG == 8) 3373# define FAST_CHAR_MASK 0x8000800080008000L 3374# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L 3375#elif (SIZEOF_LONG == 4) 3376# define FAST_CHAR_MASK 0x80008000L 3377# define SWAPPED_FAST_CHAR_MASK 0x00800080L 3378#else 3379# error C 'long' size should be either 4 or 8! 3380#endif 3381 3382PyObject * 3383PyUnicode_DecodeUTF16Stateful(const char *s, 3384 Py_ssize_t size, 3385 const char *errors, 3386 int *byteorder, 3387 Py_ssize_t *consumed) 3388{ 3389 const char *starts = s; 3390 Py_ssize_t startinpos; 3391 Py_ssize_t endinpos; 3392 Py_ssize_t outpos; 3393 PyUnicodeObject *unicode; 3394 Py_UNICODE *p; 3395 const unsigned char *q, *e, *aligned_end; 3396 int bo = 0; /* assume native ordering by default */ 3397 int native_ordering = 0; 3398 const char *errmsg = ""; 3399 /* Offsets from q for retrieving byte pairs in the right order. */ 3400#ifdef BYTEORDER_IS_LITTLE_ENDIAN 3401 int ihi = 1, ilo = 0; 3402#else 3403 int ihi = 0, ilo = 1; 3404#endif 3405 PyObject *errorHandler = NULL; 3406 PyObject *exc = NULL; 3407 3408 /* Note: size will always be longer than the resulting Unicode 3409 character count */ 3410 unicode = _PyUnicode_New(size); 3411 if (!unicode) 3412 return NULL; 3413 if (size == 0) 3414 return (PyObject *)unicode; 3415 3416 /* Unpack UTF-16 encoded data */ 3417 p = unicode->str; 3418 q = (unsigned char *)s; 3419 e = q + size - 1; 3420 3421 if (byteorder) 3422 bo = *byteorder; 3423 3424 /* Check for BOM marks (U+FEFF) in the input and adjust current 3425 byte order setting accordingly. In native mode, the leading BOM 3426 mark is skipped, in all other modes, it is copied to the output 3427 stream as-is (giving a ZWNBSP character). */ 3428 if (bo == 0) { 3429 if (size >= 2) { 3430 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo]; 3431#ifdef BYTEORDER_IS_LITTLE_ENDIAN 3432 if (bom == 0xFEFF) { 3433 q += 2; 3434 bo = -1; 3435 } 3436 else if (bom == 0xFFFE) { 3437 q += 2; 3438 bo = 1; 3439 } 3440#else 3441 if (bom == 0xFEFF) { 3442 q += 2; 3443 bo = 1; 3444 } 3445 else if (bom == 0xFFFE) { 3446 q += 2; 3447 bo = -1; 3448 } 3449#endif 3450 } 3451 } 3452 3453 if (bo == -1) { 3454 /* force LE */ 3455 ihi = 1; 3456 ilo = 0; 3457 } 3458 else if (bo == 1) { 3459 /* force BE */ 3460 ihi = 0; 3461 ilo = 1; 3462 } 3463#ifdef BYTEORDER_IS_LITTLE_ENDIAN 3464 native_ordering = ilo < ihi; 3465#else 3466 native_ordering = ilo > ihi; 3467#endif 3468 3469 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK); 3470 while (q < e) { 3471 Py_UNICODE ch; 3472 /* First check for possible aligned read of a C 'long'. Unaligned 3473 reads are more expensive, better to defer to another iteration. */ 3474 if (!((size_t) q & LONG_PTR_MASK)) { 3475 /* Fast path for runs of non-surrogate chars. */ 3476 register const unsigned char *_q = q; 3477 Py_UNICODE *_p = p; 3478 if (native_ordering) { 3479 /* Native ordering is simple: as long as the input cannot 3480 possibly contain a surrogate char, do an unrolled copy 3481 of several 16-bit code points to the target object. 3482 The non-surrogate check is done on several input bytes 3483 at a time (as many as a C 'long' can contain). */ 3484 while (_q < aligned_end) { 3485 unsigned long data = * (unsigned long *) _q; 3486 if (data & FAST_CHAR_MASK) 3487 break; 3488 _p[0] = ((unsigned short *) _q)[0]; 3489 _p[1] = ((unsigned short *) _q)[1]; 3490#if (SIZEOF_LONG == 8) 3491 _p[2] = ((unsigned short *) _q)[2]; 3492 _p[3] = ((unsigned short *) _q)[3]; 3493#endif 3494 _q += SIZEOF_LONG; 3495 _p += SIZEOF_LONG / 2; 3496 } 3497 } 3498 else { 3499 /* Byteswapped ordering is similar, but we must decompose 3500 the copy bytewise, and take care of zero'ing out the 3501 upper bytes if the target object is in 32-bit units 3502 (that is, in UCS-4 builds). */ 3503 while (_q < aligned_end) { 3504 unsigned long data = * (unsigned long *) _q; 3505 if (data & SWAPPED_FAST_CHAR_MASK) 3506 break; 3507 /* Zero upper bytes in UCS-4 builds */ 3508#if (Py_UNICODE_SIZE > 2) 3509 _p[0] = 0; 3510 _p[1] = 0; 3511#if (SIZEOF_LONG == 8) 3512 _p[2] = 0; 3513 _p[3] = 0; 3514#endif 3515#endif 3516 /* Issue #4916; UCS-4 builds on big endian machines must 3517 fill the two last bytes of each 4-byte unit. */ 3518#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2) 3519# define OFF 2 3520#else 3521# define OFF 0 3522#endif 3523 ((unsigned char *) _p)[OFF + 1] = _q[0]; 3524 ((unsigned char *) _p)[OFF + 0] = _q[1]; 3525 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2]; 3526 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3]; 3527#if (SIZEOF_LONG == 8) 3528 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4]; 3529 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5]; 3530 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6]; 3531 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7]; 3532#endif 3533#undef OFF 3534 _q += SIZEOF_LONG; 3535 _p += SIZEOF_LONG / 2; 3536 } 3537 } 3538 p = _p; 3539 q = _q; 3540 if (q >= e) 3541 break; 3542 } 3543 ch = (q[ihi] << 8) | q[ilo]; 3544 3545 q += 2; 3546 3547 if (ch < 0xD800 || ch > 0xDFFF) { 3548 *p++ = ch; 3549 continue; 3550 } 3551 3552 /* UTF-16 code pair: */ 3553 if (q > e) { 3554 errmsg = "unexpected end of data"; 3555 startinpos = (((const char *)q) - 2) - starts; 3556 endinpos = ((const char *)e) + 1 - starts; 3557 goto utf16Error; 3558 } 3559 if (0xD800 <= ch && ch <= 0xDBFF) { 3560 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo]; 3561 q += 2; 3562 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) { 3563#ifndef Py_UNICODE_WIDE 3564 *p++ = ch; 3565 *p++ = ch2; 3566#else 3567 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000; 3568#endif 3569 continue; 3570 } 3571 else { 3572 errmsg = "illegal UTF-16 surrogate"; 3573 startinpos = (((const char *)q)-4)-starts; 3574 endinpos = startinpos+2; 3575 goto utf16Error; 3576 } 3577 3578 } 3579 errmsg = "illegal encoding"; 3580 startinpos = (((const char *)q)-2)-starts; 3581 endinpos = startinpos+2; 3582 /* Fall through to report the error */ 3583 3584 utf16Error: 3585 outpos = p - PyUnicode_AS_UNICODE(unicode); 3586 if (unicode_decode_call_errorhandler( 3587 errors, 3588 &errorHandler, 3589 "utf16", errmsg, 3590 &starts, 3591 (const char **)&e, 3592 &startinpos, 3593 &endinpos, 3594 &exc, 3595 (const char **)&q, 3596 &unicode, 3597 &outpos, 3598 &p)) 3599 goto onError; 3600 } 3601 /* remaining byte at the end? (size should be even) */ 3602 if (e == q) { 3603 if (!consumed) { 3604 errmsg = "truncated data"; 3605 startinpos = ((const char *)q) - starts; 3606 endinpos = ((const char *)e) + 1 - starts; 3607 outpos = p - PyUnicode_AS_UNICODE(unicode); 3608 if (unicode_decode_call_errorhandler( 3609 errors, 3610 &errorHandler, 3611 "utf16", errmsg, 3612 &starts, 3613 (const char **)&e, 3614 &startinpos, 3615 &endinpos, 3616 &exc, 3617 (const char **)&q, 3618 &unicode, 3619 &outpos, 3620 &p)) 3621 goto onError; 3622 /* The remaining input chars are ignored if the callback 3623 chooses to skip the input */ 3624 } 3625 } 3626 3627 if (byteorder) 3628 *byteorder = bo; 3629 3630 if (consumed) 3631 *consumed = (const char *)q-starts; 3632 3633 /* Adjust length */ 3634 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0) 3635 goto onError; 3636 3637 Py_XDECREF(errorHandler); 3638 Py_XDECREF(exc); 3639 return (PyObject *)unicode; 3640 3641 onError: 3642 Py_DECREF(unicode); 3643 Py_XDECREF(errorHandler); 3644 Py_XDECREF(exc); 3645 return NULL; 3646} 3647 3648#undef FAST_CHAR_MASK 3649#undef SWAPPED_FAST_CHAR_MASK 3650 3651PyObject * 3652PyUnicode_EncodeUTF16(const Py_UNICODE *s, 3653 Py_ssize_t size, 3654 const char *errors, 3655 int byteorder) 3656{ 3657 PyObject *v; 3658 unsigned char *p; 3659 Py_ssize_t nsize, bytesize; 3660#ifdef Py_UNICODE_WIDE 3661 Py_ssize_t i, pairs; 3662#else 3663 const int pairs = 0; 3664#endif 3665 /* Offsets from p for storing byte pairs in the right order. */ 3666#ifdef BYTEORDER_IS_LITTLE_ENDIAN 3667 int ihi = 1, ilo = 0; 3668#else 3669 int ihi = 0, ilo = 1; 3670#endif 3671 3672#define STORECHAR(CH) \ 3673 do { \ 3674 p[ihi] = ((CH) >> 8) & 0xff; \ 3675 p[ilo] = (CH) & 0xff; \ 3676 p += 2; \ 3677 } while(0) 3678 3679#ifdef Py_UNICODE_WIDE 3680 for (i = pairs = 0; i < size; i++) 3681 if (s[i] >= 0x10000) 3682 pairs++; 3683#endif 3684 /* 2 * (size + pairs + (byteorder == 0)) */ 3685 if (size > PY_SSIZE_T_MAX || 3686 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0)) 3687 return PyErr_NoMemory(); 3688 nsize = size + pairs + (byteorder == 0); 3689 bytesize = nsize * 2; 3690 if (bytesize / 2 != nsize) 3691 return PyErr_NoMemory(); 3692 v = PyBytes_FromStringAndSize(NULL, bytesize); 3693 if (v == NULL) 3694 return NULL; 3695 3696 p = (unsigned char *)PyBytes_AS_STRING(v); 3697 if (byteorder == 0) 3698 STORECHAR(0xFEFF); 3699 if (size == 0) 3700 goto done; 3701 3702 if (byteorder == -1) { 3703 /* force LE */ 3704 ihi = 1; 3705 ilo = 0; 3706 } 3707 else if (byteorder == 1) { 3708 /* force BE */ 3709 ihi = 0; 3710 ilo = 1; 3711 } 3712 3713 while (size-- > 0) { 3714 Py_UNICODE ch = *s++; 3715 Py_UNICODE ch2 = 0; 3716#ifdef Py_UNICODE_WIDE 3717 if (ch >= 0x10000) { 3718 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF); 3719 ch = 0xD800 | ((ch-0x10000) >> 10); 3720 } 3721#endif 3722 STORECHAR(ch); 3723 if (ch2) 3724 STORECHAR(ch2); 3725 } 3726 3727 done: 3728 return v; 3729#undef STORECHAR 3730} 3731 3732PyObject *PyUnicode_AsUTF16String(PyObject *unicode) 3733{ 3734 if (!PyUnicode_Check(unicode)) { 3735 PyErr_BadArgument(); 3736 return NULL; 3737 } 3738 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode), 3739 PyUnicode_GET_SIZE(unicode), 3740 NULL, 3741 0); 3742} 3743 3744/* --- Unicode Escape Codec ----------------------------------------------- */ 3745 3746static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL; 3747 3748PyObject *PyUnicode_DecodeUnicodeEscape(const char *s, 3749 Py_ssize_t size, 3750 const char *errors) 3751{ 3752 const char *starts = s; 3753 Py_ssize_t startinpos; 3754 Py_ssize_t endinpos; 3755 Py_ssize_t outpos; 3756 int i; 3757 PyUnicodeObject *v; 3758 Py_UNICODE *p; 3759 const char *end; 3760 char* message; 3761 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */ 3762 PyObject *errorHandler = NULL; 3763 PyObject *exc = NULL; 3764 3765 /* Escaped strings will always be longer than the resulting 3766 Unicode string, so we start with size here and then reduce the 3767 length after conversion to the true value. 3768 (but if the error callback returns a long replacement string 3769 we'll have to allocate more space) */ 3770 v = _PyUnicode_New(size); 3771 if (v == NULL) 3772 goto onError; 3773 if (size == 0) 3774 return (PyObject *)v; 3775 3776 p = PyUnicode_AS_UNICODE(v); 3777 end = s + size; 3778 3779 while (s < end) { 3780 unsigned char c; 3781 Py_UNICODE x; 3782 int digits; 3783 3784 /* Non-escape characters are interpreted as Unicode ordinals */ 3785 if (*s != '\\') { 3786 *p++ = (unsigned char) *s++; 3787 continue; 3788 } 3789 3790 startinpos = s-starts; 3791 /* \ - Escapes */ 3792 s++; 3793 c = *s++; 3794 if (s > end) 3795 c = '\0'; /* Invalid after \ */ 3796 switch (c) { 3797 3798 /* \x escapes */ 3799 case '\n': break; 3800 case '\\': *p++ = '\\'; break; 3801 case '\'': *p++ = '\''; break; 3802 case '\"': *p++ = '\"'; break; 3803 case 'b': *p++ = '\b'; break; 3804 case 'f': *p++ = '\014'; break; /* FF */ 3805 case 't': *p++ = '\t'; break; 3806 case 'n': *p++ = '\n'; break; 3807 case 'r': *p++ = '\r'; break; 3808 case 'v': *p++ = '\013'; break; /* VT */ 3809 case 'a': *p++ = '\007'; break; /* BEL, not classic C */ 3810 3811 /* \OOO (octal) escapes */ 3812 case '0': case '1': case '2': case '3': 3813 case '4': case '5': case '6': case '7': 3814 x = s[-1] - '0'; 3815 if (s < end && '0' <= *s && *s <= '7') { 3816 x = (x<<3) + *s++ - '0'; 3817 if (s < end && '0' <= *s && *s <= '7') 3818 x = (x<<3) + *s++ - '0'; 3819 } 3820 *p++ = x; 3821 break; 3822 3823 /* hex escapes */ 3824 /* \xXX */ 3825 case 'x': 3826 digits = 2; 3827 message = "truncated \\xXX escape"; 3828 goto hexescape; 3829 3830 /* \uXXXX */ 3831 case 'u': 3832 digits = 4; 3833 message = "truncated \\uXXXX escape"; 3834 goto hexescape; 3835 3836 /* \UXXXXXXXX */ 3837 case 'U': 3838 digits = 8; 3839 message = "truncated \\UXXXXXXXX escape"; 3840 hexescape: 3841 chr = 0; 3842 outpos = p-PyUnicode_AS_UNICODE(v); 3843 if (s+digits>end) { 3844 endinpos = size; 3845 if (unicode_decode_call_errorhandler( 3846 errors, &errorHandler, 3847 "unicodeescape", "end of string in escape sequence", 3848 &starts, &end, &startinpos, &endinpos, &exc, &s, 3849 &v, &outpos, &p)) 3850 goto onError; 3851 goto nextByte; 3852 } 3853 for (i = 0; i < digits; ++i) { 3854 c = (unsigned char) s[i]; 3855 if (!Py_ISXDIGIT(c)) { 3856 endinpos = (s+i+1)-starts; 3857 if (unicode_decode_call_errorhandler( 3858 errors, &errorHandler, 3859 "unicodeescape", message, 3860 &starts, &end, &startinpos, &endinpos, &exc, &s, 3861 &v, &outpos, &p)) 3862 goto onError; 3863 goto nextByte; 3864 } 3865 chr = (chr<<4) & ~0xF; 3866 if (c >= '0' && c <= '9') 3867 chr += c - '0'; 3868 else if (c >= 'a' && c <= 'f') 3869 chr += 10 + c - 'a'; 3870 else 3871 chr += 10 + c - 'A'; 3872 } 3873 s += i; 3874 if (chr == 0xffffffff && PyErr_Occurred()) 3875 /* _decoding_error will have already written into the 3876 target buffer. */ 3877 break; 3878 store: 3879 /* when we get here, chr is a 32-bit unicode character */ 3880 if (chr <= 0xffff) 3881 /* UCS-2 character */ 3882 *p++ = (Py_UNICODE) chr; 3883 else if (chr <= 0x10ffff) { 3884 /* UCS-4 character. Either store directly, or as 3885 surrogate pair. */ 3886#ifdef Py_UNICODE_WIDE 3887 *p++ = chr; 3888#else 3889 chr -= 0x10000L; 3890 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10); 3891 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF); 3892#endif 3893 } else { 3894 endinpos = s-starts; 3895 outpos = p-PyUnicode_AS_UNICODE(v); 3896 if (unicode_decode_call_errorhandler( 3897 errors, &errorHandler, 3898 "unicodeescape", "illegal Unicode character", 3899 &starts, &end, &startinpos, &endinpos, &exc, &s, 3900 &v, &outpos, &p)) 3901 goto onError; 3902 } 3903 break; 3904 3905 /* \N{name} */ 3906 case 'N': 3907 message = "malformed \\N character escape"; 3908 if (ucnhash_CAPI == NULL) { 3909 /* load the unicode data module */ 3910 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(PyUnicodeData_CAPSULE_NAME, 1); 3911 if (ucnhash_CAPI == NULL) 3912 goto ucnhashError; 3913 } 3914 if (*s == '{') { 3915 const char *start = s+1; 3916 /* look for the closing brace */ 3917 while (*s != '}' && s < end) 3918 s++; 3919 if (s > start && s < end && *s == '}') { 3920 /* found a name. look it up in the unicode database */ 3921 message = "unknown Unicode character name"; 3922 s++; 3923 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr)) 3924 goto store; 3925 } 3926 } 3927 endinpos = s-starts; 3928 outpos = p-PyUnicode_AS_UNICODE(v); 3929 if (unicode_decode_call_errorhandler( 3930 errors, &errorHandler, 3931 "unicodeescape", message, 3932 &starts, &end, &startinpos, &endinpos, &exc, &s, 3933 &v, &outpos, &p)) 3934 goto onError; 3935 break; 3936 3937 default: 3938 if (s > end) { 3939 message = "\\ at end of string"; 3940 s--; 3941 endinpos = s-starts; 3942 outpos = p-PyUnicode_AS_UNICODE(v); 3943 if (unicode_decode_call_errorhandler( 3944 errors, &errorHandler, 3945 "unicodeescape", message, 3946 &starts, &end, &startinpos, &endinpos, &exc, &s, 3947 &v, &outpos, &p)) 3948 goto onError; 3949 } 3950 else { 3951 *p++ = '\\'; 3952 *p++ = (unsigned char)s[-1]; 3953 } 3954 break; 3955 } 3956 nextByte: 3957 ; 3958 } 3959 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0) 3960 goto onError; 3961 Py_XDECREF(errorHandler); 3962 Py_XDECREF(exc); 3963 return (PyObject *)v; 3964 3965 ucnhashError: 3966 PyErr_SetString( 3967 PyExc_UnicodeError, 3968 "\\N escapes not supported (can't load unicodedata module)" 3969 ); 3970 Py_XDECREF(v); 3971 Py_XDECREF(errorHandler); 3972 Py_XDECREF(exc); 3973 return NULL; 3974 3975 onError: 3976 Py_XDECREF(v); 3977 Py_XDECREF(errorHandler); 3978 Py_XDECREF(exc); 3979 return NULL; 3980} 3981 3982/* Return a Unicode-Escape string version of the Unicode object. 3983 3984 If quotes is true, the string is enclosed in u"" or u'' quotes as 3985 appropriate. 3986 3987*/ 3988 3989Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s, 3990 Py_ssize_t size, 3991 Py_UNICODE ch) 3992{ 3993 /* like wcschr, but doesn't stop at NULL characters */ 3994 3995 while (size-- > 0) { 3996 if (*s == ch) 3997 return s; 3998 s++; 3999 } 4000 4001 return NULL; 4002} 4003 4004static const char *hexdigits = "0123456789abcdef"; 4005 4006PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s, 4007 Py_ssize_t size) 4008{ 4009 PyObject *repr; 4010 char *p; 4011 4012#ifdef Py_UNICODE_WIDE 4013 const Py_ssize_t expandsize = 10; 4014#else 4015 const Py_ssize_t expandsize = 6; 4016#endif 4017 4018 /* XXX(nnorwitz): rather than over-allocating, it would be 4019 better to choose a different scheme. Perhaps scan the 4020 first N-chars of the string and allocate based on that size. 4021 */ 4022 /* Initial allocation is based on the longest-possible unichr 4023 escape. 4024 4025 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source 4026 unichr, so in this case it's the longest unichr escape. In 4027 narrow (UTF-16) builds this is five chars per source unichr 4028 since there are two unichrs in the surrogate pair, so in narrow 4029 (UTF-16) builds it's not the longest unichr escape. 4030 4031 In wide or narrow builds '\uxxxx' is 6 chars per source unichr, 4032 so in the narrow (UTF-16) build case it's the longest unichr 4033 escape. 4034 */ 4035 4036 if (size == 0) 4037 return PyBytes_FromStringAndSize(NULL, 0); 4038 4039 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize) 4040 return PyErr_NoMemory(); 4041 4042 repr = PyBytes_FromStringAndSize(NULL, 4043 2 4044 + expandsize*size 4045 + 1); 4046 if (repr == NULL) 4047 return NULL; 4048 4049 p = PyBytes_AS_STRING(repr); 4050 4051 while (size-- > 0) { 4052 Py_UNICODE ch = *s++; 4053 4054 /* Escape backslashes */ 4055 if (ch == '\\') { 4056 *p++ = '\\'; 4057 *p++ = (char) ch; 4058 continue; 4059 } 4060 4061#ifdef Py_UNICODE_WIDE 4062 /* Map 21-bit characters to '\U00xxxxxx' */ 4063 else if (ch >= 0x10000) { 4064 *p++ = '\\'; 4065 *p++ = 'U'; 4066 *p++ = hexdigits[(ch >> 28) & 0x0000000F]; 4067 *p++ = hexdigits[(ch >> 24) & 0x0000000F]; 4068 *p++ = hexdigits[(ch >> 20) & 0x0000000F]; 4069 *p++ = hexdigits[(ch >> 16) & 0x0000000F]; 4070 *p++ = hexdigits[(ch >> 12) & 0x0000000F]; 4071 *p++ = hexdigits[(ch >> 8) & 0x0000000F]; 4072 *p++ = hexdigits[(ch >> 4) & 0x0000000F]; 4073 *p++ = hexdigits[ch & 0x0000000F]; 4074 continue; 4075 } 4076#else 4077 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */ 4078 else if (ch >= 0xD800 && ch < 0xDC00) { 4079 Py_UNICODE ch2; 4080 Py_UCS4 ucs; 4081 4082 ch2 = *s++; 4083 size--; 4084 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) { 4085 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000; 4086 *p++ = '\\'; 4087 *p++ = 'U'; 4088 *p++ = hexdigits[(ucs >> 28) & 0x0000000F]; 4089 *p++ = hexdigits[(ucs >> 24) & 0x0000000F]; 4090 *p++ = hexdigits[(ucs >> 20) & 0x0000000F]; 4091 *p++ = hexdigits[(ucs >> 16) & 0x0000000F]; 4092 *p++ = hexdigits[(ucs >> 12) & 0x0000000F]; 4093 *p++ = hexdigits[(ucs >> 8) & 0x0000000F]; 4094 *p++ = hexdigits[(ucs >> 4) & 0x0000000F]; 4095 *p++ = hexdigits[ucs & 0x0000000F]; 4096 continue; 4097 } 4098 /* Fall through: isolated surrogates are copied as-is */ 4099 s--; 4100 size++; 4101 } 4102#endif 4103 4104 /* Map 16-bit characters to '\uxxxx' */ 4105 if (ch >= 256) { 4106 *p++ = '\\'; 4107 *p++ = 'u'; 4108 *p++ = hexdigits[(ch >> 12) & 0x000F]; 4109 *p++ = hexdigits[(ch >> 8) & 0x000F]; 4110 *p++ = hexdigits[(ch >> 4) & 0x000F]; 4111 *p++ = hexdigits[ch & 0x000F]; 4112 } 4113 4114 /* Map special whitespace to '\t', \n', '\r' */ 4115 else if (ch == '\t') { 4116 *p++ = '\\'; 4117 *p++ = 't'; 4118 } 4119 else if (ch == '\n') { 4120 *p++ = '\\'; 4121 *p++ = 'n'; 4122 } 4123 else if (ch == '\r') { 4124 *p++ = '\\'; 4125 *p++ = 'r'; 4126 } 4127 4128 /* Map non-printable US ASCII to '\xhh' */ 4129 else if (ch < ' ' || ch >= 0x7F) { 4130 *p++ = '\\'; 4131 *p++ = 'x'; 4132 *p++ = hexdigits[(ch >> 4) & 0x000F]; 4133 *p++ = hexdigits[ch & 0x000F]; 4134 } 4135 4136 /* Copy everything else as-is */ 4137 else 4138 *p++ = (char) ch; 4139 } 4140 4141 assert(p - PyBytes_AS_STRING(repr) > 0); 4142 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) 4143 return NULL; 4144 return repr; 4145} 4146 4147PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode) 4148{ 4149 PyObject *s; 4150 if (!PyUnicode_Check(unicode)) { 4151 PyErr_BadArgument(); 4152 return NULL; 4153 } 4154 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode), 4155 PyUnicode_GET_SIZE(unicode)); 4156 return s; 4157} 4158 4159/* --- Raw Unicode Escape Codec ------------------------------------------- */ 4160 4161PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s, 4162 Py_ssize_t size, 4163 const char *errors) 4164{ 4165 const char *starts = s; 4166 Py_ssize_t startinpos; 4167 Py_ssize_t endinpos; 4168 Py_ssize_t outpos; 4169 PyUnicodeObject *v; 4170 Py_UNICODE *p; 4171 const char *end; 4172 const char *bs; 4173 PyObject *errorHandler = NULL; 4174 PyObject *exc = NULL; 4175 4176 /* Escaped strings will always be longer than the resulting 4177 Unicode string, so we start with size here and then reduce the 4178 length after conversion to the true value. (But decoding error 4179 handler might have to resize the string) */ 4180 v = _PyUnicode_New(size); 4181 if (v == NULL) 4182 goto onError; 4183 if (size == 0) 4184 return (PyObject *)v; 4185 p = PyUnicode_AS_UNICODE(v); 4186 end = s + size; 4187 while (s < end) { 4188 unsigned char c; 4189 Py_UCS4 x; 4190 int i; 4191 int count; 4192 4193 /* Non-escape characters are interpreted as Unicode ordinals */ 4194 if (*s != '\\') { 4195 *p++ = (unsigned char)*s++; 4196 continue; 4197 } 4198 startinpos = s-starts; 4199 4200 /* \u-escapes are only interpreted iff the number of leading 4201 backslashes if odd */ 4202 bs = s; 4203 for (;s < end;) { 4204 if (*s != '\\') 4205 break; 4206 *p++ = (unsigned char)*s++; 4207 } 4208 if (((s - bs) & 1) == 0 || 4209 s >= end || 4210 (*s != 'u' && *s != 'U')) { 4211 continue; 4212 } 4213 p--; 4214 count = *s=='u' ? 4 : 8; 4215 s++; 4216 4217 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */ 4218 outpos = p-PyUnicode_AS_UNICODE(v); 4219 for (x = 0, i = 0; i < count; ++i, ++s) { 4220 c = (unsigned char)*s; 4221 if (!Py_ISXDIGIT(c)) { 4222 endinpos = s-starts; 4223 if (unicode_decode_call_errorhandler( 4224 errors, &errorHandler, 4225 "rawunicodeescape", "truncated \\uXXXX", 4226 &starts, &end, &startinpos, &endinpos, &exc, &s, 4227 &v, &outpos, &p)) 4228 goto onError; 4229 goto nextByte; 4230 } 4231 x = (x<<4) & ~0xF; 4232 if (c >= '0' && c <= '9') 4233 x += c - '0'; 4234 else if (c >= 'a' && c <= 'f') 4235 x += 10 + c - 'a'; 4236 else 4237 x += 10 + c - 'A'; 4238 } 4239 if (x <= 0xffff) 4240 /* UCS-2 character */ 4241 *p++ = (Py_UNICODE) x; 4242 else if (x <= 0x10ffff) { 4243 /* UCS-4 character. Either store directly, or as 4244 surrogate pair. */ 4245#ifdef Py_UNICODE_WIDE 4246 *p++ = (Py_UNICODE) x; 4247#else 4248 x -= 0x10000L; 4249 *p++ = 0xD800 + (Py_UNICODE) (x >> 10); 4250 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF); 4251#endif 4252 } else { 4253 endinpos = s-starts; 4254 outpos = p-PyUnicode_AS_UNICODE(v); 4255 if (unicode_decode_call_errorhandler( 4256 errors, &errorHandler, 4257 "rawunicodeescape", "\\Uxxxxxxxx out of range", 4258 &starts, &end, &startinpos, &endinpos, &exc, &s, 4259 &v, &outpos, &p)) 4260 goto onError; 4261 } 4262 nextByte: 4263 ; 4264 } 4265 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0) 4266 goto onError; 4267 Py_XDECREF(errorHandler); 4268 Py_XDECREF(exc); 4269 return (PyObject *)v; 4270 4271 onError: 4272 Py_XDECREF(v); 4273 Py_XDECREF(errorHandler); 4274 Py_XDECREF(exc); 4275 return NULL; 4276} 4277 4278PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s, 4279 Py_ssize_t size) 4280{ 4281 PyObject *repr; 4282 char *p; 4283 char *q; 4284 4285#ifdef Py_UNICODE_WIDE 4286 const Py_ssize_t expandsize = 10; 4287#else 4288 const Py_ssize_t expandsize = 6; 4289#endif 4290 4291 if (size > PY_SSIZE_T_MAX / expandsize) 4292 return PyErr_NoMemory(); 4293 4294 repr = PyBytes_FromStringAndSize(NULL, expandsize * size); 4295 if (repr == NULL) 4296 return NULL; 4297 if (size == 0) 4298 return repr; 4299 4300 p = q = PyBytes_AS_STRING(repr); 4301 while (size-- > 0) { 4302 Py_UNICODE ch = *s++; 4303#ifdef Py_UNICODE_WIDE 4304 /* Map 32-bit characters to '\Uxxxxxxxx' */ 4305 if (ch >= 0x10000) { 4306 *p++ = '\\'; 4307 *p++ = 'U'; 4308 *p++ = hexdigits[(ch >> 28) & 0xf]; 4309 *p++ = hexdigits[(ch >> 24) & 0xf]; 4310 *p++ = hexdigits[(ch >> 20) & 0xf]; 4311 *p++ = hexdigits[(ch >> 16) & 0xf]; 4312 *p++ = hexdigits[(ch >> 12) & 0xf]; 4313 *p++ = hexdigits[(ch >> 8) & 0xf]; 4314 *p++ = hexdigits[(ch >> 4) & 0xf]; 4315 *p++ = hexdigits[ch & 15]; 4316 } 4317 else 4318#else 4319 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */ 4320 if (ch >= 0xD800 && ch < 0xDC00) { 4321 Py_UNICODE ch2; 4322 Py_UCS4 ucs; 4323 4324 ch2 = *s++; 4325 size--; 4326 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) { 4327 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000; 4328 *p++ = '\\'; 4329 *p++ = 'U'; 4330 *p++ = hexdigits[(ucs >> 28) & 0xf]; 4331 *p++ = hexdigits[(ucs >> 24) & 0xf]; 4332 *p++ = hexdigits[(ucs >> 20) & 0xf]; 4333 *p++ = hexdigits[(ucs >> 16) & 0xf]; 4334 *p++ = hexdigits[(ucs >> 12) & 0xf]; 4335 *p++ = hexdigits[(ucs >> 8) & 0xf]; 4336 *p++ = hexdigits[(ucs >> 4) & 0xf]; 4337 *p++ = hexdigits[ucs & 0xf]; 4338 continue; 4339 } 4340 /* Fall through: isolated surrogates are copied as-is */ 4341 s--; 4342 size++; 4343 } 4344#endif 4345 /* Map 16-bit characters to '\uxxxx' */ 4346 if (ch >= 256) { 4347 *p++ = '\\'; 4348 *p++ = 'u'; 4349 *p++ = hexdigits[(ch >> 12) & 0xf]; 4350 *p++ = hexdigits[(ch >> 8) & 0xf]; 4351 *p++ = hexdigits[(ch >> 4) & 0xf]; 4352 *p++ = hexdigits[ch & 15]; 4353 } 4354 /* Copy everything else as-is */ 4355 else 4356 *p++ = (char) ch; 4357 } 4358 size = p - q; 4359 4360 assert(size > 0); 4361 if (_PyBytes_Resize(&repr, size) < 0) 4362 return NULL; 4363 return repr; 4364} 4365 4366PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode) 4367{ 4368 PyObject *s; 4369 if (!PyUnicode_Check(unicode)) { 4370 PyErr_BadArgument(); 4371 return NULL; 4372 } 4373 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode), 4374 PyUnicode_GET_SIZE(unicode)); 4375 4376 return s; 4377} 4378 4379/* --- Unicode Internal Codec ------------------------------------------- */ 4380 4381PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s, 4382 Py_ssize_t size, 4383 const char *errors) 4384{ 4385 const char *starts = s; 4386 Py_ssize_t startinpos; 4387 Py_ssize_t endinpos; 4388 Py_ssize_t outpos; 4389 PyUnicodeObject *v; 4390 Py_UNICODE *p; 4391 const char *end; 4392 const char *reason; 4393 PyObject *errorHandler = NULL; 4394 PyObject *exc = NULL; 4395 4396#ifdef Py_UNICODE_WIDE 4397 Py_UNICODE unimax = PyUnicode_GetMax(); 4398#endif 4399 4400 /* XXX overflow detection missing */ 4401 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE); 4402 if (v == NULL) 4403 goto onError; 4404 if (PyUnicode_GetSize((PyObject *)v) == 0) 4405 return (PyObject *)v; 4406 p = PyUnicode_AS_UNICODE(v); 4407 end = s + size; 4408 4409 while (s < end) { 4410 memcpy(p, s, sizeof(Py_UNICODE)); 4411 /* We have to sanity check the raw data, otherwise doom looms for 4412 some malformed UCS-4 data. */ 4413 if ( 4414#ifdef Py_UNICODE_WIDE 4415 *p > unimax || *p < 0 || 4416#endif 4417 end-s < Py_UNICODE_SIZE 4418 ) 4419 { 4420 startinpos = s - starts; 4421 if (end-s < Py_UNICODE_SIZE) { 4422 endinpos = end-starts; 4423 reason = "truncated input"; 4424 } 4425 else { 4426 endinpos = s - starts + Py_UNICODE_SIZE; 4427 reason = "illegal code point (> 0x10FFFF)"; 4428 } 4429 outpos = p - PyUnicode_AS_UNICODE(v); 4430 if (unicode_decode_call_errorhandler( 4431 errors, &errorHandler, 4432 "unicode_internal", reason, 4433 &starts, &end, &startinpos, &endinpos, &exc, &s, 4434 &v, &outpos, &p)) { 4435 goto onError; 4436 } 4437 } 4438 else { 4439 p++; 4440 s += Py_UNICODE_SIZE; 4441 } 4442 } 4443 4444 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0) 4445 goto onError; 4446 Py_XDECREF(errorHandler); 4447 Py_XDECREF(exc); 4448 return (PyObject *)v; 4449 4450 onError: 4451 Py_XDECREF(v); 4452 Py_XDECREF(errorHandler); 4453 Py_XDECREF(exc); 4454 return NULL; 4455} 4456 4457/* --- Latin-1 Codec ------------------------------------------------------ */ 4458 4459PyObject *PyUnicode_DecodeLatin1(const char *s, 4460 Py_ssize_t size, 4461 const char *errors) 4462{ 4463 PyUnicodeObject *v; 4464 Py_UNICODE *p; 4465 const char *e, *unrolled_end; 4466 4467 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */ 4468 if (size == 1) { 4469 Py_UNICODE r = *(unsigned char*)s; 4470 return PyUnicode_FromUnicode(&r, 1); 4471 } 4472 4473 v = _PyUnicode_New(size); 4474 if (v == NULL) 4475 goto onError; 4476 if (size == 0) 4477 return (PyObject *)v; 4478 p = PyUnicode_AS_UNICODE(v); 4479 e = s + size; 4480 /* Unrolling the copy makes it much faster by reducing the looping 4481 overhead. This is similar to what many memcpy() implementations do. */ 4482 unrolled_end = e - 4; 4483 while (s < unrolled_end) { 4484 p[0] = (unsigned char) s[0]; 4485 p[1] = (unsigned char) s[1]; 4486 p[2] = (unsigned char) s[2]; 4487 p[3] = (unsigned char) s[3]; 4488 s += 4; 4489 p += 4; 4490 } 4491 while (s < e) 4492 *p++ = (unsigned char) *s++; 4493 return (PyObject *)v; 4494 4495 onError: 4496 Py_XDECREF(v); 4497 return NULL; 4498} 4499 4500/* create or adjust a UnicodeEncodeError */ 4501static void make_encode_exception(PyObject **exceptionObject, 4502 const char *encoding, 4503 const Py_UNICODE *unicode, Py_ssize_t size, 4504 Py_ssize_t startpos, Py_ssize_t endpos, 4505 const char *reason) 4506{ 4507 if (*exceptionObject == NULL) { 4508 *exceptionObject = PyUnicodeEncodeError_Create( 4509 encoding, unicode, size, startpos, endpos, reason); 4510 } 4511 else { 4512 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos)) 4513 goto onError; 4514 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos)) 4515 goto onError; 4516 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason)) 4517 goto onError; 4518 return; 4519 onError: 4520 Py_DECREF(*exceptionObject); 4521 *exceptionObject = NULL; 4522 } 4523} 4524 4525/* raises a UnicodeEncodeError */ 4526static void raise_encode_exception(PyObject **exceptionObject, 4527 const char *encoding, 4528 const Py_UNICODE *unicode, Py_ssize_t size, 4529 Py_ssize_t startpos, Py_ssize_t endpos, 4530 const char *reason) 4531{ 4532 make_encode_exception(exceptionObject, 4533 encoding, unicode, size, startpos, endpos, reason); 4534 if (*exceptionObject != NULL) 4535 PyCodec_StrictErrors(*exceptionObject); 4536} 4537 4538/* error handling callback helper: 4539 build arguments, call the callback and check the arguments, 4540 put the result into newpos and return the replacement string, which 4541 has to be freed by the caller */ 4542static PyObject *unicode_encode_call_errorhandler(const char *errors, 4543 PyObject **errorHandler, 4544 const char *encoding, const char *reason, 4545 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject, 4546 Py_ssize_t startpos, Py_ssize_t endpos, 4547 Py_ssize_t *newpos) 4548{ 4549 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple"; 4550 4551 PyObject *restuple; 4552 PyObject *resunicode; 4553 4554 if (*errorHandler == NULL) { 4555 *errorHandler = PyCodec_LookupError(errors); 4556 if (*errorHandler == NULL) 4557 return NULL; 4558 } 4559 4560 make_encode_exception(exceptionObject, 4561 encoding, unicode, size, startpos, endpos, reason); 4562 if (*exceptionObject == NULL) 4563 return NULL; 4564 4565 restuple = PyObject_CallFunctionObjArgs( 4566 *errorHandler, *exceptionObject, NULL); 4567 if (restuple == NULL) 4568 return NULL; 4569 if (!PyTuple_Check(restuple)) { 4570 PyErr_SetString(PyExc_TypeError, &argparse[3]); 4571 Py_DECREF(restuple); 4572 return NULL; 4573 } 4574 if (!PyArg_ParseTuple(restuple, argparse, 4575 &resunicode, newpos)) { 4576 Py_DECREF(restuple); 4577 return NULL; 4578 } 4579 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) { 4580 PyErr_SetString(PyExc_TypeError, &argparse[3]); 4581 Py_DECREF(restuple); 4582 return NULL; 4583 } 4584 if (*newpos<0) 4585 *newpos = size+*newpos; 4586 if (*newpos<0 || *newpos>size) { 4587 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos); 4588 Py_DECREF(restuple); 4589 return NULL; 4590 } 4591 Py_INCREF(resunicode); 4592 Py_DECREF(restuple); 4593 return resunicode; 4594} 4595 4596static PyObject *unicode_encode_ucs1(const Py_UNICODE *p, 4597 Py_ssize_t size, 4598 const char *errors, 4599 int limit) 4600{ 4601 /* output object */ 4602 PyObject *res; 4603 /* pointers to the beginning and end+1 of input */ 4604 const Py_UNICODE *startp = p; 4605 const Py_UNICODE *endp = p + size; 4606 /* pointer to the beginning of the unencodable characters */ 4607 /* const Py_UNICODE *badp = NULL; */ 4608 /* pointer into the output */ 4609 char *str; 4610 /* current output position */ 4611 Py_ssize_t ressize; 4612 const char *encoding = (limit == 256) ? "latin-1" : "ascii"; 4613 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)"; 4614 PyObject *errorHandler = NULL; 4615 PyObject *exc = NULL; 4616 /* the following variable is used for caching string comparisons 4617 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */ 4618 int known_errorHandler = -1; 4619 4620 /* allocate enough for a simple encoding without 4621 replacements, if we need more, we'll resize */ 4622 if (size == 0) 4623 return PyBytes_FromStringAndSize(NULL, 0); 4624 res = PyBytes_FromStringAndSize(NULL, size); 4625 if (res == NULL) 4626 return NULL; 4627 str = PyBytes_AS_STRING(res); 4628 ressize = size; 4629 4630 while (p<endp) { 4631 Py_UNICODE c = *p; 4632 4633 /* can we encode this? */ 4634 if (c<limit) { 4635 /* no overflow check, because we know that the space is enough */ 4636 *str++ = (char)c; 4637 ++p; 4638 } 4639 else { 4640 Py_ssize_t unicodepos = p-startp; 4641 Py_ssize_t requiredsize; 4642 PyObject *repunicode; 4643 Py_ssize_t repsize; 4644 Py_ssize_t newpos; 4645 Py_ssize_t respos; 4646 Py_UNICODE *uni2; 4647 /* startpos for collecting unencodable chars */ 4648 const Py_UNICODE *collstart = p; 4649 const Py_UNICODE *collend = p; 4650 /* find all unecodable characters */ 4651 while ((collend < endp) && ((*collend)>=limit)) 4652 ++collend; 4653 /* cache callback name lookup (if not done yet, i.e. it's the first error) */ 4654 if (known_errorHandler==-1) { 4655 if ((errors==NULL) || (!strcmp(errors, "strict"))) 4656 known_errorHandler = 1; 4657 else if (!strcmp(errors, "replace")) 4658 known_errorHandler = 2; 4659 else if (!strcmp(errors, "ignore")) 4660 known_errorHandler = 3; 4661 else if (!strcmp(errors, "xmlcharrefreplace")) 4662 known_errorHandler = 4; 4663 else 4664 known_errorHandler = 0; 4665 } 4666 switch (known_errorHandler) { 4667 case 1: /* strict */ 4668 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason); 4669 goto onError; 4670 case 2: /* replace */ 4671 while (collstart++<collend) 4672 *str++ = '?'; /* fall through */ 4673 case 3: /* ignore */ 4674 p = collend; 4675 break; 4676 case 4: /* xmlcharrefreplace */ 4677 respos = str - PyBytes_AS_STRING(res); 4678 /* determine replacement size (temporarily (mis)uses p) */ 4679 for (p = collstart, repsize = 0; p < collend; ++p) { 4680 if (*p<10) 4681 repsize += 2+1+1; 4682 else if (*p<100) 4683 repsize += 2+2+1; 4684 else if (*p<1000) 4685 repsize += 2+3+1; 4686 else if (*p<10000) 4687 repsize += 2+4+1; 4688#ifndef Py_UNICODE_WIDE 4689 else 4690 repsize += 2+5+1; 4691#else 4692 else if (*p<100000) 4693 repsize += 2+5+1; 4694 else if (*p<1000000) 4695 repsize += 2+6+1; 4696 else 4697 repsize += 2+7+1; 4698#endif 4699 } 4700 requiredsize = respos+repsize+(endp-collend); 4701 if (requiredsize > ressize) { 4702 if (requiredsize<2*ressize) 4703 requiredsize = 2*ressize; 4704 if (_PyBytes_Resize(&res, requiredsize)) 4705 goto onError; 4706 str = PyBytes_AS_STRING(res) + respos; 4707 ressize = requiredsize; 4708 } 4709 /* generate replacement (temporarily (mis)uses p) */ 4710 for (p = collstart; p < collend; ++p) { 4711 str += sprintf(str, "&#%d;", (int)*p); 4712 } 4713 p = collend; 4714 break; 4715 default: 4716 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler, 4717 encoding, reason, startp, size, &exc, 4718 collstart-startp, collend-startp, &newpos); 4719 if (repunicode == NULL) 4720 goto onError; 4721 if (PyBytes_Check(repunicode)) { 4722 /* Directly copy bytes result to output. */ 4723 repsize = PyBytes_Size(repunicode); 4724 if (repsize > 1) { 4725 /* Make room for all additional bytes. */ 4726 respos = str - PyBytes_AS_STRING(res); 4727 if (_PyBytes_Resize(&res, ressize+repsize-1)) { 4728 Py_DECREF(repunicode); 4729 goto onError; 4730 } 4731 str = PyBytes_AS_STRING(res) + respos; 4732 ressize += repsize-1; 4733 } 4734 memcpy(str, PyBytes_AsString(repunicode), repsize); 4735 str += repsize; 4736 p = startp + newpos; 4737 Py_DECREF(repunicode); 4738 break; 4739 } 4740 /* need more space? (at least enough for what we 4741 have+the replacement+the rest of the string, so 4742 we won't have to check space for encodable characters) */ 4743 respos = str - PyBytes_AS_STRING(res); 4744 repsize = PyUnicode_GET_SIZE(repunicode); 4745 requiredsize = respos+repsize+(endp-collend); 4746 if (requiredsize > ressize) { 4747 if (requiredsize<2*ressize) 4748 requiredsize = 2*ressize; 4749 if (_PyBytes_Resize(&res, requiredsize)) { 4750 Py_DECREF(repunicode); 4751 goto onError; 4752 } 4753 str = PyBytes_AS_STRING(res) + respos; 4754 ressize = requiredsize; 4755 } 4756 /* check if there is anything unencodable in the replacement 4757 and copy it to the output */ 4758 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) { 4759 c = *uni2; 4760 if (c >= limit) { 4761 raise_encode_exception(&exc, encoding, startp, size, 4762 unicodepos, unicodepos+1, reason); 4763 Py_DECREF(repunicode); 4764 goto onError; 4765 } 4766 *str = (char)c; 4767 } 4768 p = startp + newpos; 4769 Py_DECREF(repunicode); 4770 } 4771 } 4772 } 4773 /* Resize if we allocated to much */ 4774 size = str - PyBytes_AS_STRING(res); 4775 if (size < ressize) { /* If this falls res will be NULL */ 4776 assert(size >= 0); 4777 if (_PyBytes_Resize(&res, size) < 0) 4778 goto onError; 4779 } 4780 4781 Py_XDECREF(errorHandler); 4782 Py_XDECREF(exc); 4783 return res; 4784 4785 onError: 4786 Py_XDECREF(res); 4787 Py_XDECREF(errorHandler); 4788 Py_XDECREF(exc); 4789 return NULL; 4790} 4791 4792PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p, 4793 Py_ssize_t size, 4794 const char *errors) 4795{ 4796 return unicode_encode_ucs1(p, size, errors, 256); 4797} 4798 4799PyObject *PyUnicode_AsLatin1String(PyObject *unicode) 4800{ 4801 if (!PyUnicode_Check(unicode)) { 4802 PyErr_BadArgument(); 4803 return NULL; 4804 } 4805 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode), 4806 PyUnicode_GET_SIZE(unicode), 4807 NULL); 4808} 4809 4810/* --- 7-bit ASCII Codec -------------------------------------------------- */ 4811 4812PyObject *PyUnicode_DecodeASCII(const char *s, 4813 Py_ssize_t size, 4814 const char *errors) 4815{ 4816 const char *starts = s; 4817 PyUnicodeObject *v; 4818 Py_UNICODE *p; 4819 Py_ssize_t startinpos; 4820 Py_ssize_t endinpos; 4821 Py_ssize_t outpos; 4822 const char *e; 4823 PyObject *errorHandler = NULL; 4824 PyObject *exc = NULL; 4825 4826 /* ASCII is equivalent to the first 128 ordinals in Unicode. */ 4827 if (size == 1 && *(unsigned char*)s < 128) { 4828 Py_UNICODE r = *(unsigned char*)s; 4829 return PyUnicode_FromUnicode(&r, 1); 4830 } 4831 4832 v = _PyUnicode_New(size); 4833 if (v == NULL) 4834 goto onError; 4835 if (size == 0) 4836 return (PyObject *)v; 4837 p = PyUnicode_AS_UNICODE(v); 4838 e = s + size; 4839 while (s < e) { 4840 register unsigned char c = (unsigned char)*s; 4841 if (c < 128) { 4842 *p++ = c; 4843 ++s; 4844 } 4845 else { 4846 startinpos = s-starts; 4847 endinpos = startinpos + 1; 4848 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v); 4849 if (unicode_decode_call_errorhandler( 4850 errors, &errorHandler, 4851 "ascii", "ordinal not in range(128)", 4852 &starts, &e, &startinpos, &endinpos, &exc, &s, 4853 &v, &outpos, &p)) 4854 goto onError; 4855 } 4856 } 4857 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v)) 4858 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0) 4859 goto onError; 4860 Py_XDECREF(errorHandler); 4861 Py_XDECREF(exc); 4862 return (PyObject *)v; 4863 4864 onError: 4865 Py_XDECREF(v); 4866 Py_XDECREF(errorHandler); 4867 Py_XDECREF(exc); 4868 return NULL; 4869} 4870 4871PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p, 4872 Py_ssize_t size, 4873 const char *errors) 4874{ 4875 return unicode_encode_ucs1(p, size, errors, 128); 4876} 4877 4878PyObject *PyUnicode_AsASCIIString(PyObject *unicode) 4879{ 4880 if (!PyUnicode_Check(unicode)) { 4881 PyErr_BadArgument(); 4882 return NULL; 4883 } 4884 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode), 4885 PyUnicode_GET_SIZE(unicode), 4886 NULL); 4887} 4888 4889#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T) 4890 4891/* --- MBCS codecs for Windows -------------------------------------------- */ 4892 4893#if SIZEOF_INT < SIZEOF_SIZE_T 4894#define NEED_RETRY 4895#endif 4896 4897/* XXX This code is limited to "true" double-byte encodings, as 4898 a) it assumes an incomplete character consists of a single byte, and 4899 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte 4900 encodings, see IsDBCSLeadByteEx documentation. */ 4901 4902static int is_dbcs_lead_byte(const char *s, int offset) 4903{ 4904 const char *curr = s + offset; 4905 4906 if (IsDBCSLeadByte(*curr)) { 4907 const char *prev = CharPrev(s, curr); 4908 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2); 4909 } 4910 return 0; 4911} 4912 4913/* 4914 * Decode MBCS string into unicode object. If 'final' is set, converts 4915 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise. 4916 */ 4917static int decode_mbcs(PyUnicodeObject **v, 4918 const char *s, /* MBCS string */ 4919 int size, /* sizeof MBCS string */ 4920 int final, 4921 const char *errors) 4922{ 4923 Py_UNICODE *p; 4924 Py_ssize_t n; 4925 DWORD usize; 4926 DWORD flags; 4927 4928 assert(size >= 0); 4929 4930 /* check and handle 'errors' arg */ 4931 if (errors==NULL || strcmp(errors, "strict")==0) 4932 flags = MB_ERR_INVALID_CHARS; 4933 else if (strcmp(errors, "ignore")==0) 4934 flags = 0; 4935 else { 4936 PyErr_Format(PyExc_ValueError, 4937 "mbcs encoding does not support errors='%s'", 4938 errors); 4939 return -1; 4940 } 4941 4942 /* Skip trailing lead-byte unless 'final' is set */ 4943 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1)) 4944 --size; 4945 4946 /* First get the size of the result */ 4947 if (size > 0) { 4948 usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0); 4949 if (usize==0) 4950 goto mbcs_decode_error; 4951 } else 4952 usize = 0; 4953 4954 if (*v == NULL) { 4955 /* Create unicode object */ 4956 *v = _PyUnicode_New(usize); 4957 if (*v == NULL) 4958 return -1; 4959 n = 0; 4960 } 4961 else { 4962 /* Extend unicode object */ 4963 n = PyUnicode_GET_SIZE(*v); 4964 if (_PyUnicode_Resize(v, n + usize) < 0) 4965 return -1; 4966 } 4967 4968 /* Do the conversion */ 4969 if (usize > 0) { 4970 p = PyUnicode_AS_UNICODE(*v) + n; 4971 if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) { 4972 goto mbcs_decode_error; 4973 } 4974 } 4975 return size; 4976 4977mbcs_decode_error: 4978 /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then 4979 we raise a UnicodeDecodeError - else it is a 'generic' 4980 windows error 4981 */ 4982 if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) { 4983 /* Ideally, we should get reason from FormatMessage - this 4984 is the Windows 2000 English version of the message 4985 */ 4986 PyObject *exc = NULL; 4987 const char *reason = "No mapping for the Unicode character exists " 4988 "in the target multi-byte code page."; 4989 make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason); 4990 if (exc != NULL) { 4991 PyCodec_StrictErrors(exc); 4992 Py_DECREF(exc); 4993 } 4994 } else { 4995 PyErr_SetFromWindowsErrWithFilename(0, NULL); 4996 } 4997 return -1; 4998} 4999 5000PyObject *PyUnicode_DecodeMBCSStateful(const char *s, 5001 Py_ssize_t size, 5002 const char *errors, 5003 Py_ssize_t *consumed) 5004{ 5005 PyUnicodeObject *v = NULL; 5006 int done; 5007 5008 if (consumed) 5009 *consumed = 0; 5010 5011#ifdef NEED_RETRY 5012 retry: 5013 if (size > INT_MAX) 5014 done = decode_mbcs(&v, s, INT_MAX, 0, errors); 5015 else 5016#endif 5017 done = decode_mbcs(&v, s, (int)size, !consumed, errors); 5018 5019 if (done < 0) { 5020 Py_XDECREF(v); 5021 return NULL; 5022 } 5023 5024 if (consumed) 5025 *consumed += done; 5026 5027#ifdef NEED_RETRY 5028 if (size > INT_MAX) { 5029 s += done; 5030 size -= done; 5031 goto retry; 5032 } 5033#endif 5034 5035 return (PyObject *)v; 5036} 5037 5038PyObject *PyUnicode_DecodeMBCS(const char *s, 5039 Py_ssize_t size, 5040 const char *errors) 5041{ 5042 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL); 5043} 5044 5045/* 5046 * Convert unicode into string object (MBCS). 5047 * Returns 0 if succeed, -1 otherwise. 5048 */ 5049static int encode_mbcs(PyObject **repr, 5050 const Py_UNICODE *p, /* unicode */ 5051 int size, /* size of unicode */ 5052 const char* errors) 5053{ 5054 BOOL usedDefaultChar = FALSE; 5055 BOOL *pusedDefaultChar; 5056 int mbcssize; 5057 Py_ssize_t n; 5058 PyObject *exc = NULL; 5059 DWORD flags; 5060 5061 assert(size >= 0); 5062 5063 /* check and handle 'errors' arg */ 5064 if (errors==NULL || strcmp(errors, "strict")==0) { 5065 flags = WC_NO_BEST_FIT_CHARS; 5066 pusedDefaultChar = &usedDefaultChar; 5067 } else if (strcmp(errors, "replace")==0) { 5068 flags = 0; 5069 pusedDefaultChar = NULL; 5070 } else { 5071 PyErr_Format(PyExc_ValueError, 5072 "mbcs encoding does not support errors='%s'", 5073 errors); 5074 return -1; 5075 } 5076 5077 /* First get the size of the result */ 5078 if (size > 0) { 5079 mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0, 5080 NULL, pusedDefaultChar); 5081 if (mbcssize == 0) { 5082 PyErr_SetFromWindowsErrWithFilename(0, NULL); 5083 return -1; 5084 } 5085 /* If we used a default char, then we failed! */ 5086 if (pusedDefaultChar && *pusedDefaultChar) 5087 goto mbcs_encode_error; 5088 } else { 5089 mbcssize = 0; 5090 } 5091 5092 if (*repr == NULL) { 5093 /* Create string object */ 5094 *repr = PyBytes_FromStringAndSize(NULL, mbcssize); 5095 if (*repr == NULL) 5096 return -1; 5097 n = 0; 5098 } 5099 else { 5100 /* Extend string object */ 5101 n = PyBytes_Size(*repr); 5102 if (_PyBytes_Resize(repr, n + mbcssize) < 0) 5103 return -1; 5104 } 5105 5106 /* Do the conversion */ 5107 if (size > 0) { 5108 char *s = PyBytes_AS_STRING(*repr) + n; 5109 if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize, 5110 NULL, pusedDefaultChar)) { 5111 PyErr_SetFromWindowsErrWithFilename(0, NULL); 5112 return -1; 5113 } 5114 if (pusedDefaultChar && *pusedDefaultChar) 5115 goto mbcs_encode_error; 5116 } 5117 return 0; 5118 5119mbcs_encode_error: 5120 raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character"); 5121 Py_XDECREF(exc); 5122 return -1; 5123} 5124 5125PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p, 5126 Py_ssize_t size, 5127 const char *errors) 5128{ 5129 PyObject *repr = NULL; 5130 int ret; 5131 5132#ifdef NEED_RETRY 5133 retry: 5134 if (size > INT_MAX) 5135 ret = encode_mbcs(&repr, p, INT_MAX, errors); 5136 else 5137#endif 5138 ret = encode_mbcs(&repr, p, (int)size, errors); 5139 5140 if (ret < 0) { 5141 Py_XDECREF(repr); 5142 return NULL; 5143 } 5144 5145#ifdef NEED_RETRY 5146 if (size > INT_MAX) { 5147 p += INT_MAX; 5148 size -= INT_MAX; 5149 goto retry; 5150 } 5151#endif 5152 5153 return repr; 5154} 5155 5156PyObject *PyUnicode_AsMBCSString(PyObject *unicode) 5157{ 5158 if (!PyUnicode_Check(unicode)) { 5159 PyErr_BadArgument(); 5160 return NULL; 5161 } 5162 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode), 5163 PyUnicode_GET_SIZE(unicode), 5164 NULL); 5165} 5166 5167#undef NEED_RETRY 5168 5169#endif /* MS_WINDOWS */ 5170 5171/* --- Character Mapping Codec -------------------------------------------- */ 5172 5173PyObject *PyUnicode_DecodeCharmap(const char *s, 5174 Py_ssize_t size, 5175 PyObject *mapping, 5176 const char *errors) 5177{ 5178 const char *starts = s; 5179 Py_ssize_t startinpos; 5180 Py_ssize_t endinpos; 5181 Py_ssize_t outpos; 5182 const char *e; 5183 PyUnicodeObject *v; 5184 Py_UNICODE *p; 5185 Py_ssize_t extrachars = 0; 5186 PyObject *errorHandler = NULL; 5187 PyObject *exc = NULL; 5188 Py_UNICODE *mapstring = NULL; 5189 Py_ssize_t maplen = 0; 5190 5191 /* Default to Latin-1 */ 5192 if (mapping == NULL) 5193 return PyUnicode_DecodeLatin1(s, size, errors); 5194 5195 v = _PyUnicode_New(size); 5196 if (v == NULL) 5197 goto onError; 5198 if (size == 0) 5199 return (PyObject *)v; 5200 p = PyUnicode_AS_UNICODE(v); 5201 e = s + size; 5202 if (PyUnicode_CheckExact(mapping)) { 5203 mapstring = PyUnicode_AS_UNICODE(mapping); 5204 maplen = PyUnicode_GET_SIZE(mapping); 5205 while (s < e) { 5206 unsigned char ch = *s; 5207 Py_UNICODE x = 0xfffe; /* illegal value */ 5208 5209 if (ch < maplen) 5210 x = mapstring[ch]; 5211 5212 if (x == 0xfffe) { 5213 /* undefined mapping */ 5214 outpos = p-PyUnicode_AS_UNICODE(v); 5215 startinpos = s-starts; 5216 endinpos = startinpos+1; 5217 if (unicode_decode_call_errorhandler( 5218 errors, &errorHandler, 5219 "charmap", "character maps to <undefined>", 5220 &starts, &e, &startinpos, &endinpos, &exc, &s, 5221 &v, &outpos, &p)) { 5222 goto onError; 5223 } 5224 continue; 5225 } 5226 *p++ = x; 5227 ++s; 5228 } 5229 } 5230 else { 5231 while (s < e) { 5232 unsigned char ch = *s; 5233 PyObject *w, *x; 5234 5235 /* Get mapping (char ordinal -> integer, Unicode char or None) */ 5236 w = PyLong_FromLong((long)ch); 5237 if (w == NULL) 5238 goto onError; 5239 x = PyObject_GetItem(mapping, w); 5240 Py_DECREF(w); 5241 if (x == NULL) { 5242 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 5243 /* No mapping found means: mapping is undefined. */ 5244 PyErr_Clear(); 5245 x = Py_None; 5246 Py_INCREF(x); 5247 } else 5248 goto onError; 5249 } 5250 5251 /* Apply mapping */ 5252 if (PyLong_Check(x)) { 5253 long value = PyLong_AS_LONG(x); 5254 if (value < 0 || value > 65535) { 5255 PyErr_SetString(PyExc_TypeError, 5256 "character mapping must be in range(65536)"); 5257 Py_DECREF(x); 5258 goto onError; 5259 } 5260 *p++ = (Py_UNICODE)value; 5261 } 5262 else if (x == Py_None) { 5263 /* undefined mapping */ 5264 outpos = p-PyUnicode_AS_UNICODE(v); 5265 startinpos = s-starts; 5266 endinpos = startinpos+1; 5267 if (unicode_decode_call_errorhandler( 5268 errors, &errorHandler, 5269 "charmap", "character maps to <undefined>", 5270 &starts, &e, &startinpos, &endinpos, &exc, &s, 5271 &v, &outpos, &p)) { 5272 Py_DECREF(x); 5273 goto onError; 5274 } 5275 Py_DECREF(x); 5276 continue; 5277 } 5278 else if (PyUnicode_Check(x)) { 5279 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x); 5280 5281 if (targetsize == 1) 5282 /* 1-1 mapping */ 5283 *p++ = *PyUnicode_AS_UNICODE(x); 5284 5285 else if (targetsize > 1) { 5286 /* 1-n mapping */ 5287 if (targetsize > extrachars) { 5288 /* resize first */ 5289 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v); 5290 Py_ssize_t needed = (targetsize - extrachars) + \ 5291 (targetsize << 2); 5292 extrachars += needed; 5293 /* XXX overflow detection missing */ 5294 if (_PyUnicode_Resize(&v, 5295 PyUnicode_GET_SIZE(v) + needed) < 0) { 5296 Py_DECREF(x); 5297 goto onError; 5298 } 5299 p = PyUnicode_AS_UNICODE(v) + oldpos; 5300 } 5301 Py_UNICODE_COPY(p, 5302 PyUnicode_AS_UNICODE(x), 5303 targetsize); 5304 p += targetsize; 5305 extrachars -= targetsize; 5306 } 5307 /* 1-0 mapping: skip the character */ 5308 } 5309 else { 5310 /* wrong return value */ 5311 PyErr_SetString(PyExc_TypeError, 5312 "character mapping must return integer, None or str"); 5313 Py_DECREF(x); 5314 goto onError; 5315 } 5316 Py_DECREF(x); 5317 ++s; 5318 } 5319 } 5320 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v)) 5321 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0) 5322 goto onError; 5323 Py_XDECREF(errorHandler); 5324 Py_XDECREF(exc); 5325 return (PyObject *)v; 5326 5327 onError: 5328 Py_XDECREF(errorHandler); 5329 Py_XDECREF(exc); 5330 Py_XDECREF(v); 5331 return NULL; 5332} 5333 5334/* Charmap encoding: the lookup table */ 5335 5336struct encoding_map{ 5337 PyObject_HEAD 5338 unsigned char level1[32]; 5339 int count2, count3; 5340 unsigned char level23[1]; 5341}; 5342 5343static PyObject* 5344encoding_map_size(PyObject *obj, PyObject* args) 5345{ 5346 struct encoding_map *map = (struct encoding_map*)obj; 5347 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 + 5348 128*map->count3); 5349} 5350 5351static PyMethodDef encoding_map_methods[] = { 5352 {"size", encoding_map_size, METH_NOARGS, 5353 PyDoc_STR("Return the size (in bytes) of this object") }, 5354 { 0 } 5355}; 5356 5357static void 5358encoding_map_dealloc(PyObject* o) 5359{ 5360 PyObject_FREE(o); 5361} 5362 5363static PyTypeObject EncodingMapType = { 5364 PyVarObject_HEAD_INIT(NULL, 0) 5365 "EncodingMap", /*tp_name*/ 5366 sizeof(struct encoding_map), /*tp_basicsize*/ 5367 0, /*tp_itemsize*/ 5368 /* methods */ 5369 encoding_map_dealloc, /*tp_dealloc*/ 5370 0, /*tp_print*/ 5371 0, /*tp_getattr*/ 5372 0, /*tp_setattr*/ 5373 0, /*tp_reserved*/ 5374 0, /*tp_repr*/ 5375 0, /*tp_as_number*/ 5376 0, /*tp_as_sequence*/ 5377 0, /*tp_as_mapping*/ 5378 0, /*tp_hash*/ 5379 0, /*tp_call*/ 5380 0, /*tp_str*/ 5381 0, /*tp_getattro*/ 5382 0, /*tp_setattro*/ 5383 0, /*tp_as_buffer*/ 5384 Py_TPFLAGS_DEFAULT, /*tp_flags*/ 5385 0, /*tp_doc*/ 5386 0, /*tp_traverse*/ 5387 0, /*tp_clear*/ 5388 0, /*tp_richcompare*/ 5389 0, /*tp_weaklistoffset*/ 5390 0, /*tp_iter*/ 5391 0, /*tp_iternext*/ 5392 encoding_map_methods, /*tp_methods*/ 5393 0, /*tp_members*/ 5394 0, /*tp_getset*/ 5395 0, /*tp_base*/ 5396 0, /*tp_dict*/ 5397 0, /*tp_descr_get*/ 5398 0, /*tp_descr_set*/ 5399 0, /*tp_dictoffset*/ 5400 0, /*tp_init*/ 5401 0, /*tp_alloc*/ 5402 0, /*tp_new*/ 5403 0, /*tp_free*/ 5404 0, /*tp_is_gc*/ 5405}; 5406 5407PyObject* 5408PyUnicode_BuildEncodingMap(PyObject* string) 5409{ 5410 Py_UNICODE *decode; 5411 PyObject *result; 5412 struct encoding_map *mresult; 5413 int i; 5414 int need_dict = 0; 5415 unsigned char level1[32]; 5416 unsigned char level2[512]; 5417 unsigned char *mlevel1, *mlevel2, *mlevel3; 5418 int count2 = 0, count3 = 0; 5419 5420 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) { 5421 PyErr_BadArgument(); 5422 return NULL; 5423 } 5424 decode = PyUnicode_AS_UNICODE(string); 5425 memset(level1, 0xFF, sizeof level1); 5426 memset(level2, 0xFF, sizeof level2); 5427 5428 /* If there isn't a one-to-one mapping of NULL to \0, 5429 or if there are non-BMP characters, we need to use 5430 a mapping dictionary. */ 5431 if (decode[0] != 0) 5432 need_dict = 1; 5433 for (i = 1; i < 256; i++) { 5434 int l1, l2; 5435 if (decode[i] == 0 5436#ifdef Py_UNICODE_WIDE 5437 || decode[i] > 0xFFFF 5438#endif 5439 ) { 5440 need_dict = 1; 5441 break; 5442 } 5443 if (decode[i] == 0xFFFE) 5444 /* unmapped character */ 5445 continue; 5446 l1 = decode[i] >> 11; 5447 l2 = decode[i] >> 7; 5448 if (level1[l1] == 0xFF) 5449 level1[l1] = count2++; 5450 if (level2[l2] == 0xFF) 5451 level2[l2] = count3++; 5452 } 5453 5454 if (count2 >= 0xFF || count3 >= 0xFF) 5455 need_dict = 1; 5456 5457 if (need_dict) { 5458 PyObject *result = PyDict_New(); 5459 PyObject *key, *value; 5460 if (!result) 5461 return NULL; 5462 for (i = 0; i < 256; i++) { 5463 key = value = NULL; 5464 key = PyLong_FromLong(decode[i]); 5465 value = PyLong_FromLong(i); 5466 if (!key || !value) 5467 goto failed1; 5468 if (PyDict_SetItem(result, key, value) == -1) 5469 goto failed1; 5470 Py_DECREF(key); 5471 Py_DECREF(value); 5472 } 5473 return result; 5474 failed1: 5475 Py_XDECREF(key); 5476 Py_XDECREF(value); 5477 Py_DECREF(result); 5478 return NULL; 5479 } 5480 5481 /* Create a three-level trie */ 5482 result = PyObject_MALLOC(sizeof(struct encoding_map) + 5483 16*count2 + 128*count3 - 1); 5484 if (!result) 5485 return PyErr_NoMemory(); 5486 PyObject_Init(result, &EncodingMapType); 5487 mresult = (struct encoding_map*)result; 5488 mresult->count2 = count2; 5489 mresult->count3 = count3; 5490 mlevel1 = mresult->level1; 5491 mlevel2 = mresult->level23; 5492 mlevel3 = mresult->level23 + 16*count2; 5493 memcpy(mlevel1, level1, 32); 5494 memset(mlevel2, 0xFF, 16*count2); 5495 memset(mlevel3, 0, 128*count3); 5496 count3 = 0; 5497 for (i = 1; i < 256; i++) { 5498 int o1, o2, o3, i2, i3; 5499 if (decode[i] == 0xFFFE) 5500 /* unmapped character */ 5501 continue; 5502 o1 = decode[i]>>11; 5503 o2 = (decode[i]>>7) & 0xF; 5504 i2 = 16*mlevel1[o1] + o2; 5505 if (mlevel2[i2] == 0xFF) 5506 mlevel2[i2] = count3++; 5507 o3 = decode[i] & 0x7F; 5508 i3 = 128*mlevel2[i2] + o3; 5509 mlevel3[i3] = i; 5510 } 5511 return result; 5512} 5513 5514static int 5515encoding_map_lookup(Py_UNICODE c, PyObject *mapping) 5516{ 5517 struct encoding_map *map = (struct encoding_map*)mapping; 5518 int l1 = c>>11; 5519 int l2 = (c>>7) & 0xF; 5520 int l3 = c & 0x7F; 5521 int i; 5522 5523#ifdef Py_UNICODE_WIDE 5524 if (c > 0xFFFF) { 5525 return -1; 5526 } 5527#endif 5528 if (c == 0) 5529 return 0; 5530 /* level 1*/ 5531 i = map->level1[l1]; 5532 if (i == 0xFF) { 5533 return -1; 5534 } 5535 /* level 2*/ 5536 i = map->level23[16*i+l2]; 5537 if (i == 0xFF) { 5538 return -1; 5539 } 5540 /* level 3 */ 5541 i = map->level23[16*map->count2 + 128*i + l3]; 5542 if (i == 0) { 5543 return -1; 5544 } 5545 return i; 5546} 5547 5548/* Lookup the character ch in the mapping. If the character 5549 can't be found, Py_None is returned (or NULL, if another 5550 error occurred). */ 5551static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping) 5552{ 5553 PyObject *w = PyLong_FromLong((long)c); 5554 PyObject *x; 5555 5556 if (w == NULL) 5557 return NULL; 5558 x = PyObject_GetItem(mapping, w); 5559 Py_DECREF(w); 5560 if (x == NULL) { 5561 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 5562 /* No mapping found means: mapping is undefined. */ 5563 PyErr_Clear(); 5564 x = Py_None; 5565 Py_INCREF(x); 5566 return x; 5567 } else 5568 return NULL; 5569 } 5570 else if (x == Py_None) 5571 return x; 5572 else if (PyLong_Check(x)) { 5573 long value = PyLong_AS_LONG(x); 5574 if (value < 0 || value > 255) { 5575 PyErr_SetString(PyExc_TypeError, 5576 "character mapping must be in range(256)"); 5577 Py_DECREF(x); 5578 return NULL; 5579 } 5580 return x; 5581 } 5582 else if (PyBytes_Check(x)) 5583 return x; 5584 else { 5585 /* wrong return value */ 5586 PyErr_Format(PyExc_TypeError, 5587 "character mapping must return integer, bytes or None, not %.400s", 5588 x->ob_type->tp_name); 5589 Py_DECREF(x); 5590 return NULL; 5591 } 5592} 5593 5594static int 5595charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize) 5596{ 5597 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj); 5598 /* exponentially overallocate to minimize reallocations */ 5599 if (requiredsize < 2*outsize) 5600 requiredsize = 2*outsize; 5601 if (_PyBytes_Resize(outobj, requiredsize)) 5602 return -1; 5603 return 0; 5604} 5605 5606typedef enum charmapencode_result { 5607 enc_SUCCESS, enc_FAILED, enc_EXCEPTION 5608}charmapencode_result; 5609/* lookup the character, put the result in the output string and adjust 5610 various state variables. Resize the output bytes object if not enough 5611 space is available. Return a new reference to the object that 5612 was put in the output buffer, or Py_None, if the mapping was undefined 5613 (in which case no character was written) or NULL, if a 5614 reallocation error occurred. The caller must decref the result */ 5615static 5616charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping, 5617 PyObject **outobj, Py_ssize_t *outpos) 5618{ 5619 PyObject *rep; 5620 char *outstart; 5621 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj); 5622 5623 if (Py_TYPE(mapping) == &EncodingMapType) { 5624 int res = encoding_map_lookup(c, mapping); 5625 Py_ssize_t requiredsize = *outpos+1; 5626 if (res == -1) 5627 return enc_FAILED; 5628 if (outsize<requiredsize) 5629 if (charmapencode_resize(outobj, outpos, requiredsize)) 5630 return enc_EXCEPTION; 5631 outstart = PyBytes_AS_STRING(*outobj); 5632 outstart[(*outpos)++] = (char)res; 5633 return enc_SUCCESS; 5634 } 5635 5636 rep = charmapencode_lookup(c, mapping); 5637 if (rep==NULL) 5638 return enc_EXCEPTION; 5639 else if (rep==Py_None) { 5640 Py_DECREF(rep); 5641 return enc_FAILED; 5642 } else { 5643 if (PyLong_Check(rep)) { 5644 Py_ssize_t requiredsize = *outpos+1; 5645 if (outsize<requiredsize) 5646 if (charmapencode_resize(outobj, outpos, requiredsize)) { 5647 Py_DECREF(rep); 5648 return enc_EXCEPTION; 5649 } 5650 outstart = PyBytes_AS_STRING(*outobj); 5651 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep); 5652 } 5653 else { 5654 const char *repchars = PyBytes_AS_STRING(rep); 5655 Py_ssize_t repsize = PyBytes_GET_SIZE(rep); 5656 Py_ssize_t requiredsize = *outpos+repsize; 5657 if (outsize<requiredsize) 5658 if (charmapencode_resize(outobj, outpos, requiredsize)) { 5659 Py_DECREF(rep); 5660 return enc_EXCEPTION; 5661 } 5662 outstart = PyBytes_AS_STRING(*outobj); 5663 memcpy(outstart + *outpos, repchars, repsize); 5664 *outpos += repsize; 5665 } 5666 } 5667 Py_DECREF(rep); 5668 return enc_SUCCESS; 5669} 5670 5671/* handle an error in PyUnicode_EncodeCharmap 5672 Return 0 on success, -1 on error */ 5673static 5674int charmap_encoding_error( 5675 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping, 5676 PyObject **exceptionObject, 5677 int *known_errorHandler, PyObject **errorHandler, const char *errors, 5678 PyObject **res, Py_ssize_t *respos) 5679{ 5680 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */ 5681 Py_ssize_t repsize; 5682 Py_ssize_t newpos; 5683 Py_UNICODE *uni2; 5684 /* startpos for collecting unencodable chars */ 5685 Py_ssize_t collstartpos = *inpos; 5686 Py_ssize_t collendpos = *inpos+1; 5687 Py_ssize_t collpos; 5688 char *encoding = "charmap"; 5689 char *reason = "character maps to <undefined>"; 5690 charmapencode_result x; 5691 5692 /* find all unencodable characters */ 5693 while (collendpos < size) { 5694 PyObject *rep; 5695 if (Py_TYPE(mapping) == &EncodingMapType) { 5696 int res = encoding_map_lookup(p[collendpos], mapping); 5697 if (res != -1) 5698 break; 5699 ++collendpos; 5700 continue; 5701 } 5702 5703 rep = charmapencode_lookup(p[collendpos], mapping); 5704 if (rep==NULL) 5705 return -1; 5706 else if (rep!=Py_None) { 5707 Py_DECREF(rep); 5708 break; 5709 } 5710 Py_DECREF(rep); 5711 ++collendpos; 5712 } 5713 /* cache callback name lookup 5714 * (if not done yet, i.e. it's the first error) */ 5715 if (*known_errorHandler==-1) { 5716 if ((errors==NULL) || (!strcmp(errors, "strict"))) 5717 *known_errorHandler = 1; 5718 else if (!strcmp(errors, "replace")) 5719 *known_errorHandler = 2; 5720 else if (!strcmp(errors, "ignore")) 5721 *known_errorHandler = 3; 5722 else if (!strcmp(errors, "xmlcharrefreplace")) 5723 *known_errorHandler = 4; 5724 else 5725 *known_errorHandler = 0; 5726 } 5727 switch (*known_errorHandler) { 5728 case 1: /* strict */ 5729 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason); 5730 return -1; 5731 case 2: /* replace */ 5732 for (collpos = collstartpos; collpos<collendpos; ++collpos) { 5733 x = charmapencode_output('?', mapping, res, respos); 5734 if (x==enc_EXCEPTION) { 5735 return -1; 5736 } 5737 else if (x==enc_FAILED) { 5738 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason); 5739 return -1; 5740 } 5741 } 5742 /* fall through */ 5743 case 3: /* ignore */ 5744 *inpos = collendpos; 5745 break; 5746 case 4: /* xmlcharrefreplace */ 5747 /* generate replacement (temporarily (mis)uses p) */ 5748 for (collpos = collstartpos; collpos < collendpos; ++collpos) { 5749 char buffer[2+29+1+1]; 5750 char *cp; 5751 sprintf(buffer, "&#%d;", (int)p[collpos]); 5752 for (cp = buffer; *cp; ++cp) { 5753 x = charmapencode_output(*cp, mapping, res, respos); 5754 if (x==enc_EXCEPTION) 5755 return -1; 5756 else if (x==enc_FAILED) { 5757 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason); 5758 return -1; 5759 } 5760 } 5761 } 5762 *inpos = collendpos; 5763 break; 5764 default: 5765 repunicode = unicode_encode_call_errorhandler(errors, errorHandler, 5766 encoding, reason, p, size, exceptionObject, 5767 collstartpos, collendpos, &newpos); 5768 if (repunicode == NULL) 5769 return -1; 5770 if (PyBytes_Check(repunicode)) { 5771 /* Directly copy bytes result to output. */ 5772 Py_ssize_t outsize = PyBytes_Size(*res); 5773 Py_ssize_t requiredsize; 5774 repsize = PyBytes_Size(repunicode); 5775 requiredsize = *respos + repsize; 5776 if (requiredsize > outsize) 5777 /* Make room for all additional bytes. */ 5778 if (charmapencode_resize(res, respos, requiredsize)) { 5779 Py_DECREF(repunicode); 5780 return -1; 5781 } 5782 memcpy(PyBytes_AsString(*res) + *respos, 5783 PyBytes_AsString(repunicode), repsize); 5784 *respos += repsize; 5785 *inpos = newpos; 5786 Py_DECREF(repunicode); 5787 break; 5788 } 5789 /* generate replacement */ 5790 repsize = PyUnicode_GET_SIZE(repunicode); 5791 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) { 5792 x = charmapencode_output(*uni2, mapping, res, respos); 5793 if (x==enc_EXCEPTION) { 5794 return -1; 5795 } 5796 else if (x==enc_FAILED) { 5797 Py_DECREF(repunicode); 5798 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason); 5799 return -1; 5800 } 5801 } 5802 *inpos = newpos; 5803 Py_DECREF(repunicode); 5804 } 5805 return 0; 5806} 5807 5808PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p, 5809 Py_ssize_t size, 5810 PyObject *mapping, 5811 const char *errors) 5812{ 5813 /* output object */ 5814 PyObject *res = NULL; 5815 /* current input position */ 5816 Py_ssize_t inpos = 0; 5817 /* current output position */ 5818 Py_ssize_t respos = 0; 5819 PyObject *errorHandler = NULL; 5820 PyObject *exc = NULL; 5821 /* the following variable is used for caching string comparisons 5822 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 5823 * 3=ignore, 4=xmlcharrefreplace */ 5824 int known_errorHandler = -1; 5825 5826 /* Default to Latin-1 */ 5827 if (mapping == NULL) 5828 return PyUnicode_EncodeLatin1(p, size, errors); 5829 5830 /* allocate enough for a simple encoding without 5831 replacements, if we need more, we'll resize */ 5832 res = PyBytes_FromStringAndSize(NULL, size); 5833 if (res == NULL) 5834 goto onError; 5835 if (size == 0) 5836 return res; 5837 5838 while (inpos<size) { 5839 /* try to encode it */ 5840 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos); 5841 if (x==enc_EXCEPTION) /* error */ 5842 goto onError; 5843 if (x==enc_FAILED) { /* unencodable character */ 5844 if (charmap_encoding_error(p, size, &inpos, mapping, 5845 &exc, 5846 &known_errorHandler, &errorHandler, errors, 5847 &res, &respos)) { 5848 goto onError; 5849 } 5850 } 5851 else 5852 /* done with this character => adjust input position */ 5853 ++inpos; 5854 } 5855 5856 /* Resize if we allocated to much */ 5857 if (respos<PyBytes_GET_SIZE(res)) 5858 if (_PyBytes_Resize(&res, respos) < 0) 5859 goto onError; 5860 5861 Py_XDECREF(exc); 5862 Py_XDECREF(errorHandler); 5863 return res; 5864 5865 onError: 5866 Py_XDECREF(res); 5867 Py_XDECREF(exc); 5868 Py_XDECREF(errorHandler); 5869 return NULL; 5870} 5871 5872PyObject *PyUnicode_AsCharmapString(PyObject *unicode, 5873 PyObject *mapping) 5874{ 5875 if (!PyUnicode_Check(unicode) || mapping == NULL) { 5876 PyErr_BadArgument(); 5877 return NULL; 5878 } 5879 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode), 5880 PyUnicode_GET_SIZE(unicode), 5881 mapping, 5882 NULL); 5883} 5884 5885/* create or adjust a UnicodeTranslateError */ 5886static void make_translate_exception(PyObject **exceptionObject, 5887 const Py_UNICODE *unicode, Py_ssize_t size, 5888 Py_ssize_t startpos, Py_ssize_t endpos, 5889 const char *reason) 5890{ 5891 if (*exceptionObject == NULL) { 5892 *exceptionObject = PyUnicodeTranslateError_Create( 5893 unicode, size, startpos, endpos, reason); 5894 } 5895 else { 5896 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos)) 5897 goto onError; 5898 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos)) 5899 goto onError; 5900 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason)) 5901 goto onError; 5902 return; 5903 onError: 5904 Py_DECREF(*exceptionObject); 5905 *exceptionObject = NULL; 5906 } 5907} 5908 5909/* raises a UnicodeTranslateError */ 5910static void raise_translate_exception(PyObject **exceptionObject, 5911 const Py_UNICODE *unicode, Py_ssize_t size, 5912 Py_ssize_t startpos, Py_ssize_t endpos, 5913 const char *reason) 5914{ 5915 make_translate_exception(exceptionObject, 5916 unicode, size, startpos, endpos, reason); 5917 if (*exceptionObject != NULL) 5918 PyCodec_StrictErrors(*exceptionObject); 5919} 5920 5921/* error handling callback helper: 5922 build arguments, call the callback and check the arguments, 5923 put the result into newpos and return the replacement string, which 5924 has to be freed by the caller */ 5925static PyObject *unicode_translate_call_errorhandler(const char *errors, 5926 PyObject **errorHandler, 5927 const char *reason, 5928 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject, 5929 Py_ssize_t startpos, Py_ssize_t endpos, 5930 Py_ssize_t *newpos) 5931{ 5932 static char *argparse = "O!n;translating error handler must return (str, int) tuple"; 5933 5934 Py_ssize_t i_newpos; 5935 PyObject *restuple; 5936 PyObject *resunicode; 5937 5938 if (*errorHandler == NULL) { 5939 *errorHandler = PyCodec_LookupError(errors); 5940 if (*errorHandler == NULL) 5941 return NULL; 5942 } 5943 5944 make_translate_exception(exceptionObject, 5945 unicode, size, startpos, endpos, reason); 5946 if (*exceptionObject == NULL) 5947 return NULL; 5948 5949 restuple = PyObject_CallFunctionObjArgs( 5950 *errorHandler, *exceptionObject, NULL); 5951 if (restuple == NULL) 5952 return NULL; 5953 if (!PyTuple_Check(restuple)) { 5954 PyErr_SetString(PyExc_TypeError, &argparse[4]); 5955 Py_DECREF(restuple); 5956 return NULL; 5957 } 5958 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, 5959 &resunicode, &i_newpos)) { 5960 Py_DECREF(restuple); 5961 return NULL; 5962 } 5963 if (i_newpos<0) 5964 *newpos = size+i_newpos; 5965 else 5966 *newpos = i_newpos; 5967 if (*newpos<0 || *newpos>size) { 5968 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos); 5969 Py_DECREF(restuple); 5970 return NULL; 5971 } 5972 Py_INCREF(resunicode); 5973 Py_DECREF(restuple); 5974 return resunicode; 5975} 5976 5977/* Lookup the character ch in the mapping and put the result in result, 5978 which must be decrefed by the caller. 5979 Return 0 on success, -1 on error */ 5980static 5981int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result) 5982{ 5983 PyObject *w = PyLong_FromLong((long)c); 5984 PyObject *x; 5985 5986 if (w == NULL) 5987 return -1; 5988 x = PyObject_GetItem(mapping, w); 5989 Py_DECREF(w); 5990 if (x == NULL) { 5991 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 5992 /* No mapping found means: use 1:1 mapping. */ 5993 PyErr_Clear(); 5994 *result = NULL; 5995 return 0; 5996 } else 5997 return -1; 5998 } 5999 else if (x == Py_None) { 6000 *result = x; 6001 return 0; 6002 } 6003 else if (PyLong_Check(x)) { 6004 long value = PyLong_AS_LONG(x); 6005 long max = PyUnicode_GetMax(); 6006 if (value < 0 || value > max) { 6007 PyErr_Format(PyExc_TypeError, 6008 "character mapping must be in range(0x%x)", max+1); 6009 Py_DECREF(x); 6010 return -1; 6011 } 6012 *result = x; 6013 return 0; 6014 } 6015 else if (PyUnicode_Check(x)) { 6016 *result = x; 6017 return 0; 6018 } 6019 else { 6020 /* wrong return value */ 6021 PyErr_SetString(PyExc_TypeError, 6022 "character mapping must return integer, None or str"); 6023 Py_DECREF(x); 6024 return -1; 6025 } 6026} 6027/* ensure that *outobj is at least requiredsize characters long, 6028 if not reallocate and adjust various state variables. 6029 Return 0 on success, -1 on error */ 6030static 6031int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp, 6032 Py_ssize_t requiredsize) 6033{ 6034 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj); 6035 if (requiredsize > oldsize) { 6036 /* remember old output position */ 6037 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj); 6038 /* exponentially overallocate to minimize reallocations */ 6039 if (requiredsize < 2 * oldsize) 6040 requiredsize = 2 * oldsize; 6041 if (PyUnicode_Resize(outobj, requiredsize) < 0) 6042 return -1; 6043 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos; 6044 } 6045 return 0; 6046} 6047/* lookup the character, put the result in the output string and adjust 6048 various state variables. Return a new reference to the object that 6049 was put in the output buffer in *result, or Py_None, if the mapping was 6050 undefined (in which case no character was written). 6051 The called must decref result. 6052 Return 0 on success, -1 on error. */ 6053static 6054int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp, 6055 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp, 6056 PyObject **res) 6057{ 6058 if (charmaptranslate_lookup(*curinp, mapping, res)) 6059 return -1; 6060 if (*res==NULL) { 6061 /* not found => default to 1:1 mapping */ 6062 *(*outp)++ = *curinp; 6063 } 6064 else if (*res==Py_None) 6065 ; 6066 else if (PyLong_Check(*res)) { 6067 /* no overflow check, because we know that the space is enough */ 6068 *(*outp)++ = (Py_UNICODE)PyLong_AS_LONG(*res); 6069 } 6070 else if (PyUnicode_Check(*res)) { 6071 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res); 6072 if (repsize==1) { 6073 /* no overflow check, because we know that the space is enough */ 6074 *(*outp)++ = *PyUnicode_AS_UNICODE(*res); 6075 } 6076 else if (repsize!=0) { 6077 /* more than one character */ 6078 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) + 6079 (insize - (curinp-startinp)) + 6080 repsize - 1; 6081 if (charmaptranslate_makespace(outobj, outp, requiredsize)) 6082 return -1; 6083 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize); 6084 *outp += repsize; 6085 } 6086 } 6087 else 6088 return -1; 6089 return 0; 6090} 6091 6092PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p, 6093 Py_ssize_t size, 6094 PyObject *mapping, 6095 const char *errors) 6096{ 6097 /* output object */ 6098 PyObject *res = NULL; 6099 /* pointers to the beginning and end+1 of input */ 6100 const Py_UNICODE *startp = p; 6101 const Py_UNICODE *endp = p + size; 6102 /* pointer into the output */ 6103 Py_UNICODE *str; 6104 /* current output position */ 6105 Py_ssize_t respos = 0; 6106 char *reason = "character maps to <undefined>"; 6107 PyObject *errorHandler = NULL; 6108 PyObject *exc = NULL; 6109 /* the following variable is used for caching string comparisons 6110 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 6111 * 3=ignore, 4=xmlcharrefreplace */ 6112 int known_errorHandler = -1; 6113 6114 if (mapping == NULL) { 6115 PyErr_BadArgument(); 6116 return NULL; 6117 } 6118 6119 /* allocate enough for a simple 1:1 translation without 6120 replacements, if we need more, we'll resize */ 6121 res = PyUnicode_FromUnicode(NULL, size); 6122 if (res == NULL) 6123 goto onError; 6124 if (size == 0) 6125 return res; 6126 str = PyUnicode_AS_UNICODE(res); 6127 6128 while (p<endp) { 6129 /* try to encode it */ 6130 PyObject *x = NULL; 6131 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) { 6132 Py_XDECREF(x); 6133 goto onError; 6134 } 6135 Py_XDECREF(x); 6136 if (x!=Py_None) /* it worked => adjust input pointer */ 6137 ++p; 6138 else { /* untranslatable character */ 6139 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */ 6140 Py_ssize_t repsize; 6141 Py_ssize_t newpos; 6142 Py_UNICODE *uni2; 6143 /* startpos for collecting untranslatable chars */ 6144 const Py_UNICODE *collstart = p; 6145 const Py_UNICODE *collend = p+1; 6146 const Py_UNICODE *coll; 6147 6148 /* find all untranslatable characters */ 6149 while (collend < endp) { 6150 if (charmaptranslate_lookup(*collend, mapping, &x)) 6151 goto onError; 6152 Py_XDECREF(x); 6153 if (x!=Py_None) 6154 break; 6155 ++collend; 6156 } 6157 /* cache callback name lookup 6158 * (if not done yet, i.e. it's the first error) */ 6159 if (known_errorHandler==-1) { 6160 if ((errors==NULL) || (!strcmp(errors, "strict"))) 6161 known_errorHandler = 1; 6162 else if (!strcmp(errors, "replace")) 6163 known_errorHandler = 2; 6164 else if (!strcmp(errors, "ignore")) 6165 known_errorHandler = 3; 6166 else if (!strcmp(errors, "xmlcharrefreplace")) 6167 known_errorHandler = 4; 6168 else 6169 known_errorHandler = 0; 6170 } 6171 switch (known_errorHandler) { 6172 case 1: /* strict */ 6173 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason); 6174 goto onError; 6175 case 2: /* replace */ 6176 /* No need to check for space, this is a 1:1 replacement */ 6177 for (coll = collstart; coll<collend; ++coll) 6178 *str++ = '?'; 6179 /* fall through */ 6180 case 3: /* ignore */ 6181 p = collend; 6182 break; 6183 case 4: /* xmlcharrefreplace */ 6184 /* generate replacement (temporarily (mis)uses p) */ 6185 for (p = collstart; p < collend; ++p) { 6186 char buffer[2+29+1+1]; 6187 char *cp; 6188 sprintf(buffer, "&#%d;", (int)*p); 6189 if (charmaptranslate_makespace(&res, &str, 6190 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend))) 6191 goto onError; 6192 for (cp = buffer; *cp; ++cp) 6193 *str++ = *cp; 6194 } 6195 p = collend; 6196 break; 6197 default: 6198 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler, 6199 reason, startp, size, &exc, 6200 collstart-startp, collend-startp, &newpos); 6201 if (repunicode == NULL) 6202 goto onError; 6203 /* generate replacement */ 6204 repsize = PyUnicode_GET_SIZE(repunicode); 6205 if (charmaptranslate_makespace(&res, &str, 6206 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) { 6207 Py_DECREF(repunicode); 6208 goto onError; 6209 } 6210 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) 6211 *str++ = *uni2; 6212 p = startp + newpos; 6213 Py_DECREF(repunicode); 6214 } 6215 } 6216 } 6217 /* Resize if we allocated to much */ 6218 respos = str-PyUnicode_AS_UNICODE(res); 6219 if (respos<PyUnicode_GET_SIZE(res)) { 6220 if (PyUnicode_Resize(&res, respos) < 0) 6221 goto onError; 6222 } 6223 Py_XDECREF(exc); 6224 Py_XDECREF(errorHandler); 6225 return res; 6226 6227 onError: 6228 Py_XDECREF(res); 6229 Py_XDECREF(exc); 6230 Py_XDECREF(errorHandler); 6231 return NULL; 6232} 6233 6234PyObject *PyUnicode_Translate(PyObject *str, 6235 PyObject *mapping, 6236 const char *errors) 6237{ 6238 PyObject *result; 6239 6240 str = PyUnicode_FromObject(str); 6241 if (str == NULL) 6242 goto onError; 6243 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str), 6244 PyUnicode_GET_SIZE(str), 6245 mapping, 6246 errors); 6247 Py_DECREF(str); 6248 return result; 6249 6250 onError: 6251 Py_XDECREF(str); 6252 return NULL; 6253} 6254 6255PyObject * 6256PyUnicode_TransformDecimalToASCII(Py_UNICODE *s, 6257 Py_ssize_t length) 6258{ 6259 PyObject *result; 6260 Py_UNICODE *p; /* write pointer into result */ 6261 Py_ssize_t i; 6262 /* Copy to a new string */ 6263 result = (PyObject *)_PyUnicode_New(length); 6264 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length); 6265 if (result == NULL) 6266 return result; 6267 p = PyUnicode_AS_UNICODE(result); 6268 /* Iterate over code points */ 6269 for (i = 0; i < length; i++) { 6270 Py_UNICODE ch =s[i]; 6271 if (ch > 127) { 6272 int decimal = Py_UNICODE_TODECIMAL(ch); 6273 if (decimal >= 0) 6274 p[i] = '0' + decimal; 6275 } 6276 } 6277 return result; 6278} 6279/* --- Decimal Encoder ---------------------------------------------------- */ 6280 6281int PyUnicode_EncodeDecimal(Py_UNICODE *s, 6282 Py_ssize_t length, 6283 char *output, 6284 const char *errors) 6285{ 6286 Py_UNICODE *p, *end; 6287 PyObject *errorHandler = NULL; 6288 PyObject *exc = NULL; 6289 const char *encoding = "decimal"; 6290 const char *reason = "invalid decimal Unicode string"; 6291 /* the following variable is used for caching string comparisons 6292 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */ 6293 int known_errorHandler = -1; 6294 6295 if (output == NULL) { 6296 PyErr_BadArgument(); 6297 return -1; 6298 } 6299 6300 p = s; 6301 end = s + length; 6302 while (p < end) { 6303 register Py_UNICODE ch = *p; 6304 int decimal; 6305 PyObject *repunicode; 6306 Py_ssize_t repsize; 6307 Py_ssize_t newpos; 6308 Py_UNICODE *uni2; 6309 Py_UNICODE *collstart; 6310 Py_UNICODE *collend; 6311 6312 if (Py_UNICODE_ISSPACE(ch)) { 6313 *output++ = ' '; 6314 ++p; 6315 continue; 6316 } 6317 decimal = Py_UNICODE_TODECIMAL(ch); 6318 if (decimal >= 0) { 6319 *output++ = '0' + decimal; 6320 ++p; 6321 continue; 6322 } 6323 if (0 < ch && ch < 256) { 6324 *output++ = (char)ch; 6325 ++p; 6326 continue; 6327 } 6328 /* All other characters are considered unencodable */ 6329 collstart = p; 6330 collend = p+1; 6331 while (collend < end) { 6332 if ((0 < *collend && *collend < 256) || 6333 !Py_UNICODE_ISSPACE(*collend) || 6334 Py_UNICODE_TODECIMAL(*collend)) 6335 break; 6336 } 6337 /* cache callback name lookup 6338 * (if not done yet, i.e. it's the first error) */ 6339 if (known_errorHandler==-1) { 6340 if ((errors==NULL) || (!strcmp(errors, "strict"))) 6341 known_errorHandler = 1; 6342 else if (!strcmp(errors, "replace")) 6343 known_errorHandler = 2; 6344 else if (!strcmp(errors, "ignore")) 6345 known_errorHandler = 3; 6346 else if (!strcmp(errors, "xmlcharrefreplace")) 6347 known_errorHandler = 4; 6348 else 6349 known_errorHandler = 0; 6350 } 6351 switch (known_errorHandler) { 6352 case 1: /* strict */ 6353 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason); 6354 goto onError; 6355 case 2: /* replace */ 6356 for (p = collstart; p < collend; ++p) 6357 *output++ = '?'; 6358 /* fall through */ 6359 case 3: /* ignore */ 6360 p = collend; 6361 break; 6362 case 4: /* xmlcharrefreplace */ 6363 /* generate replacement (temporarily (mis)uses p) */ 6364 for (p = collstart; p < collend; ++p) 6365 output += sprintf(output, "&#%d;", (int)*p); 6366 p = collend; 6367 break; 6368 default: 6369 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler, 6370 encoding, reason, s, length, &exc, 6371 collstart-s, collend-s, &newpos); 6372 if (repunicode == NULL) 6373 goto onError; 6374 if (!PyUnicode_Check(repunicode)) { 6375 /* Byte results not supported, since they have no decimal property. */ 6376 PyErr_SetString(PyExc_TypeError, "error handler should return unicode"); 6377 Py_DECREF(repunicode); 6378 goto onError; 6379 } 6380 /* generate replacement */ 6381 repsize = PyUnicode_GET_SIZE(repunicode); 6382 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) { 6383 Py_UNICODE ch = *uni2; 6384 if (Py_UNICODE_ISSPACE(ch)) 6385 *output++ = ' '; 6386 else { 6387 decimal = Py_UNICODE_TODECIMAL(ch); 6388 if (decimal >= 0) 6389 *output++ = '0' + decimal; 6390 else if (0 < ch && ch < 256) 6391 *output++ = (char)ch; 6392 else { 6393 Py_DECREF(repunicode); 6394 raise_encode_exception(&exc, encoding, 6395 s, length, collstart-s, collend-s, reason); 6396 goto onError; 6397 } 6398 } 6399 } 6400 p = s + newpos; 6401 Py_DECREF(repunicode); 6402 } 6403 } 6404 /* 0-terminate the output string */ 6405 *output++ = '\0'; 6406 Py_XDECREF(exc); 6407 Py_XDECREF(errorHandler); 6408 return 0; 6409 6410 onError: 6411 Py_XDECREF(exc); 6412 Py_XDECREF(errorHandler); 6413 return -1; 6414} 6415 6416/* --- Helpers ------------------------------------------------------------ */ 6417 6418#include "stringlib/unicodedefs.h" 6419#include "stringlib/fastsearch.h" 6420 6421#include "stringlib/count.h" 6422#include "stringlib/find.h" 6423#include "stringlib/partition.h" 6424#include "stringlib/split.h" 6425 6426#define _Py_InsertThousandsGrouping _PyUnicode_InsertThousandsGrouping 6427#define _Py_InsertThousandsGroupingLocale _PyUnicode_InsertThousandsGroupingLocale 6428#include "stringlib/localeutil.h" 6429 6430/* helper macro to fixup start/end slice values */ 6431#define ADJUST_INDICES(start, end, len) \ 6432 if (end > len) \ 6433 end = len; \ 6434 else if (end < 0) { \ 6435 end += len; \ 6436 if (end < 0) \ 6437 end = 0; \ 6438 } \ 6439 if (start < 0) { \ 6440 start += len; \ 6441 if (start < 0) \ 6442 start = 0; \ 6443 } 6444 6445Py_ssize_t PyUnicode_Count(PyObject *str, 6446 PyObject *substr, 6447 Py_ssize_t start, 6448 Py_ssize_t end) 6449{ 6450 Py_ssize_t result; 6451 PyUnicodeObject* str_obj; 6452 PyUnicodeObject* sub_obj; 6453 6454 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str); 6455 if (!str_obj) 6456 return -1; 6457 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr); 6458 if (!sub_obj) { 6459 Py_DECREF(str_obj); 6460 return -1; 6461 } 6462 6463 ADJUST_INDICES(start, end, str_obj->length); 6464 result = stringlib_count( 6465 str_obj->str + start, end - start, sub_obj->str, sub_obj->length, 6466 PY_SSIZE_T_MAX 6467 ); 6468 6469 Py_DECREF(sub_obj); 6470 Py_DECREF(str_obj); 6471 6472 return result; 6473} 6474 6475Py_ssize_t PyUnicode_Find(PyObject *str, 6476 PyObject *sub, 6477 Py_ssize_t start, 6478 Py_ssize_t end, 6479 int direction) 6480{ 6481 Py_ssize_t result; 6482 6483 str = PyUnicode_FromObject(str); 6484 if (!str) 6485 return -2; 6486 sub = PyUnicode_FromObject(sub); 6487 if (!sub) { 6488 Py_DECREF(str); 6489 return -2; 6490 } 6491 6492 if (direction > 0) 6493 result = stringlib_find_slice( 6494 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str), 6495 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub), 6496 start, end 6497 ); 6498 else 6499 result = stringlib_rfind_slice( 6500 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str), 6501 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub), 6502 start, end 6503 ); 6504 6505 Py_DECREF(str); 6506 Py_DECREF(sub); 6507 6508 return result; 6509} 6510 6511static 6512int tailmatch(PyUnicodeObject *self, 6513 PyUnicodeObject *substring, 6514 Py_ssize_t start, 6515 Py_ssize_t end, 6516 int direction) 6517{ 6518 if (substring->length == 0) 6519 return 1; 6520 6521 ADJUST_INDICES(start, end, self->length); 6522 end -= substring->length; 6523 if (end < start) 6524 return 0; 6525 6526 if (direction > 0) { 6527 if (Py_UNICODE_MATCH(self, end, substring)) 6528 return 1; 6529 } else { 6530 if (Py_UNICODE_MATCH(self, start, substring)) 6531 return 1; 6532 } 6533 6534 return 0; 6535} 6536 6537Py_ssize_t PyUnicode_Tailmatch(PyObject *str, 6538 PyObject *substr, 6539 Py_ssize_t start, 6540 Py_ssize_t end, 6541 int direction) 6542{ 6543 Py_ssize_t result; 6544 6545 str = PyUnicode_FromObject(str); 6546 if (str == NULL) 6547 return -1; 6548 substr = PyUnicode_FromObject(substr); 6549 if (substr == NULL) { 6550 Py_DECREF(str); 6551 return -1; 6552 } 6553 6554 result = tailmatch((PyUnicodeObject *)str, 6555 (PyUnicodeObject *)substr, 6556 start, end, direction); 6557 Py_DECREF(str); 6558 Py_DECREF(substr); 6559 return result; 6560} 6561 6562/* Apply fixfct filter to the Unicode object self and return a 6563 reference to the modified object */ 6564 6565static 6566PyObject *fixup(PyUnicodeObject *self, 6567 int (*fixfct)(PyUnicodeObject *s)) 6568{ 6569 6570 PyUnicodeObject *u; 6571 6572 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length); 6573 if (u == NULL) 6574 return NULL; 6575 6576 Py_UNICODE_COPY(u->str, self->str, self->length); 6577 6578 if (!fixfct(u) && PyUnicode_CheckExact(self)) { 6579 /* fixfct should return TRUE if it modified the buffer. If 6580 FALSE, return a reference to the original buffer instead 6581 (to save space, not time) */ 6582 Py_INCREF(self); 6583 Py_DECREF(u); 6584 return (PyObject*) self; 6585 } 6586 return (PyObject*) u; 6587} 6588 6589static 6590int fixupper(PyUnicodeObject *self) 6591{ 6592 Py_ssize_t len = self->length; 6593 Py_UNICODE *s = self->str; 6594 int status = 0; 6595 6596 while (len-- > 0) { 6597 register Py_UNICODE ch; 6598 6599 ch = Py_UNICODE_TOUPPER(*s); 6600 if (ch != *s) { 6601 status = 1; 6602 *s = ch; 6603 } 6604 s++; 6605 } 6606 6607 return status; 6608} 6609 6610static 6611int fixlower(PyUnicodeObject *self) 6612{ 6613 Py_ssize_t len = self->length; 6614 Py_UNICODE *s = self->str; 6615 int status = 0; 6616 6617 while (len-- > 0) { 6618 register Py_UNICODE ch; 6619 6620 ch = Py_UNICODE_TOLOWER(*s); 6621 if (ch != *s) { 6622 status = 1; 6623 *s = ch; 6624 } 6625 s++; 6626 } 6627 6628 return status; 6629} 6630 6631static 6632int fixswapcase(PyUnicodeObject *self) 6633{ 6634 Py_ssize_t len = self->length; 6635 Py_UNICODE *s = self->str; 6636 int status = 0; 6637 6638 while (len-- > 0) { 6639 if (Py_UNICODE_ISUPPER(*s)) { 6640 *s = Py_UNICODE_TOLOWER(*s); 6641 status = 1; 6642 } else if (Py_UNICODE_ISLOWER(*s)) { 6643 *s = Py_UNICODE_TOUPPER(*s); 6644 status = 1; 6645 } 6646 s++; 6647 } 6648 6649 return status; 6650} 6651 6652static 6653int fixcapitalize(PyUnicodeObject *self) 6654{ 6655 Py_ssize_t len = self->length; 6656 Py_UNICODE *s = self->str; 6657 int status = 0; 6658 6659 if (len == 0) 6660 return 0; 6661 if (Py_UNICODE_ISLOWER(*s)) { 6662 *s = Py_UNICODE_TOUPPER(*s); 6663 status = 1; 6664 } 6665 s++; 6666 while (--len > 0) { 6667 if (Py_UNICODE_ISUPPER(*s)) { 6668 *s = Py_UNICODE_TOLOWER(*s); 6669 status = 1; 6670 } 6671 s++; 6672 } 6673 return status; 6674} 6675 6676static 6677int fixtitle(PyUnicodeObject *self) 6678{ 6679 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 6680 register Py_UNICODE *e; 6681 int previous_is_cased; 6682 6683 /* Shortcut for single character strings */ 6684 if (PyUnicode_GET_SIZE(self) == 1) { 6685 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p); 6686 if (*p != ch) { 6687 *p = ch; 6688 return 1; 6689 } 6690 else 6691 return 0; 6692 } 6693 6694 e = p + PyUnicode_GET_SIZE(self); 6695 previous_is_cased = 0; 6696 for (; p < e; p++) { 6697 register const Py_UNICODE ch = *p; 6698 6699 if (previous_is_cased) 6700 *p = Py_UNICODE_TOLOWER(ch); 6701 else 6702 *p = Py_UNICODE_TOTITLE(ch); 6703 6704 if (Py_UNICODE_ISLOWER(ch) || 6705 Py_UNICODE_ISUPPER(ch) || 6706 Py_UNICODE_ISTITLE(ch)) 6707 previous_is_cased = 1; 6708 else 6709 previous_is_cased = 0; 6710 } 6711 return 1; 6712} 6713 6714PyObject * 6715PyUnicode_Join(PyObject *separator, PyObject *seq) 6716{ 6717 const Py_UNICODE blank = ' '; 6718 const Py_UNICODE *sep = ␣ 6719 Py_ssize_t seplen = 1; 6720 PyUnicodeObject *res = NULL; /* the result */ 6721 Py_UNICODE *res_p; /* pointer to free byte in res's string area */ 6722 PyObject *fseq; /* PySequence_Fast(seq) */ 6723 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */ 6724 PyObject **items; 6725 PyObject *item; 6726 Py_ssize_t sz, i; 6727 6728 fseq = PySequence_Fast(seq, ""); 6729 if (fseq == NULL) { 6730 return NULL; 6731 } 6732 6733 /* NOTE: the following code can't call back into Python code, 6734 * so we are sure that fseq won't be mutated. 6735 */ 6736 6737 seqlen = PySequence_Fast_GET_SIZE(fseq); 6738 /* If empty sequence, return u"". */ 6739 if (seqlen == 0) { 6740 res = _PyUnicode_New(0); /* empty sequence; return u"" */ 6741 goto Done; 6742 } 6743 items = PySequence_Fast_ITEMS(fseq); 6744 /* If singleton sequence with an exact Unicode, return that. */ 6745 if (seqlen == 1) { 6746 item = items[0]; 6747 if (PyUnicode_CheckExact(item)) { 6748 Py_INCREF(item); 6749 res = (PyUnicodeObject *)item; 6750 goto Done; 6751 } 6752 } 6753 else { 6754 /* Set up sep and seplen */ 6755 if (separator == NULL) { 6756 sep = ␣ 6757 seplen = 1; 6758 } 6759 else { 6760 if (!PyUnicode_Check(separator)) { 6761 PyErr_Format(PyExc_TypeError, 6762 "separator: expected str instance," 6763 " %.80s found", 6764 Py_TYPE(separator)->tp_name); 6765 goto onError; 6766 } 6767 sep = PyUnicode_AS_UNICODE(separator); 6768 seplen = PyUnicode_GET_SIZE(separator); 6769 } 6770 } 6771 6772 /* There are at least two things to join, or else we have a subclass 6773 * of str in the sequence. 6774 * Do a pre-pass to figure out the total amount of space we'll 6775 * need (sz), and see whether all argument are strings. 6776 */ 6777 sz = 0; 6778 for (i = 0; i < seqlen; i++) { 6779 const Py_ssize_t old_sz = sz; 6780 item = items[i]; 6781 if (!PyUnicode_Check(item)) { 6782 PyErr_Format(PyExc_TypeError, 6783 "sequence item %zd: expected str instance," 6784 " %.80s found", 6785 i, Py_TYPE(item)->tp_name); 6786 goto onError; 6787 } 6788 sz += PyUnicode_GET_SIZE(item); 6789 if (i != 0) 6790 sz += seplen; 6791 if (sz < old_sz || sz > PY_SSIZE_T_MAX) { 6792 PyErr_SetString(PyExc_OverflowError, 6793 "join() result is too long for a Python string"); 6794 goto onError; 6795 } 6796 } 6797 6798 res = _PyUnicode_New(sz); 6799 if (res == NULL) 6800 goto onError; 6801 6802 /* Catenate everything. */ 6803 res_p = PyUnicode_AS_UNICODE(res); 6804 for (i = 0; i < seqlen; ++i) { 6805 Py_ssize_t itemlen; 6806 item = items[i]; 6807 itemlen = PyUnicode_GET_SIZE(item); 6808 /* Copy item, and maybe the separator. */ 6809 if (i) { 6810 Py_UNICODE_COPY(res_p, sep, seplen); 6811 res_p += seplen; 6812 } 6813 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen); 6814 res_p += itemlen; 6815 } 6816 6817 Done: 6818 Py_DECREF(fseq); 6819 return (PyObject *)res; 6820 6821 onError: 6822 Py_DECREF(fseq); 6823 Py_XDECREF(res); 6824 return NULL; 6825} 6826 6827static 6828PyUnicodeObject *pad(PyUnicodeObject *self, 6829 Py_ssize_t left, 6830 Py_ssize_t right, 6831 Py_UNICODE fill) 6832{ 6833 PyUnicodeObject *u; 6834 6835 if (left < 0) 6836 left = 0; 6837 if (right < 0) 6838 right = 0; 6839 6840 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) { 6841 Py_INCREF(self); 6842 return self; 6843 } 6844 6845 if (left > PY_SSIZE_T_MAX - self->length || 6846 right > PY_SSIZE_T_MAX - (left + self->length)) { 6847 PyErr_SetString(PyExc_OverflowError, "padded string is too long"); 6848 return NULL; 6849 } 6850 u = _PyUnicode_New(left + self->length + right); 6851 if (u) { 6852 if (left) 6853 Py_UNICODE_FILL(u->str, fill, left); 6854 Py_UNICODE_COPY(u->str + left, self->str, self->length); 6855 if (right) 6856 Py_UNICODE_FILL(u->str + left + self->length, fill, right); 6857 } 6858 6859 return u; 6860} 6861 6862PyObject *PyUnicode_Splitlines(PyObject *string, int keepends) 6863{ 6864 PyObject *list; 6865 6866 string = PyUnicode_FromObject(string); 6867 if (string == NULL) 6868 return NULL; 6869 6870 list = stringlib_splitlines( 6871 (PyObject*) string, PyUnicode_AS_UNICODE(string), 6872 PyUnicode_GET_SIZE(string), keepends); 6873 6874 Py_DECREF(string); 6875 return list; 6876} 6877 6878static 6879PyObject *split(PyUnicodeObject *self, 6880 PyUnicodeObject *substring, 6881 Py_ssize_t maxcount) 6882{ 6883 if (maxcount < 0) 6884 maxcount = PY_SSIZE_T_MAX; 6885 6886 if (substring == NULL) 6887 return stringlib_split_whitespace( 6888 (PyObject*) self, self->str, self->length, maxcount 6889 ); 6890 6891 return stringlib_split( 6892 (PyObject*) self, self->str, self->length, 6893 substring->str, substring->length, 6894 maxcount 6895 ); 6896} 6897 6898static 6899PyObject *rsplit(PyUnicodeObject *self, 6900 PyUnicodeObject *substring, 6901 Py_ssize_t maxcount) 6902{ 6903 if (maxcount < 0) 6904 maxcount = PY_SSIZE_T_MAX; 6905 6906 if (substring == NULL) 6907 return stringlib_rsplit_whitespace( 6908 (PyObject*) self, self->str, self->length, maxcount 6909 ); 6910 6911 return stringlib_rsplit( 6912 (PyObject*) self, self->str, self->length, 6913 substring->str, substring->length, 6914 maxcount 6915 ); 6916} 6917 6918static 6919PyObject *replace(PyUnicodeObject *self, 6920 PyUnicodeObject *str1, 6921 PyUnicodeObject *str2, 6922 Py_ssize_t maxcount) 6923{ 6924 PyUnicodeObject *u; 6925 6926 if (maxcount < 0) 6927 maxcount = PY_SSIZE_T_MAX; 6928 else if (maxcount == 0 || self->length == 0) 6929 goto nothing; 6930 6931 if (str1->length == str2->length) { 6932 Py_ssize_t i; 6933 /* same length */ 6934 if (str1->length == 0) 6935 goto nothing; 6936 if (str1->length == 1) { 6937 /* replace characters */ 6938 Py_UNICODE u1, u2; 6939 if (!findchar(self->str, self->length, str1->str[0])) 6940 goto nothing; 6941 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length); 6942 if (!u) 6943 return NULL; 6944 Py_UNICODE_COPY(u->str, self->str, self->length); 6945 u1 = str1->str[0]; 6946 u2 = str2->str[0]; 6947 for (i = 0; i < u->length; i++) 6948 if (u->str[i] == u1) { 6949 if (--maxcount < 0) 6950 break; 6951 u->str[i] = u2; 6952 } 6953 } else { 6954 i = stringlib_find( 6955 self->str, self->length, str1->str, str1->length, 0 6956 ); 6957 if (i < 0) 6958 goto nothing; 6959 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length); 6960 if (!u) 6961 return NULL; 6962 Py_UNICODE_COPY(u->str, self->str, self->length); 6963 6964 /* change everything in-place, starting with this one */ 6965 Py_UNICODE_COPY(u->str+i, str2->str, str2->length); 6966 i += str1->length; 6967 6968 while ( --maxcount > 0) { 6969 i = stringlib_find(self->str+i, self->length-i, 6970 str1->str, str1->length, 6971 i); 6972 if (i == -1) 6973 break; 6974 Py_UNICODE_COPY(u->str+i, str2->str, str2->length); 6975 i += str1->length; 6976 } 6977 } 6978 } else { 6979 6980 Py_ssize_t n, i, j, e; 6981 Py_ssize_t product, new_size, delta; 6982 Py_UNICODE *p; 6983 6984 /* replace strings */ 6985 n = stringlib_count(self->str, self->length, str1->str, str1->length, 6986 maxcount); 6987 if (n == 0) 6988 goto nothing; 6989 /* new_size = self->length + n * (str2->length - str1->length)); */ 6990 delta = (str2->length - str1->length); 6991 if (delta == 0) { 6992 new_size = self->length; 6993 } else { 6994 product = n * (str2->length - str1->length); 6995 if ((product / (str2->length - str1->length)) != n) { 6996 PyErr_SetString(PyExc_OverflowError, 6997 "replace string is too long"); 6998 return NULL; 6999 } 7000 new_size = self->length + product; 7001 if (new_size < 0) { 7002 PyErr_SetString(PyExc_OverflowError, 7003 "replace string is too long"); 7004 return NULL; 7005 } 7006 } 7007 u = _PyUnicode_New(new_size); 7008 if (!u) 7009 return NULL; 7010 i = 0; 7011 p = u->str; 7012 e = self->length - str1->length; 7013 if (str1->length > 0) { 7014 while (n-- > 0) { 7015 /* look for next match */ 7016 j = stringlib_find(self->str+i, self->length-i, 7017 str1->str, str1->length, 7018 i); 7019 if (j == -1) 7020 break; 7021 else if (j > i) { 7022 /* copy unchanged part [i:j] */ 7023 Py_UNICODE_COPY(p, self->str+i, j-i); 7024 p += j - i; 7025 } 7026 /* copy substitution string */ 7027 if (str2->length > 0) { 7028 Py_UNICODE_COPY(p, str2->str, str2->length); 7029 p += str2->length; 7030 } 7031 i = j + str1->length; 7032 } 7033 if (i < self->length) 7034 /* copy tail [i:] */ 7035 Py_UNICODE_COPY(p, self->str+i, self->length-i); 7036 } else { 7037 /* interleave */ 7038 while (n > 0) { 7039 Py_UNICODE_COPY(p, str2->str, str2->length); 7040 p += str2->length; 7041 if (--n <= 0) 7042 break; 7043 *p++ = self->str[i++]; 7044 } 7045 Py_UNICODE_COPY(p, self->str+i, self->length-i); 7046 } 7047 } 7048 return (PyObject *) u; 7049 7050 nothing: 7051 /* nothing to replace; return original string (when possible) */ 7052 if (PyUnicode_CheckExact(self)) { 7053 Py_INCREF(self); 7054 return (PyObject *) self; 7055 } 7056 return PyUnicode_FromUnicode(self->str, self->length); 7057} 7058 7059/* --- Unicode Object Methods --------------------------------------------- */ 7060 7061PyDoc_STRVAR(title__doc__, 7062 "S.title() -> str\n\ 7063\n\ 7064Return a titlecased version of S, i.e. words start with title case\n\ 7065characters, all remaining cased characters have lower case."); 7066 7067static PyObject* 7068unicode_title(PyUnicodeObject *self) 7069{ 7070 return fixup(self, fixtitle); 7071} 7072 7073PyDoc_STRVAR(capitalize__doc__, 7074 "S.capitalize() -> str\n\ 7075\n\ 7076Return a capitalized version of S, i.e. make the first character\n\ 7077have upper case and the rest lower case."); 7078 7079static PyObject* 7080unicode_capitalize(PyUnicodeObject *self) 7081{ 7082 return fixup(self, fixcapitalize); 7083} 7084 7085#if 0 7086PyDoc_STRVAR(capwords__doc__, 7087 "S.capwords() -> str\n\ 7088\n\ 7089Apply .capitalize() to all words in S and return the result with\n\ 7090normalized whitespace (all whitespace strings are replaced by ' ')."); 7091 7092static PyObject* 7093unicode_capwords(PyUnicodeObject *self) 7094{ 7095 PyObject *list; 7096 PyObject *item; 7097 Py_ssize_t i; 7098 7099 /* Split into words */ 7100 list = split(self, NULL, -1); 7101 if (!list) 7102 return NULL; 7103 7104 /* Capitalize each word */ 7105 for (i = 0; i < PyList_GET_SIZE(list); i++) { 7106 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i), 7107 fixcapitalize); 7108 if (item == NULL) 7109 goto onError; 7110 Py_DECREF(PyList_GET_ITEM(list, i)); 7111 PyList_SET_ITEM(list, i, item); 7112 } 7113 7114 /* Join the words to form a new string */ 7115 item = PyUnicode_Join(NULL, list); 7116 7117 onError: 7118 Py_DECREF(list); 7119 return (PyObject *)item; 7120} 7121#endif 7122 7123/* Argument converter. Coerces to a single unicode character */ 7124 7125static int 7126convert_uc(PyObject *obj, void *addr) 7127{ 7128 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr; 7129 PyObject *uniobj; 7130 Py_UNICODE *unistr; 7131 7132 uniobj = PyUnicode_FromObject(obj); 7133 if (uniobj == NULL) { 7134 PyErr_SetString(PyExc_TypeError, 7135 "The fill character cannot be converted to Unicode"); 7136 return 0; 7137 } 7138 if (PyUnicode_GET_SIZE(uniobj) != 1) { 7139 PyErr_SetString(PyExc_TypeError, 7140 "The fill character must be exactly one character long"); 7141 Py_DECREF(uniobj); 7142 return 0; 7143 } 7144 unistr = PyUnicode_AS_UNICODE(uniobj); 7145 *fillcharloc = unistr[0]; 7146 Py_DECREF(uniobj); 7147 return 1; 7148} 7149 7150PyDoc_STRVAR(center__doc__, 7151 "S.center(width[, fillchar]) -> str\n\ 7152\n\ 7153Return S centered in a string of length width. Padding is\n\ 7154done using the specified fill character (default is a space)"); 7155 7156static PyObject * 7157unicode_center(PyUnicodeObject *self, PyObject *args) 7158{ 7159 Py_ssize_t marg, left; 7160 Py_ssize_t width; 7161 Py_UNICODE fillchar = ' '; 7162 7163 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar)) 7164 return NULL; 7165 7166 if (self->length >= width && PyUnicode_CheckExact(self)) { 7167 Py_INCREF(self); 7168 return (PyObject*) self; 7169 } 7170 7171 marg = width - self->length; 7172 left = marg / 2 + (marg & width & 1); 7173 7174 return (PyObject*) pad(self, left, marg - left, fillchar); 7175} 7176 7177#if 0 7178 7179/* This code should go into some future Unicode collation support 7180 module. The basic comparison should compare ordinals on a naive 7181 basis (this is what Java does and thus Jython too). */ 7182 7183/* speedy UTF-16 code point order comparison */ 7184/* gleaned from: */ 7185/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */ 7186 7187static short utf16Fixup[32] = 7188{ 7189 0, 0, 0, 0, 0, 0, 0, 0, 7190 0, 0, 0, 0, 0, 0, 0, 0, 7191 0, 0, 0, 0, 0, 0, 0, 0, 7192 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800 7193}; 7194 7195static int 7196unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2) 7197{ 7198 Py_ssize_t len1, len2; 7199 7200 Py_UNICODE *s1 = str1->str; 7201 Py_UNICODE *s2 = str2->str; 7202 7203 len1 = str1->length; 7204 len2 = str2->length; 7205 7206 while (len1 > 0 && len2 > 0) { 7207 Py_UNICODE c1, c2; 7208 7209 c1 = *s1++; 7210 c2 = *s2++; 7211 7212 if (c1 > (1<<11) * 26) 7213 c1 += utf16Fixup[c1>>11]; 7214 if (c2 > (1<<11) * 26) 7215 c2 += utf16Fixup[c2>>11]; 7216 /* now c1 and c2 are in UTF-32-compatible order */ 7217 7218 if (c1 != c2) 7219 return (c1 < c2) ? -1 : 1; 7220 7221 len1--; len2--; 7222 } 7223 7224 return (len1 < len2) ? -1 : (len1 != len2); 7225} 7226 7227#else 7228 7229static int 7230unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2) 7231{ 7232 register Py_ssize_t len1, len2; 7233 7234 Py_UNICODE *s1 = str1->str; 7235 Py_UNICODE *s2 = str2->str; 7236 7237 len1 = str1->length; 7238 len2 = str2->length; 7239 7240 while (len1 > 0 && len2 > 0) { 7241 Py_UNICODE c1, c2; 7242 7243 c1 = *s1++; 7244 c2 = *s2++; 7245 7246 if (c1 != c2) 7247 return (c1 < c2) ? -1 : 1; 7248 7249 len1--; len2--; 7250 } 7251 7252 return (len1 < len2) ? -1 : (len1 != len2); 7253} 7254 7255#endif 7256 7257int PyUnicode_Compare(PyObject *left, 7258 PyObject *right) 7259{ 7260 if (PyUnicode_Check(left) && PyUnicode_Check(right)) 7261 return unicode_compare((PyUnicodeObject *)left, 7262 (PyUnicodeObject *)right); 7263 PyErr_Format(PyExc_TypeError, 7264 "Can't compare %.100s and %.100s", 7265 left->ob_type->tp_name, 7266 right->ob_type->tp_name); 7267 return -1; 7268} 7269 7270int 7271PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str) 7272{ 7273 int i; 7274 Py_UNICODE *id; 7275 assert(PyUnicode_Check(uni)); 7276 id = PyUnicode_AS_UNICODE(uni); 7277 /* Compare Unicode string and source character set string */ 7278 for (i = 0; id[i] && str[i]; i++) 7279 if (id[i] != str[i]) 7280 return ((int)id[i] < (int)str[i]) ? -1 : 1; 7281 /* This check keeps Python strings that end in '\0' from comparing equal 7282 to C strings identical up to that point. */ 7283 if (PyUnicode_GET_SIZE(uni) != i || id[i]) 7284 return 1; /* uni is longer */ 7285 if (str[i]) 7286 return -1; /* str is longer */ 7287 return 0; 7288} 7289 7290 7291#define TEST_COND(cond) \ 7292 ((cond) ? Py_True : Py_False) 7293 7294PyObject *PyUnicode_RichCompare(PyObject *left, 7295 PyObject *right, 7296 int op) 7297{ 7298 int result; 7299 7300 if (PyUnicode_Check(left) && PyUnicode_Check(right)) { 7301 PyObject *v; 7302 if (((PyUnicodeObject *) left)->length != 7303 ((PyUnicodeObject *) right)->length) { 7304 if (op == Py_EQ) { 7305 Py_INCREF(Py_False); 7306 return Py_False; 7307 } 7308 if (op == Py_NE) { 7309 Py_INCREF(Py_True); 7310 return Py_True; 7311 } 7312 } 7313 if (left == right) 7314 result = 0; 7315 else 7316 result = unicode_compare((PyUnicodeObject *)left, 7317 (PyUnicodeObject *)right); 7318 7319 /* Convert the return value to a Boolean */ 7320 switch (op) { 7321 case Py_EQ: 7322 v = TEST_COND(result == 0); 7323 break; 7324 case Py_NE: 7325 v = TEST_COND(result != 0); 7326 break; 7327 case Py_LE: 7328 v = TEST_COND(result <= 0); 7329 break; 7330 case Py_GE: 7331 v = TEST_COND(result >= 0); 7332 break; 7333 case Py_LT: 7334 v = TEST_COND(result == -1); 7335 break; 7336 case Py_GT: 7337 v = TEST_COND(result == 1); 7338 break; 7339 default: 7340 PyErr_BadArgument(); 7341 return NULL; 7342 } 7343 Py_INCREF(v); 7344 return v; 7345 } 7346 7347 Py_INCREF(Py_NotImplemented); 7348 return Py_NotImplemented; 7349} 7350 7351int PyUnicode_Contains(PyObject *container, 7352 PyObject *element) 7353{ 7354 PyObject *str, *sub; 7355 int result; 7356 7357 /* Coerce the two arguments */ 7358 sub = PyUnicode_FromObject(element); 7359 if (!sub) { 7360 PyErr_Format(PyExc_TypeError, 7361 "'in <string>' requires string as left operand, not %s", 7362 element->ob_type->tp_name); 7363 return -1; 7364 } 7365 7366 str = PyUnicode_FromObject(container); 7367 if (!str) { 7368 Py_DECREF(sub); 7369 return -1; 7370 } 7371 7372 result = stringlib_contains_obj(str, sub); 7373 7374 Py_DECREF(str); 7375 Py_DECREF(sub); 7376 7377 return result; 7378} 7379 7380/* Concat to string or Unicode object giving a new Unicode object. */ 7381 7382PyObject *PyUnicode_Concat(PyObject *left, 7383 PyObject *right) 7384{ 7385 PyUnicodeObject *u = NULL, *v = NULL, *w; 7386 7387 /* Coerce the two arguments */ 7388 u = (PyUnicodeObject *)PyUnicode_FromObject(left); 7389 if (u == NULL) 7390 goto onError; 7391 v = (PyUnicodeObject *)PyUnicode_FromObject(right); 7392 if (v == NULL) 7393 goto onError; 7394 7395 /* Shortcuts */ 7396 if (v == unicode_empty) { 7397 Py_DECREF(v); 7398 return (PyObject *)u; 7399 } 7400 if (u == unicode_empty) { 7401 Py_DECREF(u); 7402 return (PyObject *)v; 7403 } 7404 7405 /* Concat the two Unicode strings */ 7406 w = _PyUnicode_New(u->length + v->length); 7407 if (w == NULL) 7408 goto onError; 7409 Py_UNICODE_COPY(w->str, u->str, u->length); 7410 Py_UNICODE_COPY(w->str + u->length, v->str, v->length); 7411 7412 Py_DECREF(u); 7413 Py_DECREF(v); 7414 return (PyObject *)w; 7415 7416 onError: 7417 Py_XDECREF(u); 7418 Py_XDECREF(v); 7419 return NULL; 7420} 7421 7422void 7423PyUnicode_Append(PyObject **pleft, PyObject *right) 7424{ 7425 PyObject *new; 7426 if (*pleft == NULL) 7427 return; 7428 if (right == NULL || !PyUnicode_Check(*pleft)) { 7429 Py_DECREF(*pleft); 7430 *pleft = NULL; 7431 return; 7432 } 7433 new = PyUnicode_Concat(*pleft, right); 7434 Py_DECREF(*pleft); 7435 *pleft = new; 7436} 7437 7438void 7439PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right) 7440{ 7441 PyUnicode_Append(pleft, right); 7442 Py_XDECREF(right); 7443} 7444 7445PyDoc_STRVAR(count__doc__, 7446 "S.count(sub[, start[, end]]) -> int\n\ 7447\n\ 7448Return the number of non-overlapping occurrences of substring sub in\n\ 7449string S[start:end]. Optional arguments start and end are\n\ 7450interpreted as in slice notation."); 7451 7452static PyObject * 7453unicode_count(PyUnicodeObject *self, PyObject *args) 7454{ 7455 PyUnicodeObject *substring; 7456 Py_ssize_t start = 0; 7457 Py_ssize_t end = PY_SSIZE_T_MAX; 7458 PyObject *result; 7459 7460 if (!stringlib_parse_args_finds_unicode("count", args, &substring, 7461 &start, &end)) 7462 return NULL; 7463 7464 ADJUST_INDICES(start, end, self->length); 7465 result = PyLong_FromSsize_t( 7466 stringlib_count(self->str + start, end - start, 7467 substring->str, substring->length, 7468 PY_SSIZE_T_MAX) 7469 ); 7470 7471 Py_DECREF(substring); 7472 7473 return result; 7474} 7475 7476PyDoc_STRVAR(encode__doc__, 7477 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\ 7478\n\ 7479Encode S using the codec registered for encoding. Default encoding\n\ 7480is 'utf-8'. errors may be given to set a different error\n\ 7481handling scheme. Default is 'strict' meaning that encoding errors raise\n\ 7482a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\ 7483'xmlcharrefreplace' as well as any other name registered with\n\ 7484codecs.register_error that can handle UnicodeEncodeErrors."); 7485 7486static PyObject * 7487unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs) 7488{ 7489 static char *kwlist[] = {"encoding", "errors", 0}; 7490 char *encoding = NULL; 7491 char *errors = NULL; 7492 7493 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode", 7494 kwlist, &encoding, &errors)) 7495 return NULL; 7496 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors); 7497} 7498 7499PyDoc_STRVAR(expandtabs__doc__, 7500 "S.expandtabs([tabsize]) -> str\n\ 7501\n\ 7502Return a copy of S where all tab characters are expanded using spaces.\n\ 7503If tabsize is not given, a tab size of 8 characters is assumed."); 7504 7505static PyObject* 7506unicode_expandtabs(PyUnicodeObject *self, PyObject *args) 7507{ 7508 Py_UNICODE *e; 7509 Py_UNICODE *p; 7510 Py_UNICODE *q; 7511 Py_UNICODE *qe; 7512 Py_ssize_t i, j, incr; 7513 PyUnicodeObject *u; 7514 int tabsize = 8; 7515 7516 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize)) 7517 return NULL; 7518 7519 /* First pass: determine size of output string */ 7520 i = 0; /* chars up to and including most recent \n or \r */ 7521 j = 0; /* chars since most recent \n or \r (use in tab calculations) */ 7522 e = self->str + self->length; /* end of input */ 7523 for (p = self->str; p < e; p++) 7524 if (*p == '\t') { 7525 if (tabsize > 0) { 7526 incr = tabsize - (j % tabsize); /* cannot overflow */ 7527 if (j > PY_SSIZE_T_MAX - incr) 7528 goto overflow1; 7529 j += incr; 7530 } 7531 } 7532 else { 7533 if (j > PY_SSIZE_T_MAX - 1) 7534 goto overflow1; 7535 j++; 7536 if (*p == '\n' || *p == '\r') { 7537 if (i > PY_SSIZE_T_MAX - j) 7538 goto overflow1; 7539 i += j; 7540 j = 0; 7541 } 7542 } 7543 7544 if (i > PY_SSIZE_T_MAX - j) 7545 goto overflow1; 7546 7547 /* Second pass: create output string and fill it */ 7548 u = _PyUnicode_New(i + j); 7549 if (!u) 7550 return NULL; 7551 7552 j = 0; /* same as in first pass */ 7553 q = u->str; /* next output char */ 7554 qe = u->str + u->length; /* end of output */ 7555 7556 for (p = self->str; p < e; p++) 7557 if (*p == '\t') { 7558 if (tabsize > 0) { 7559 i = tabsize - (j % tabsize); 7560 j += i; 7561 while (i--) { 7562 if (q >= qe) 7563 goto overflow2; 7564 *q++ = ' '; 7565 } 7566 } 7567 } 7568 else { 7569 if (q >= qe) 7570 goto overflow2; 7571 *q++ = *p; 7572 j++; 7573 if (*p == '\n' || *p == '\r') 7574 j = 0; 7575 } 7576 7577 return (PyObject*) u; 7578 7579 overflow2: 7580 Py_DECREF(u); 7581 overflow1: 7582 PyErr_SetString(PyExc_OverflowError, "new string is too long"); 7583 return NULL; 7584} 7585 7586PyDoc_STRVAR(find__doc__, 7587 "S.find(sub[, start[, end]]) -> int\n\ 7588\n\ 7589Return the lowest index in S where substring sub is found,\n\ 7590such that sub is contained within S[start:end]. Optional\n\ 7591arguments start and end are interpreted as in slice notation.\n\ 7592\n\ 7593Return -1 on failure."); 7594 7595static PyObject * 7596unicode_find(PyUnicodeObject *self, PyObject *args) 7597{ 7598 PyUnicodeObject *substring; 7599 Py_ssize_t start; 7600 Py_ssize_t end; 7601 Py_ssize_t result; 7602 7603 if (!stringlib_parse_args_finds_unicode("find", args, &substring, 7604 &start, &end)) 7605 return NULL; 7606 7607 result = stringlib_find_slice( 7608 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self), 7609 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring), 7610 start, end 7611 ); 7612 7613 Py_DECREF(substring); 7614 7615 return PyLong_FromSsize_t(result); 7616} 7617 7618static PyObject * 7619unicode_getitem(PyUnicodeObject *self, Py_ssize_t index) 7620{ 7621 if (index < 0 || index >= self->length) { 7622 PyErr_SetString(PyExc_IndexError, "string index out of range"); 7623 return NULL; 7624 } 7625 7626 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1); 7627} 7628 7629/* Believe it or not, this produces the same value for ASCII strings 7630 as string_hash(). */ 7631static Py_hash_t 7632unicode_hash(PyUnicodeObject *self) 7633{ 7634 Py_ssize_t len; 7635 Py_UNICODE *p; 7636 Py_hash_t x; 7637 7638 if (self->hash != -1) 7639 return self->hash; 7640 len = Py_SIZE(self); 7641 p = self->str; 7642 x = *p << 7; 7643 while (--len >= 0) 7644 x = (1000003*x) ^ *p++; 7645 x ^= Py_SIZE(self); 7646 if (x == -1) 7647 x = -2; 7648 self->hash = x; 7649 return x; 7650} 7651 7652PyDoc_STRVAR(index__doc__, 7653 "S.index(sub[, start[, end]]) -> int\n\ 7654\n\ 7655Like S.find() but raise ValueError when the substring is not found."); 7656 7657static PyObject * 7658unicode_index(PyUnicodeObject *self, PyObject *args) 7659{ 7660 Py_ssize_t result; 7661 PyUnicodeObject *substring; 7662 Py_ssize_t start; 7663 Py_ssize_t end; 7664 7665 if (!stringlib_parse_args_finds_unicode("index", args, &substring, 7666 &start, &end)) 7667 return NULL; 7668 7669 result = stringlib_find_slice( 7670 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self), 7671 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring), 7672 start, end 7673 ); 7674 7675 Py_DECREF(substring); 7676 7677 if (result < 0) { 7678 PyErr_SetString(PyExc_ValueError, "substring not found"); 7679 return NULL; 7680 } 7681 7682 return PyLong_FromSsize_t(result); 7683} 7684 7685PyDoc_STRVAR(islower__doc__, 7686 "S.islower() -> bool\n\ 7687\n\ 7688Return True if all cased characters in S are lowercase and there is\n\ 7689at least one cased character in S, False otherwise."); 7690 7691static PyObject* 7692unicode_islower(PyUnicodeObject *self) 7693{ 7694 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 7695 register const Py_UNICODE *e; 7696 int cased; 7697 7698 /* Shortcut for single character strings */ 7699 if (PyUnicode_GET_SIZE(self) == 1) 7700 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p)); 7701 7702 /* Special case for empty strings */ 7703 if (PyUnicode_GET_SIZE(self) == 0) 7704 return PyBool_FromLong(0); 7705 7706 e = p + PyUnicode_GET_SIZE(self); 7707 cased = 0; 7708 for (; p < e; p++) { 7709 register const Py_UNICODE ch = *p; 7710 7711 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) 7712 return PyBool_FromLong(0); 7713 else if (!cased && Py_UNICODE_ISLOWER(ch)) 7714 cased = 1; 7715 } 7716 return PyBool_FromLong(cased); 7717} 7718 7719PyDoc_STRVAR(isupper__doc__, 7720 "S.isupper() -> bool\n\ 7721\n\ 7722Return True if all cased characters in S are uppercase and there is\n\ 7723at least one cased character in S, False otherwise."); 7724 7725static PyObject* 7726unicode_isupper(PyUnicodeObject *self) 7727{ 7728 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 7729 register const Py_UNICODE *e; 7730 int cased; 7731 7732 /* Shortcut for single character strings */ 7733 if (PyUnicode_GET_SIZE(self) == 1) 7734 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0); 7735 7736 /* Special case for empty strings */ 7737 if (PyUnicode_GET_SIZE(self) == 0) 7738 return PyBool_FromLong(0); 7739 7740 e = p + PyUnicode_GET_SIZE(self); 7741 cased = 0; 7742 for (; p < e; p++) { 7743 register const Py_UNICODE ch = *p; 7744 7745 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch)) 7746 return PyBool_FromLong(0); 7747 else if (!cased && Py_UNICODE_ISUPPER(ch)) 7748 cased = 1; 7749 } 7750 return PyBool_FromLong(cased); 7751} 7752 7753PyDoc_STRVAR(istitle__doc__, 7754 "S.istitle() -> bool\n\ 7755\n\ 7756Return True if S is a titlecased string and there is at least one\n\ 7757character in S, i.e. upper- and titlecase characters may only\n\ 7758follow uncased characters and lowercase characters only cased ones.\n\ 7759Return False otherwise."); 7760 7761static PyObject* 7762unicode_istitle(PyUnicodeObject *self) 7763{ 7764 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 7765 register const Py_UNICODE *e; 7766 int cased, previous_is_cased; 7767 7768 /* Shortcut for single character strings */ 7769 if (PyUnicode_GET_SIZE(self) == 1) 7770 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) || 7771 (Py_UNICODE_ISUPPER(*p) != 0)); 7772 7773 /* Special case for empty strings */ 7774 if (PyUnicode_GET_SIZE(self) == 0) 7775 return PyBool_FromLong(0); 7776 7777 e = p + PyUnicode_GET_SIZE(self); 7778 cased = 0; 7779 previous_is_cased = 0; 7780 for (; p < e; p++) { 7781 register const Py_UNICODE ch = *p; 7782 7783 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) { 7784 if (previous_is_cased) 7785 return PyBool_FromLong(0); 7786 previous_is_cased = 1; 7787 cased = 1; 7788 } 7789 else if (Py_UNICODE_ISLOWER(ch)) { 7790 if (!previous_is_cased) 7791 return PyBool_FromLong(0); 7792 previous_is_cased = 1; 7793 cased = 1; 7794 } 7795 else 7796 previous_is_cased = 0; 7797 } 7798 return PyBool_FromLong(cased); 7799} 7800 7801PyDoc_STRVAR(isspace__doc__, 7802 "S.isspace() -> bool\n\ 7803\n\ 7804Return True if all characters in S are whitespace\n\ 7805and there is at least one character in S, False otherwise."); 7806 7807static PyObject* 7808unicode_isspace(PyUnicodeObject *self) 7809{ 7810 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 7811 register const Py_UNICODE *e; 7812 7813 /* Shortcut for single character strings */ 7814 if (PyUnicode_GET_SIZE(self) == 1 && 7815 Py_UNICODE_ISSPACE(*p)) 7816 return PyBool_FromLong(1); 7817 7818 /* Special case for empty strings */ 7819 if (PyUnicode_GET_SIZE(self) == 0) 7820 return PyBool_FromLong(0); 7821 7822 e = p + PyUnicode_GET_SIZE(self); 7823 for (; p < e; p++) { 7824 if (!Py_UNICODE_ISSPACE(*p)) 7825 return PyBool_FromLong(0); 7826 } 7827 return PyBool_FromLong(1); 7828} 7829 7830PyDoc_STRVAR(isalpha__doc__, 7831 "S.isalpha() -> bool\n\ 7832\n\ 7833Return True if all characters in S are alphabetic\n\ 7834and there is at least one character in S, False otherwise."); 7835 7836static PyObject* 7837unicode_isalpha(PyUnicodeObject *self) 7838{ 7839 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 7840 register const Py_UNICODE *e; 7841 7842 /* Shortcut for single character strings */ 7843 if (PyUnicode_GET_SIZE(self) == 1 && 7844 Py_UNICODE_ISALPHA(*p)) 7845 return PyBool_FromLong(1); 7846 7847 /* Special case for empty strings */ 7848 if (PyUnicode_GET_SIZE(self) == 0) 7849 return PyBool_FromLong(0); 7850 7851 e = p + PyUnicode_GET_SIZE(self); 7852 for (; p < e; p++) { 7853 if (!Py_UNICODE_ISALPHA(*p)) 7854 return PyBool_FromLong(0); 7855 } 7856 return PyBool_FromLong(1); 7857} 7858 7859PyDoc_STRVAR(isalnum__doc__, 7860 "S.isalnum() -> bool\n\ 7861\n\ 7862Return True if all characters in S are alphanumeric\n\ 7863and there is at least one character in S, False otherwise."); 7864 7865static PyObject* 7866unicode_isalnum(PyUnicodeObject *self) 7867{ 7868 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 7869 register const Py_UNICODE *e; 7870 7871 /* Shortcut for single character strings */ 7872 if (PyUnicode_GET_SIZE(self) == 1 && 7873 Py_UNICODE_ISALNUM(*p)) 7874 return PyBool_FromLong(1); 7875 7876 /* Special case for empty strings */ 7877 if (PyUnicode_GET_SIZE(self) == 0) 7878 return PyBool_FromLong(0); 7879 7880 e = p + PyUnicode_GET_SIZE(self); 7881 for (; p < e; p++) { 7882 if (!Py_UNICODE_ISALNUM(*p)) 7883 return PyBool_FromLong(0); 7884 } 7885 return PyBool_FromLong(1); 7886} 7887 7888PyDoc_STRVAR(isdecimal__doc__, 7889 "S.isdecimal() -> bool\n\ 7890\n\ 7891Return True if there are only decimal characters in S,\n\ 7892False otherwise."); 7893 7894static PyObject* 7895unicode_isdecimal(PyUnicodeObject *self) 7896{ 7897 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 7898 register const Py_UNICODE *e; 7899 7900 /* Shortcut for single character strings */ 7901 if (PyUnicode_GET_SIZE(self) == 1 && 7902 Py_UNICODE_ISDECIMAL(*p)) 7903 return PyBool_FromLong(1); 7904 7905 /* Special case for empty strings */ 7906 if (PyUnicode_GET_SIZE(self) == 0) 7907 return PyBool_FromLong(0); 7908 7909 e = p + PyUnicode_GET_SIZE(self); 7910 for (; p < e; p++) { 7911 if (!Py_UNICODE_ISDECIMAL(*p)) 7912 return PyBool_FromLong(0); 7913 } 7914 return PyBool_FromLong(1); 7915} 7916 7917PyDoc_STRVAR(isdigit__doc__, 7918 "S.isdigit() -> bool\n\ 7919\n\ 7920Return True if all characters in S are digits\n\ 7921and there is at least one character in S, False otherwise."); 7922 7923static PyObject* 7924unicode_isdigit(PyUnicodeObject *self) 7925{ 7926 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 7927 register const Py_UNICODE *e; 7928 7929 /* Shortcut for single character strings */ 7930 if (PyUnicode_GET_SIZE(self) == 1 && 7931 Py_UNICODE_ISDIGIT(*p)) 7932 return PyBool_FromLong(1); 7933 7934 /* Special case for empty strings */ 7935 if (PyUnicode_GET_SIZE(self) == 0) 7936 return PyBool_FromLong(0); 7937 7938 e = p + PyUnicode_GET_SIZE(self); 7939 for (; p < e; p++) { 7940 if (!Py_UNICODE_ISDIGIT(*p)) 7941 return PyBool_FromLong(0); 7942 } 7943 return PyBool_FromLong(1); 7944} 7945 7946PyDoc_STRVAR(isnumeric__doc__, 7947 "S.isnumeric() -> bool\n\ 7948\n\ 7949Return True if there are only numeric characters in S,\n\ 7950False otherwise."); 7951 7952static PyObject* 7953unicode_isnumeric(PyUnicodeObject *self) 7954{ 7955 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 7956 register const Py_UNICODE *e; 7957 7958 /* Shortcut for single character strings */ 7959 if (PyUnicode_GET_SIZE(self) == 1 && 7960 Py_UNICODE_ISNUMERIC(*p)) 7961 return PyBool_FromLong(1); 7962 7963 /* Special case for empty strings */ 7964 if (PyUnicode_GET_SIZE(self) == 0) 7965 return PyBool_FromLong(0); 7966 7967 e = p + PyUnicode_GET_SIZE(self); 7968 for (; p < e; p++) { 7969 if (!Py_UNICODE_ISNUMERIC(*p)) 7970 return PyBool_FromLong(0); 7971 } 7972 return PyBool_FromLong(1); 7973} 7974 7975int 7976PyUnicode_IsIdentifier(PyObject *self) 7977{ 7978 register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self); 7979 register const Py_UNICODE *e; 7980 7981 /* Special case for empty strings */ 7982 if (PyUnicode_GET_SIZE(self) == 0) 7983 return 0; 7984 7985 /* PEP 3131 says that the first character must be in 7986 XID_Start and subsequent characters in XID_Continue, 7987 and for the ASCII range, the 2.x rules apply (i.e 7988 start with letters and underscore, continue with 7989 letters, digits, underscore). However, given the current 7990 definition of XID_Start and XID_Continue, it is sufficient 7991 to check just for these, except that _ must be allowed 7992 as starting an identifier. */ 7993 if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */) 7994 return 0; 7995 7996 e = p + PyUnicode_GET_SIZE(self); 7997 for (p++; p < e; p++) { 7998 if (!_PyUnicode_IsXidContinue(*p)) 7999 return 0; 8000 } 8001 return 1; 8002} 8003 8004PyDoc_STRVAR(isidentifier__doc__, 8005 "S.isidentifier() -> bool\n\ 8006\n\ 8007Return True if S is a valid identifier according\n\ 8008to the language definition."); 8009 8010static PyObject* 8011unicode_isidentifier(PyObject *self) 8012{ 8013 return PyBool_FromLong(PyUnicode_IsIdentifier(self)); 8014} 8015 8016PyDoc_STRVAR(isprintable__doc__, 8017 "S.isprintable() -> bool\n\ 8018\n\ 8019Return True if all characters in S are considered\n\ 8020printable in repr() or S is empty, False otherwise."); 8021 8022static PyObject* 8023unicode_isprintable(PyObject *self) 8024{ 8025 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 8026 register const Py_UNICODE *e; 8027 8028 /* Shortcut for single character strings */ 8029 if (PyUnicode_GET_SIZE(self) == 1 && Py_UNICODE_ISPRINTABLE(*p)) { 8030 Py_RETURN_TRUE; 8031 } 8032 8033 e = p + PyUnicode_GET_SIZE(self); 8034 for (; p < e; p++) { 8035 if (!Py_UNICODE_ISPRINTABLE(*p)) { 8036 Py_RETURN_FALSE; 8037 } 8038 } 8039 Py_RETURN_TRUE; 8040} 8041 8042PyDoc_STRVAR(join__doc__, 8043 "S.join(iterable) -> str\n\ 8044\n\ 8045Return a string which is the concatenation of the strings in the\n\ 8046iterable. The separator between elements is S."); 8047 8048static PyObject* 8049unicode_join(PyObject *self, PyObject *data) 8050{ 8051 return PyUnicode_Join(self, data); 8052} 8053 8054static Py_ssize_t 8055unicode_length(PyUnicodeObject *self) 8056{ 8057 return self->length; 8058} 8059 8060PyDoc_STRVAR(ljust__doc__, 8061 "S.ljust(width[, fillchar]) -> str\n\ 8062\n\ 8063Return S left-justified in a Unicode string of length width. Padding is\n\ 8064done using the specified fill character (default is a space)."); 8065 8066static PyObject * 8067unicode_ljust(PyUnicodeObject *self, PyObject *args) 8068{ 8069 Py_ssize_t width; 8070 Py_UNICODE fillchar = ' '; 8071 8072 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar)) 8073 return NULL; 8074 8075 if (self->length >= width && PyUnicode_CheckExact(self)) { 8076 Py_INCREF(self); 8077 return (PyObject*) self; 8078 } 8079 8080 return (PyObject*) pad(self, 0, width - self->length, fillchar); 8081} 8082 8083PyDoc_STRVAR(lower__doc__, 8084 "S.lower() -> str\n\ 8085\n\ 8086Return a copy of the string S converted to lowercase."); 8087 8088static PyObject* 8089unicode_lower(PyUnicodeObject *self) 8090{ 8091 return fixup(self, fixlower); 8092} 8093 8094#define LEFTSTRIP 0 8095#define RIGHTSTRIP 1 8096#define BOTHSTRIP 2 8097 8098/* Arrays indexed by above */ 8099static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"}; 8100 8101#define STRIPNAME(i) (stripformat[i]+3) 8102 8103/* externally visible for str.strip(unicode) */ 8104PyObject * 8105_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj) 8106{ 8107 Py_UNICODE *s = PyUnicode_AS_UNICODE(self); 8108 Py_ssize_t len = PyUnicode_GET_SIZE(self); 8109 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj); 8110 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj); 8111 Py_ssize_t i, j; 8112 8113 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen); 8114 8115 i = 0; 8116 if (striptype != RIGHTSTRIP) { 8117 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) { 8118 i++; 8119 } 8120 } 8121 8122 j = len; 8123 if (striptype != LEFTSTRIP) { 8124 do { 8125 j--; 8126 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen)); 8127 j++; 8128 } 8129 8130 if (i == 0 && j == len && PyUnicode_CheckExact(self)) { 8131 Py_INCREF(self); 8132 return (PyObject*)self; 8133 } 8134 else 8135 return PyUnicode_FromUnicode(s+i, j-i); 8136} 8137 8138 8139static PyObject * 8140do_strip(PyUnicodeObject *self, int striptype) 8141{ 8142 Py_UNICODE *s = PyUnicode_AS_UNICODE(self); 8143 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j; 8144 8145 i = 0; 8146 if (striptype != RIGHTSTRIP) { 8147 while (i < len && Py_UNICODE_ISSPACE(s[i])) { 8148 i++; 8149 } 8150 } 8151 8152 j = len; 8153 if (striptype != LEFTSTRIP) { 8154 do { 8155 j--; 8156 } while (j >= i && Py_UNICODE_ISSPACE(s[j])); 8157 j++; 8158 } 8159 8160 if (i == 0 && j == len && PyUnicode_CheckExact(self)) { 8161 Py_INCREF(self); 8162 return (PyObject*)self; 8163 } 8164 else 8165 return PyUnicode_FromUnicode(s+i, j-i); 8166} 8167 8168 8169static PyObject * 8170do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args) 8171{ 8172 PyObject *sep = NULL; 8173 8174 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep)) 8175 return NULL; 8176 8177 if (sep != NULL && sep != Py_None) { 8178 if (PyUnicode_Check(sep)) 8179 return _PyUnicode_XStrip(self, striptype, sep); 8180 else { 8181 PyErr_Format(PyExc_TypeError, 8182 "%s arg must be None or str", 8183 STRIPNAME(striptype)); 8184 return NULL; 8185 } 8186 } 8187 8188 return do_strip(self, striptype); 8189} 8190 8191 8192PyDoc_STRVAR(strip__doc__, 8193 "S.strip([chars]) -> str\n\ 8194\n\ 8195Return a copy of the string S with leading and trailing\n\ 8196whitespace removed.\n\ 8197If chars is given and not None, remove characters in chars instead."); 8198 8199static PyObject * 8200unicode_strip(PyUnicodeObject *self, PyObject *args) 8201{ 8202 if (PyTuple_GET_SIZE(args) == 0) 8203 return do_strip(self, BOTHSTRIP); /* Common case */ 8204 else 8205 return do_argstrip(self, BOTHSTRIP, args); 8206} 8207 8208 8209PyDoc_STRVAR(lstrip__doc__, 8210 "S.lstrip([chars]) -> str\n\ 8211\n\ 8212Return a copy of the string S with leading whitespace removed.\n\ 8213If chars is given and not None, remove characters in chars instead."); 8214 8215static PyObject * 8216unicode_lstrip(PyUnicodeObject *self, PyObject *args) 8217{ 8218 if (PyTuple_GET_SIZE(args) == 0) 8219 return do_strip(self, LEFTSTRIP); /* Common case */ 8220 else 8221 return do_argstrip(self, LEFTSTRIP, args); 8222} 8223 8224 8225PyDoc_STRVAR(rstrip__doc__, 8226 "S.rstrip([chars]) -> str\n\ 8227\n\ 8228Return a copy of the string S with trailing whitespace removed.\n\ 8229If chars is given and not None, remove characters in chars instead."); 8230 8231static PyObject * 8232unicode_rstrip(PyUnicodeObject *self, PyObject *args) 8233{ 8234 if (PyTuple_GET_SIZE(args) == 0) 8235 return do_strip(self, RIGHTSTRIP); /* Common case */ 8236 else 8237 return do_argstrip(self, RIGHTSTRIP, args); 8238} 8239 8240 8241static PyObject* 8242unicode_repeat(PyUnicodeObject *str, Py_ssize_t len) 8243{ 8244 PyUnicodeObject *u; 8245 Py_UNICODE *p; 8246 Py_ssize_t nchars; 8247 size_t nbytes; 8248 8249 if (len < 1) { 8250 Py_INCREF(unicode_empty); 8251 return (PyObject *)unicode_empty; 8252 } 8253 8254 if (len == 1 && PyUnicode_CheckExact(str)) { 8255 /* no repeat, return original string */ 8256 Py_INCREF(str); 8257 return (PyObject*) str; 8258 } 8259 8260 /* ensure # of chars needed doesn't overflow int and # of bytes 8261 * needed doesn't overflow size_t 8262 */ 8263 nchars = len * str->length; 8264 if (nchars / len != str->length) { 8265 PyErr_SetString(PyExc_OverflowError, 8266 "repeated string is too long"); 8267 return NULL; 8268 } 8269 nbytes = (nchars + 1) * sizeof(Py_UNICODE); 8270 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) { 8271 PyErr_SetString(PyExc_OverflowError, 8272 "repeated string is too long"); 8273 return NULL; 8274 } 8275 u = _PyUnicode_New(nchars); 8276 if (!u) 8277 return NULL; 8278 8279 p = u->str; 8280 8281 if (str->length == 1) { 8282 Py_UNICODE_FILL(p, str->str[0], len); 8283 } else { 8284 Py_ssize_t done = str->length; /* number of characters copied this far */ 8285 Py_UNICODE_COPY(p, str->str, str->length); 8286 while (done < nchars) { 8287 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done; 8288 Py_UNICODE_COPY(p+done, p, n); 8289 done += n; 8290 } 8291 } 8292 8293 return (PyObject*) u; 8294} 8295 8296PyObject *PyUnicode_Replace(PyObject *obj, 8297 PyObject *subobj, 8298 PyObject *replobj, 8299 Py_ssize_t maxcount) 8300{ 8301 PyObject *self; 8302 PyObject *str1; 8303 PyObject *str2; 8304 PyObject *result; 8305 8306 self = PyUnicode_FromObject(obj); 8307 if (self == NULL) 8308 return NULL; 8309 str1 = PyUnicode_FromObject(subobj); 8310 if (str1 == NULL) { 8311 Py_DECREF(self); 8312 return NULL; 8313 } 8314 str2 = PyUnicode_FromObject(replobj); 8315 if (str2 == NULL) { 8316 Py_DECREF(self); 8317 Py_DECREF(str1); 8318 return NULL; 8319 } 8320 result = replace((PyUnicodeObject *)self, 8321 (PyUnicodeObject *)str1, 8322 (PyUnicodeObject *)str2, 8323 maxcount); 8324 Py_DECREF(self); 8325 Py_DECREF(str1); 8326 Py_DECREF(str2); 8327 return result; 8328} 8329 8330PyDoc_STRVAR(replace__doc__, 8331 "S.replace(old, new[, count]) -> str\n\ 8332\n\ 8333Return a copy of S with all occurrences of substring\n\ 8334old replaced by new. If the optional argument count is\n\ 8335given, only the first count occurrences are replaced."); 8336 8337static PyObject* 8338unicode_replace(PyUnicodeObject *self, PyObject *args) 8339{ 8340 PyUnicodeObject *str1; 8341 PyUnicodeObject *str2; 8342 Py_ssize_t maxcount = -1; 8343 PyObject *result; 8344 8345 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount)) 8346 return NULL; 8347 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1); 8348 if (str1 == NULL) 8349 return NULL; 8350 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2); 8351 if (str2 == NULL) { 8352 Py_DECREF(str1); 8353 return NULL; 8354 } 8355 8356 result = replace(self, str1, str2, maxcount); 8357 8358 Py_DECREF(str1); 8359 Py_DECREF(str2); 8360 return result; 8361} 8362 8363static 8364PyObject *unicode_repr(PyObject *unicode) 8365{ 8366 PyObject *repr; 8367 Py_UNICODE *p; 8368 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode); 8369 Py_ssize_t size = PyUnicode_GET_SIZE(unicode); 8370 8371 /* XXX(nnorwitz): rather than over-allocating, it would be 8372 better to choose a different scheme. Perhaps scan the 8373 first N-chars of the string and allocate based on that size. 8374 */ 8375 /* Initial allocation is based on the longest-possible unichr 8376 escape. 8377 8378 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source 8379 unichr, so in this case it's the longest unichr escape. In 8380 narrow (UTF-16) builds this is five chars per source unichr 8381 since there are two unichrs in the surrogate pair, so in narrow 8382 (UTF-16) builds it's not the longest unichr escape. 8383 8384 In wide or narrow builds '\uxxxx' is 6 chars per source unichr, 8385 so in the narrow (UTF-16) build case it's the longest unichr 8386 escape. 8387 */ 8388 8389 repr = PyUnicode_FromUnicode(NULL, 8390 2 /* quotes */ 8391#ifdef Py_UNICODE_WIDE 8392 + 10*size 8393#else 8394 + 6*size 8395#endif 8396 + 1); 8397 if (repr == NULL) 8398 return NULL; 8399 8400 p = PyUnicode_AS_UNICODE(repr); 8401 8402 /* Add quote */ 8403 *p++ = (findchar(s, size, '\'') && 8404 !findchar(s, size, '"')) ? '"' : '\''; 8405 while (size-- > 0) { 8406 Py_UNICODE ch = *s++; 8407 8408 /* Escape quotes and backslashes */ 8409 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) { 8410 *p++ = '\\'; 8411 *p++ = ch; 8412 continue; 8413 } 8414 8415 /* Map special whitespace to '\t', \n', '\r' */ 8416 if (ch == '\t') { 8417 *p++ = '\\'; 8418 *p++ = 't'; 8419 } 8420 else if (ch == '\n') { 8421 *p++ = '\\'; 8422 *p++ = 'n'; 8423 } 8424 else if (ch == '\r') { 8425 *p++ = '\\'; 8426 *p++ = 'r'; 8427 } 8428 8429 /* Map non-printable US ASCII to '\xhh' */ 8430 else if (ch < ' ' || ch == 0x7F) { 8431 *p++ = '\\'; 8432 *p++ = 'x'; 8433 *p++ = hexdigits[(ch >> 4) & 0x000F]; 8434 *p++ = hexdigits[ch & 0x000F]; 8435 } 8436 8437 /* Copy ASCII characters as-is */ 8438 else if (ch < 0x7F) { 8439 *p++ = ch; 8440 } 8441 8442 /* Non-ASCII characters */ 8443 else { 8444 Py_UCS4 ucs = ch; 8445 8446#ifndef Py_UNICODE_WIDE 8447 Py_UNICODE ch2 = 0; 8448 /* Get code point from surrogate pair */ 8449 if (size > 0) { 8450 ch2 = *s; 8451 if (ch >= 0xD800 && ch < 0xDC00 && ch2 >= 0xDC00 8452 && ch2 <= 0xDFFF) { 8453 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) 8454 + 0x00010000; 8455 s++; 8456 size--; 8457 } 8458 } 8459#endif 8460 /* Map Unicode whitespace and control characters 8461 (categories Z* and C* except ASCII space) 8462 */ 8463 if (!Py_UNICODE_ISPRINTABLE(ucs)) { 8464 /* Map 8-bit characters to '\xhh' */ 8465 if (ucs <= 0xff) { 8466 *p++ = '\\'; 8467 *p++ = 'x'; 8468 *p++ = hexdigits[(ch >> 4) & 0x000F]; 8469 *p++ = hexdigits[ch & 0x000F]; 8470 } 8471 /* Map 21-bit characters to '\U00xxxxxx' */ 8472 else if (ucs >= 0x10000) { 8473 *p++ = '\\'; 8474 *p++ = 'U'; 8475 *p++ = hexdigits[(ucs >> 28) & 0x0000000F]; 8476 *p++ = hexdigits[(ucs >> 24) & 0x0000000F]; 8477 *p++ = hexdigits[(ucs >> 20) & 0x0000000F]; 8478 *p++ = hexdigits[(ucs >> 16) & 0x0000000F]; 8479 *p++ = hexdigits[(ucs >> 12) & 0x0000000F]; 8480 *p++ = hexdigits[(ucs >> 8) & 0x0000000F]; 8481 *p++ = hexdigits[(ucs >> 4) & 0x0000000F]; 8482 *p++ = hexdigits[ucs & 0x0000000F]; 8483 } 8484 /* Map 16-bit characters to '\uxxxx' */ 8485 else { 8486 *p++ = '\\'; 8487 *p++ = 'u'; 8488 *p++ = hexdigits[(ucs >> 12) & 0x000F]; 8489 *p++ = hexdigits[(ucs >> 8) & 0x000F]; 8490 *p++ = hexdigits[(ucs >> 4) & 0x000F]; 8491 *p++ = hexdigits[ucs & 0x000F]; 8492 } 8493 } 8494 /* Copy characters as-is */ 8495 else { 8496 *p++ = ch; 8497#ifndef Py_UNICODE_WIDE 8498 if (ucs >= 0x10000) 8499 *p++ = ch2; 8500#endif 8501 } 8502 } 8503 } 8504 /* Add quote */ 8505 *p++ = PyUnicode_AS_UNICODE(repr)[0]; 8506 8507 *p = '\0'; 8508 PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr)); 8509 return repr; 8510} 8511 8512PyDoc_STRVAR(rfind__doc__, 8513 "S.rfind(sub[, start[, end]]) -> int\n\ 8514\n\ 8515Return the highest index in S where substring sub is found,\n\ 8516such that sub is contained within S[start:end]. Optional\n\ 8517arguments start and end are interpreted as in slice notation.\n\ 8518\n\ 8519Return -1 on failure."); 8520 8521static PyObject * 8522unicode_rfind(PyUnicodeObject *self, PyObject *args) 8523{ 8524 PyUnicodeObject *substring; 8525 Py_ssize_t start; 8526 Py_ssize_t end; 8527 Py_ssize_t result; 8528 8529 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring, 8530 &start, &end)) 8531 return NULL; 8532 8533 result = stringlib_rfind_slice( 8534 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self), 8535 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring), 8536 start, end 8537 ); 8538 8539 Py_DECREF(substring); 8540 8541 return PyLong_FromSsize_t(result); 8542} 8543 8544PyDoc_STRVAR(rindex__doc__, 8545 "S.rindex(sub[, start[, end]]) -> int\n\ 8546\n\ 8547Like S.rfind() but raise ValueError when the substring is not found."); 8548 8549static PyObject * 8550unicode_rindex(PyUnicodeObject *self, PyObject *args) 8551{ 8552 PyUnicodeObject *substring; 8553 Py_ssize_t start; 8554 Py_ssize_t end; 8555 Py_ssize_t result; 8556 8557 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring, 8558 &start, &end)) 8559 return NULL; 8560 8561 result = stringlib_rfind_slice( 8562 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self), 8563 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring), 8564 start, end 8565 ); 8566 8567 Py_DECREF(substring); 8568 8569 if (result < 0) { 8570 PyErr_SetString(PyExc_ValueError, "substring not found"); 8571 return NULL; 8572 } 8573 return PyLong_FromSsize_t(result); 8574} 8575 8576PyDoc_STRVAR(rjust__doc__, 8577 "S.rjust(width[, fillchar]) -> str\n\ 8578\n\ 8579Return S right-justified in a string of length width. Padding is\n\ 8580done using the specified fill character (default is a space)."); 8581 8582static PyObject * 8583unicode_rjust(PyUnicodeObject *self, PyObject *args) 8584{ 8585 Py_ssize_t width; 8586 Py_UNICODE fillchar = ' '; 8587 8588 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar)) 8589 return NULL; 8590 8591 if (self->length >= width && PyUnicode_CheckExact(self)) { 8592 Py_INCREF(self); 8593 return (PyObject*) self; 8594 } 8595 8596 return (PyObject*) pad(self, width - self->length, 0, fillchar); 8597} 8598 8599PyObject *PyUnicode_Split(PyObject *s, 8600 PyObject *sep, 8601 Py_ssize_t maxsplit) 8602{ 8603 PyObject *result; 8604 8605 s = PyUnicode_FromObject(s); 8606 if (s == NULL) 8607 return NULL; 8608 if (sep != NULL) { 8609 sep = PyUnicode_FromObject(sep); 8610 if (sep == NULL) { 8611 Py_DECREF(s); 8612 return NULL; 8613 } 8614 } 8615 8616 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit); 8617 8618 Py_DECREF(s); 8619 Py_XDECREF(sep); 8620 return result; 8621} 8622 8623PyDoc_STRVAR(split__doc__, 8624 "S.split([sep[, maxsplit]]) -> list of strings\n\ 8625\n\ 8626Return a list of the words in S, using sep as the\n\ 8627delimiter string. If maxsplit is given, at most maxsplit\n\ 8628splits are done. If sep is not specified or is None, any\n\ 8629whitespace string is a separator and empty strings are\n\ 8630removed from the result."); 8631 8632static PyObject* 8633unicode_split(PyUnicodeObject *self, PyObject *args) 8634{ 8635 PyObject *substring = Py_None; 8636 Py_ssize_t maxcount = -1; 8637 8638 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount)) 8639 return NULL; 8640 8641 if (substring == Py_None) 8642 return split(self, NULL, maxcount); 8643 else if (PyUnicode_Check(substring)) 8644 return split(self, (PyUnicodeObject *)substring, maxcount); 8645 else 8646 return PyUnicode_Split((PyObject *)self, substring, maxcount); 8647} 8648 8649PyObject * 8650PyUnicode_Partition(PyObject *str_in, PyObject *sep_in) 8651{ 8652 PyObject* str_obj; 8653 PyObject* sep_obj; 8654 PyObject* out; 8655 8656 str_obj = PyUnicode_FromObject(str_in); 8657 if (!str_obj) 8658 return NULL; 8659 sep_obj = PyUnicode_FromObject(sep_in); 8660 if (!sep_obj) { 8661 Py_DECREF(str_obj); 8662 return NULL; 8663 } 8664 8665 out = stringlib_partition( 8666 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj), 8667 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj) 8668 ); 8669 8670 Py_DECREF(sep_obj); 8671 Py_DECREF(str_obj); 8672 8673 return out; 8674} 8675 8676 8677PyObject * 8678PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in) 8679{ 8680 PyObject* str_obj; 8681 PyObject* sep_obj; 8682 PyObject* out; 8683 8684 str_obj = PyUnicode_FromObject(str_in); 8685 if (!str_obj) 8686 return NULL; 8687 sep_obj = PyUnicode_FromObject(sep_in); 8688 if (!sep_obj) { 8689 Py_DECREF(str_obj); 8690 return NULL; 8691 } 8692 8693 out = stringlib_rpartition( 8694 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj), 8695 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj) 8696 ); 8697 8698 Py_DECREF(sep_obj); 8699 Py_DECREF(str_obj); 8700 8701 return out; 8702} 8703 8704PyDoc_STRVAR(partition__doc__, 8705 "S.partition(sep) -> (head, sep, tail)\n\ 8706\n\ 8707Search for the separator sep in S, and return the part before it,\n\ 8708the separator itself, and the part after it. If the separator is not\n\ 8709found, return S and two empty strings."); 8710 8711static PyObject* 8712unicode_partition(PyUnicodeObject *self, PyObject *separator) 8713{ 8714 return PyUnicode_Partition((PyObject *)self, separator); 8715} 8716 8717PyDoc_STRVAR(rpartition__doc__, 8718 "S.rpartition(sep) -> (head, sep, tail)\n\ 8719\n\ 8720Search for the separator sep in S, starting at the end of S, and return\n\ 8721the part before it, the separator itself, and the part after it. If the\n\ 8722separator is not found, return two empty strings and S."); 8723 8724static PyObject* 8725unicode_rpartition(PyUnicodeObject *self, PyObject *separator) 8726{ 8727 return PyUnicode_RPartition((PyObject *)self, separator); 8728} 8729 8730PyObject *PyUnicode_RSplit(PyObject *s, 8731 PyObject *sep, 8732 Py_ssize_t maxsplit) 8733{ 8734 PyObject *result; 8735 8736 s = PyUnicode_FromObject(s); 8737 if (s == NULL) 8738 return NULL; 8739 if (sep != NULL) { 8740 sep = PyUnicode_FromObject(sep); 8741 if (sep == NULL) { 8742 Py_DECREF(s); 8743 return NULL; 8744 } 8745 } 8746 8747 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit); 8748 8749 Py_DECREF(s); 8750 Py_XDECREF(sep); 8751 return result; 8752} 8753 8754PyDoc_STRVAR(rsplit__doc__, 8755 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\ 8756\n\ 8757Return a list of the words in S, using sep as the\n\ 8758delimiter string, starting at the end of the string and\n\ 8759working to the front. If maxsplit is given, at most maxsplit\n\ 8760splits are done. If sep is not specified, any whitespace string\n\ 8761is a separator."); 8762 8763static PyObject* 8764unicode_rsplit(PyUnicodeObject *self, PyObject *args) 8765{ 8766 PyObject *substring = Py_None; 8767 Py_ssize_t maxcount = -1; 8768 8769 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount)) 8770 return NULL; 8771 8772 if (substring == Py_None) 8773 return rsplit(self, NULL, maxcount); 8774 else if (PyUnicode_Check(substring)) 8775 return rsplit(self, (PyUnicodeObject *)substring, maxcount); 8776 else 8777 return PyUnicode_RSplit((PyObject *)self, substring, maxcount); 8778} 8779 8780PyDoc_STRVAR(splitlines__doc__, 8781 "S.splitlines([keepends]) -> list of strings\n\ 8782\n\ 8783Return a list of the lines in S, breaking at line boundaries.\n\ 8784Line breaks are not included in the resulting list unless keepends\n\ 8785is given and true."); 8786 8787static PyObject* 8788unicode_splitlines(PyUnicodeObject *self, PyObject *args) 8789{ 8790 int keepends = 0; 8791 8792 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends)) 8793 return NULL; 8794 8795 return PyUnicode_Splitlines((PyObject *)self, keepends); 8796} 8797 8798static 8799PyObject *unicode_str(PyObject *self) 8800{ 8801 if (PyUnicode_CheckExact(self)) { 8802 Py_INCREF(self); 8803 return self; 8804 } else 8805 /* Subtype -- return genuine unicode string with the same value. */ 8806 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self), 8807 PyUnicode_GET_SIZE(self)); 8808} 8809 8810PyDoc_STRVAR(swapcase__doc__, 8811 "S.swapcase() -> str\n\ 8812\n\ 8813Return a copy of S with uppercase characters converted to lowercase\n\ 8814and vice versa."); 8815 8816static PyObject* 8817unicode_swapcase(PyUnicodeObject *self) 8818{ 8819 return fixup(self, fixswapcase); 8820} 8821 8822PyDoc_STRVAR(maketrans__doc__, 8823 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\ 8824\n\ 8825Return a translation table usable for str.translate().\n\ 8826If there is only one argument, it must be a dictionary mapping Unicode\n\ 8827ordinals (integers) or characters to Unicode ordinals, strings or None.\n\ 8828Character keys will be then converted to ordinals.\n\ 8829If there are two arguments, they must be strings of equal length, and\n\ 8830in the resulting dictionary, each character in x will be mapped to the\n\ 8831character at the same position in y. If there is a third argument, it\n\ 8832must be a string, whose characters will be mapped to None in the result."); 8833 8834static PyObject* 8835unicode_maketrans(PyUnicodeObject *null, PyObject *args) 8836{ 8837 PyObject *x, *y = NULL, *z = NULL; 8838 PyObject *new = NULL, *key, *value; 8839 Py_ssize_t i = 0; 8840 int res; 8841 8842 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z)) 8843 return NULL; 8844 new = PyDict_New(); 8845 if (!new) 8846 return NULL; 8847 if (y != NULL) { 8848 /* x must be a string too, of equal length */ 8849 Py_ssize_t ylen = PyUnicode_GET_SIZE(y); 8850 if (!PyUnicode_Check(x)) { 8851 PyErr_SetString(PyExc_TypeError, "first maketrans argument must " 8852 "be a string if there is a second argument"); 8853 goto err; 8854 } 8855 if (PyUnicode_GET_SIZE(x) != ylen) { 8856 PyErr_SetString(PyExc_ValueError, "the first two maketrans " 8857 "arguments must have equal length"); 8858 goto err; 8859 } 8860 /* create entries for translating chars in x to those in y */ 8861 for (i = 0; i < PyUnicode_GET_SIZE(x); i++) { 8862 key = PyLong_FromLong(PyUnicode_AS_UNICODE(x)[i]); 8863 value = PyLong_FromLong(PyUnicode_AS_UNICODE(y)[i]); 8864 if (!key || !value) 8865 goto err; 8866 res = PyDict_SetItem(new, key, value); 8867 Py_DECREF(key); 8868 Py_DECREF(value); 8869 if (res < 0) 8870 goto err; 8871 } 8872 /* create entries for deleting chars in z */ 8873 if (z != NULL) { 8874 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) { 8875 key = PyLong_FromLong(PyUnicode_AS_UNICODE(z)[i]); 8876 if (!key) 8877 goto err; 8878 res = PyDict_SetItem(new, key, Py_None); 8879 Py_DECREF(key); 8880 if (res < 0) 8881 goto err; 8882 } 8883 } 8884 } else { 8885 /* x must be a dict */ 8886 if (!PyDict_CheckExact(x)) { 8887 PyErr_SetString(PyExc_TypeError, "if you give only one argument " 8888 "to maketrans it must be a dict"); 8889 goto err; 8890 } 8891 /* copy entries into the new dict, converting string keys to int keys */ 8892 while (PyDict_Next(x, &i, &key, &value)) { 8893 if (PyUnicode_Check(key)) { 8894 /* convert string keys to integer keys */ 8895 PyObject *newkey; 8896 if (PyUnicode_GET_SIZE(key) != 1) { 8897 PyErr_SetString(PyExc_ValueError, "string keys in translate " 8898 "table must be of length 1"); 8899 goto err; 8900 } 8901 newkey = PyLong_FromLong(PyUnicode_AS_UNICODE(key)[0]); 8902 if (!newkey) 8903 goto err; 8904 res = PyDict_SetItem(new, newkey, value); 8905 Py_DECREF(newkey); 8906 if (res < 0) 8907 goto err; 8908 } else if (PyLong_Check(key)) { 8909 /* just keep integer keys */ 8910 if (PyDict_SetItem(new, key, value) < 0) 8911 goto err; 8912 } else { 8913 PyErr_SetString(PyExc_TypeError, "keys in translate table must " 8914 "be strings or integers"); 8915 goto err; 8916 } 8917 } 8918 } 8919 return new; 8920 err: 8921 Py_DECREF(new); 8922 return NULL; 8923} 8924 8925PyDoc_STRVAR(translate__doc__, 8926 "S.translate(table) -> str\n\ 8927\n\ 8928Return a copy of the string S, where all characters have been mapped\n\ 8929through the given translation table, which must be a mapping of\n\ 8930Unicode ordinals to Unicode ordinals, strings, or None.\n\ 8931Unmapped characters are left untouched. Characters mapped to None\n\ 8932are deleted."); 8933 8934static PyObject* 8935unicode_translate(PyUnicodeObject *self, PyObject *table) 8936{ 8937 return PyUnicode_TranslateCharmap(self->str, self->length, table, "ignore"); 8938} 8939 8940PyDoc_STRVAR(upper__doc__, 8941 "S.upper() -> str\n\ 8942\n\ 8943Return a copy of S converted to uppercase."); 8944 8945static PyObject* 8946unicode_upper(PyUnicodeObject *self) 8947{ 8948 return fixup(self, fixupper); 8949} 8950 8951PyDoc_STRVAR(zfill__doc__, 8952 "S.zfill(width) -> str\n\ 8953\n\ 8954Pad a numeric string S with zeros on the left, to fill a field\n\ 8955of the specified width. The string S is never truncated."); 8956 8957static PyObject * 8958unicode_zfill(PyUnicodeObject *self, PyObject *args) 8959{ 8960 Py_ssize_t fill; 8961 PyUnicodeObject *u; 8962 8963 Py_ssize_t width; 8964 if (!PyArg_ParseTuple(args, "n:zfill", &width)) 8965 return NULL; 8966 8967 if (self->length >= width) { 8968 if (PyUnicode_CheckExact(self)) { 8969 Py_INCREF(self); 8970 return (PyObject*) self; 8971 } 8972 else 8973 return PyUnicode_FromUnicode( 8974 PyUnicode_AS_UNICODE(self), 8975 PyUnicode_GET_SIZE(self) 8976 ); 8977 } 8978 8979 fill = width - self->length; 8980 8981 u = pad(self, fill, 0, '0'); 8982 8983 if (u == NULL) 8984 return NULL; 8985 8986 if (u->str[fill] == '+' || u->str[fill] == '-') { 8987 /* move sign to beginning of string */ 8988 u->str[0] = u->str[fill]; 8989 u->str[fill] = '0'; 8990 } 8991 8992 return (PyObject*) u; 8993} 8994 8995#if 0 8996static PyObject* 8997unicode_freelistsize(PyUnicodeObject *self) 8998{ 8999 return PyLong_FromLong(numfree); 9000} 9001 9002static PyObject * 9003unicode__decimal2ascii(PyObject *self) 9004{ 9005 return PyUnicode_TransformDecimalToASCII(PyUnicode_AS_UNICODE(self), 9006 PyUnicode_GET_SIZE(self)); 9007} 9008#endif 9009 9010PyDoc_STRVAR(startswith__doc__, 9011 "S.startswith(prefix[, start[, end]]) -> bool\n\ 9012\n\ 9013Return True if S starts with the specified prefix, False otherwise.\n\ 9014With optional start, test S beginning at that position.\n\ 9015With optional end, stop comparing S at that position.\n\ 9016prefix can also be a tuple of strings to try."); 9017 9018static PyObject * 9019unicode_startswith(PyUnicodeObject *self, 9020 PyObject *args) 9021{ 9022 PyObject *subobj; 9023 PyUnicodeObject *substring; 9024 Py_ssize_t start = 0; 9025 Py_ssize_t end = PY_SSIZE_T_MAX; 9026 int result; 9027 9028 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end)) 9029 return NULL; 9030 if (PyTuple_Check(subobj)) { 9031 Py_ssize_t i; 9032 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) { 9033 substring = (PyUnicodeObject *)PyUnicode_FromObject( 9034 PyTuple_GET_ITEM(subobj, i)); 9035 if (substring == NULL) 9036 return NULL; 9037 result = tailmatch(self, substring, start, end, -1); 9038 Py_DECREF(substring); 9039 if (result) { 9040 Py_RETURN_TRUE; 9041 } 9042 } 9043 /* nothing matched */ 9044 Py_RETURN_FALSE; 9045 } 9046 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj); 9047 if (substring == NULL) { 9048 if (PyErr_ExceptionMatches(PyExc_TypeError)) 9049 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or " 9050 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name); 9051 return NULL; 9052 } 9053 result = tailmatch(self, substring, start, end, -1); 9054 Py_DECREF(substring); 9055 return PyBool_FromLong(result); 9056} 9057 9058 9059PyDoc_STRVAR(endswith__doc__, 9060 "S.endswith(suffix[, start[, end]]) -> bool\n\ 9061\n\ 9062Return True if S ends with the specified suffix, False otherwise.\n\ 9063With optional start, test S beginning at that position.\n\ 9064With optional end, stop comparing S at that position.\n\ 9065suffix can also be a tuple of strings to try."); 9066 9067static PyObject * 9068unicode_endswith(PyUnicodeObject *self, 9069 PyObject *args) 9070{ 9071 PyObject *subobj; 9072 PyUnicodeObject *substring; 9073 Py_ssize_t start = 0; 9074 Py_ssize_t end = PY_SSIZE_T_MAX; 9075 int result; 9076 9077 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end)) 9078 return NULL; 9079 if (PyTuple_Check(subobj)) { 9080 Py_ssize_t i; 9081 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) { 9082 substring = (PyUnicodeObject *)PyUnicode_FromObject( 9083 PyTuple_GET_ITEM(subobj, i)); 9084 if (substring == NULL) 9085 return NULL; 9086 result = tailmatch(self, substring, start, end, +1); 9087 Py_DECREF(substring); 9088 if (result) { 9089 Py_RETURN_TRUE; 9090 } 9091 } 9092 Py_RETURN_FALSE; 9093 } 9094 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj); 9095 if (substring == NULL) { 9096 if (PyErr_ExceptionMatches(PyExc_TypeError)) 9097 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or " 9098 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name); 9099 return NULL; 9100 } 9101 result = tailmatch(self, substring, start, end, +1); 9102 Py_DECREF(substring); 9103 return PyBool_FromLong(result); 9104} 9105 9106#include "stringlib/string_format.h" 9107 9108PyDoc_STRVAR(format__doc__, 9109 "S.format(*args, **kwargs) -> str\n\ 9110\n\ 9111Return a formatted version of S, using substitutions from args and kwargs.\n\ 9112The substitutions are identified by braces ('{' and '}')."); 9113 9114PyDoc_STRVAR(format_map__doc__, 9115 "S.format_map(mapping) -> str\n\ 9116\n\ 9117Return a formatted version of S, using substitutions from mapping.\n\ 9118The substitutions are identified by braces ('{' and '}')."); 9119 9120static PyObject * 9121unicode__format__(PyObject* self, PyObject* args) 9122{ 9123 PyObject *format_spec; 9124 9125 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec)) 9126 return NULL; 9127 9128 return _PyUnicode_FormatAdvanced(self, 9129 PyUnicode_AS_UNICODE(format_spec), 9130 PyUnicode_GET_SIZE(format_spec)); 9131} 9132 9133PyDoc_STRVAR(p_format__doc__, 9134 "S.__format__(format_spec) -> str\n\ 9135\n\ 9136Return a formatted version of S as described by format_spec."); 9137 9138static PyObject * 9139unicode__sizeof__(PyUnicodeObject *v) 9140{ 9141 return PyLong_FromSsize_t(sizeof(PyUnicodeObject) + 9142 sizeof(Py_UNICODE) * (v->length + 1)); 9143} 9144 9145PyDoc_STRVAR(sizeof__doc__, 9146 "S.__sizeof__() -> size of S in memory, in bytes"); 9147 9148static PyObject * 9149unicode_getnewargs(PyUnicodeObject *v) 9150{ 9151 return Py_BuildValue("(u#)", v->str, v->length); 9152} 9153 9154static PyMethodDef unicode_methods[] = { 9155 9156 /* Order is according to common usage: often used methods should 9157 appear first, since lookup is done sequentially. */ 9158 9159 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__}, 9160 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__}, 9161 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__}, 9162 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__}, 9163 {"join", (PyCFunction) unicode_join, METH_O, join__doc__}, 9164 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__}, 9165 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__}, 9166 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__}, 9167 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__}, 9168 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__}, 9169 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__}, 9170 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__}, 9171 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__}, 9172 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__}, 9173 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__}, 9174 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__}, 9175 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__}, 9176 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__}, 9177 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__}, 9178 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__}, 9179 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__}, 9180 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__}, 9181 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__}, 9182 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__}, 9183 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__}, 9184 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__}, 9185 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__}, 9186 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__}, 9187 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__}, 9188 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__}, 9189 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__}, 9190 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__}, 9191 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__}, 9192 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__}, 9193 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__}, 9194 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__}, 9195 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__}, 9196 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__}, 9197 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__}, 9198 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__}, 9199 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__}, 9200 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__}, 9201 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__}, 9202 {"maketrans", (PyCFunction) unicode_maketrans, 9203 METH_VARARGS | METH_STATIC, maketrans__doc__}, 9204 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__}, 9205#if 0 9206 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__}, 9207#endif 9208 9209#if 0 9210 /* These methods are just used for debugging the implementation. */ 9211 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS}, 9212 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS}, 9213#endif 9214 9215 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS}, 9216 {NULL, NULL} 9217}; 9218 9219static PyObject * 9220unicode_mod(PyObject *v, PyObject *w) 9221{ 9222 if (!PyUnicode_Check(v)) { 9223 Py_INCREF(Py_NotImplemented); 9224 return Py_NotImplemented; 9225 } 9226 return PyUnicode_Format(v, w); 9227} 9228 9229static PyNumberMethods unicode_as_number = { 9230 0, /*nb_add*/ 9231 0, /*nb_subtract*/ 9232 0, /*nb_multiply*/ 9233 unicode_mod, /*nb_remainder*/ 9234}; 9235 9236static PySequenceMethods unicode_as_sequence = { 9237 (lenfunc) unicode_length, /* sq_length */ 9238 PyUnicode_Concat, /* sq_concat */ 9239 (ssizeargfunc) unicode_repeat, /* sq_repeat */ 9240 (ssizeargfunc) unicode_getitem, /* sq_item */ 9241 0, /* sq_slice */ 9242 0, /* sq_ass_item */ 9243 0, /* sq_ass_slice */ 9244 PyUnicode_Contains, /* sq_contains */ 9245}; 9246 9247static PyObject* 9248unicode_subscript(PyUnicodeObject* self, PyObject* item) 9249{ 9250 if (PyIndex_Check(item)) { 9251 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError); 9252 if (i == -1 && PyErr_Occurred()) 9253 return NULL; 9254 if (i < 0) 9255 i += PyUnicode_GET_SIZE(self); 9256 return unicode_getitem(self, i); 9257 } else if (PySlice_Check(item)) { 9258 Py_ssize_t start, stop, step, slicelength, cur, i; 9259 Py_UNICODE* source_buf; 9260 Py_UNICODE* result_buf; 9261 PyObject* result; 9262 9263 if (PySlice_GetIndicesEx(item, PyUnicode_GET_SIZE(self), 9264 &start, &stop, &step, &slicelength) < 0) { 9265 return NULL; 9266 } 9267 9268 if (slicelength <= 0) { 9269 return PyUnicode_FromUnicode(NULL, 0); 9270 } else if (start == 0 && step == 1 && slicelength == self->length && 9271 PyUnicode_CheckExact(self)) { 9272 Py_INCREF(self); 9273 return (PyObject *)self; 9274 } else if (step == 1) { 9275 return PyUnicode_FromUnicode(self->str + start, slicelength); 9276 } else { 9277 source_buf = PyUnicode_AS_UNICODE((PyObject*)self); 9278 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength* 9279 sizeof(Py_UNICODE)); 9280 9281 if (result_buf == NULL) 9282 return PyErr_NoMemory(); 9283 9284 for (cur = start, i = 0; i < slicelength; cur += step, i++) { 9285 result_buf[i] = source_buf[cur]; 9286 } 9287 9288 result = PyUnicode_FromUnicode(result_buf, slicelength); 9289 PyObject_FREE(result_buf); 9290 return result; 9291 } 9292 } else { 9293 PyErr_SetString(PyExc_TypeError, "string indices must be integers"); 9294 return NULL; 9295 } 9296} 9297 9298static PyMappingMethods unicode_as_mapping = { 9299 (lenfunc)unicode_length, /* mp_length */ 9300 (binaryfunc)unicode_subscript, /* mp_subscript */ 9301 (objobjargproc)0, /* mp_ass_subscript */ 9302}; 9303 9304 9305/* Helpers for PyUnicode_Format() */ 9306 9307static PyObject * 9308getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx) 9309{ 9310 Py_ssize_t argidx = *p_argidx; 9311 if (argidx < arglen) { 9312 (*p_argidx)++; 9313 if (arglen < 0) 9314 return args; 9315 else 9316 return PyTuple_GetItem(args, argidx); 9317 } 9318 PyErr_SetString(PyExc_TypeError, 9319 "not enough arguments for format string"); 9320 return NULL; 9321} 9322 9323/* Returns a new reference to a PyUnicode object, or NULL on failure. */ 9324 9325static PyObject * 9326formatfloat(PyObject *v, int flags, int prec, int type) 9327{ 9328 char *p; 9329 PyObject *result; 9330 double x; 9331 9332 x = PyFloat_AsDouble(v); 9333 if (x == -1.0 && PyErr_Occurred()) 9334 return NULL; 9335 9336 if (prec < 0) 9337 prec = 6; 9338 9339 p = PyOS_double_to_string(x, type, prec, 9340 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL); 9341 if (p == NULL) 9342 return NULL; 9343 result = PyUnicode_FromStringAndSize(p, strlen(p)); 9344 PyMem_Free(p); 9345 return result; 9346} 9347 9348static PyObject* 9349formatlong(PyObject *val, int flags, int prec, int type) 9350{ 9351 char *buf; 9352 int len; 9353 PyObject *str; /* temporary string object. */ 9354 PyObject *result; 9355 9356 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len); 9357 if (!str) 9358 return NULL; 9359 result = PyUnicode_FromStringAndSize(buf, len); 9360 Py_DECREF(str); 9361 return result; 9362} 9363 9364static int 9365formatchar(Py_UNICODE *buf, 9366 size_t buflen, 9367 PyObject *v) 9368{ 9369 /* presume that the buffer is at least 3 characters long */ 9370 if (PyUnicode_Check(v)) { 9371 if (PyUnicode_GET_SIZE(v) == 1) { 9372 buf[0] = PyUnicode_AS_UNICODE(v)[0]; 9373 buf[1] = '\0'; 9374 return 1; 9375 } 9376#ifndef Py_UNICODE_WIDE 9377 if (PyUnicode_GET_SIZE(v) == 2) { 9378 /* Decode a valid surrogate pair */ 9379 int c0 = PyUnicode_AS_UNICODE(v)[0]; 9380 int c1 = PyUnicode_AS_UNICODE(v)[1]; 9381 if (0xD800 <= c0 && c0 <= 0xDBFF && 9382 0xDC00 <= c1 && c1 <= 0xDFFF) { 9383 buf[0] = c0; 9384 buf[1] = c1; 9385 buf[2] = '\0'; 9386 return 2; 9387 } 9388 } 9389#endif 9390 goto onError; 9391 } 9392 else { 9393 /* Integer input truncated to a character */ 9394 long x; 9395 x = PyLong_AsLong(v); 9396 if (x == -1 && PyErr_Occurred()) 9397 goto onError; 9398 9399 if (x < 0 || x > 0x10ffff) { 9400 PyErr_SetString(PyExc_OverflowError, 9401 "%c arg not in range(0x110000)"); 9402 return -1; 9403 } 9404 9405#ifndef Py_UNICODE_WIDE 9406 if (x > 0xffff) { 9407 x -= 0x10000; 9408 buf[0] = (Py_UNICODE)(0xD800 | (x >> 10)); 9409 buf[1] = (Py_UNICODE)(0xDC00 | (x & 0x3FF)); 9410 return 2; 9411 } 9412#endif 9413 buf[0] = (Py_UNICODE) x; 9414 buf[1] = '\0'; 9415 return 1; 9416 } 9417 9418 onError: 9419 PyErr_SetString(PyExc_TypeError, 9420 "%c requires int or char"); 9421 return -1; 9422} 9423 9424/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...) 9425 FORMATBUFLEN is the length of the buffer in which chars are formatted. 9426*/ 9427#define FORMATBUFLEN (size_t)10 9428 9429PyObject *PyUnicode_Format(PyObject *format, 9430 PyObject *args) 9431{ 9432 Py_UNICODE *fmt, *res; 9433 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx; 9434 int args_owned = 0; 9435 PyUnicodeObject *result = NULL; 9436 PyObject *dict = NULL; 9437 PyObject *uformat; 9438 9439 if (format == NULL || args == NULL) { 9440 PyErr_BadInternalCall(); 9441 return NULL; 9442 } 9443 uformat = PyUnicode_FromObject(format); 9444 if (uformat == NULL) 9445 return NULL; 9446 fmt = PyUnicode_AS_UNICODE(uformat); 9447 fmtcnt = PyUnicode_GET_SIZE(uformat); 9448 9449 reslen = rescnt = fmtcnt + 100; 9450 result = _PyUnicode_New(reslen); 9451 if (result == NULL) 9452 goto onError; 9453 res = PyUnicode_AS_UNICODE(result); 9454 9455 if (PyTuple_Check(args)) { 9456 arglen = PyTuple_Size(args); 9457 argidx = 0; 9458 } 9459 else { 9460 arglen = -1; 9461 argidx = -2; 9462 } 9463 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) && 9464 !PyUnicode_Check(args)) 9465 dict = args; 9466 9467 while (--fmtcnt >= 0) { 9468 if (*fmt != '%') { 9469 if (--rescnt < 0) { 9470 rescnt = fmtcnt + 100; 9471 reslen += rescnt; 9472 if (_PyUnicode_Resize(&result, reslen) < 0) 9473 goto onError; 9474 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt; 9475 --rescnt; 9476 } 9477 *res++ = *fmt++; 9478 } 9479 else { 9480 /* Got a format specifier */ 9481 int flags = 0; 9482 Py_ssize_t width = -1; 9483 int prec = -1; 9484 Py_UNICODE c = '\0'; 9485 Py_UNICODE fill; 9486 int isnumok; 9487 PyObject *v = NULL; 9488 PyObject *temp = NULL; 9489 Py_UNICODE *pbuf; 9490 Py_UNICODE sign; 9491 Py_ssize_t len; 9492 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For formatchar() */ 9493 9494 fmt++; 9495 if (*fmt == '(') { 9496 Py_UNICODE *keystart; 9497 Py_ssize_t keylen; 9498 PyObject *key; 9499 int pcount = 1; 9500 9501 if (dict == NULL) { 9502 PyErr_SetString(PyExc_TypeError, 9503 "format requires a mapping"); 9504 goto onError; 9505 } 9506 ++fmt; 9507 --fmtcnt; 9508 keystart = fmt; 9509 /* Skip over balanced parentheses */ 9510 while (pcount > 0 && --fmtcnt >= 0) { 9511 if (*fmt == ')') 9512 --pcount; 9513 else if (*fmt == '(') 9514 ++pcount; 9515 fmt++; 9516 } 9517 keylen = fmt - keystart - 1; 9518 if (fmtcnt < 0 || pcount > 0) { 9519 PyErr_SetString(PyExc_ValueError, 9520 "incomplete format key"); 9521 goto onError; 9522 } 9523#if 0 9524 /* keys are converted to strings using UTF-8 and 9525 then looked up since Python uses strings to hold 9526 variables names etc. in its namespaces and we 9527 wouldn't want to break common idioms. */ 9528 key = PyUnicode_EncodeUTF8(keystart, 9529 keylen, 9530 NULL); 9531#else 9532 key = PyUnicode_FromUnicode(keystart, keylen); 9533#endif 9534 if (key == NULL) 9535 goto onError; 9536 if (args_owned) { 9537 Py_DECREF(args); 9538 args_owned = 0; 9539 } 9540 args = PyObject_GetItem(dict, key); 9541 Py_DECREF(key); 9542 if (args == NULL) { 9543 goto onError; 9544 } 9545 args_owned = 1; 9546 arglen = -1; 9547 argidx = -2; 9548 } 9549 while (--fmtcnt >= 0) { 9550 switch (c = *fmt++) { 9551 case '-': flags |= F_LJUST; continue; 9552 case '+': flags |= F_SIGN; continue; 9553 case ' ': flags |= F_BLANK; continue; 9554 case '#': flags |= F_ALT; continue; 9555 case '0': flags |= F_ZERO; continue; 9556 } 9557 break; 9558 } 9559 if (c == '*') { 9560 v = getnextarg(args, arglen, &argidx); 9561 if (v == NULL) 9562 goto onError; 9563 if (!PyLong_Check(v)) { 9564 PyErr_SetString(PyExc_TypeError, 9565 "* wants int"); 9566 goto onError; 9567 } 9568 width = PyLong_AsLong(v); 9569 if (width == -1 && PyErr_Occurred()) 9570 goto onError; 9571 if (width < 0) { 9572 flags |= F_LJUST; 9573 width = -width; 9574 } 9575 if (--fmtcnt >= 0) 9576 c = *fmt++; 9577 } 9578 else if (c >= '0' && c <= '9') { 9579 width = c - '0'; 9580 while (--fmtcnt >= 0) { 9581 c = *fmt++; 9582 if (c < '0' || c > '9') 9583 break; 9584 if ((width*10) / 10 != width) { 9585 PyErr_SetString(PyExc_ValueError, 9586 "width too big"); 9587 goto onError; 9588 } 9589 width = width*10 + (c - '0'); 9590 } 9591 } 9592 if (c == '.') { 9593 prec = 0; 9594 if (--fmtcnt >= 0) 9595 c = *fmt++; 9596 if (c == '*') { 9597 v = getnextarg(args, arglen, &argidx); 9598 if (v == NULL) 9599 goto onError; 9600 if (!PyLong_Check(v)) { 9601 PyErr_SetString(PyExc_TypeError, 9602 "* wants int"); 9603 goto onError; 9604 } 9605 prec = PyLong_AsLong(v); 9606 if (prec == -1 && PyErr_Occurred()) 9607 goto onError; 9608 if (prec < 0) 9609 prec = 0; 9610 if (--fmtcnt >= 0) 9611 c = *fmt++; 9612 } 9613 else if (c >= '0' && c <= '9') { 9614 prec = c - '0'; 9615 while (--fmtcnt >= 0) { 9616 c = *fmt++; 9617 if (c < '0' || c > '9') 9618 break; 9619 if ((prec*10) / 10 != prec) { 9620 PyErr_SetString(PyExc_ValueError, 9621 "prec too big"); 9622 goto onError; 9623 } 9624 prec = prec*10 + (c - '0'); 9625 } 9626 } 9627 } /* prec */ 9628 if (fmtcnt >= 0) { 9629 if (c == 'h' || c == 'l' || c == 'L') { 9630 if (--fmtcnt >= 0) 9631 c = *fmt++; 9632 } 9633 } 9634 if (fmtcnt < 0) { 9635 PyErr_SetString(PyExc_ValueError, 9636 "incomplete format"); 9637 goto onError; 9638 } 9639 if (c != '%') { 9640 v = getnextarg(args, arglen, &argidx); 9641 if (v == NULL) 9642 goto onError; 9643 } 9644 sign = 0; 9645 fill = ' '; 9646 switch (c) { 9647 9648 case '%': 9649 pbuf = formatbuf; 9650 /* presume that buffer length is at least 1 */ 9651 pbuf[0] = '%'; 9652 len = 1; 9653 break; 9654 9655 case 's': 9656 case 'r': 9657 case 'a': 9658 if (PyUnicode_CheckExact(v) && c == 's') { 9659 temp = v; 9660 Py_INCREF(temp); 9661 } 9662 else { 9663 if (c == 's') 9664 temp = PyObject_Str(v); 9665 else if (c == 'r') 9666 temp = PyObject_Repr(v); 9667 else 9668 temp = PyObject_ASCII(v); 9669 if (temp == NULL) 9670 goto onError; 9671 if (PyUnicode_Check(temp)) 9672 /* nothing to do */; 9673 else { 9674 Py_DECREF(temp); 9675 PyErr_SetString(PyExc_TypeError, 9676 "%s argument has non-string str()"); 9677 goto onError; 9678 } 9679 } 9680 pbuf = PyUnicode_AS_UNICODE(temp); 9681 len = PyUnicode_GET_SIZE(temp); 9682 if (prec >= 0 && len > prec) 9683 len = prec; 9684 break; 9685 9686 case 'i': 9687 case 'd': 9688 case 'u': 9689 case 'o': 9690 case 'x': 9691 case 'X': 9692 isnumok = 0; 9693 if (PyNumber_Check(v)) { 9694 PyObject *iobj=NULL; 9695 9696 if (PyLong_Check(v)) { 9697 iobj = v; 9698 Py_INCREF(iobj); 9699 } 9700 else { 9701 iobj = PyNumber_Long(v); 9702 } 9703 if (iobj!=NULL) { 9704 if (PyLong_Check(iobj)) { 9705 isnumok = 1; 9706 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c)); 9707 Py_DECREF(iobj); 9708 if (!temp) 9709 goto onError; 9710 pbuf = PyUnicode_AS_UNICODE(temp); 9711 len = PyUnicode_GET_SIZE(temp); 9712 sign = 1; 9713 } 9714 else { 9715 Py_DECREF(iobj); 9716 } 9717 } 9718 } 9719 if (!isnumok) { 9720 PyErr_Format(PyExc_TypeError, 9721 "%%%c format: a number is required, " 9722 "not %.200s", (char)c, Py_TYPE(v)->tp_name); 9723 goto onError; 9724 } 9725 if (flags & F_ZERO) 9726 fill = '0'; 9727 break; 9728 9729 case 'e': 9730 case 'E': 9731 case 'f': 9732 case 'F': 9733 case 'g': 9734 case 'G': 9735 temp = formatfloat(v, flags, prec, c); 9736 if (!temp) 9737 goto onError; 9738 pbuf = PyUnicode_AS_UNICODE(temp); 9739 len = PyUnicode_GET_SIZE(temp); 9740 sign = 1; 9741 if (flags & F_ZERO) 9742 fill = '0'; 9743 break; 9744 9745 case 'c': 9746 pbuf = formatbuf; 9747 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v); 9748 if (len < 0) 9749 goto onError; 9750 break; 9751 9752 default: 9753 PyErr_Format(PyExc_ValueError, 9754 "unsupported format character '%c' (0x%x) " 9755 "at index %zd", 9756 (31<=c && c<=126) ? (char)c : '?', 9757 (int)c, 9758 (Py_ssize_t)(fmt - 1 - 9759 PyUnicode_AS_UNICODE(uformat))); 9760 goto onError; 9761 } 9762 if (sign) { 9763 if (*pbuf == '-' || *pbuf == '+') { 9764 sign = *pbuf++; 9765 len--; 9766 } 9767 else if (flags & F_SIGN) 9768 sign = '+'; 9769 else if (flags & F_BLANK) 9770 sign = ' '; 9771 else 9772 sign = 0; 9773 } 9774 if (width < len) 9775 width = len; 9776 if (rescnt - (sign != 0) < width) { 9777 reslen -= rescnt; 9778 rescnt = width + fmtcnt + 100; 9779 reslen += rescnt; 9780 if (reslen < 0) { 9781 Py_XDECREF(temp); 9782 PyErr_NoMemory(); 9783 goto onError; 9784 } 9785 if (_PyUnicode_Resize(&result, reslen) < 0) { 9786 Py_XDECREF(temp); 9787 goto onError; 9788 } 9789 res = PyUnicode_AS_UNICODE(result) 9790 + reslen - rescnt; 9791 } 9792 if (sign) { 9793 if (fill != ' ') 9794 *res++ = sign; 9795 rescnt--; 9796 if (width > len) 9797 width--; 9798 } 9799 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) { 9800 assert(pbuf[0] == '0'); 9801 assert(pbuf[1] == c); 9802 if (fill != ' ') { 9803 *res++ = *pbuf++; 9804 *res++ = *pbuf++; 9805 } 9806 rescnt -= 2; 9807 width -= 2; 9808 if (width < 0) 9809 width = 0; 9810 len -= 2; 9811 } 9812 if (width > len && !(flags & F_LJUST)) { 9813 do { 9814 --rescnt; 9815 *res++ = fill; 9816 } while (--width > len); 9817 } 9818 if (fill == ' ') { 9819 if (sign) 9820 *res++ = sign; 9821 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) { 9822 assert(pbuf[0] == '0'); 9823 assert(pbuf[1] == c); 9824 *res++ = *pbuf++; 9825 *res++ = *pbuf++; 9826 } 9827 } 9828 Py_UNICODE_COPY(res, pbuf, len); 9829 res += len; 9830 rescnt -= len; 9831 while (--width >= len) { 9832 --rescnt; 9833 *res++ = ' '; 9834 } 9835 if (dict && (argidx < arglen) && c != '%') { 9836 PyErr_SetString(PyExc_TypeError, 9837 "not all arguments converted during string formatting"); 9838 Py_XDECREF(temp); 9839 goto onError; 9840 } 9841 Py_XDECREF(temp); 9842 } /* '%' */ 9843 } /* until end */ 9844 if (argidx < arglen && !dict) { 9845 PyErr_SetString(PyExc_TypeError, 9846 "not all arguments converted during string formatting"); 9847 goto onError; 9848 } 9849 9850 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0) 9851 goto onError; 9852 if (args_owned) { 9853 Py_DECREF(args); 9854 } 9855 Py_DECREF(uformat); 9856 return (PyObject *)result; 9857 9858 onError: 9859 Py_XDECREF(result); 9860 Py_DECREF(uformat); 9861 if (args_owned) { 9862 Py_DECREF(args); 9863 } 9864 return NULL; 9865} 9866 9867static PyObject * 9868unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds); 9869 9870static PyObject * 9871unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds) 9872{ 9873 PyObject *x = NULL; 9874 static char *kwlist[] = {"object", "encoding", "errors", 0}; 9875 char *encoding = NULL; 9876 char *errors = NULL; 9877 9878 if (type != &PyUnicode_Type) 9879 return unicode_subtype_new(type, args, kwds); 9880 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str", 9881 kwlist, &x, &encoding, &errors)) 9882 return NULL; 9883 if (x == NULL) 9884 return (PyObject *)_PyUnicode_New(0); 9885 if (encoding == NULL && errors == NULL) 9886 return PyObject_Str(x); 9887 else 9888 return PyUnicode_FromEncodedObject(x, encoding, errors); 9889} 9890 9891static PyObject * 9892unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds) 9893{ 9894 PyUnicodeObject *tmp, *pnew; 9895 Py_ssize_t n; 9896 9897 assert(PyType_IsSubtype(type, &PyUnicode_Type)); 9898 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds); 9899 if (tmp == NULL) 9900 return NULL; 9901 assert(PyUnicode_Check(tmp)); 9902 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length); 9903 if (pnew == NULL) { 9904 Py_DECREF(tmp); 9905 return NULL; 9906 } 9907 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1)); 9908 if (pnew->str == NULL) { 9909 _Py_ForgetReference((PyObject *)pnew); 9910 PyObject_Del(pnew); 9911 Py_DECREF(tmp); 9912 return PyErr_NoMemory(); 9913 } 9914 Py_UNICODE_COPY(pnew->str, tmp->str, n+1); 9915 pnew->length = n; 9916 pnew->hash = tmp->hash; 9917 Py_DECREF(tmp); 9918 return (PyObject *)pnew; 9919} 9920 9921PyDoc_STRVAR(unicode_doc, 9922 "str(string[, encoding[, errors]]) -> str\n\ 9923\n\ 9924Create a new string object from the given encoded string.\n\ 9925encoding defaults to the current default string encoding.\n\ 9926errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'."); 9927 9928static PyObject *unicode_iter(PyObject *seq); 9929 9930PyTypeObject PyUnicode_Type = { 9931 PyVarObject_HEAD_INIT(&PyType_Type, 0) 9932 "str", /* tp_name */ 9933 sizeof(PyUnicodeObject), /* tp_size */ 9934 0, /* tp_itemsize */ 9935 /* Slots */ 9936 (destructor)unicode_dealloc, /* tp_dealloc */ 9937 0, /* tp_print */ 9938 0, /* tp_getattr */ 9939 0, /* tp_setattr */ 9940 0, /* tp_reserved */ 9941 unicode_repr, /* tp_repr */ 9942 &unicode_as_number, /* tp_as_number */ 9943 &unicode_as_sequence, /* tp_as_sequence */ 9944 &unicode_as_mapping, /* tp_as_mapping */ 9945 (hashfunc) unicode_hash, /* tp_hash*/ 9946 0, /* tp_call*/ 9947 (reprfunc) unicode_str, /* tp_str */ 9948 PyObject_GenericGetAttr, /* tp_getattro */ 9949 0, /* tp_setattro */ 9950 0, /* tp_as_buffer */ 9951 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | 9952 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */ 9953 unicode_doc, /* tp_doc */ 9954 0, /* tp_traverse */ 9955 0, /* tp_clear */ 9956 PyUnicode_RichCompare, /* tp_richcompare */ 9957 0, /* tp_weaklistoffset */ 9958 unicode_iter, /* tp_iter */ 9959 0, /* tp_iternext */ 9960 unicode_methods, /* tp_methods */ 9961 0, /* tp_members */ 9962 0, /* tp_getset */ 9963 &PyBaseObject_Type, /* tp_base */ 9964 0, /* tp_dict */ 9965 0, /* tp_descr_get */ 9966 0, /* tp_descr_set */ 9967 0, /* tp_dictoffset */ 9968 0, /* tp_init */ 9969 0, /* tp_alloc */ 9970 unicode_new, /* tp_new */ 9971 PyObject_Del, /* tp_free */ 9972}; 9973 9974/* Initialize the Unicode implementation */ 9975 9976void _PyUnicode_Init(void) 9977{ 9978 int i; 9979 9980 /* XXX - move this array to unicodectype.c ? */ 9981 Py_UNICODE linebreak[] = { 9982 0x000A, /* LINE FEED */ 9983 0x000D, /* CARRIAGE RETURN */ 9984 0x001C, /* FILE SEPARATOR */ 9985 0x001D, /* GROUP SEPARATOR */ 9986 0x001E, /* RECORD SEPARATOR */ 9987 0x0085, /* NEXT LINE */ 9988 0x2028, /* LINE SEPARATOR */ 9989 0x2029, /* PARAGRAPH SEPARATOR */ 9990 }; 9991 9992 /* Init the implementation */ 9993 free_list = NULL; 9994 numfree = 0; 9995 unicode_empty = _PyUnicode_New(0); 9996 if (!unicode_empty) 9997 return; 9998 9999 for (i = 0; i < 256; i++) 10000 unicode_latin1[i] = NULL; 10001 if (PyType_Ready(&PyUnicode_Type) < 0) 10002 Py_FatalError("Can't initialize 'unicode'"); 10003 10004 /* initialize the linebreak bloom filter */ 10005 bloom_linebreak = make_bloom_mask( 10006 linebreak, sizeof(linebreak) / sizeof(linebreak[0]) 10007 ); 10008 10009 PyType_Ready(&EncodingMapType); 10010} 10011 10012/* Finalize the Unicode implementation */ 10013 10014int 10015PyUnicode_ClearFreeList(void) 10016{ 10017 int freelist_size = numfree; 10018 PyUnicodeObject *u; 10019 10020 for (u = free_list; u != NULL;) { 10021 PyUnicodeObject *v = u; 10022 u = *(PyUnicodeObject **)u; 10023 if (v->str) 10024 PyObject_DEL(v->str); 10025 Py_XDECREF(v->defenc); 10026 PyObject_Del(v); 10027 numfree--; 10028 } 10029 free_list = NULL; 10030 assert(numfree == 0); 10031 return freelist_size; 10032} 10033 10034void 10035_PyUnicode_Fini(void) 10036{ 10037 int i; 10038 10039 Py_XDECREF(unicode_empty); 10040 unicode_empty = NULL; 10041 10042 for (i = 0; i < 256; i++) { 10043 if (unicode_latin1[i]) { 10044 Py_DECREF(unicode_latin1[i]); 10045 unicode_latin1[i] = NULL; 10046 } 10047 } 10048 (void)PyUnicode_ClearFreeList(); 10049} 10050 10051void 10052PyUnicode_InternInPlace(PyObject **p) 10053{ 10054 register PyUnicodeObject *s = (PyUnicodeObject *)(*p); 10055 PyObject *t; 10056 if (s == NULL || !PyUnicode_Check(s)) 10057 Py_FatalError( 10058 "PyUnicode_InternInPlace: unicode strings only please!"); 10059 /* If it's a subclass, we don't really know what putting 10060 it in the interned dict might do. */ 10061 if (!PyUnicode_CheckExact(s)) 10062 return; 10063 if (PyUnicode_CHECK_INTERNED(s)) 10064 return; 10065 if (interned == NULL) { 10066 interned = PyDict_New(); 10067 if (interned == NULL) { 10068 PyErr_Clear(); /* Don't leave an exception */ 10069 return; 10070 } 10071 } 10072 /* It might be that the GetItem call fails even 10073 though the key is present in the dictionary, 10074 namely when this happens during a stack overflow. */ 10075 Py_ALLOW_RECURSION 10076 t = PyDict_GetItem(interned, (PyObject *)s); 10077 Py_END_ALLOW_RECURSION 10078 10079 if (t) { 10080 Py_INCREF(t); 10081 Py_DECREF(*p); 10082 *p = t; 10083 return; 10084 } 10085 10086 PyThreadState_GET()->recursion_critical = 1; 10087 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) { 10088 PyErr_Clear(); 10089 PyThreadState_GET()->recursion_critical = 0; 10090 return; 10091 } 10092 PyThreadState_GET()->recursion_critical = 0; 10093 /* The two references in interned are not counted by refcnt. 10094 The deallocator will take care of this */ 10095 Py_REFCNT(s) -= 2; 10096 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL; 10097} 10098 10099void 10100PyUnicode_InternImmortal(PyObject **p) 10101{ 10102 PyUnicode_InternInPlace(p); 10103 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) { 10104 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL; 10105 Py_INCREF(*p); 10106 } 10107} 10108 10109PyObject * 10110PyUnicode_InternFromString(const char *cp) 10111{ 10112 PyObject *s = PyUnicode_FromString(cp); 10113 if (s == NULL) 10114 return NULL; 10115 PyUnicode_InternInPlace(&s); 10116 return s; 10117} 10118 10119void _Py_ReleaseInternedUnicodeStrings(void) 10120{ 10121 PyObject *keys; 10122 PyUnicodeObject *s; 10123 Py_ssize_t i, n; 10124 Py_ssize_t immortal_size = 0, mortal_size = 0; 10125 10126 if (interned == NULL || !PyDict_Check(interned)) 10127 return; 10128 keys = PyDict_Keys(interned); 10129 if (keys == NULL || !PyList_Check(keys)) { 10130 PyErr_Clear(); 10131 return; 10132 } 10133 10134 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak 10135 detector, interned unicode strings are not forcibly deallocated; 10136 rather, we give them their stolen references back, and then clear 10137 and DECREF the interned dict. */ 10138 10139 n = PyList_GET_SIZE(keys); 10140 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n", 10141 n); 10142 for (i = 0; i < n; i++) { 10143 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i); 10144 switch (s->state) { 10145 case SSTATE_NOT_INTERNED: 10146 /* XXX Shouldn't happen */ 10147 break; 10148 case SSTATE_INTERNED_IMMORTAL: 10149 Py_REFCNT(s) += 1; 10150 immortal_size += s->length; 10151 break; 10152 case SSTATE_INTERNED_MORTAL: 10153 Py_REFCNT(s) += 2; 10154 mortal_size += s->length; 10155 break; 10156 default: 10157 Py_FatalError("Inconsistent interned string state."); 10158 } 10159 s->state = SSTATE_NOT_INTERNED; 10160 } 10161 fprintf(stderr, "total size of all interned strings: " 10162 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d " 10163 "mortal/immortal\n", mortal_size, immortal_size); 10164 Py_DECREF(keys); 10165 PyDict_Clear(interned); 10166 Py_DECREF(interned); 10167 interned = NULL; 10168} 10169 10170 10171/********************* Unicode Iterator **************************/ 10172 10173typedef struct { 10174 PyObject_HEAD 10175 Py_ssize_t it_index; 10176 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */ 10177} unicodeiterobject; 10178 10179static void 10180unicodeiter_dealloc(unicodeiterobject *it) 10181{ 10182 _PyObject_GC_UNTRACK(it); 10183 Py_XDECREF(it->it_seq); 10184 PyObject_GC_Del(it); 10185} 10186 10187static int 10188unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg) 10189{ 10190 Py_VISIT(it->it_seq); 10191 return 0; 10192} 10193 10194static PyObject * 10195unicodeiter_next(unicodeiterobject *it) 10196{ 10197 PyUnicodeObject *seq; 10198 PyObject *item; 10199 10200 assert(it != NULL); 10201 seq = it->it_seq; 10202 if (seq == NULL) 10203 return NULL; 10204 assert(PyUnicode_Check(seq)); 10205 10206 if (it->it_index < PyUnicode_GET_SIZE(seq)) { 10207 item = PyUnicode_FromUnicode( 10208 PyUnicode_AS_UNICODE(seq)+it->it_index, 1); 10209 if (item != NULL) 10210 ++it->it_index; 10211 return item; 10212 } 10213 10214 Py_DECREF(seq); 10215 it->it_seq = NULL; 10216 return NULL; 10217} 10218 10219static PyObject * 10220unicodeiter_len(unicodeiterobject *it) 10221{ 10222 Py_ssize_t len = 0; 10223 if (it->it_seq) 10224 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index; 10225 return PyLong_FromSsize_t(len); 10226} 10227 10228PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it))."); 10229 10230static PyMethodDef unicodeiter_methods[] = { 10231 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS, 10232 length_hint_doc}, 10233 {NULL, NULL} /* sentinel */ 10234}; 10235 10236PyTypeObject PyUnicodeIter_Type = { 10237 PyVarObject_HEAD_INIT(&PyType_Type, 0) 10238 "str_iterator", /* tp_name */ 10239 sizeof(unicodeiterobject), /* tp_basicsize */ 10240 0, /* tp_itemsize */ 10241 /* methods */ 10242 (destructor)unicodeiter_dealloc, /* tp_dealloc */ 10243 0, /* tp_print */ 10244 0, /* tp_getattr */ 10245 0, /* tp_setattr */ 10246 0, /* tp_reserved */ 10247 0, /* tp_repr */ 10248 0, /* tp_as_number */ 10249 0, /* tp_as_sequence */ 10250 0, /* tp_as_mapping */ 10251 0, /* tp_hash */ 10252 0, /* tp_call */ 10253 0, /* tp_str */ 10254 PyObject_GenericGetAttr, /* tp_getattro */ 10255 0, /* tp_setattro */ 10256 0, /* tp_as_buffer */ 10257 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */ 10258 0, /* tp_doc */ 10259 (traverseproc)unicodeiter_traverse, /* tp_traverse */ 10260 0, /* tp_clear */ 10261 0, /* tp_richcompare */ 10262 0, /* tp_weaklistoffset */ 10263 PyObject_SelfIter, /* tp_iter */ 10264 (iternextfunc)unicodeiter_next, /* tp_iternext */ 10265 unicodeiter_methods, /* tp_methods */ 10266 0, 10267}; 10268 10269static PyObject * 10270unicode_iter(PyObject *seq) 10271{ 10272 unicodeiterobject *it; 10273 10274 if (!PyUnicode_Check(seq)) { 10275 PyErr_BadInternalCall(); 10276 return NULL; 10277 } 10278 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type); 10279 if (it == NULL) 10280 return NULL; 10281 it->it_index = 0; 10282 Py_INCREF(seq); 10283 it->it_seq = (PyUnicodeObject *)seq; 10284 _PyObject_GC_TRACK(it); 10285 return (PyObject *)it; 10286} 10287 10288size_t 10289Py_UNICODE_strlen(const Py_UNICODE *u) 10290{ 10291 int res = 0; 10292 while(*u++) 10293 res++; 10294 return res; 10295} 10296 10297Py_UNICODE* 10298Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2) 10299{ 10300 Py_UNICODE *u = s1; 10301 while ((*u++ = *s2++)); 10302 return s1; 10303} 10304 10305Py_UNICODE* 10306Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n) 10307{ 10308 Py_UNICODE *u = s1; 10309 while ((*u++ = *s2++)) 10310 if (n-- == 0) 10311 break; 10312 return s1; 10313} 10314 10315Py_UNICODE* 10316Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2) 10317{ 10318 Py_UNICODE *u1 = s1; 10319 u1 += Py_UNICODE_strlen(u1); 10320 Py_UNICODE_strcpy(u1, s2); 10321 return s1; 10322} 10323 10324int 10325Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2) 10326{ 10327 while (*s1 && *s2 && *s1 == *s2) 10328 s1++, s2++; 10329 if (*s1 && *s2) 10330 return (*s1 < *s2) ? -1 : +1; 10331 if (*s1) 10332 return 1; 10333 if (*s2) 10334 return -1; 10335 return 0; 10336} 10337 10338int 10339Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n) 10340{ 10341 register Py_UNICODE u1, u2; 10342 for (; n != 0; n--) { 10343 u1 = *s1; 10344 u2 = *s2; 10345 if (u1 != u2) 10346 return (u1 < u2) ? -1 : +1; 10347 if (u1 == '\0') 10348 return 0; 10349 s1++; 10350 s2++; 10351 } 10352 return 0; 10353} 10354 10355Py_UNICODE* 10356Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c) 10357{ 10358 const Py_UNICODE *p; 10359 for (p = s; *p; p++) 10360 if (*p == c) 10361 return (Py_UNICODE*)p; 10362 return NULL; 10363} 10364 10365Py_UNICODE* 10366Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c) 10367{ 10368 const Py_UNICODE *p; 10369 p = s + Py_UNICODE_strlen(s); 10370 while (p != s) { 10371 p--; 10372 if (*p == c) 10373 return (Py_UNICODE*)p; 10374 } 10375 return NULL; 10376} 10377 10378Py_UNICODE* 10379PyUnicode_AsUnicodeCopy(PyObject *object) 10380{ 10381 PyUnicodeObject *unicode = (PyUnicodeObject *)object; 10382 Py_UNICODE *copy; 10383 Py_ssize_t size; 10384 10385 /* Ensure we won't overflow the size. */ 10386 if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) { 10387 PyErr_NoMemory(); 10388 return NULL; 10389 } 10390 size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */ 10391 size *= sizeof(Py_UNICODE); 10392 copy = PyMem_Malloc(size); 10393 if (copy == NULL) { 10394 PyErr_NoMemory(); 10395 return NULL; 10396 } 10397 memcpy(copy, PyUnicode_AS_UNICODE(unicode), size); 10398 return copy; 10399} 10400 10401/* A _string module, to export formatter_parser and formatter_field_name_split 10402 to the string.Formatter class implemented in Python. */ 10403 10404static PyMethodDef _string_methods[] = { 10405 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split, 10406 METH_O, PyDoc_STR("split the argument as a field name")}, 10407 {"formatter_parser", (PyCFunction) formatter_parser, 10408 METH_O, PyDoc_STR("parse the argument as a format string")}, 10409 {NULL, NULL} 10410}; 10411 10412static struct PyModuleDef _string_module = { 10413 PyModuleDef_HEAD_INIT, 10414 "_string", 10415 PyDoc_STR("string helper module"), 10416 0, 10417 _string_methods, 10418 NULL, 10419 NULL, 10420 NULL, 10421 NULL 10422}; 10423 10424PyMODINIT_FUNC 10425PyInit__string(void) 10426{ 10427 return PyModule_Create(&_string_module); 10428} 10429 10430 10431#ifdef __cplusplus 10432} 10433#endif 10434