unicodeobject.c revision 28a6cfaefc41a4e4bfa6dd0b54318c0465987652
1/* 2 3Unicode implementation based on original code by Fredrik Lundh, 4modified by Marc-Andre Lemburg <mal@lemburg.com> according to the 5Unicode Integration Proposal (see file Misc/unicode.txt). 6 7Major speed upgrades to the method implementations at the Reykjavik 8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke. 9 10Copyright (c) Corporation for National Research Initiatives. 11 12-------------------------------------------------------------------- 13The original string type implementation is: 14 15 Copyright (c) 1999 by Secret Labs AB 16 Copyright (c) 1999 by Fredrik Lundh 17 18By obtaining, using, and/or copying this software and/or its 19associated documentation, you agree that you have read, understood, 20and will comply with the following terms and conditions: 21 22Permission to use, copy, modify, and distribute this software and its 23associated documentation for any purpose and without fee is hereby 24granted, provided that the above copyright notice appears in all 25copies, and that both that copyright notice and this permission notice 26appear in supporting documentation, and that the name of Secret Labs 27AB or the author not be used in advertising or publicity pertaining to 28distribution of the software without specific, written prior 29permission. 30 31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO 32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND 33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR 34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT 37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 38-------------------------------------------------------------------- 39 40*/ 41 42#define PY_SSIZE_T_CLEAN 43#include "Python.h" 44#include "ucnhash.h" 45 46#ifdef MS_WINDOWS 47#include <windows.h> 48#endif 49 50/* Limit for the Unicode object free list */ 51 52#define PyUnicode_MAXFREELIST 1024 53 54/* Limit for the Unicode object free list stay alive optimization. 55 56 The implementation will keep allocated Unicode memory intact for 57 all objects on the free list having a size less than this 58 limit. This reduces malloc() overhead for small Unicode objects. 59 60 At worst this will result in PyUnicode_MAXFREELIST * 61 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT + 62 malloc()-overhead) bytes of unused garbage. 63 64 Setting the limit to 0 effectively turns the feature off. 65 66 Note: This is an experimental feature ! If you get core dumps when 67 using Unicode objects, turn this feature off. 68 69*/ 70 71#define KEEPALIVE_SIZE_LIMIT 9 72 73/* Endianness switches; defaults to little endian */ 74 75#ifdef WORDS_BIGENDIAN 76# define BYTEORDER_IS_BIG_ENDIAN 77#else 78# define BYTEORDER_IS_LITTLE_ENDIAN 79#endif 80 81/* --- Globals ------------------------------------------------------------ 82 83 The globals are initialized by the _PyUnicode_Init() API and should 84 not be used before calling that API. 85 86*/ 87 88 89#ifdef __cplusplus 90extern "C" { 91#endif 92 93/* This dictionary holds all interned unicode strings. Note that references 94 to strings in this dictionary are *not* counted in the string's ob_refcnt. 95 When the interned string reaches a refcnt of 0 the string deallocation 96 function will delete the reference from this dictionary. 97 98 Another way to look at this is that to say that the actual reference 99 count of a string is: s->ob_refcnt + (s->state ? 2 : 0) 100*/ 101static PyObject *interned; 102 103/* Free list for Unicode objects */ 104static PyUnicodeObject *free_list; 105static int numfree; 106 107/* The empty Unicode object is shared to improve performance. */ 108static PyUnicodeObject *unicode_empty; 109 110/* Single character Unicode strings in the Latin-1 range are being 111 shared as well. */ 112static PyUnicodeObject *unicode_latin1[256]; 113 114/* Fast detection of the most frequent whitespace characters */ 115const unsigned char _Py_ascii_whitespace[] = { 116 0, 0, 0, 0, 0, 0, 0, 0, 117/* case 0x0009: * CHARACTER TABULATION */ 118/* case 0x000A: * LINE FEED */ 119/* case 0x000B: * LINE TABULATION */ 120/* case 0x000C: * FORM FEED */ 121/* case 0x000D: * CARRIAGE RETURN */ 122 0, 1, 1, 1, 1, 1, 0, 0, 123 0, 0, 0, 0, 0, 0, 0, 0, 124/* case 0x001C: * FILE SEPARATOR */ 125/* case 0x001D: * GROUP SEPARATOR */ 126/* case 0x001E: * RECORD SEPARATOR */ 127/* case 0x001F: * UNIT SEPARATOR */ 128 0, 0, 0, 0, 1, 1, 1, 1, 129/* case 0x0020: * SPACE */ 130 1, 0, 0, 0, 0, 0, 0, 0, 131 0, 0, 0, 0, 0, 0, 0, 0, 132 0, 0, 0, 0, 0, 0, 0, 0, 133 0, 0, 0, 0, 0, 0, 0, 0, 134 135 0, 0, 0, 0, 0, 0, 0, 0, 136 0, 0, 0, 0, 0, 0, 0, 0, 137 0, 0, 0, 0, 0, 0, 0, 0, 138 0, 0, 0, 0, 0, 0, 0, 0, 139 0, 0, 0, 0, 0, 0, 0, 0, 140 0, 0, 0, 0, 0, 0, 0, 0, 141 0, 0, 0, 0, 0, 0, 0, 0, 142 0, 0, 0, 0, 0, 0, 0, 0 143}; 144 145static PyObject *unicode_encode_call_errorhandler(const char *errors, 146 PyObject **errorHandler,const char *encoding, const char *reason, 147 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject, 148 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos); 149 150static void raise_encode_exception(PyObject **exceptionObject, 151 const char *encoding, 152 const Py_UNICODE *unicode, Py_ssize_t size, 153 Py_ssize_t startpos, Py_ssize_t endpos, 154 const char *reason); 155 156/* Same for linebreaks */ 157static unsigned char ascii_linebreak[] = { 158 0, 0, 0, 0, 0, 0, 0, 0, 159/* 0x000A, * LINE FEED */ 160/* 0x000B, * LINE TABULATION */ 161/* 0x000C, * FORM FEED */ 162/* 0x000D, * CARRIAGE RETURN */ 163 0, 0, 1, 1, 1, 1, 0, 0, 164 0, 0, 0, 0, 0, 0, 0, 0, 165/* 0x001C, * FILE SEPARATOR */ 166/* 0x001D, * GROUP SEPARATOR */ 167/* 0x001E, * RECORD SEPARATOR */ 168 0, 0, 0, 0, 1, 1, 1, 0, 169 0, 0, 0, 0, 0, 0, 0, 0, 170 0, 0, 0, 0, 0, 0, 0, 0, 171 0, 0, 0, 0, 0, 0, 0, 0, 172 0, 0, 0, 0, 0, 0, 0, 0, 173 174 0, 0, 0, 0, 0, 0, 0, 0, 175 0, 0, 0, 0, 0, 0, 0, 0, 176 0, 0, 0, 0, 0, 0, 0, 0, 177 0, 0, 0, 0, 0, 0, 0, 0, 178 0, 0, 0, 0, 0, 0, 0, 0, 179 0, 0, 0, 0, 0, 0, 0, 0, 180 0, 0, 0, 0, 0, 0, 0, 0, 181 0, 0, 0, 0, 0, 0, 0, 0 182}; 183 184 185Py_UNICODE 186PyUnicode_GetMax(void) 187{ 188#ifdef Py_UNICODE_WIDE 189 return 0x10FFFF; 190#else 191 /* This is actually an illegal character, so it should 192 not be passed to unichr. */ 193 return 0xFFFF; 194#endif 195} 196 197/* --- Bloom Filters ----------------------------------------------------- */ 198 199/* stuff to implement simple "bloom filters" for Unicode characters. 200 to keep things simple, we use a single bitmask, using the least 5 201 bits from each unicode characters as the bit index. */ 202 203/* the linebreak mask is set up by Unicode_Init below */ 204 205#if LONG_BIT >= 128 206#define BLOOM_WIDTH 128 207#elif LONG_BIT >= 64 208#define BLOOM_WIDTH 64 209#elif LONG_BIT >= 32 210#define BLOOM_WIDTH 32 211#else 212#error "LONG_BIT is smaller than 32" 213#endif 214 215#define BLOOM_MASK unsigned long 216 217static BLOOM_MASK bloom_linebreak; 218 219#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1))))) 220#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1))))) 221 222#define BLOOM_LINEBREAK(ch) \ 223 ((ch) < 128U ? ascii_linebreak[(ch)] : \ 224 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch))) 225 226Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len) 227{ 228 /* calculate simple bloom-style bitmask for a given unicode string */ 229 230 BLOOM_MASK mask; 231 Py_ssize_t i; 232 233 mask = 0; 234 for (i = 0; i < len; i++) 235 BLOOM_ADD(mask, ptr[i]); 236 237 return mask; 238} 239 240Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen) 241{ 242 Py_ssize_t i; 243 244 for (i = 0; i < setlen; i++) 245 if (set[i] == chr) 246 return 1; 247 248 return 0; 249} 250 251#define BLOOM_MEMBER(mask, chr, set, setlen) \ 252 BLOOM(mask, chr) && unicode_member(chr, set, setlen) 253 254/* --- Unicode Object ----------------------------------------------------- */ 255 256static 257int unicode_resize(register PyUnicodeObject *unicode, 258 Py_ssize_t length) 259{ 260 void *oldstr; 261 262 /* Shortcut if there's nothing much to do. */ 263 if (unicode->length == length) 264 goto reset; 265 266 /* Resizing shared object (unicode_empty or single character 267 objects) in-place is not allowed. Use PyUnicode_Resize() 268 instead ! */ 269 270 if (unicode == unicode_empty || 271 (unicode->length == 1 && 272 unicode->str[0] < 256U && 273 unicode_latin1[unicode->str[0]] == unicode)) { 274 PyErr_SetString(PyExc_SystemError, 275 "can't resize shared str objects"); 276 return -1; 277 } 278 279 /* We allocate one more byte to make sure the string is Ux0000 terminated. 280 The overallocation is also used by fastsearch, which assumes that it's 281 safe to look at str[length] (without making any assumptions about what 282 it contains). */ 283 284 oldstr = unicode->str; 285 unicode->str = PyObject_REALLOC(unicode->str, 286 sizeof(Py_UNICODE) * (length + 1)); 287 if (!unicode->str) { 288 unicode->str = (Py_UNICODE *)oldstr; 289 PyErr_NoMemory(); 290 return -1; 291 } 292 unicode->str[length] = 0; 293 unicode->length = length; 294 295 reset: 296 /* Reset the object caches */ 297 if (unicode->defenc) { 298 Py_CLEAR(unicode->defenc); 299 } 300 unicode->hash = -1; 301 302 return 0; 303} 304 305/* We allocate one more byte to make sure the string is 306 Ux0000 terminated; some code (e.g. new_identifier) 307 relies on that. 308 309 XXX This allocator could further be enhanced by assuring that the 310 free list never reduces its size below 1. 311 312*/ 313 314static 315PyUnicodeObject *_PyUnicode_New(Py_ssize_t length) 316{ 317 register PyUnicodeObject *unicode; 318 319 /* Optimization for empty strings */ 320 if (length == 0 && unicode_empty != NULL) { 321 Py_INCREF(unicode_empty); 322 return unicode_empty; 323 } 324 325 /* Ensure we won't overflow the size. */ 326 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) { 327 return (PyUnicodeObject *)PyErr_NoMemory(); 328 } 329 330 /* Unicode freelist & memory allocation */ 331 if (free_list) { 332 unicode = free_list; 333 free_list = *(PyUnicodeObject **)unicode; 334 numfree--; 335 if (unicode->str) { 336 /* Keep-Alive optimization: we only upsize the buffer, 337 never downsize it. */ 338 if ((unicode->length < length) && 339 unicode_resize(unicode, length) < 0) { 340 PyObject_DEL(unicode->str); 341 unicode->str = NULL; 342 } 343 } 344 else { 345 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1); 346 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size); 347 } 348 PyObject_INIT(unicode, &PyUnicode_Type); 349 } 350 else { 351 size_t new_size; 352 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type); 353 if (unicode == NULL) 354 return NULL; 355 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1); 356 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size); 357 } 358 359 if (!unicode->str) { 360 PyErr_NoMemory(); 361 goto onError; 362 } 363 /* Initialize the first element to guard against cases where 364 * the caller fails before initializing str -- unicode_resize() 365 * reads str[0], and the Keep-Alive optimization can keep memory 366 * allocated for str alive across a call to unicode_dealloc(unicode). 367 * We don't want unicode_resize to read uninitialized memory in 368 * that case. 369 */ 370 unicode->str[0] = 0; 371 unicode->str[length] = 0; 372 unicode->length = length; 373 unicode->hash = -1; 374 unicode->state = 0; 375 unicode->defenc = NULL; 376 return unicode; 377 378 onError: 379 /* XXX UNREF/NEWREF interface should be more symmetrical */ 380 _Py_DEC_REFTOTAL; 381 _Py_ForgetReference((PyObject *)unicode); 382 PyObject_Del(unicode); 383 return NULL; 384} 385 386static 387void unicode_dealloc(register PyUnicodeObject *unicode) 388{ 389 switch (PyUnicode_CHECK_INTERNED(unicode)) { 390 case SSTATE_NOT_INTERNED: 391 break; 392 393 case SSTATE_INTERNED_MORTAL: 394 /* revive dead object temporarily for DelItem */ 395 Py_REFCNT(unicode) = 3; 396 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0) 397 Py_FatalError( 398 "deletion of interned string failed"); 399 break; 400 401 case SSTATE_INTERNED_IMMORTAL: 402 Py_FatalError("Immortal interned string died."); 403 404 default: 405 Py_FatalError("Inconsistent interned string state."); 406 } 407 408 if (PyUnicode_CheckExact(unicode) && 409 numfree < PyUnicode_MAXFREELIST) { 410 /* Keep-Alive optimization */ 411 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) { 412 PyObject_DEL(unicode->str); 413 unicode->str = NULL; 414 unicode->length = 0; 415 } 416 if (unicode->defenc) { 417 Py_CLEAR(unicode->defenc); 418 } 419 /* Add to free list */ 420 *(PyUnicodeObject **)unicode = free_list; 421 free_list = unicode; 422 numfree++; 423 } 424 else { 425 PyObject_DEL(unicode->str); 426 Py_XDECREF(unicode->defenc); 427 Py_TYPE(unicode)->tp_free((PyObject *)unicode); 428 } 429} 430 431static 432int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length) 433{ 434 register PyUnicodeObject *v; 435 436 /* Argument checks */ 437 if (unicode == NULL) { 438 PyErr_BadInternalCall(); 439 return -1; 440 } 441 v = *unicode; 442 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) { 443 PyErr_BadInternalCall(); 444 return -1; 445 } 446 447 /* Resizing unicode_empty and single character objects is not 448 possible since these are being shared. We simply return a fresh 449 copy with the same Unicode content. */ 450 if (v->length != length && 451 (v == unicode_empty || v->length == 1)) { 452 PyUnicodeObject *w = _PyUnicode_New(length); 453 if (w == NULL) 454 return -1; 455 Py_UNICODE_COPY(w->str, v->str, 456 length < v->length ? length : v->length); 457 Py_DECREF(*unicode); 458 *unicode = w; 459 return 0; 460 } 461 462 /* Note that we don't have to modify *unicode for unshared Unicode 463 objects, since we can modify them in-place. */ 464 return unicode_resize(v, length); 465} 466 467int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length) 468{ 469 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length); 470} 471 472PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u, 473 Py_ssize_t size) 474{ 475 PyUnicodeObject *unicode; 476 477 /* If the Unicode data is known at construction time, we can apply 478 some optimizations which share commonly used objects. */ 479 if (u != NULL) { 480 481 /* Optimization for empty strings */ 482 if (size == 0 && unicode_empty != NULL) { 483 Py_INCREF(unicode_empty); 484 return (PyObject *)unicode_empty; 485 } 486 487 /* Single character Unicode objects in the Latin-1 range are 488 shared when using this constructor */ 489 if (size == 1 && *u < 256) { 490 unicode = unicode_latin1[*u]; 491 if (!unicode) { 492 unicode = _PyUnicode_New(1); 493 if (!unicode) 494 return NULL; 495 unicode->str[0] = *u; 496 unicode_latin1[*u] = unicode; 497 } 498 Py_INCREF(unicode); 499 return (PyObject *)unicode; 500 } 501 } 502 503 unicode = _PyUnicode_New(size); 504 if (!unicode) 505 return NULL; 506 507 /* Copy the Unicode data into the new object */ 508 if (u != NULL) 509 Py_UNICODE_COPY(unicode->str, u, size); 510 511 return (PyObject *)unicode; 512} 513 514PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size) 515{ 516 PyUnicodeObject *unicode; 517 518 if (size < 0) { 519 PyErr_SetString(PyExc_SystemError, 520 "Negative size passed to PyUnicode_FromStringAndSize"); 521 return NULL; 522 } 523 524 /* If the Unicode data is known at construction time, we can apply 525 some optimizations which share commonly used objects. 526 Also, this means the input must be UTF-8, so fall back to the 527 UTF-8 decoder at the end. */ 528 if (u != NULL) { 529 530 /* Optimization for empty strings */ 531 if (size == 0 && unicode_empty != NULL) { 532 Py_INCREF(unicode_empty); 533 return (PyObject *)unicode_empty; 534 } 535 536 /* Single characters are shared when using this constructor. 537 Restrict to ASCII, since the input must be UTF-8. */ 538 if (size == 1 && Py_CHARMASK(*u) < 128) { 539 unicode = unicode_latin1[Py_CHARMASK(*u)]; 540 if (!unicode) { 541 unicode = _PyUnicode_New(1); 542 if (!unicode) 543 return NULL; 544 unicode->str[0] = Py_CHARMASK(*u); 545 unicode_latin1[Py_CHARMASK(*u)] = unicode; 546 } 547 Py_INCREF(unicode); 548 return (PyObject *)unicode; 549 } 550 551 return PyUnicode_DecodeUTF8(u, size, NULL); 552 } 553 554 unicode = _PyUnicode_New(size); 555 if (!unicode) 556 return NULL; 557 558 return (PyObject *)unicode; 559} 560 561PyObject *PyUnicode_FromString(const char *u) 562{ 563 size_t size = strlen(u); 564 if (size > PY_SSIZE_T_MAX) { 565 PyErr_SetString(PyExc_OverflowError, "input too long"); 566 return NULL; 567 } 568 569 return PyUnicode_FromStringAndSize(u, size); 570} 571 572#ifdef HAVE_WCHAR_H 573 574#if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4) 575# define CONVERT_WCHAR_TO_SURROGATES 576#endif 577 578#ifdef CONVERT_WCHAR_TO_SURROGATES 579 580/* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need 581 to convert from UTF32 to UTF16. */ 582 583PyObject *PyUnicode_FromWideChar(register const wchar_t *w, 584 Py_ssize_t size) 585{ 586 PyUnicodeObject *unicode; 587 register Py_ssize_t i; 588 Py_ssize_t alloc; 589 const wchar_t *orig_w; 590 591 if (w == NULL) { 592 if (size == 0) 593 return PyUnicode_FromStringAndSize(NULL, 0); 594 PyErr_BadInternalCall(); 595 return NULL; 596 } 597 598 if (size == -1) { 599 size = wcslen(w); 600 } 601 602 alloc = size; 603 orig_w = w; 604 for (i = size; i > 0; i--) { 605 if (*w > 0xFFFF) 606 alloc++; 607 w++; 608 } 609 w = orig_w; 610 unicode = _PyUnicode_New(alloc); 611 if (!unicode) 612 return NULL; 613 614 /* Copy the wchar_t data into the new object */ 615 { 616 register Py_UNICODE *u; 617 u = PyUnicode_AS_UNICODE(unicode); 618 for (i = size; i > 0; i--) { 619 if (*w > 0xFFFF) { 620 wchar_t ordinal = *w++; 621 ordinal -= 0x10000; 622 *u++ = 0xD800 | (ordinal >> 10); 623 *u++ = 0xDC00 | (ordinal & 0x3FF); 624 } 625 else 626 *u++ = *w++; 627 } 628 } 629 return (PyObject *)unicode; 630} 631 632#else 633 634PyObject *PyUnicode_FromWideChar(register const wchar_t *w, 635 Py_ssize_t size) 636{ 637 PyUnicodeObject *unicode; 638 639 if (w == NULL) { 640 if (size == 0) 641 return PyUnicode_FromStringAndSize(NULL, 0); 642 PyErr_BadInternalCall(); 643 return NULL; 644 } 645 646 if (size == -1) { 647 size = wcslen(w); 648 } 649 650 unicode = _PyUnicode_New(size); 651 if (!unicode) 652 return NULL; 653 654 /* Copy the wchar_t data into the new object */ 655#if Py_UNICODE_SIZE == SIZEOF_WCHAR_T 656 memcpy(unicode->str, w, size * sizeof(wchar_t)); 657#else 658 { 659 register Py_UNICODE *u; 660 register Py_ssize_t i; 661 u = PyUnicode_AS_UNICODE(unicode); 662 for (i = size; i > 0; i--) 663 *u++ = *w++; 664 } 665#endif 666 667 return (PyObject *)unicode; 668} 669 670#endif /* CONVERT_WCHAR_TO_SURROGATES */ 671 672#undef CONVERT_WCHAR_TO_SURROGATES 673 674static void 675makefmt(char *fmt, int longflag, int longlongflag, int size_tflag, 676 int zeropad, int width, int precision, char c) 677{ 678 *fmt++ = '%'; 679 if (width) { 680 if (zeropad) 681 *fmt++ = '0'; 682 fmt += sprintf(fmt, "%d", width); 683 } 684 if (precision) 685 fmt += sprintf(fmt, ".%d", precision); 686 if (longflag) 687 *fmt++ = 'l'; 688 else if (longlongflag) { 689 /* longlongflag should only ever be nonzero on machines with 690 HAVE_LONG_LONG defined */ 691#ifdef HAVE_LONG_LONG 692 char *f = PY_FORMAT_LONG_LONG; 693 while (*f) 694 *fmt++ = *f++; 695#else 696 /* we shouldn't ever get here */ 697 assert(0); 698 *fmt++ = 'l'; 699#endif 700 } 701 else if (size_tflag) { 702 char *f = PY_FORMAT_SIZE_T; 703 while (*f) 704 *fmt++ = *f++; 705 } 706 *fmt++ = c; 707 *fmt = '\0'; 708} 709 710#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;} 711 712/* size of fixed-size buffer for formatting single arguments */ 713#define ITEM_BUFFER_LEN 21 714/* maximum number of characters required for output of %ld. 21 characters 715 allows for 64-bit integers (in decimal) and an optional sign. */ 716#define MAX_LONG_CHARS 21 717/* maximum number of characters required for output of %lld. 718 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits, 719 plus 1 for the sign. 53/22 is an upper bound for log10(256). */ 720#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22) 721 722PyObject * 723PyUnicode_FromFormatV(const char *format, va_list vargs) 724{ 725 va_list count; 726 Py_ssize_t callcount = 0; 727 PyObject **callresults = NULL; 728 PyObject **callresult = NULL; 729 Py_ssize_t n = 0; 730 int width = 0; 731 int precision = 0; 732 int zeropad; 733 const char* f; 734 Py_UNICODE *s; 735 PyObject *string; 736 /* used by sprintf */ 737 char buffer[ITEM_BUFFER_LEN+1]; 738 /* use abuffer instead of buffer, if we need more space 739 * (which can happen if there's a format specifier with width). */ 740 char *abuffer = NULL; 741 char *realbuffer; 742 Py_ssize_t abuffersize = 0; 743 char fmt[61]; /* should be enough for %0width.precisionlld */ 744 const char *copy; 745 746 Py_VA_COPY(count, vargs); 747 /* step 1: count the number of %S/%R/%A/%s format specifications 748 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/ 749 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the 750 * result in an array) */ 751 for (f = format; *f; f++) { 752 if (*f == '%') { 753 if (*(f+1)=='%') 754 continue; 755 if (*(f+1)=='S' || *(f+1)=='R' || *(f+1)=='A' || *(f+1) == 'V') 756 ++callcount; 757 while (Py_ISDIGIT((unsigned)*f)) 758 width = (width*10) + *f++ - '0'; 759 while (*++f && *f != '%' && !Py_ISALPHA((unsigned)*f)) 760 ; 761 if (*f == 's') 762 ++callcount; 763 } 764 else if (128 <= (unsigned char)*f) { 765 PyErr_Format(PyExc_ValueError, 766 "PyUnicode_FromFormatV() expects an ASCII-encoded format " 767 "string, got a non-ASCII byte: 0x%02x", 768 (unsigned char)*f); 769 return NULL; 770 } 771 } 772 /* step 2: allocate memory for the results of 773 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */ 774 if (callcount) { 775 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount); 776 if (!callresults) { 777 PyErr_NoMemory(); 778 return NULL; 779 } 780 callresult = callresults; 781 } 782 /* step 3: figure out how large a buffer we need */ 783 for (f = format; *f; f++) { 784 if (*f == '%') { 785#ifdef HAVE_LONG_LONG 786 int longlongflag = 0; 787#endif 788 const char* p = f; 789 width = 0; 790 while (Py_ISDIGIT((unsigned)*f)) 791 width = (width*10) + *f++ - '0'; 792 while (*++f && *f != '%' && !Py_ISALPHA((unsigned)*f)) 793 ; 794 795 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since 796 * they don't affect the amount of space we reserve. 797 */ 798 if (*f == 'l') { 799 if (f[1] == 'd' || f[1] == 'u') { 800 ++f; 801 } 802#ifdef HAVE_LONG_LONG 803 else if (f[1] == 'l' && 804 (f[2] == 'd' || f[2] == 'u')) { 805 longlongflag = 1; 806 f += 2; 807 } 808#endif 809 } 810 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) { 811 ++f; 812 } 813 814 switch (*f) { 815 case 'c': 816 { 817#ifndef Py_UNICODE_WIDE 818 int ordinal = va_arg(count, int); 819 if (ordinal > 0xffff) 820 n += 2; 821 else 822 n++; 823#else 824 (void)va_arg(count, int); 825 n++; 826#endif 827 break; 828 } 829 case '%': 830 n++; 831 break; 832 case 'd': case 'u': case 'i': case 'x': 833 (void) va_arg(count, int); 834#ifdef HAVE_LONG_LONG 835 if (longlongflag) { 836 if (width < MAX_LONG_LONG_CHARS) 837 width = MAX_LONG_LONG_CHARS; 838 } 839 else 840#endif 841 /* MAX_LONG_CHARS is enough to hold a 64-bit integer, 842 including sign. Decimal takes the most space. This 843 isn't enough for octal. If a width is specified we 844 need more (which we allocate later). */ 845 if (width < MAX_LONG_CHARS) 846 width = MAX_LONG_CHARS; 847 n += width; 848 /* XXX should allow for large precision here too. */ 849 if (abuffersize < width) 850 abuffersize = width; 851 break; 852 case 's': 853 { 854 /* UTF-8 */ 855 const char *s = va_arg(count, const char*); 856 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace"); 857 if (!str) 858 goto fail; 859 n += PyUnicode_GET_SIZE(str); 860 /* Remember the str and switch to the next slot */ 861 *callresult++ = str; 862 break; 863 } 864 case 'U': 865 { 866 PyObject *obj = va_arg(count, PyObject *); 867 assert(obj && PyUnicode_Check(obj)); 868 n += PyUnicode_GET_SIZE(obj); 869 break; 870 } 871 case 'V': 872 { 873 PyObject *obj = va_arg(count, PyObject *); 874 const char *str = va_arg(count, const char *); 875 PyObject *str_obj; 876 assert(obj || str); 877 assert(!obj || PyUnicode_Check(obj)); 878 if (obj) { 879 n += PyUnicode_GET_SIZE(obj); 880 *callresult++ = NULL; 881 } 882 else { 883 str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace"); 884 if (!str_obj) 885 goto fail; 886 n += PyUnicode_GET_SIZE(str_obj); 887 *callresult++ = str_obj; 888 } 889 break; 890 } 891 case 'S': 892 { 893 PyObject *obj = va_arg(count, PyObject *); 894 PyObject *str; 895 assert(obj); 896 str = PyObject_Str(obj); 897 if (!str) 898 goto fail; 899 n += PyUnicode_GET_SIZE(str); 900 /* Remember the str and switch to the next slot */ 901 *callresult++ = str; 902 break; 903 } 904 case 'R': 905 { 906 PyObject *obj = va_arg(count, PyObject *); 907 PyObject *repr; 908 assert(obj); 909 repr = PyObject_Repr(obj); 910 if (!repr) 911 goto fail; 912 n += PyUnicode_GET_SIZE(repr); 913 /* Remember the repr and switch to the next slot */ 914 *callresult++ = repr; 915 break; 916 } 917 case 'A': 918 { 919 PyObject *obj = va_arg(count, PyObject *); 920 PyObject *ascii; 921 assert(obj); 922 ascii = PyObject_ASCII(obj); 923 if (!ascii) 924 goto fail; 925 n += PyUnicode_GET_SIZE(ascii); 926 /* Remember the repr and switch to the next slot */ 927 *callresult++ = ascii; 928 break; 929 } 930 case 'p': 931 (void) va_arg(count, int); 932 /* maximum 64-bit pointer representation: 933 * 0xffffffffffffffff 934 * so 19 characters is enough. 935 * XXX I count 18 -- what's the extra for? 936 */ 937 n += 19; 938 break; 939 default: 940 /* if we stumble upon an unknown 941 formatting code, copy the rest of 942 the format string to the output 943 string. (we cannot just skip the 944 code, since there's no way to know 945 what's in the argument list) */ 946 n += strlen(p); 947 goto expand; 948 } 949 } else 950 n++; 951 } 952 expand: 953 if (abuffersize > ITEM_BUFFER_LEN) { 954 /* add 1 for sprintf's trailing null byte */ 955 abuffer = PyObject_Malloc(abuffersize + 1); 956 if (!abuffer) { 957 PyErr_NoMemory(); 958 goto fail; 959 } 960 realbuffer = abuffer; 961 } 962 else 963 realbuffer = buffer; 964 /* step 4: fill the buffer */ 965 /* Since we've analyzed how much space we need for the worst case, 966 we don't have to resize the string. 967 There can be no errors beyond this point. */ 968 string = PyUnicode_FromUnicode(NULL, n); 969 if (!string) 970 goto fail; 971 972 s = PyUnicode_AS_UNICODE(string); 973 callresult = callresults; 974 975 for (f = format; *f; f++) { 976 if (*f == '%') { 977 const char* p = f++; 978 int longflag = 0; 979 int longlongflag = 0; 980 int size_tflag = 0; 981 zeropad = (*f == '0'); 982 /* parse the width.precision part */ 983 width = 0; 984 while (Py_ISDIGIT((unsigned)*f)) 985 width = (width*10) + *f++ - '0'; 986 precision = 0; 987 if (*f == '.') { 988 f++; 989 while (Py_ISDIGIT((unsigned)*f)) 990 precision = (precision*10) + *f++ - '0'; 991 } 992 /* Handle %ld, %lu, %lld and %llu. */ 993 if (*f == 'l') { 994 if (f[1] == 'd' || f[1] == 'u') { 995 longflag = 1; 996 ++f; 997 } 998#ifdef HAVE_LONG_LONG 999 else if (f[1] == 'l' && 1000 (f[2] == 'd' || f[2] == 'u')) { 1001 longlongflag = 1; 1002 f += 2; 1003 } 1004#endif 1005 } 1006 /* handle the size_t flag. */ 1007 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) { 1008 size_tflag = 1; 1009 ++f; 1010 } 1011 1012 switch (*f) { 1013 case 'c': 1014 { 1015 int ordinal = va_arg(vargs, int); 1016#ifndef Py_UNICODE_WIDE 1017 if (ordinal > 0xffff) { 1018 ordinal -= 0x10000; 1019 *s++ = 0xD800 | (ordinal >> 10); 1020 *s++ = 0xDC00 | (ordinal & 0x3FF); 1021 } else 1022#endif 1023 *s++ = ordinal; 1024 break; 1025 } 1026 case 'd': 1027 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad, 1028 width, precision, 'd'); 1029 if (longflag) 1030 sprintf(realbuffer, fmt, va_arg(vargs, long)); 1031#ifdef HAVE_LONG_LONG 1032 else if (longlongflag) 1033 sprintf(realbuffer, fmt, va_arg(vargs, PY_LONG_LONG)); 1034#endif 1035 else if (size_tflag) 1036 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t)); 1037 else 1038 sprintf(realbuffer, fmt, va_arg(vargs, int)); 1039 appendstring(realbuffer); 1040 break; 1041 case 'u': 1042 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad, 1043 width, precision, 'u'); 1044 if (longflag) 1045 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long)); 1046#ifdef HAVE_LONG_LONG 1047 else if (longlongflag) 1048 sprintf(realbuffer, fmt, va_arg(vargs, 1049 unsigned PY_LONG_LONG)); 1050#endif 1051 else if (size_tflag) 1052 sprintf(realbuffer, fmt, va_arg(vargs, size_t)); 1053 else 1054 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int)); 1055 appendstring(realbuffer); 1056 break; 1057 case 'i': 1058 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'i'); 1059 sprintf(realbuffer, fmt, va_arg(vargs, int)); 1060 appendstring(realbuffer); 1061 break; 1062 case 'x': 1063 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x'); 1064 sprintf(realbuffer, fmt, va_arg(vargs, int)); 1065 appendstring(realbuffer); 1066 break; 1067 case 's': 1068 { 1069 /* unused, since we already have the result */ 1070 (void) va_arg(vargs, char *); 1071 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult), 1072 PyUnicode_GET_SIZE(*callresult)); 1073 s += PyUnicode_GET_SIZE(*callresult); 1074 /* We're done with the unicode()/repr() => forget it */ 1075 Py_DECREF(*callresult); 1076 /* switch to next unicode()/repr() result */ 1077 ++callresult; 1078 break; 1079 } 1080 case 'U': 1081 { 1082 PyObject *obj = va_arg(vargs, PyObject *); 1083 Py_ssize_t size = PyUnicode_GET_SIZE(obj); 1084 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size); 1085 s += size; 1086 break; 1087 } 1088 case 'V': 1089 { 1090 PyObject *obj = va_arg(vargs, PyObject *); 1091 va_arg(vargs, const char *); 1092 if (obj) { 1093 Py_ssize_t size = PyUnicode_GET_SIZE(obj); 1094 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size); 1095 s += size; 1096 } else { 1097 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult), 1098 PyUnicode_GET_SIZE(*callresult)); 1099 s += PyUnicode_GET_SIZE(*callresult); 1100 Py_DECREF(*callresult); 1101 } 1102 ++callresult; 1103 break; 1104 } 1105 case 'S': 1106 case 'R': 1107 case 'A': 1108 { 1109 Py_UNICODE *ucopy; 1110 Py_ssize_t usize; 1111 Py_ssize_t upos; 1112 /* unused, since we already have the result */ 1113 (void) va_arg(vargs, PyObject *); 1114 ucopy = PyUnicode_AS_UNICODE(*callresult); 1115 usize = PyUnicode_GET_SIZE(*callresult); 1116 for (upos = 0; upos<usize;) 1117 *s++ = ucopy[upos++]; 1118 /* We're done with the unicode()/repr() => forget it */ 1119 Py_DECREF(*callresult); 1120 /* switch to next unicode()/repr() result */ 1121 ++callresult; 1122 break; 1123 } 1124 case 'p': 1125 sprintf(buffer, "%p", va_arg(vargs, void*)); 1126 /* %p is ill-defined: ensure leading 0x. */ 1127 if (buffer[1] == 'X') 1128 buffer[1] = 'x'; 1129 else if (buffer[1] != 'x') { 1130 memmove(buffer+2, buffer, strlen(buffer)+1); 1131 buffer[0] = '0'; 1132 buffer[1] = 'x'; 1133 } 1134 appendstring(buffer); 1135 break; 1136 case '%': 1137 *s++ = '%'; 1138 break; 1139 default: 1140 appendstring(p); 1141 goto end; 1142 } 1143 } 1144 else 1145 *s++ = *f; 1146 } 1147 1148 end: 1149 if (callresults) 1150 PyObject_Free(callresults); 1151 if (abuffer) 1152 PyObject_Free(abuffer); 1153 PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string)); 1154 return string; 1155 fail: 1156 if (callresults) { 1157 PyObject **callresult2 = callresults; 1158 while (callresult2 < callresult) { 1159 Py_XDECREF(*callresult2); 1160 ++callresult2; 1161 } 1162 PyObject_Free(callresults); 1163 } 1164 if (abuffer) 1165 PyObject_Free(abuffer); 1166 return NULL; 1167} 1168 1169#undef appendstring 1170 1171PyObject * 1172PyUnicode_FromFormat(const char *format, ...) 1173{ 1174 PyObject* ret; 1175 va_list vargs; 1176 1177#ifdef HAVE_STDARG_PROTOTYPES 1178 va_start(vargs, format); 1179#else 1180 va_start(vargs); 1181#endif 1182 ret = PyUnicode_FromFormatV(format, vargs); 1183 va_end(vargs); 1184 return ret; 1185} 1186 1187/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString(): 1188 convert a Unicode object to a wide character string. 1189 1190 - If w is NULL: return the number of wide characters (including the null 1191 character) required to convert the unicode object. Ignore size argument. 1192 1193 - Otherwise: return the number of wide characters (excluding the null 1194 character) written into w. Write at most size wide characters (including 1195 the null character). */ 1196static Py_ssize_t 1197unicode_aswidechar(PyUnicodeObject *unicode, 1198 wchar_t *w, 1199 Py_ssize_t size) 1200{ 1201#if Py_UNICODE_SIZE == SIZEOF_WCHAR_T 1202 Py_ssize_t res; 1203 if (w != NULL) { 1204 res = PyUnicode_GET_SIZE(unicode); 1205 if (size > res) 1206 size = res + 1; 1207 else 1208 res = size; 1209 memcpy(w, unicode->str, size * sizeof(wchar_t)); 1210 return res; 1211 } 1212 else 1213 return PyUnicode_GET_SIZE(unicode) + 1; 1214#elif Py_UNICODE_SIZE == 2 && SIZEOF_WCHAR_T == 4 1215 register const Py_UNICODE *u; 1216 const Py_UNICODE *uend; 1217 const wchar_t *worig, *wend; 1218 Py_ssize_t nchar; 1219 1220 u = PyUnicode_AS_UNICODE(unicode); 1221 uend = u + PyUnicode_GET_SIZE(unicode); 1222 if (w != NULL) { 1223 worig = w; 1224 wend = w + size; 1225 while (u != uend && w != wend) { 1226 if (0xD800 <= u[0] && u[0] <= 0xDBFF 1227 && 0xDC00 <= u[1] && u[1] <= 0xDFFF) 1228 { 1229 *w = (((u[0] & 0x3FF) << 10) | (u[1] & 0x3FF)) + 0x10000; 1230 u += 2; 1231 } 1232 else { 1233 *w = *u; 1234 u++; 1235 } 1236 w++; 1237 } 1238 if (w != wend) 1239 *w = L'\0'; 1240 return w - worig; 1241 } 1242 else { 1243 nchar = 1; /* null character at the end */ 1244 while (u != uend) { 1245 if (0xD800 <= u[0] && u[0] <= 0xDBFF 1246 && 0xDC00 <= u[1] && u[1] <= 0xDFFF) 1247 u += 2; 1248 else 1249 u++; 1250 nchar++; 1251 } 1252 } 1253 return nchar; 1254#elif Py_UNICODE_SIZE == 4 && SIZEOF_WCHAR_T == 2 1255 register Py_UNICODE *u, *uend, ordinal; 1256 register Py_ssize_t i; 1257 wchar_t *worig, *wend; 1258 Py_ssize_t nchar; 1259 1260 u = PyUnicode_AS_UNICODE(unicode); 1261 uend = u + PyUnicode_GET_SIZE(u); 1262 if (w != NULL) { 1263 worig = w; 1264 wend = w + size; 1265 while (u != uend && w != wend) { 1266 ordinal = *u; 1267 if (ordinal > 0xffff) { 1268 ordinal -= 0x10000; 1269 *w++ = 0xD800 | (ordinal >> 10); 1270 *w++ = 0xDC00 | (ordinal & 0x3FF); 1271 } 1272 else 1273 *w++ = ordinal; 1274 u++; 1275 } 1276 if (w != wend) 1277 *w = 0; 1278 return w - worig; 1279 } 1280 else { 1281 nchar = 1; /* null character */ 1282 while (u != uend) { 1283 if (*u > 0xffff) 1284 nchar += 2; 1285 else 1286 nchar++; 1287 u++; 1288 } 1289 return nchar; 1290 } 1291#else 1292# error "unsupported wchar_t and Py_UNICODE sizes, see issue #8670" 1293#endif 1294} 1295 1296Py_ssize_t 1297PyUnicode_AsWideChar(PyObject *unicode, 1298 wchar_t *w, 1299 Py_ssize_t size) 1300{ 1301 if (unicode == NULL) { 1302 PyErr_BadInternalCall(); 1303 return -1; 1304 } 1305 return unicode_aswidechar((PyUnicodeObject*)unicode, w, size); 1306} 1307 1308wchar_t* 1309PyUnicode_AsWideCharString(PyObject *unicode, 1310 Py_ssize_t *size) 1311{ 1312 wchar_t* buffer; 1313 Py_ssize_t buflen; 1314 1315 if (unicode == NULL) { 1316 PyErr_BadInternalCall(); 1317 return NULL; 1318 } 1319 1320 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, NULL, 0); 1321 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) { 1322 PyErr_NoMemory(); 1323 return NULL; 1324 } 1325 1326 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t)); 1327 if (buffer == NULL) { 1328 PyErr_NoMemory(); 1329 return NULL; 1330 } 1331 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, buffer, buflen); 1332 if (size != NULL) 1333 *size = buflen; 1334 return buffer; 1335} 1336 1337#endif 1338 1339PyObject *PyUnicode_FromOrdinal(int ordinal) 1340{ 1341 Py_UNICODE s[2]; 1342 1343 if (ordinal < 0 || ordinal > 0x10ffff) { 1344 PyErr_SetString(PyExc_ValueError, 1345 "chr() arg not in range(0x110000)"); 1346 return NULL; 1347 } 1348 1349#ifndef Py_UNICODE_WIDE 1350 if (ordinal > 0xffff) { 1351 ordinal -= 0x10000; 1352 s[0] = 0xD800 | (ordinal >> 10); 1353 s[1] = 0xDC00 | (ordinal & 0x3FF); 1354 return PyUnicode_FromUnicode(s, 2); 1355 } 1356#endif 1357 1358 s[0] = (Py_UNICODE)ordinal; 1359 return PyUnicode_FromUnicode(s, 1); 1360} 1361 1362PyObject *PyUnicode_FromObject(register PyObject *obj) 1363{ 1364 /* XXX Perhaps we should make this API an alias of 1365 PyObject_Str() instead ?! */ 1366 if (PyUnicode_CheckExact(obj)) { 1367 Py_INCREF(obj); 1368 return obj; 1369 } 1370 if (PyUnicode_Check(obj)) { 1371 /* For a Unicode subtype that's not a Unicode object, 1372 return a true Unicode object with the same data. */ 1373 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj), 1374 PyUnicode_GET_SIZE(obj)); 1375 } 1376 PyErr_Format(PyExc_TypeError, 1377 "Can't convert '%.100s' object to str implicitly", 1378 Py_TYPE(obj)->tp_name); 1379 return NULL; 1380} 1381 1382PyObject *PyUnicode_FromEncodedObject(register PyObject *obj, 1383 const char *encoding, 1384 const char *errors) 1385{ 1386 Py_buffer buffer; 1387 PyObject *v; 1388 1389 if (obj == NULL) { 1390 PyErr_BadInternalCall(); 1391 return NULL; 1392 } 1393 1394 /* Decoding bytes objects is the most common case and should be fast */ 1395 if (PyBytes_Check(obj)) { 1396 if (PyBytes_GET_SIZE(obj) == 0) { 1397 Py_INCREF(unicode_empty); 1398 v = (PyObject *) unicode_empty; 1399 } 1400 else { 1401 v = PyUnicode_Decode( 1402 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj), 1403 encoding, errors); 1404 } 1405 return v; 1406 } 1407 1408 if (PyUnicode_Check(obj)) { 1409 PyErr_SetString(PyExc_TypeError, 1410 "decoding str is not supported"); 1411 return NULL; 1412 } 1413 1414 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */ 1415 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) { 1416 PyErr_Format(PyExc_TypeError, 1417 "coercing to str: need bytes, bytearray " 1418 "or buffer-like object, %.80s found", 1419 Py_TYPE(obj)->tp_name); 1420 return NULL; 1421 } 1422 1423 if (buffer.len == 0) { 1424 Py_INCREF(unicode_empty); 1425 v = (PyObject *) unicode_empty; 1426 } 1427 else 1428 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors); 1429 1430 PyBuffer_Release(&buffer); 1431 return v; 1432} 1433 1434/* Convert encoding to lower case and replace '_' with '-' in order to 1435 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1), 1436 1 on success. */ 1437static int 1438normalize_encoding(const char *encoding, 1439 char *lower, 1440 size_t lower_len) 1441{ 1442 const char *e; 1443 char *l; 1444 char *l_end; 1445 1446 e = encoding; 1447 l = lower; 1448 l_end = &lower[lower_len - 1]; 1449 while (*e) { 1450 if (l == l_end) 1451 return 0; 1452 if (Py_ISUPPER(*e)) { 1453 *l++ = Py_TOLOWER(*e++); 1454 } 1455 else if (*e == '_') { 1456 *l++ = '-'; 1457 e++; 1458 } 1459 else { 1460 *l++ = *e++; 1461 } 1462 } 1463 *l = '\0'; 1464 return 1; 1465} 1466 1467PyObject *PyUnicode_Decode(const char *s, 1468 Py_ssize_t size, 1469 const char *encoding, 1470 const char *errors) 1471{ 1472 PyObject *buffer = NULL, *unicode; 1473 Py_buffer info; 1474 char lower[11]; /* Enough for any encoding shortcut */ 1475 1476 if (encoding == NULL) 1477 encoding = PyUnicode_GetDefaultEncoding(); 1478 1479 /* Shortcuts for common default encodings */ 1480 if (normalize_encoding(encoding, lower, sizeof(lower))) { 1481 if (strcmp(lower, "utf-8") == 0) 1482 return PyUnicode_DecodeUTF8(s, size, errors); 1483 else if ((strcmp(lower, "latin-1") == 0) || 1484 (strcmp(lower, "iso-8859-1") == 0)) 1485 return PyUnicode_DecodeLatin1(s, size, errors); 1486#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T) 1487 else if (strcmp(lower, "mbcs") == 0) 1488 return PyUnicode_DecodeMBCS(s, size, errors); 1489#endif 1490 else if (strcmp(lower, "ascii") == 0) 1491 return PyUnicode_DecodeASCII(s, size, errors); 1492 else if (strcmp(lower, "utf-16") == 0) 1493 return PyUnicode_DecodeUTF16(s, size, errors, 0); 1494 else if (strcmp(lower, "utf-32") == 0) 1495 return PyUnicode_DecodeUTF32(s, size, errors, 0); 1496 } 1497 1498 /* Decode via the codec registry */ 1499 buffer = NULL; 1500 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0) 1501 goto onError; 1502 buffer = PyMemoryView_FromBuffer(&info); 1503 if (buffer == NULL) 1504 goto onError; 1505 unicode = PyCodec_Decode(buffer, encoding, errors); 1506 if (unicode == NULL) 1507 goto onError; 1508 if (!PyUnicode_Check(unicode)) { 1509 PyErr_Format(PyExc_TypeError, 1510 "decoder did not return a str object (type=%.400s)", 1511 Py_TYPE(unicode)->tp_name); 1512 Py_DECREF(unicode); 1513 goto onError; 1514 } 1515 Py_DECREF(buffer); 1516 return unicode; 1517 1518 onError: 1519 Py_XDECREF(buffer); 1520 return NULL; 1521} 1522 1523PyObject *PyUnicode_AsDecodedObject(PyObject *unicode, 1524 const char *encoding, 1525 const char *errors) 1526{ 1527 PyObject *v; 1528 1529 if (!PyUnicode_Check(unicode)) { 1530 PyErr_BadArgument(); 1531 goto onError; 1532 } 1533 1534 if (encoding == NULL) 1535 encoding = PyUnicode_GetDefaultEncoding(); 1536 1537 /* Decode via the codec registry */ 1538 v = PyCodec_Decode(unicode, encoding, errors); 1539 if (v == NULL) 1540 goto onError; 1541 return v; 1542 1543 onError: 1544 return NULL; 1545} 1546 1547PyObject *PyUnicode_AsDecodedUnicode(PyObject *unicode, 1548 const char *encoding, 1549 const char *errors) 1550{ 1551 PyObject *v; 1552 1553 if (!PyUnicode_Check(unicode)) { 1554 PyErr_BadArgument(); 1555 goto onError; 1556 } 1557 1558 if (encoding == NULL) 1559 encoding = PyUnicode_GetDefaultEncoding(); 1560 1561 /* Decode via the codec registry */ 1562 v = PyCodec_Decode(unicode, encoding, errors); 1563 if (v == NULL) 1564 goto onError; 1565 if (!PyUnicode_Check(v)) { 1566 PyErr_Format(PyExc_TypeError, 1567 "decoder did not return a str object (type=%.400s)", 1568 Py_TYPE(v)->tp_name); 1569 Py_DECREF(v); 1570 goto onError; 1571 } 1572 return v; 1573 1574 onError: 1575 return NULL; 1576} 1577 1578PyObject *PyUnicode_Encode(const Py_UNICODE *s, 1579 Py_ssize_t size, 1580 const char *encoding, 1581 const char *errors) 1582{ 1583 PyObject *v, *unicode; 1584 1585 unicode = PyUnicode_FromUnicode(s, size); 1586 if (unicode == NULL) 1587 return NULL; 1588 v = PyUnicode_AsEncodedString(unicode, encoding, errors); 1589 Py_DECREF(unicode); 1590 return v; 1591} 1592 1593PyObject *PyUnicode_AsEncodedObject(PyObject *unicode, 1594 const char *encoding, 1595 const char *errors) 1596{ 1597 PyObject *v; 1598 1599 if (!PyUnicode_Check(unicode)) { 1600 PyErr_BadArgument(); 1601 goto onError; 1602 } 1603 1604 if (encoding == NULL) 1605 encoding = PyUnicode_GetDefaultEncoding(); 1606 1607 /* Encode via the codec registry */ 1608 v = PyCodec_Encode(unicode, encoding, errors); 1609 if (v == NULL) 1610 goto onError; 1611 return v; 1612 1613 onError: 1614 return NULL; 1615} 1616 1617PyObject * 1618PyUnicode_EncodeFSDefault(PyObject *unicode) 1619{ 1620#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T) 1621 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode), 1622 PyUnicode_GET_SIZE(unicode), 1623 NULL); 1624#elif defined(__APPLE__) 1625 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode), 1626 PyUnicode_GET_SIZE(unicode), 1627 "surrogateescape"); 1628#else 1629 PyInterpreterState *interp = PyThreadState_GET()->interp; 1630 /* Bootstrap check: if the filesystem codec is implemented in Python, we 1631 cannot use it to encode and decode filenames before it is loaded. Load 1632 the Python codec requires to encode at least its own filename. Use the C 1633 version of the locale codec until the codec registry is initialized and 1634 the Python codec is loaded. 1635 1636 Py_FileSystemDefaultEncoding is shared between all interpreters, we 1637 cannot only rely on it: check also interp->fscodec_initialized for 1638 subinterpreters. */ 1639 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) { 1640 return PyUnicode_AsEncodedString(unicode, 1641 Py_FileSystemDefaultEncoding, 1642 "surrogateescape"); 1643 } 1644 else { 1645 /* locale encoding with surrogateescape */ 1646 wchar_t *wchar; 1647 char *bytes; 1648 PyObject *bytes_obj; 1649 size_t error_pos; 1650 1651 wchar = PyUnicode_AsWideCharString(unicode, NULL); 1652 if (wchar == NULL) 1653 return NULL; 1654 bytes = _Py_wchar2char(wchar, &error_pos); 1655 if (bytes == NULL) { 1656 if (error_pos != (size_t)-1) { 1657 char *errmsg = strerror(errno); 1658 PyObject *exc = NULL; 1659 if (errmsg == NULL) 1660 errmsg = "Py_wchar2char() failed"; 1661 raise_encode_exception(&exc, 1662 "filesystemencoding", 1663 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode), 1664 error_pos, error_pos+1, 1665 errmsg); 1666 Py_XDECREF(exc); 1667 } 1668 else 1669 PyErr_NoMemory(); 1670 PyMem_Free(wchar); 1671 return NULL; 1672 } 1673 PyMem_Free(wchar); 1674 1675 bytes_obj = PyBytes_FromString(bytes); 1676 PyMem_Free(bytes); 1677 return bytes_obj; 1678 } 1679#endif 1680} 1681 1682PyObject *PyUnicode_AsEncodedString(PyObject *unicode, 1683 const char *encoding, 1684 const char *errors) 1685{ 1686 PyObject *v; 1687 char lower[11]; /* Enough for any encoding shortcut */ 1688 1689 if (!PyUnicode_Check(unicode)) { 1690 PyErr_BadArgument(); 1691 return NULL; 1692 } 1693 1694 if (encoding == NULL) 1695 encoding = PyUnicode_GetDefaultEncoding(); 1696 1697 /* Shortcuts for common default encodings */ 1698 if (normalize_encoding(encoding, lower, sizeof(lower))) { 1699 if (strcmp(lower, "utf-8") == 0) 1700 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode), 1701 PyUnicode_GET_SIZE(unicode), 1702 errors); 1703 else if ((strcmp(lower, "latin-1") == 0) || 1704 (strcmp(lower, "iso-8859-1") == 0)) 1705 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode), 1706 PyUnicode_GET_SIZE(unicode), 1707 errors); 1708#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T) 1709 else if (strcmp(lower, "mbcs") == 0) 1710 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode), 1711 PyUnicode_GET_SIZE(unicode), 1712 errors); 1713#endif 1714 else if (strcmp(lower, "ascii") == 0) 1715 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode), 1716 PyUnicode_GET_SIZE(unicode), 1717 errors); 1718 } 1719 /* During bootstrap, we may need to find the encodings 1720 package, to load the file system encoding, and require the 1721 file system encoding in order to load the encodings 1722 package. 1723 1724 Break out of this dependency by assuming that the path to 1725 the encodings module is ASCII-only. XXX could try wcstombs 1726 instead, if the file system encoding is the locale's 1727 encoding. */ 1728 if (Py_FileSystemDefaultEncoding && 1729 strcmp(encoding, Py_FileSystemDefaultEncoding) == 0 && 1730 !PyThreadState_GET()->interp->codecs_initialized) 1731 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode), 1732 PyUnicode_GET_SIZE(unicode), 1733 errors); 1734 1735 /* Encode via the codec registry */ 1736 v = PyCodec_Encode(unicode, encoding, errors); 1737 if (v == NULL) 1738 return NULL; 1739 1740 /* The normal path */ 1741 if (PyBytes_Check(v)) 1742 return v; 1743 1744 /* If the codec returns a buffer, raise a warning and convert to bytes */ 1745 if (PyByteArray_Check(v)) { 1746 int error; 1747 PyObject *b; 1748 1749 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1, 1750 "encoder %s returned bytearray instead of bytes", 1751 encoding); 1752 if (error) { 1753 Py_DECREF(v); 1754 return NULL; 1755 } 1756 1757 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v)); 1758 Py_DECREF(v); 1759 return b; 1760 } 1761 1762 PyErr_Format(PyExc_TypeError, 1763 "encoder did not return a bytes object (type=%.400s)", 1764 Py_TYPE(v)->tp_name); 1765 Py_DECREF(v); 1766 return NULL; 1767} 1768 1769PyObject *PyUnicode_AsEncodedUnicode(PyObject *unicode, 1770 const char *encoding, 1771 const char *errors) 1772{ 1773 PyObject *v; 1774 1775 if (!PyUnicode_Check(unicode)) { 1776 PyErr_BadArgument(); 1777 goto onError; 1778 } 1779 1780 if (encoding == NULL) 1781 encoding = PyUnicode_GetDefaultEncoding(); 1782 1783 /* Encode via the codec registry */ 1784 v = PyCodec_Encode(unicode, encoding, errors); 1785 if (v == NULL) 1786 goto onError; 1787 if (!PyUnicode_Check(v)) { 1788 PyErr_Format(PyExc_TypeError, 1789 "encoder did not return an str object (type=%.400s)", 1790 Py_TYPE(v)->tp_name); 1791 Py_DECREF(v); 1792 goto onError; 1793 } 1794 return v; 1795 1796 onError: 1797 return NULL; 1798} 1799 1800PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode, 1801 const char *errors) 1802{ 1803 PyObject *v = ((PyUnicodeObject *)unicode)->defenc; 1804 if (v) 1805 return v; 1806 if (errors != NULL) 1807 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString"); 1808 v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode), 1809 PyUnicode_GET_SIZE(unicode), 1810 NULL); 1811 if (!v) 1812 return NULL; 1813 ((PyUnicodeObject *)unicode)->defenc = v; 1814 return v; 1815} 1816 1817PyObject* 1818PyUnicode_DecodeFSDefault(const char *s) { 1819 Py_ssize_t size = (Py_ssize_t)strlen(s); 1820 return PyUnicode_DecodeFSDefaultAndSize(s, size); 1821} 1822 1823PyObject* 1824PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size) 1825{ 1826#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T) 1827 return PyUnicode_DecodeMBCS(s, size, NULL); 1828#elif defined(__APPLE__) 1829 return PyUnicode_DecodeUTF8(s, size, "surrogateescape"); 1830#else 1831 PyInterpreterState *interp = PyThreadState_GET()->interp; 1832 /* Bootstrap check: if the filesystem codec is implemented in Python, we 1833 cannot use it to encode and decode filenames before it is loaded. Load 1834 the Python codec requires to encode at least its own filename. Use the C 1835 version of the locale codec until the codec registry is initialized and 1836 the Python codec is loaded. 1837 1838 Py_FileSystemDefaultEncoding is shared between all interpreters, we 1839 cannot only rely on it: check also interp->fscodec_initialized for 1840 subinterpreters. */ 1841 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) { 1842 return PyUnicode_Decode(s, size, 1843 Py_FileSystemDefaultEncoding, 1844 "surrogateescape"); 1845 } 1846 else { 1847 /* locale encoding with surrogateescape */ 1848 wchar_t *wchar; 1849 PyObject *unicode; 1850 size_t len; 1851 1852 if (s[size] != '\0' || size != strlen(s)) { 1853 PyErr_SetString(PyExc_TypeError, "embedded NUL character"); 1854 return NULL; 1855 } 1856 1857 wchar = _Py_char2wchar(s, &len); 1858 if (wchar == NULL) 1859 return PyErr_NoMemory(); 1860 1861 unicode = PyUnicode_FromWideChar(wchar, len); 1862 PyMem_Free(wchar); 1863 return unicode; 1864 } 1865#endif 1866} 1867 1868 1869int 1870_PyUnicode_HasNULChars(PyObject* s) 1871{ 1872 static PyObject *nul = NULL; 1873 1874 if (nul == NULL) 1875 nul = PyUnicode_FromStringAndSize("\0", 1); 1876 if (nul == NULL) 1877 return -1; 1878 return PyUnicode_Contains(s, nul); 1879} 1880 1881 1882int 1883PyUnicode_FSConverter(PyObject* arg, void* addr) 1884{ 1885 PyObject *output = NULL; 1886 Py_ssize_t size; 1887 void *data; 1888 if (arg == NULL) { 1889 Py_DECREF(*(PyObject**)addr); 1890 return 1; 1891 } 1892 if (PyBytes_Check(arg)) { 1893 output = arg; 1894 Py_INCREF(output); 1895 } 1896 else { 1897 arg = PyUnicode_FromObject(arg); 1898 if (!arg) 1899 return 0; 1900 output = PyUnicode_EncodeFSDefault(arg); 1901 Py_DECREF(arg); 1902 if (!output) 1903 return 0; 1904 if (!PyBytes_Check(output)) { 1905 Py_DECREF(output); 1906 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes"); 1907 return 0; 1908 } 1909 } 1910 size = PyBytes_GET_SIZE(output); 1911 data = PyBytes_AS_STRING(output); 1912 if (size != strlen(data)) { 1913 PyErr_SetString(PyExc_TypeError, "embedded NUL character"); 1914 Py_DECREF(output); 1915 return 0; 1916 } 1917 *(PyObject**)addr = output; 1918 return Py_CLEANUP_SUPPORTED; 1919} 1920 1921 1922int 1923PyUnicode_FSDecoder(PyObject* arg, void* addr) 1924{ 1925 PyObject *output = NULL; 1926 Py_ssize_t size; 1927 void *data; 1928 if (arg == NULL) { 1929 Py_DECREF(*(PyObject**)addr); 1930 return 1; 1931 } 1932 if (PyUnicode_Check(arg)) { 1933 output = arg; 1934 Py_INCREF(output); 1935 } 1936 else { 1937 arg = PyBytes_FromObject(arg); 1938 if (!arg) 1939 return 0; 1940 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg), 1941 PyBytes_GET_SIZE(arg)); 1942 Py_DECREF(arg); 1943 if (!output) 1944 return 0; 1945 if (!PyUnicode_Check(output)) { 1946 Py_DECREF(output); 1947 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode"); 1948 return 0; 1949 } 1950 } 1951 size = PyUnicode_GET_SIZE(output); 1952 data = PyUnicode_AS_UNICODE(output); 1953 if (size != Py_UNICODE_strlen(data)) { 1954 PyErr_SetString(PyExc_TypeError, "embedded NUL character"); 1955 Py_DECREF(output); 1956 return 0; 1957 } 1958 *(PyObject**)addr = output; 1959 return Py_CLEANUP_SUPPORTED; 1960} 1961 1962 1963char* 1964_PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize) 1965{ 1966 PyObject *bytes; 1967 if (!PyUnicode_Check(unicode)) { 1968 PyErr_BadArgument(); 1969 return NULL; 1970 } 1971 bytes = _PyUnicode_AsDefaultEncodedString(unicode, NULL); 1972 if (bytes == NULL) 1973 return NULL; 1974 if (psize != NULL) 1975 *psize = PyBytes_GET_SIZE(bytes); 1976 return PyBytes_AS_STRING(bytes); 1977} 1978 1979char* 1980_PyUnicode_AsString(PyObject *unicode) 1981{ 1982 return _PyUnicode_AsStringAndSize(unicode, NULL); 1983} 1984 1985Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode) 1986{ 1987 if (!PyUnicode_Check(unicode)) { 1988 PyErr_BadArgument(); 1989 goto onError; 1990 } 1991 return PyUnicode_AS_UNICODE(unicode); 1992 1993 onError: 1994 return NULL; 1995} 1996 1997Py_ssize_t PyUnicode_GetSize(PyObject *unicode) 1998{ 1999 if (!PyUnicode_Check(unicode)) { 2000 PyErr_BadArgument(); 2001 goto onError; 2002 } 2003 return PyUnicode_GET_SIZE(unicode); 2004 2005 onError: 2006 return -1; 2007} 2008 2009const char *PyUnicode_GetDefaultEncoding(void) 2010{ 2011 return "utf-8"; 2012} 2013 2014/* create or adjust a UnicodeDecodeError */ 2015static void 2016make_decode_exception(PyObject **exceptionObject, 2017 const char *encoding, 2018 const char *input, Py_ssize_t length, 2019 Py_ssize_t startpos, Py_ssize_t endpos, 2020 const char *reason) 2021{ 2022 if (*exceptionObject == NULL) { 2023 *exceptionObject = PyUnicodeDecodeError_Create( 2024 encoding, input, length, startpos, endpos, reason); 2025 } 2026 else { 2027 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos)) 2028 goto onError; 2029 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos)) 2030 goto onError; 2031 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason)) 2032 goto onError; 2033 } 2034 return; 2035 2036onError: 2037 Py_DECREF(*exceptionObject); 2038 *exceptionObject = NULL; 2039} 2040 2041/* error handling callback helper: 2042 build arguments, call the callback and check the arguments, 2043 if no exception occurred, copy the replacement to the output 2044 and adjust various state variables. 2045 return 0 on success, -1 on error 2046*/ 2047 2048static 2049int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler, 2050 const char *encoding, const char *reason, 2051 const char **input, const char **inend, Py_ssize_t *startinpos, 2052 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr, 2053 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr) 2054{ 2055 static char *argparse = "O!n;decoding error handler must return (str, int) tuple"; 2056 2057 PyObject *restuple = NULL; 2058 PyObject *repunicode = NULL; 2059 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output); 2060 Py_ssize_t insize; 2061 Py_ssize_t requiredsize; 2062 Py_ssize_t newpos; 2063 Py_UNICODE *repptr; 2064 PyObject *inputobj = NULL; 2065 Py_ssize_t repsize; 2066 int res = -1; 2067 2068 if (*errorHandler == NULL) { 2069 *errorHandler = PyCodec_LookupError(errors); 2070 if (*errorHandler == NULL) 2071 goto onError; 2072 } 2073 2074 make_decode_exception(exceptionObject, 2075 encoding, 2076 *input, *inend - *input, 2077 *startinpos, *endinpos, 2078 reason); 2079 if (*exceptionObject == NULL) 2080 goto onError; 2081 2082 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL); 2083 if (restuple == NULL) 2084 goto onError; 2085 if (!PyTuple_Check(restuple)) { 2086 PyErr_SetString(PyExc_TypeError, &argparse[4]); 2087 goto onError; 2088 } 2089 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos)) 2090 goto onError; 2091 2092 /* Copy back the bytes variables, which might have been modified by the 2093 callback */ 2094 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject); 2095 if (!inputobj) 2096 goto onError; 2097 if (!PyBytes_Check(inputobj)) { 2098 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes"); 2099 } 2100 *input = PyBytes_AS_STRING(inputobj); 2101 insize = PyBytes_GET_SIZE(inputobj); 2102 *inend = *input + insize; 2103 /* we can DECREF safely, as the exception has another reference, 2104 so the object won't go away. */ 2105 Py_DECREF(inputobj); 2106 2107 if (newpos<0) 2108 newpos = insize+newpos; 2109 if (newpos<0 || newpos>insize) { 2110 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos); 2111 goto onError; 2112 } 2113 2114 /* need more space? (at least enough for what we 2115 have+the replacement+the rest of the string (starting 2116 at the new input position), so we won't have to check space 2117 when there are no errors in the rest of the string) */ 2118 repptr = PyUnicode_AS_UNICODE(repunicode); 2119 repsize = PyUnicode_GET_SIZE(repunicode); 2120 requiredsize = *outpos + repsize + insize-newpos; 2121 if (requiredsize > outsize) { 2122 if (requiredsize<2*outsize) 2123 requiredsize = 2*outsize; 2124 if (_PyUnicode_Resize(output, requiredsize) < 0) 2125 goto onError; 2126 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos; 2127 } 2128 *endinpos = newpos; 2129 *inptr = *input + newpos; 2130 Py_UNICODE_COPY(*outptr, repptr, repsize); 2131 *outptr += repsize; 2132 *outpos += repsize; 2133 2134 /* we made it! */ 2135 res = 0; 2136 2137 onError: 2138 Py_XDECREF(restuple); 2139 return res; 2140} 2141 2142/* --- UTF-7 Codec -------------------------------------------------------- */ 2143 2144/* See RFC2152 for details. We encode conservatively and decode liberally. */ 2145 2146/* Three simple macros defining base-64. */ 2147 2148/* Is c a base-64 character? */ 2149 2150#define IS_BASE64(c) \ 2151 (((c) >= 'A' && (c) <= 'Z') || \ 2152 ((c) >= 'a' && (c) <= 'z') || \ 2153 ((c) >= '0' && (c) <= '9') || \ 2154 (c) == '+' || (c) == '/') 2155 2156/* given that c is a base-64 character, what is its base-64 value? */ 2157 2158#define FROM_BASE64(c) \ 2159 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \ 2160 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \ 2161 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \ 2162 (c) == '+' ? 62 : 63) 2163 2164/* What is the base-64 character of the bottom 6 bits of n? */ 2165 2166#define TO_BASE64(n) \ 2167 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f]) 2168 2169/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be 2170 * decoded as itself. We are permissive on decoding; the only ASCII 2171 * byte not decoding to itself is the + which begins a base64 2172 * string. */ 2173 2174#define DECODE_DIRECT(c) \ 2175 ((c) <= 127 && (c) != '+') 2176 2177/* The UTF-7 encoder treats ASCII characters differently according to 2178 * whether they are Set D, Set O, Whitespace, or special (i.e. none of 2179 * the above). See RFC2152. This array identifies these different 2180 * sets: 2181 * 0 : "Set D" 2182 * alphanumeric and '(),-./:? 2183 * 1 : "Set O" 2184 * !"#$%&*;<=>@[]^_`{|} 2185 * 2 : "whitespace" 2186 * ht nl cr sp 2187 * 3 : special (must be base64 encoded) 2188 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127) 2189 */ 2190 2191static 2192char utf7_category[128] = { 2193/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */ 2194 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3, 2195/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */ 2196 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2197/* sp ! " # $ % & ' ( ) * + , - . / */ 2198 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0, 2199/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */ 2200 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 2201/* @ A B C D E F G H I J K L M N O */ 2202 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2203/* P Q R S T U V W X Y Z [ \ ] ^ _ */ 2204 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1, 2205/* ` a b c d e f g h i j k l m n o */ 2206 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2207/* p q r s t u v w x y z { | } ~ del */ 2208 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3, 2209}; 2210 2211/* ENCODE_DIRECT: this character should be encoded as itself. The 2212 * answer depends on whether we are encoding set O as itself, and also 2213 * on whether we are encoding whitespace as itself. RFC2152 makes it 2214 * clear that the answers to these questions vary between 2215 * applications, so this code needs to be flexible. */ 2216 2217#define ENCODE_DIRECT(c, directO, directWS) \ 2218 ((c) < 128 && (c) > 0 && \ 2219 ((utf7_category[(c)] == 0) || \ 2220 (directWS && (utf7_category[(c)] == 2)) || \ 2221 (directO && (utf7_category[(c)] == 1)))) 2222 2223PyObject *PyUnicode_DecodeUTF7(const char *s, 2224 Py_ssize_t size, 2225 const char *errors) 2226{ 2227 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL); 2228} 2229 2230/* The decoder. The only state we preserve is our read position, 2231 * i.e. how many characters we have consumed. So if we end in the 2232 * middle of a shift sequence we have to back off the read position 2233 * and the output to the beginning of the sequence, otherwise we lose 2234 * all the shift state (seen bits, number of bits seen, high 2235 * surrogate). */ 2236 2237PyObject *PyUnicode_DecodeUTF7Stateful(const char *s, 2238 Py_ssize_t size, 2239 const char *errors, 2240 Py_ssize_t *consumed) 2241{ 2242 const char *starts = s; 2243 Py_ssize_t startinpos; 2244 Py_ssize_t endinpos; 2245 Py_ssize_t outpos; 2246 const char *e; 2247 PyUnicodeObject *unicode; 2248 Py_UNICODE *p; 2249 const char *errmsg = ""; 2250 int inShift = 0; 2251 Py_UNICODE *shiftOutStart; 2252 unsigned int base64bits = 0; 2253 unsigned long base64buffer = 0; 2254 Py_UNICODE surrogate = 0; 2255 PyObject *errorHandler = NULL; 2256 PyObject *exc = NULL; 2257 2258 unicode = _PyUnicode_New(size); 2259 if (!unicode) 2260 return NULL; 2261 if (size == 0) { 2262 if (consumed) 2263 *consumed = 0; 2264 return (PyObject *)unicode; 2265 } 2266 2267 p = unicode->str; 2268 shiftOutStart = p; 2269 e = s + size; 2270 2271 while (s < e) { 2272 Py_UNICODE ch; 2273 restart: 2274 ch = (unsigned char) *s; 2275 2276 if (inShift) { /* in a base-64 section */ 2277 if (IS_BASE64(ch)) { /* consume a base-64 character */ 2278 base64buffer = (base64buffer << 6) | FROM_BASE64(ch); 2279 base64bits += 6; 2280 s++; 2281 if (base64bits >= 16) { 2282 /* we have enough bits for a UTF-16 value */ 2283 Py_UNICODE outCh = (Py_UNICODE) 2284 (base64buffer >> (base64bits-16)); 2285 base64bits -= 16; 2286 base64buffer &= (1 << base64bits) - 1; /* clear high bits */ 2287 if (surrogate) { 2288 /* expecting a second surrogate */ 2289 if (outCh >= 0xDC00 && outCh <= 0xDFFF) { 2290#ifdef Py_UNICODE_WIDE 2291 *p++ = (((surrogate & 0x3FF)<<10) 2292 | (outCh & 0x3FF)) + 0x10000; 2293#else 2294 *p++ = surrogate; 2295 *p++ = outCh; 2296#endif 2297 surrogate = 0; 2298 continue; 2299 } 2300 else { 2301 *p++ = surrogate; 2302 surrogate = 0; 2303 } 2304 } 2305 if (outCh >= 0xD800 && outCh <= 0xDBFF) { 2306 /* first surrogate */ 2307 surrogate = outCh; 2308 } 2309 else { 2310 *p++ = outCh; 2311 } 2312 } 2313 } 2314 else { /* now leaving a base-64 section */ 2315 inShift = 0; 2316 s++; 2317 if (surrogate) { 2318 *p++ = surrogate; 2319 surrogate = 0; 2320 } 2321 if (base64bits > 0) { /* left-over bits */ 2322 if (base64bits >= 6) { 2323 /* We've seen at least one base-64 character */ 2324 errmsg = "partial character in shift sequence"; 2325 goto utf7Error; 2326 } 2327 else { 2328 /* Some bits remain; they should be zero */ 2329 if (base64buffer != 0) { 2330 errmsg = "non-zero padding bits in shift sequence"; 2331 goto utf7Error; 2332 } 2333 } 2334 } 2335 if (ch != '-') { 2336 /* '-' is absorbed; other terminating 2337 characters are preserved */ 2338 *p++ = ch; 2339 } 2340 } 2341 } 2342 else if ( ch == '+' ) { 2343 startinpos = s-starts; 2344 s++; /* consume '+' */ 2345 if (s < e && *s == '-') { /* '+-' encodes '+' */ 2346 s++; 2347 *p++ = '+'; 2348 } 2349 else { /* begin base64-encoded section */ 2350 inShift = 1; 2351 shiftOutStart = p; 2352 base64bits = 0; 2353 } 2354 } 2355 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */ 2356 *p++ = ch; 2357 s++; 2358 } 2359 else { 2360 startinpos = s-starts; 2361 s++; 2362 errmsg = "unexpected special character"; 2363 goto utf7Error; 2364 } 2365 continue; 2366utf7Error: 2367 outpos = p-PyUnicode_AS_UNICODE(unicode); 2368 endinpos = s-starts; 2369 if (unicode_decode_call_errorhandler( 2370 errors, &errorHandler, 2371 "utf7", errmsg, 2372 &starts, &e, &startinpos, &endinpos, &exc, &s, 2373 &unicode, &outpos, &p)) 2374 goto onError; 2375 } 2376 2377 /* end of string */ 2378 2379 if (inShift && !consumed) { /* in shift sequence, no more to follow */ 2380 /* if we're in an inconsistent state, that's an error */ 2381 if (surrogate || 2382 (base64bits >= 6) || 2383 (base64bits > 0 && base64buffer != 0)) { 2384 outpos = p-PyUnicode_AS_UNICODE(unicode); 2385 endinpos = size; 2386 if (unicode_decode_call_errorhandler( 2387 errors, &errorHandler, 2388 "utf7", "unterminated shift sequence", 2389 &starts, &e, &startinpos, &endinpos, &exc, &s, 2390 &unicode, &outpos, &p)) 2391 goto onError; 2392 if (s < e) 2393 goto restart; 2394 } 2395 } 2396 2397 /* return state */ 2398 if (consumed) { 2399 if (inShift) { 2400 p = shiftOutStart; /* back off output */ 2401 *consumed = startinpos; 2402 } 2403 else { 2404 *consumed = s-starts; 2405 } 2406 } 2407 2408 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0) 2409 goto onError; 2410 2411 Py_XDECREF(errorHandler); 2412 Py_XDECREF(exc); 2413 return (PyObject *)unicode; 2414 2415 onError: 2416 Py_XDECREF(errorHandler); 2417 Py_XDECREF(exc); 2418 Py_DECREF(unicode); 2419 return NULL; 2420} 2421 2422 2423PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s, 2424 Py_ssize_t size, 2425 int base64SetO, 2426 int base64WhiteSpace, 2427 const char *errors) 2428{ 2429 PyObject *v; 2430 /* It might be possible to tighten this worst case */ 2431 Py_ssize_t allocated = 8 * size; 2432 int inShift = 0; 2433 Py_ssize_t i = 0; 2434 unsigned int base64bits = 0; 2435 unsigned long base64buffer = 0; 2436 char * out; 2437 char * start; 2438 2439 if (size == 0) 2440 return PyBytes_FromStringAndSize(NULL, 0); 2441 2442 if (allocated / 8 != size) 2443 return PyErr_NoMemory(); 2444 2445 v = PyBytes_FromStringAndSize(NULL, allocated); 2446 if (v == NULL) 2447 return NULL; 2448 2449 start = out = PyBytes_AS_STRING(v); 2450 for (;i < size; ++i) { 2451 Py_UNICODE ch = s[i]; 2452 2453 if (inShift) { 2454 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) { 2455 /* shifting out */ 2456 if (base64bits) { /* output remaining bits */ 2457 *out++ = TO_BASE64(base64buffer << (6-base64bits)); 2458 base64buffer = 0; 2459 base64bits = 0; 2460 } 2461 inShift = 0; 2462 /* Characters not in the BASE64 set implicitly unshift the sequence 2463 so no '-' is required, except if the character is itself a '-' */ 2464 if (IS_BASE64(ch) || ch == '-') { 2465 *out++ = '-'; 2466 } 2467 *out++ = (char) ch; 2468 } 2469 else { 2470 goto encode_char; 2471 } 2472 } 2473 else { /* not in a shift sequence */ 2474 if (ch == '+') { 2475 *out++ = '+'; 2476 *out++ = '-'; 2477 } 2478 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) { 2479 *out++ = (char) ch; 2480 } 2481 else { 2482 *out++ = '+'; 2483 inShift = 1; 2484 goto encode_char; 2485 } 2486 } 2487 continue; 2488encode_char: 2489#ifdef Py_UNICODE_WIDE 2490 if (ch >= 0x10000) { 2491 /* code first surrogate */ 2492 base64bits += 16; 2493 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10); 2494 while (base64bits >= 6) { 2495 *out++ = TO_BASE64(base64buffer >> (base64bits-6)); 2496 base64bits -= 6; 2497 } 2498 /* prepare second surrogate */ 2499 ch = 0xDC00 | ((ch-0x10000) & 0x3FF); 2500 } 2501#endif 2502 base64bits += 16; 2503 base64buffer = (base64buffer << 16) | ch; 2504 while (base64bits >= 6) { 2505 *out++ = TO_BASE64(base64buffer >> (base64bits-6)); 2506 base64bits -= 6; 2507 } 2508 } 2509 if (base64bits) 2510 *out++= TO_BASE64(base64buffer << (6-base64bits) ); 2511 if (inShift) 2512 *out++ = '-'; 2513 if (_PyBytes_Resize(&v, out - start) < 0) 2514 return NULL; 2515 return v; 2516} 2517 2518#undef IS_BASE64 2519#undef FROM_BASE64 2520#undef TO_BASE64 2521#undef DECODE_DIRECT 2522#undef ENCODE_DIRECT 2523 2524/* --- UTF-8 Codec -------------------------------------------------------- */ 2525 2526static 2527char utf8_code_length[256] = { 2528 /* Map UTF-8 encoded prefix byte to sequence length. Zero means 2529 illegal prefix. See RFC 3629 for details */ 2530 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */ 2531 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2532 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2533 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2534 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2535 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2536 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2537 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */ 2538 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */ 2539 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2540 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2541 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */ 2542 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */ 2543 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */ 2544 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */ 2545 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */ 2546}; 2547 2548PyObject *PyUnicode_DecodeUTF8(const char *s, 2549 Py_ssize_t size, 2550 const char *errors) 2551{ 2552 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL); 2553} 2554 2555/* Mask to check or force alignment of a pointer to C 'long' boundaries */ 2556#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1) 2557 2558/* Mask to quickly check whether a C 'long' contains a 2559 non-ASCII, UTF8-encoded char. */ 2560#if (SIZEOF_LONG == 8) 2561# define ASCII_CHAR_MASK 0x8080808080808080L 2562#elif (SIZEOF_LONG == 4) 2563# define ASCII_CHAR_MASK 0x80808080L 2564#else 2565# error C 'long' size should be either 4 or 8! 2566#endif 2567 2568PyObject *PyUnicode_DecodeUTF8Stateful(const char *s, 2569 Py_ssize_t size, 2570 const char *errors, 2571 Py_ssize_t *consumed) 2572{ 2573 const char *starts = s; 2574 int n; 2575 int k; 2576 Py_ssize_t startinpos; 2577 Py_ssize_t endinpos; 2578 Py_ssize_t outpos; 2579 const char *e, *aligned_end; 2580 PyUnicodeObject *unicode; 2581 Py_UNICODE *p; 2582 const char *errmsg = ""; 2583 PyObject *errorHandler = NULL; 2584 PyObject *exc = NULL; 2585 2586 /* Note: size will always be longer than the resulting Unicode 2587 character count */ 2588 unicode = _PyUnicode_New(size); 2589 if (!unicode) 2590 return NULL; 2591 if (size == 0) { 2592 if (consumed) 2593 *consumed = 0; 2594 return (PyObject *)unicode; 2595 } 2596 2597 /* Unpack UTF-8 encoded data */ 2598 p = unicode->str; 2599 e = s + size; 2600 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK); 2601 2602 while (s < e) { 2603 Py_UCS4 ch = (unsigned char)*s; 2604 2605 if (ch < 0x80) { 2606 /* Fast path for runs of ASCII characters. Given that common UTF-8 2607 input will consist of an overwhelming majority of ASCII 2608 characters, we try to optimize for this case by checking 2609 as many characters as a C 'long' can contain. 2610 First, check if we can do an aligned read, as most CPUs have 2611 a penalty for unaligned reads. 2612 */ 2613 if (!((size_t) s & LONG_PTR_MASK)) { 2614 /* Help register allocation */ 2615 register const char *_s = s; 2616 register Py_UNICODE *_p = p; 2617 while (_s < aligned_end) { 2618 /* Read a whole long at a time (either 4 or 8 bytes), 2619 and do a fast unrolled copy if it only contains ASCII 2620 characters. */ 2621 unsigned long data = *(unsigned long *) _s; 2622 if (data & ASCII_CHAR_MASK) 2623 break; 2624 _p[0] = (unsigned char) _s[0]; 2625 _p[1] = (unsigned char) _s[1]; 2626 _p[2] = (unsigned char) _s[2]; 2627 _p[3] = (unsigned char) _s[3]; 2628#if (SIZEOF_LONG == 8) 2629 _p[4] = (unsigned char) _s[4]; 2630 _p[5] = (unsigned char) _s[5]; 2631 _p[6] = (unsigned char) _s[6]; 2632 _p[7] = (unsigned char) _s[7]; 2633#endif 2634 _s += SIZEOF_LONG; 2635 _p += SIZEOF_LONG; 2636 } 2637 s = _s; 2638 p = _p; 2639 if (s == e) 2640 break; 2641 ch = (unsigned char)*s; 2642 } 2643 } 2644 2645 if (ch < 0x80) { 2646 *p++ = (Py_UNICODE)ch; 2647 s++; 2648 continue; 2649 } 2650 2651 n = utf8_code_length[ch]; 2652 2653 if (s + n > e) { 2654 if (consumed) 2655 break; 2656 else { 2657 errmsg = "unexpected end of data"; 2658 startinpos = s-starts; 2659 endinpos = startinpos+1; 2660 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++) 2661 endinpos++; 2662 goto utf8Error; 2663 } 2664 } 2665 2666 switch (n) { 2667 2668 case 0: 2669 errmsg = "invalid start byte"; 2670 startinpos = s-starts; 2671 endinpos = startinpos+1; 2672 goto utf8Error; 2673 2674 case 1: 2675 errmsg = "internal error"; 2676 startinpos = s-starts; 2677 endinpos = startinpos+1; 2678 goto utf8Error; 2679 2680 case 2: 2681 if ((s[1] & 0xc0) != 0x80) { 2682 errmsg = "invalid continuation byte"; 2683 startinpos = s-starts; 2684 endinpos = startinpos + 1; 2685 goto utf8Error; 2686 } 2687 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f); 2688 assert ((ch > 0x007F) && (ch <= 0x07FF)); 2689 *p++ = (Py_UNICODE)ch; 2690 break; 2691 2692 case 3: 2693 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf 2694 will result in surrogates in range d800-dfff. Surrogates are 2695 not valid UTF-8 so they are rejected. 2696 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf 2697 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */ 2698 if ((s[1] & 0xc0) != 0x80 || 2699 (s[2] & 0xc0) != 0x80 || 2700 ((unsigned char)s[0] == 0xE0 && 2701 (unsigned char)s[1] < 0xA0) || 2702 ((unsigned char)s[0] == 0xED && 2703 (unsigned char)s[1] > 0x9F)) { 2704 errmsg = "invalid continuation byte"; 2705 startinpos = s-starts; 2706 endinpos = startinpos + 1; 2707 2708 /* if s[1] first two bits are 1 and 0, then the invalid 2709 continuation byte is s[2], so increment endinpos by 1, 2710 if not, s[1] is invalid and endinpos doesn't need to 2711 be incremented. */ 2712 if ((s[1] & 0xC0) == 0x80) 2713 endinpos++; 2714 goto utf8Error; 2715 } 2716 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f); 2717 assert ((ch > 0x07FF) && (ch <= 0xFFFF)); 2718 *p++ = (Py_UNICODE)ch; 2719 break; 2720 2721 case 4: 2722 if ((s[1] & 0xc0) != 0x80 || 2723 (s[2] & 0xc0) != 0x80 || 2724 (s[3] & 0xc0) != 0x80 || 2725 ((unsigned char)s[0] == 0xF0 && 2726 (unsigned char)s[1] < 0x90) || 2727 ((unsigned char)s[0] == 0xF4 && 2728 (unsigned char)s[1] > 0x8F)) { 2729 errmsg = "invalid continuation byte"; 2730 startinpos = s-starts; 2731 endinpos = startinpos + 1; 2732 if ((s[1] & 0xC0) == 0x80) { 2733 endinpos++; 2734 if ((s[2] & 0xC0) == 0x80) 2735 endinpos++; 2736 } 2737 goto utf8Error; 2738 } 2739 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) + 2740 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f); 2741 assert ((ch > 0xFFFF) && (ch <= 0x10ffff)); 2742 2743#ifdef Py_UNICODE_WIDE 2744 *p++ = (Py_UNICODE)ch; 2745#else 2746 /* compute and append the two surrogates: */ 2747 2748 /* translate from 10000..10FFFF to 0..FFFF */ 2749 ch -= 0x10000; 2750 2751 /* high surrogate = top 10 bits added to D800 */ 2752 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10)); 2753 2754 /* low surrogate = bottom 10 bits added to DC00 */ 2755 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF)); 2756#endif 2757 break; 2758 } 2759 s += n; 2760 continue; 2761 2762 utf8Error: 2763 outpos = p-PyUnicode_AS_UNICODE(unicode); 2764 if (unicode_decode_call_errorhandler( 2765 errors, &errorHandler, 2766 "utf-8", errmsg, 2767 &starts, &e, &startinpos, &endinpos, &exc, &s, 2768 &unicode, &outpos, &p)) 2769 goto onError; 2770 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK); 2771 } 2772 if (consumed) 2773 *consumed = s-starts; 2774 2775 /* Adjust length */ 2776 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0) 2777 goto onError; 2778 2779 Py_XDECREF(errorHandler); 2780 Py_XDECREF(exc); 2781 return (PyObject *)unicode; 2782 2783 onError: 2784 Py_XDECREF(errorHandler); 2785 Py_XDECREF(exc); 2786 Py_DECREF(unicode); 2787 return NULL; 2788} 2789 2790#undef ASCII_CHAR_MASK 2791 2792#ifdef __APPLE__ 2793 2794/* Simplified UTF-8 decoder using surrogateescape error handler, 2795 used to decode the command line arguments on Mac OS X. */ 2796 2797wchar_t* 2798_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size) 2799{ 2800 int n; 2801 const char *e; 2802 wchar_t *unicode, *p; 2803 2804 /* Note: size will always be longer than the resulting Unicode 2805 character count */ 2806 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) { 2807 PyErr_NoMemory(); 2808 return NULL; 2809 } 2810 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t)); 2811 if (!unicode) 2812 return NULL; 2813 2814 /* Unpack UTF-8 encoded data */ 2815 p = unicode; 2816 e = s + size; 2817 while (s < e) { 2818 Py_UCS4 ch = (unsigned char)*s; 2819 2820 if (ch < 0x80) { 2821 *p++ = (wchar_t)ch; 2822 s++; 2823 continue; 2824 } 2825 2826 n = utf8_code_length[ch]; 2827 if (s + n > e) { 2828 goto surrogateescape; 2829 } 2830 2831 switch (n) { 2832 case 0: 2833 case 1: 2834 goto surrogateescape; 2835 2836 case 2: 2837 if ((s[1] & 0xc0) != 0x80) 2838 goto surrogateescape; 2839 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f); 2840 assert ((ch > 0x007F) && (ch <= 0x07FF)); 2841 *p++ = (wchar_t)ch; 2842 break; 2843 2844 case 3: 2845 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf 2846 will result in surrogates in range d800-dfff. Surrogates are 2847 not valid UTF-8 so they are rejected. 2848 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf 2849 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */ 2850 if ((s[1] & 0xc0) != 0x80 || 2851 (s[2] & 0xc0) != 0x80 || 2852 ((unsigned char)s[0] == 0xE0 && 2853 (unsigned char)s[1] < 0xA0) || 2854 ((unsigned char)s[0] == 0xED && 2855 (unsigned char)s[1] > 0x9F)) { 2856 2857 goto surrogateescape; 2858 } 2859 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f); 2860 assert ((ch > 0x07FF) && (ch <= 0xFFFF)); 2861 *p++ = (Py_UNICODE)ch; 2862 break; 2863 2864 case 4: 2865 if ((s[1] & 0xc0) != 0x80 || 2866 (s[2] & 0xc0) != 0x80 || 2867 (s[3] & 0xc0) != 0x80 || 2868 ((unsigned char)s[0] == 0xF0 && 2869 (unsigned char)s[1] < 0x90) || 2870 ((unsigned char)s[0] == 0xF4 && 2871 (unsigned char)s[1] > 0x8F)) { 2872 goto surrogateescape; 2873 } 2874 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) + 2875 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f); 2876 assert ((ch > 0xFFFF) && (ch <= 0x10ffff)); 2877 2878#if SIZEOF_WCHAR_T == 4 2879 *p++ = (wchar_t)ch; 2880#else 2881 /* compute and append the two surrogates: */ 2882 2883 /* translate from 10000..10FFFF to 0..FFFF */ 2884 ch -= 0x10000; 2885 2886 /* high surrogate = top 10 bits added to D800 */ 2887 *p++ = (wchar_t)(0xD800 + (ch >> 10)); 2888 2889 /* low surrogate = bottom 10 bits added to DC00 */ 2890 *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF)); 2891#endif 2892 break; 2893 } 2894 s += n; 2895 continue; 2896 2897 surrogateescape: 2898 *p++ = 0xDC00 + ch; 2899 s++; 2900 } 2901 *p = L'\0'; 2902 return unicode; 2903} 2904 2905#endif /* __APPLE__ */ 2906 2907/* Allocation strategy: if the string is short, convert into a stack buffer 2908 and allocate exactly as much space needed at the end. Else allocate the 2909 maximum possible needed (4 result bytes per Unicode character), and return 2910 the excess memory at the end. 2911*/ 2912PyObject * 2913PyUnicode_EncodeUTF8(const Py_UNICODE *s, 2914 Py_ssize_t size, 2915 const char *errors) 2916{ 2917#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */ 2918 2919 Py_ssize_t i; /* index into s of next input byte */ 2920 PyObject *result; /* result string object */ 2921 char *p; /* next free byte in output buffer */ 2922 Py_ssize_t nallocated; /* number of result bytes allocated */ 2923 Py_ssize_t nneeded; /* number of result bytes needed */ 2924 char stackbuf[MAX_SHORT_UNICHARS * 4]; 2925 PyObject *errorHandler = NULL; 2926 PyObject *exc = NULL; 2927 2928 assert(s != NULL); 2929 assert(size >= 0); 2930 2931 if (size <= MAX_SHORT_UNICHARS) { 2932 /* Write into the stack buffer; nallocated can't overflow. 2933 * At the end, we'll allocate exactly as much heap space as it 2934 * turns out we need. 2935 */ 2936 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int); 2937 result = NULL; /* will allocate after we're done */ 2938 p = stackbuf; 2939 } 2940 else { 2941 /* Overallocate on the heap, and give the excess back at the end. */ 2942 nallocated = size * 4; 2943 if (nallocated / 4 != size) /* overflow! */ 2944 return PyErr_NoMemory(); 2945 result = PyBytes_FromStringAndSize(NULL, nallocated); 2946 if (result == NULL) 2947 return NULL; 2948 p = PyBytes_AS_STRING(result); 2949 } 2950 2951 for (i = 0; i < size;) { 2952 Py_UCS4 ch = s[i++]; 2953 2954 if (ch < 0x80) 2955 /* Encode ASCII */ 2956 *p++ = (char) ch; 2957 2958 else if (ch < 0x0800) { 2959 /* Encode Latin-1 */ 2960 *p++ = (char)(0xc0 | (ch >> 6)); 2961 *p++ = (char)(0x80 | (ch & 0x3f)); 2962 } else if (0xD800 <= ch && ch <= 0xDFFF) { 2963#ifndef Py_UNICODE_WIDE 2964 /* Special case: check for high and low surrogate */ 2965 if (ch <= 0xDBFF && i != size && 0xDC00 <= s[i] && s[i] <= 0xDFFF) { 2966 Py_UCS4 ch2 = s[i]; 2967 /* Combine the two surrogates to form a UCS4 value */ 2968 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000; 2969 i++; 2970 2971 /* Encode UCS4 Unicode ordinals */ 2972 *p++ = (char)(0xf0 | (ch >> 18)); 2973 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f)); 2974 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); 2975 *p++ = (char)(0x80 | (ch & 0x3f)); 2976 } else { 2977#endif 2978 Py_ssize_t newpos; 2979 PyObject *rep; 2980 Py_ssize_t repsize, k; 2981 rep = unicode_encode_call_errorhandler 2982 (errors, &errorHandler, "utf-8", "surrogates not allowed", 2983 s, size, &exc, i-1, i, &newpos); 2984 if (!rep) 2985 goto error; 2986 2987 if (PyBytes_Check(rep)) 2988 repsize = PyBytes_GET_SIZE(rep); 2989 else 2990 repsize = PyUnicode_GET_SIZE(rep); 2991 2992 if (repsize > 4) { 2993 Py_ssize_t offset; 2994 2995 if (result == NULL) 2996 offset = p - stackbuf; 2997 else 2998 offset = p - PyBytes_AS_STRING(result); 2999 3000 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) { 3001 /* integer overflow */ 3002 PyErr_NoMemory(); 3003 goto error; 3004 } 3005 nallocated += repsize - 4; 3006 if (result != NULL) { 3007 if (_PyBytes_Resize(&result, nallocated) < 0) 3008 goto error; 3009 } else { 3010 result = PyBytes_FromStringAndSize(NULL, nallocated); 3011 if (result == NULL) 3012 goto error; 3013 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset); 3014 } 3015 p = PyBytes_AS_STRING(result) + offset; 3016 } 3017 3018 if (PyBytes_Check(rep)) { 3019 char *prep = PyBytes_AS_STRING(rep); 3020 for(k = repsize; k > 0; k--) 3021 *p++ = *prep++; 3022 } else /* rep is unicode */ { 3023 Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep); 3024 Py_UNICODE c; 3025 3026 for(k=0; k<repsize; k++) { 3027 c = prep[k]; 3028 if (0x80 <= c) { 3029 raise_encode_exception(&exc, "utf-8", s, size, 3030 i-1, i, "surrogates not allowed"); 3031 goto error; 3032 } 3033 *p++ = (char)prep[k]; 3034 } 3035 } 3036 Py_DECREF(rep); 3037#ifndef Py_UNICODE_WIDE 3038 } 3039#endif 3040 } else if (ch < 0x10000) { 3041 *p++ = (char)(0xe0 | (ch >> 12)); 3042 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); 3043 *p++ = (char)(0x80 | (ch & 0x3f)); 3044 } else /* ch >= 0x10000 */ { 3045 /* Encode UCS4 Unicode ordinals */ 3046 *p++ = (char)(0xf0 | (ch >> 18)); 3047 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f)); 3048 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); 3049 *p++ = (char)(0x80 | (ch & 0x3f)); 3050 } 3051 } 3052 3053 if (result == NULL) { 3054 /* This was stack allocated. */ 3055 nneeded = p - stackbuf; 3056 assert(nneeded <= nallocated); 3057 result = PyBytes_FromStringAndSize(stackbuf, nneeded); 3058 } 3059 else { 3060 /* Cut back to size actually needed. */ 3061 nneeded = p - PyBytes_AS_STRING(result); 3062 assert(nneeded <= nallocated); 3063 _PyBytes_Resize(&result, nneeded); 3064 } 3065 Py_XDECREF(errorHandler); 3066 Py_XDECREF(exc); 3067 return result; 3068 error: 3069 Py_XDECREF(errorHandler); 3070 Py_XDECREF(exc); 3071 Py_XDECREF(result); 3072 return NULL; 3073 3074#undef MAX_SHORT_UNICHARS 3075} 3076 3077PyObject *PyUnicode_AsUTF8String(PyObject *unicode) 3078{ 3079 if (!PyUnicode_Check(unicode)) { 3080 PyErr_BadArgument(); 3081 return NULL; 3082 } 3083 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode), 3084 PyUnicode_GET_SIZE(unicode), 3085 NULL); 3086} 3087 3088/* --- UTF-32 Codec ------------------------------------------------------- */ 3089 3090PyObject * 3091PyUnicode_DecodeUTF32(const char *s, 3092 Py_ssize_t size, 3093 const char *errors, 3094 int *byteorder) 3095{ 3096 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL); 3097} 3098 3099PyObject * 3100PyUnicode_DecodeUTF32Stateful(const char *s, 3101 Py_ssize_t size, 3102 const char *errors, 3103 int *byteorder, 3104 Py_ssize_t *consumed) 3105{ 3106 const char *starts = s; 3107 Py_ssize_t startinpos; 3108 Py_ssize_t endinpos; 3109 Py_ssize_t outpos; 3110 PyUnicodeObject *unicode; 3111 Py_UNICODE *p; 3112#ifndef Py_UNICODE_WIDE 3113 int pairs = 0; 3114 const unsigned char *qq; 3115#else 3116 const int pairs = 0; 3117#endif 3118 const unsigned char *q, *e; 3119 int bo = 0; /* assume native ordering by default */ 3120 const char *errmsg = ""; 3121 /* Offsets from q for retrieving bytes in the right order. */ 3122#ifdef BYTEORDER_IS_LITTLE_ENDIAN 3123 int iorder[] = {0, 1, 2, 3}; 3124#else 3125 int iorder[] = {3, 2, 1, 0}; 3126#endif 3127 PyObject *errorHandler = NULL; 3128 PyObject *exc = NULL; 3129 3130 q = (unsigned char *)s; 3131 e = q + size; 3132 3133 if (byteorder) 3134 bo = *byteorder; 3135 3136 /* Check for BOM marks (U+FEFF) in the input and adjust current 3137 byte order setting accordingly. In native mode, the leading BOM 3138 mark is skipped, in all other modes, it is copied to the output 3139 stream as-is (giving a ZWNBSP character). */ 3140 if (bo == 0) { 3141 if (size >= 4) { 3142 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) | 3143 (q[iorder[1]] << 8) | q[iorder[0]]; 3144#ifdef BYTEORDER_IS_LITTLE_ENDIAN 3145 if (bom == 0x0000FEFF) { 3146 q += 4; 3147 bo = -1; 3148 } 3149 else if (bom == 0xFFFE0000) { 3150 q += 4; 3151 bo = 1; 3152 } 3153#else 3154 if (bom == 0x0000FEFF) { 3155 q += 4; 3156 bo = 1; 3157 } 3158 else if (bom == 0xFFFE0000) { 3159 q += 4; 3160 bo = -1; 3161 } 3162#endif 3163 } 3164 } 3165 3166 if (bo == -1) { 3167 /* force LE */ 3168 iorder[0] = 0; 3169 iorder[1] = 1; 3170 iorder[2] = 2; 3171 iorder[3] = 3; 3172 } 3173 else if (bo == 1) { 3174 /* force BE */ 3175 iorder[0] = 3; 3176 iorder[1] = 2; 3177 iorder[2] = 1; 3178 iorder[3] = 0; 3179 } 3180 3181 /* On narrow builds we split characters outside the BMP into two 3182 codepoints => count how much extra space we need. */ 3183#ifndef Py_UNICODE_WIDE 3184 for (qq = q; qq < e; qq += 4) 3185 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0) 3186 pairs++; 3187#endif 3188 3189 /* This might be one to much, because of a BOM */ 3190 unicode = _PyUnicode_New((size+3)/4+pairs); 3191 if (!unicode) 3192 return NULL; 3193 if (size == 0) 3194 return (PyObject *)unicode; 3195 3196 /* Unpack UTF-32 encoded data */ 3197 p = unicode->str; 3198 3199 while (q < e) { 3200 Py_UCS4 ch; 3201 /* remaining bytes at the end? (size should be divisible by 4) */ 3202 if (e-q<4) { 3203 if (consumed) 3204 break; 3205 errmsg = "truncated data"; 3206 startinpos = ((const char *)q)-starts; 3207 endinpos = ((const char *)e)-starts; 3208 goto utf32Error; 3209 /* The remaining input chars are ignored if the callback 3210 chooses to skip the input */ 3211 } 3212 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) | 3213 (q[iorder[1]] << 8) | q[iorder[0]]; 3214 3215 if (ch >= 0x110000) 3216 { 3217 errmsg = "codepoint not in range(0x110000)"; 3218 startinpos = ((const char *)q)-starts; 3219 endinpos = startinpos+4; 3220 goto utf32Error; 3221 } 3222#ifndef Py_UNICODE_WIDE 3223 if (ch >= 0x10000) 3224 { 3225 *p++ = 0xD800 | ((ch-0x10000) >> 10); 3226 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF); 3227 } 3228 else 3229#endif 3230 *p++ = ch; 3231 q += 4; 3232 continue; 3233 utf32Error: 3234 outpos = p-PyUnicode_AS_UNICODE(unicode); 3235 if (unicode_decode_call_errorhandler( 3236 errors, &errorHandler, 3237 "utf32", errmsg, 3238 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q, 3239 &unicode, &outpos, &p)) 3240 goto onError; 3241 } 3242 3243 if (byteorder) 3244 *byteorder = bo; 3245 3246 if (consumed) 3247 *consumed = (const char *)q-starts; 3248 3249 /* Adjust length */ 3250 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0) 3251 goto onError; 3252 3253 Py_XDECREF(errorHandler); 3254 Py_XDECREF(exc); 3255 return (PyObject *)unicode; 3256 3257 onError: 3258 Py_DECREF(unicode); 3259 Py_XDECREF(errorHandler); 3260 Py_XDECREF(exc); 3261 return NULL; 3262} 3263 3264PyObject * 3265PyUnicode_EncodeUTF32(const Py_UNICODE *s, 3266 Py_ssize_t size, 3267 const char *errors, 3268 int byteorder) 3269{ 3270 PyObject *v; 3271 unsigned char *p; 3272 Py_ssize_t nsize, bytesize; 3273#ifndef Py_UNICODE_WIDE 3274 Py_ssize_t i, pairs; 3275#else 3276 const int pairs = 0; 3277#endif 3278 /* Offsets from p for storing byte pairs in the right order. */ 3279#ifdef BYTEORDER_IS_LITTLE_ENDIAN 3280 int iorder[] = {0, 1, 2, 3}; 3281#else 3282 int iorder[] = {3, 2, 1, 0}; 3283#endif 3284 3285#define STORECHAR(CH) \ 3286 do { \ 3287 p[iorder[3]] = ((CH) >> 24) & 0xff; \ 3288 p[iorder[2]] = ((CH) >> 16) & 0xff; \ 3289 p[iorder[1]] = ((CH) >> 8) & 0xff; \ 3290 p[iorder[0]] = (CH) & 0xff; \ 3291 p += 4; \ 3292 } while(0) 3293 3294 /* In narrow builds we can output surrogate pairs as one codepoint, 3295 so we need less space. */ 3296#ifndef Py_UNICODE_WIDE 3297 for (i = pairs = 0; i < size-1; i++) 3298 if (0xD800 <= s[i] && s[i] <= 0xDBFF && 3299 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF) 3300 pairs++; 3301#endif 3302 nsize = (size - pairs + (byteorder == 0)); 3303 bytesize = nsize * 4; 3304 if (bytesize / 4 != nsize) 3305 return PyErr_NoMemory(); 3306 v = PyBytes_FromStringAndSize(NULL, bytesize); 3307 if (v == NULL) 3308 return NULL; 3309 3310 p = (unsigned char *)PyBytes_AS_STRING(v); 3311 if (byteorder == 0) 3312 STORECHAR(0xFEFF); 3313 if (size == 0) 3314 goto done; 3315 3316 if (byteorder == -1) { 3317 /* force LE */ 3318 iorder[0] = 0; 3319 iorder[1] = 1; 3320 iorder[2] = 2; 3321 iorder[3] = 3; 3322 } 3323 else if (byteorder == 1) { 3324 /* force BE */ 3325 iorder[0] = 3; 3326 iorder[1] = 2; 3327 iorder[2] = 1; 3328 iorder[3] = 0; 3329 } 3330 3331 while (size-- > 0) { 3332 Py_UCS4 ch = *s++; 3333#ifndef Py_UNICODE_WIDE 3334 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) { 3335 Py_UCS4 ch2 = *s; 3336 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) { 3337 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000; 3338 s++; 3339 size--; 3340 } 3341 } 3342#endif 3343 STORECHAR(ch); 3344 } 3345 3346 done: 3347 return v; 3348#undef STORECHAR 3349} 3350 3351PyObject *PyUnicode_AsUTF32String(PyObject *unicode) 3352{ 3353 if (!PyUnicode_Check(unicode)) { 3354 PyErr_BadArgument(); 3355 return NULL; 3356 } 3357 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode), 3358 PyUnicode_GET_SIZE(unicode), 3359 NULL, 3360 0); 3361} 3362 3363/* --- UTF-16 Codec ------------------------------------------------------- */ 3364 3365PyObject * 3366PyUnicode_DecodeUTF16(const char *s, 3367 Py_ssize_t size, 3368 const char *errors, 3369 int *byteorder) 3370{ 3371 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL); 3372} 3373 3374/* Two masks for fast checking of whether a C 'long' may contain 3375 UTF16-encoded surrogate characters. This is an efficient heuristic, 3376 assuming that non-surrogate characters with a code point >= 0x8000 are 3377 rare in most input. 3378 FAST_CHAR_MASK is used when the input is in native byte ordering, 3379 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering. 3380*/ 3381#if (SIZEOF_LONG == 8) 3382# define FAST_CHAR_MASK 0x8000800080008000L 3383# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L 3384#elif (SIZEOF_LONG == 4) 3385# define FAST_CHAR_MASK 0x80008000L 3386# define SWAPPED_FAST_CHAR_MASK 0x00800080L 3387#else 3388# error C 'long' size should be either 4 or 8! 3389#endif 3390 3391PyObject * 3392PyUnicode_DecodeUTF16Stateful(const char *s, 3393 Py_ssize_t size, 3394 const char *errors, 3395 int *byteorder, 3396 Py_ssize_t *consumed) 3397{ 3398 const char *starts = s; 3399 Py_ssize_t startinpos; 3400 Py_ssize_t endinpos; 3401 Py_ssize_t outpos; 3402 PyUnicodeObject *unicode; 3403 Py_UNICODE *p; 3404 const unsigned char *q, *e, *aligned_end; 3405 int bo = 0; /* assume native ordering by default */ 3406 int native_ordering = 0; 3407 const char *errmsg = ""; 3408 /* Offsets from q for retrieving byte pairs in the right order. */ 3409#ifdef BYTEORDER_IS_LITTLE_ENDIAN 3410 int ihi = 1, ilo = 0; 3411#else 3412 int ihi = 0, ilo = 1; 3413#endif 3414 PyObject *errorHandler = NULL; 3415 PyObject *exc = NULL; 3416 3417 /* Note: size will always be longer than the resulting Unicode 3418 character count */ 3419 unicode = _PyUnicode_New(size); 3420 if (!unicode) 3421 return NULL; 3422 if (size == 0) 3423 return (PyObject *)unicode; 3424 3425 /* Unpack UTF-16 encoded data */ 3426 p = unicode->str; 3427 q = (unsigned char *)s; 3428 e = q + size; 3429 3430 if (byteorder) 3431 bo = *byteorder; 3432 3433 /* Check for BOM marks (U+FEFF) in the input and adjust current 3434 byte order setting accordingly. In native mode, the leading BOM 3435 mark is skipped, in all other modes, it is copied to the output 3436 stream as-is (giving a ZWNBSP character). */ 3437 if (bo == 0) { 3438 if (size >= 2) { 3439 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo]; 3440#ifdef BYTEORDER_IS_LITTLE_ENDIAN 3441 if (bom == 0xFEFF) { 3442 q += 2; 3443 bo = -1; 3444 } 3445 else if (bom == 0xFFFE) { 3446 q += 2; 3447 bo = 1; 3448 } 3449#else 3450 if (bom == 0xFEFF) { 3451 q += 2; 3452 bo = 1; 3453 } 3454 else if (bom == 0xFFFE) { 3455 q += 2; 3456 bo = -1; 3457 } 3458#endif 3459 } 3460 } 3461 3462 if (bo == -1) { 3463 /* force LE */ 3464 ihi = 1; 3465 ilo = 0; 3466 } 3467 else if (bo == 1) { 3468 /* force BE */ 3469 ihi = 0; 3470 ilo = 1; 3471 } 3472#ifdef BYTEORDER_IS_LITTLE_ENDIAN 3473 native_ordering = ilo < ihi; 3474#else 3475 native_ordering = ilo > ihi; 3476#endif 3477 3478 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK); 3479 while (1) { 3480 Py_UNICODE ch; 3481 if (e - q < 2) { 3482 /* remaining byte at the end? (size should be even) */ 3483 if (q == e || consumed) 3484 break; 3485 errmsg = "truncated data"; 3486 startinpos = ((const char *)q) - starts; 3487 endinpos = ((const char *)e) - starts; 3488 outpos = p - PyUnicode_AS_UNICODE(unicode); 3489 goto utf16Error; 3490 /* The remaining input chars are ignored if the callback 3491 chooses to skip the input */ 3492 } 3493 /* First check for possible aligned read of a C 'long'. Unaligned 3494 reads are more expensive, better to defer to another iteration. */ 3495 if (!((size_t) q & LONG_PTR_MASK)) { 3496 /* Fast path for runs of non-surrogate chars. */ 3497 register const unsigned char *_q = q; 3498 Py_UNICODE *_p = p; 3499 if (native_ordering) { 3500 /* Native ordering is simple: as long as the input cannot 3501 possibly contain a surrogate char, do an unrolled copy 3502 of several 16-bit code points to the target object. 3503 The non-surrogate check is done on several input bytes 3504 at a time (as many as a C 'long' can contain). */ 3505 while (_q < aligned_end) { 3506 unsigned long data = * (unsigned long *) _q; 3507 if (data & FAST_CHAR_MASK) 3508 break; 3509 _p[0] = ((unsigned short *) _q)[0]; 3510 _p[1] = ((unsigned short *) _q)[1]; 3511#if (SIZEOF_LONG == 8) 3512 _p[2] = ((unsigned short *) _q)[2]; 3513 _p[3] = ((unsigned short *) _q)[3]; 3514#endif 3515 _q += SIZEOF_LONG; 3516 _p += SIZEOF_LONG / 2; 3517 } 3518 } 3519 else { 3520 /* Byteswapped ordering is similar, but we must decompose 3521 the copy bytewise, and take care of zero'ing out the 3522 upper bytes if the target object is in 32-bit units 3523 (that is, in UCS-4 builds). */ 3524 while (_q < aligned_end) { 3525 unsigned long data = * (unsigned long *) _q; 3526 if (data & SWAPPED_FAST_CHAR_MASK) 3527 break; 3528 /* Zero upper bytes in UCS-4 builds */ 3529#if (Py_UNICODE_SIZE > 2) 3530 _p[0] = 0; 3531 _p[1] = 0; 3532#if (SIZEOF_LONG == 8) 3533 _p[2] = 0; 3534 _p[3] = 0; 3535#endif 3536#endif 3537 /* Issue #4916; UCS-4 builds on big endian machines must 3538 fill the two last bytes of each 4-byte unit. */ 3539#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2) 3540# define OFF 2 3541#else 3542# define OFF 0 3543#endif 3544 ((unsigned char *) _p)[OFF + 1] = _q[0]; 3545 ((unsigned char *) _p)[OFF + 0] = _q[1]; 3546 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2]; 3547 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3]; 3548#if (SIZEOF_LONG == 8) 3549 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4]; 3550 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5]; 3551 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6]; 3552 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7]; 3553#endif 3554#undef OFF 3555 _q += SIZEOF_LONG; 3556 _p += SIZEOF_LONG / 2; 3557 } 3558 } 3559 p = _p; 3560 q = _q; 3561 if (e - q < 2) 3562 continue; 3563 } 3564 ch = (q[ihi] << 8) | q[ilo]; 3565 3566 q += 2; 3567 3568 if (ch < 0xD800 || ch > 0xDFFF) { 3569 *p++ = ch; 3570 continue; 3571 } 3572 3573 /* UTF-16 code pair: */ 3574 if (e - q < 2) { 3575 errmsg = "unexpected end of data"; 3576 startinpos = (((const char *)q) - 2) - starts; 3577 endinpos = ((const char *)e) - starts; 3578 goto utf16Error; 3579 } 3580 if (0xD800 <= ch && ch <= 0xDBFF) { 3581 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo]; 3582 q += 2; 3583 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) { 3584#ifndef Py_UNICODE_WIDE 3585 *p++ = ch; 3586 *p++ = ch2; 3587#else 3588 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000; 3589#endif 3590 continue; 3591 } 3592 else { 3593 errmsg = "illegal UTF-16 surrogate"; 3594 startinpos = (((const char *)q)-4)-starts; 3595 endinpos = startinpos+2; 3596 goto utf16Error; 3597 } 3598 3599 } 3600 errmsg = "illegal encoding"; 3601 startinpos = (((const char *)q)-2)-starts; 3602 endinpos = startinpos+2; 3603 /* Fall through to report the error */ 3604 3605 utf16Error: 3606 outpos = p - PyUnicode_AS_UNICODE(unicode); 3607 if (unicode_decode_call_errorhandler( 3608 errors, 3609 &errorHandler, 3610 "utf16", errmsg, 3611 &starts, 3612 (const char **)&e, 3613 &startinpos, 3614 &endinpos, 3615 &exc, 3616 (const char **)&q, 3617 &unicode, 3618 &outpos, 3619 &p)) 3620 goto onError; 3621 /* Update data because unicode_decode_call_errorhandler might have 3622 changed the input object. */ 3623 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK); 3624 } 3625 3626 if (byteorder) 3627 *byteorder = bo; 3628 3629 if (consumed) 3630 *consumed = (const char *)q-starts; 3631 3632 /* Adjust length */ 3633 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0) 3634 goto onError; 3635 3636 Py_XDECREF(errorHandler); 3637 Py_XDECREF(exc); 3638 return (PyObject *)unicode; 3639 3640 onError: 3641 Py_DECREF(unicode); 3642 Py_XDECREF(errorHandler); 3643 Py_XDECREF(exc); 3644 return NULL; 3645} 3646 3647#undef FAST_CHAR_MASK 3648#undef SWAPPED_FAST_CHAR_MASK 3649 3650PyObject * 3651PyUnicode_EncodeUTF16(const Py_UNICODE *s, 3652 Py_ssize_t size, 3653 const char *errors, 3654 int byteorder) 3655{ 3656 PyObject *v; 3657 unsigned char *p; 3658 Py_ssize_t nsize, bytesize; 3659#ifdef Py_UNICODE_WIDE 3660 Py_ssize_t i, pairs; 3661#else 3662 const int pairs = 0; 3663#endif 3664 /* Offsets from p for storing byte pairs in the right order. */ 3665#ifdef BYTEORDER_IS_LITTLE_ENDIAN 3666 int ihi = 1, ilo = 0; 3667#else 3668 int ihi = 0, ilo = 1; 3669#endif 3670 3671#define STORECHAR(CH) \ 3672 do { \ 3673 p[ihi] = ((CH) >> 8) & 0xff; \ 3674 p[ilo] = (CH) & 0xff; \ 3675 p += 2; \ 3676 } while(0) 3677 3678#ifdef Py_UNICODE_WIDE 3679 for (i = pairs = 0; i < size; i++) 3680 if (s[i] >= 0x10000) 3681 pairs++; 3682#endif 3683 /* 2 * (size + pairs + (byteorder == 0)) */ 3684 if (size > PY_SSIZE_T_MAX || 3685 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0)) 3686 return PyErr_NoMemory(); 3687 nsize = size + pairs + (byteorder == 0); 3688 bytesize = nsize * 2; 3689 if (bytesize / 2 != nsize) 3690 return PyErr_NoMemory(); 3691 v = PyBytes_FromStringAndSize(NULL, bytesize); 3692 if (v == NULL) 3693 return NULL; 3694 3695 p = (unsigned char *)PyBytes_AS_STRING(v); 3696 if (byteorder == 0) 3697 STORECHAR(0xFEFF); 3698 if (size == 0) 3699 goto done; 3700 3701 if (byteorder == -1) { 3702 /* force LE */ 3703 ihi = 1; 3704 ilo = 0; 3705 } 3706 else if (byteorder == 1) { 3707 /* force BE */ 3708 ihi = 0; 3709 ilo = 1; 3710 } 3711 3712 while (size-- > 0) { 3713 Py_UNICODE ch = *s++; 3714 Py_UNICODE ch2 = 0; 3715#ifdef Py_UNICODE_WIDE 3716 if (ch >= 0x10000) { 3717 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF); 3718 ch = 0xD800 | ((ch-0x10000) >> 10); 3719 } 3720#endif 3721 STORECHAR(ch); 3722 if (ch2) 3723 STORECHAR(ch2); 3724 } 3725 3726 done: 3727 return v; 3728#undef STORECHAR 3729} 3730 3731PyObject *PyUnicode_AsUTF16String(PyObject *unicode) 3732{ 3733 if (!PyUnicode_Check(unicode)) { 3734 PyErr_BadArgument(); 3735 return NULL; 3736 } 3737 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode), 3738 PyUnicode_GET_SIZE(unicode), 3739 NULL, 3740 0); 3741} 3742 3743/* --- Unicode Escape Codec ----------------------------------------------- */ 3744 3745static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL; 3746 3747PyObject *PyUnicode_DecodeUnicodeEscape(const char *s, 3748 Py_ssize_t size, 3749 const char *errors) 3750{ 3751 const char *starts = s; 3752 Py_ssize_t startinpos; 3753 Py_ssize_t endinpos; 3754 Py_ssize_t outpos; 3755 int i; 3756 PyUnicodeObject *v; 3757 Py_UNICODE *p; 3758 const char *end; 3759 char* message; 3760 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */ 3761 PyObject *errorHandler = NULL; 3762 PyObject *exc = NULL; 3763 3764 /* Escaped strings will always be longer than the resulting 3765 Unicode string, so we start with size here and then reduce the 3766 length after conversion to the true value. 3767 (but if the error callback returns a long replacement string 3768 we'll have to allocate more space) */ 3769 v = _PyUnicode_New(size); 3770 if (v == NULL) 3771 goto onError; 3772 if (size == 0) 3773 return (PyObject *)v; 3774 3775 p = PyUnicode_AS_UNICODE(v); 3776 end = s + size; 3777 3778 while (s < end) { 3779 unsigned char c; 3780 Py_UNICODE x; 3781 int digits; 3782 3783 /* Non-escape characters are interpreted as Unicode ordinals */ 3784 if (*s != '\\') { 3785 *p++ = (unsigned char) *s++; 3786 continue; 3787 } 3788 3789 startinpos = s-starts; 3790 /* \ - Escapes */ 3791 s++; 3792 c = *s++; 3793 if (s > end) 3794 c = '\0'; /* Invalid after \ */ 3795 switch (c) { 3796 3797 /* \x escapes */ 3798 case '\n': break; 3799 case '\\': *p++ = '\\'; break; 3800 case '\'': *p++ = '\''; break; 3801 case '\"': *p++ = '\"'; break; 3802 case 'b': *p++ = '\b'; break; 3803 case 'f': *p++ = '\014'; break; /* FF */ 3804 case 't': *p++ = '\t'; break; 3805 case 'n': *p++ = '\n'; break; 3806 case 'r': *p++ = '\r'; break; 3807 case 'v': *p++ = '\013'; break; /* VT */ 3808 case 'a': *p++ = '\007'; break; /* BEL, not classic C */ 3809 3810 /* \OOO (octal) escapes */ 3811 case '0': case '1': case '2': case '3': 3812 case '4': case '5': case '6': case '7': 3813 x = s[-1] - '0'; 3814 if (s < end && '0' <= *s && *s <= '7') { 3815 x = (x<<3) + *s++ - '0'; 3816 if (s < end && '0' <= *s && *s <= '7') 3817 x = (x<<3) + *s++ - '0'; 3818 } 3819 *p++ = x; 3820 break; 3821 3822 /* hex escapes */ 3823 /* \xXX */ 3824 case 'x': 3825 digits = 2; 3826 message = "truncated \\xXX escape"; 3827 goto hexescape; 3828 3829 /* \uXXXX */ 3830 case 'u': 3831 digits = 4; 3832 message = "truncated \\uXXXX escape"; 3833 goto hexescape; 3834 3835 /* \UXXXXXXXX */ 3836 case 'U': 3837 digits = 8; 3838 message = "truncated \\UXXXXXXXX escape"; 3839 hexescape: 3840 chr = 0; 3841 outpos = p-PyUnicode_AS_UNICODE(v); 3842 if (s+digits>end) { 3843 endinpos = size; 3844 if (unicode_decode_call_errorhandler( 3845 errors, &errorHandler, 3846 "unicodeescape", "end of string in escape sequence", 3847 &starts, &end, &startinpos, &endinpos, &exc, &s, 3848 &v, &outpos, &p)) 3849 goto onError; 3850 goto nextByte; 3851 } 3852 for (i = 0; i < digits; ++i) { 3853 c = (unsigned char) s[i]; 3854 if (!Py_ISXDIGIT(c)) { 3855 endinpos = (s+i+1)-starts; 3856 if (unicode_decode_call_errorhandler( 3857 errors, &errorHandler, 3858 "unicodeescape", message, 3859 &starts, &end, &startinpos, &endinpos, &exc, &s, 3860 &v, &outpos, &p)) 3861 goto onError; 3862 goto nextByte; 3863 } 3864 chr = (chr<<4) & ~0xF; 3865 if (c >= '0' && c <= '9') 3866 chr += c - '0'; 3867 else if (c >= 'a' && c <= 'f') 3868 chr += 10 + c - 'a'; 3869 else 3870 chr += 10 + c - 'A'; 3871 } 3872 s += i; 3873 if (chr == 0xffffffff && PyErr_Occurred()) 3874 /* _decoding_error will have already written into the 3875 target buffer. */ 3876 break; 3877 store: 3878 /* when we get here, chr is a 32-bit unicode character */ 3879 if (chr <= 0xffff) 3880 /* UCS-2 character */ 3881 *p++ = (Py_UNICODE) chr; 3882 else if (chr <= 0x10ffff) { 3883 /* UCS-4 character. Either store directly, or as 3884 surrogate pair. */ 3885#ifdef Py_UNICODE_WIDE 3886 *p++ = chr; 3887#else 3888 chr -= 0x10000L; 3889 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10); 3890 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF); 3891#endif 3892 } else { 3893 endinpos = s-starts; 3894 outpos = p-PyUnicode_AS_UNICODE(v); 3895 if (unicode_decode_call_errorhandler( 3896 errors, &errorHandler, 3897 "unicodeescape", "illegal Unicode character", 3898 &starts, &end, &startinpos, &endinpos, &exc, &s, 3899 &v, &outpos, &p)) 3900 goto onError; 3901 } 3902 break; 3903 3904 /* \N{name} */ 3905 case 'N': 3906 message = "malformed \\N character escape"; 3907 if (ucnhash_CAPI == NULL) { 3908 /* load the unicode data module */ 3909 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(PyUnicodeData_CAPSULE_NAME, 1); 3910 if (ucnhash_CAPI == NULL) 3911 goto ucnhashError; 3912 } 3913 if (*s == '{') { 3914 const char *start = s+1; 3915 /* look for the closing brace */ 3916 while (*s != '}' && s < end) 3917 s++; 3918 if (s > start && s < end && *s == '}') { 3919 /* found a name. look it up in the unicode database */ 3920 message = "unknown Unicode character name"; 3921 s++; 3922 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr)) 3923 goto store; 3924 } 3925 } 3926 endinpos = s-starts; 3927 outpos = p-PyUnicode_AS_UNICODE(v); 3928 if (unicode_decode_call_errorhandler( 3929 errors, &errorHandler, 3930 "unicodeescape", message, 3931 &starts, &end, &startinpos, &endinpos, &exc, &s, 3932 &v, &outpos, &p)) 3933 goto onError; 3934 break; 3935 3936 default: 3937 if (s > end) { 3938 message = "\\ at end of string"; 3939 s--; 3940 endinpos = s-starts; 3941 outpos = p-PyUnicode_AS_UNICODE(v); 3942 if (unicode_decode_call_errorhandler( 3943 errors, &errorHandler, 3944 "unicodeescape", message, 3945 &starts, &end, &startinpos, &endinpos, &exc, &s, 3946 &v, &outpos, &p)) 3947 goto onError; 3948 } 3949 else { 3950 *p++ = '\\'; 3951 *p++ = (unsigned char)s[-1]; 3952 } 3953 break; 3954 } 3955 nextByte: 3956 ; 3957 } 3958 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0) 3959 goto onError; 3960 Py_XDECREF(errorHandler); 3961 Py_XDECREF(exc); 3962 return (PyObject *)v; 3963 3964 ucnhashError: 3965 PyErr_SetString( 3966 PyExc_UnicodeError, 3967 "\\N escapes not supported (can't load unicodedata module)" 3968 ); 3969 Py_XDECREF(v); 3970 Py_XDECREF(errorHandler); 3971 Py_XDECREF(exc); 3972 return NULL; 3973 3974 onError: 3975 Py_XDECREF(v); 3976 Py_XDECREF(errorHandler); 3977 Py_XDECREF(exc); 3978 return NULL; 3979} 3980 3981/* Return a Unicode-Escape string version of the Unicode object. 3982 3983 If quotes is true, the string is enclosed in u"" or u'' quotes as 3984 appropriate. 3985 3986*/ 3987 3988Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s, 3989 Py_ssize_t size, 3990 Py_UNICODE ch) 3991{ 3992 /* like wcschr, but doesn't stop at NULL characters */ 3993 3994 while (size-- > 0) { 3995 if (*s == ch) 3996 return s; 3997 s++; 3998 } 3999 4000 return NULL; 4001} 4002 4003static const char *hexdigits = "0123456789abcdef"; 4004 4005PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s, 4006 Py_ssize_t size) 4007{ 4008 PyObject *repr; 4009 char *p; 4010 4011#ifdef Py_UNICODE_WIDE 4012 const Py_ssize_t expandsize = 10; 4013#else 4014 const Py_ssize_t expandsize = 6; 4015#endif 4016 4017 /* XXX(nnorwitz): rather than over-allocating, it would be 4018 better to choose a different scheme. Perhaps scan the 4019 first N-chars of the string and allocate based on that size. 4020 */ 4021 /* Initial allocation is based on the longest-possible unichr 4022 escape. 4023 4024 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source 4025 unichr, so in this case it's the longest unichr escape. In 4026 narrow (UTF-16) builds this is five chars per source unichr 4027 since there are two unichrs in the surrogate pair, so in narrow 4028 (UTF-16) builds it's not the longest unichr escape. 4029 4030 In wide or narrow builds '\uxxxx' is 6 chars per source unichr, 4031 so in the narrow (UTF-16) build case it's the longest unichr 4032 escape. 4033 */ 4034 4035 if (size == 0) 4036 return PyBytes_FromStringAndSize(NULL, 0); 4037 4038 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize) 4039 return PyErr_NoMemory(); 4040 4041 repr = PyBytes_FromStringAndSize(NULL, 4042 2 4043 + expandsize*size 4044 + 1); 4045 if (repr == NULL) 4046 return NULL; 4047 4048 p = PyBytes_AS_STRING(repr); 4049 4050 while (size-- > 0) { 4051 Py_UNICODE ch = *s++; 4052 4053 /* Escape backslashes */ 4054 if (ch == '\\') { 4055 *p++ = '\\'; 4056 *p++ = (char) ch; 4057 continue; 4058 } 4059 4060#ifdef Py_UNICODE_WIDE 4061 /* Map 21-bit characters to '\U00xxxxxx' */ 4062 else if (ch >= 0x10000) { 4063 *p++ = '\\'; 4064 *p++ = 'U'; 4065 *p++ = hexdigits[(ch >> 28) & 0x0000000F]; 4066 *p++ = hexdigits[(ch >> 24) & 0x0000000F]; 4067 *p++ = hexdigits[(ch >> 20) & 0x0000000F]; 4068 *p++ = hexdigits[(ch >> 16) & 0x0000000F]; 4069 *p++ = hexdigits[(ch >> 12) & 0x0000000F]; 4070 *p++ = hexdigits[(ch >> 8) & 0x0000000F]; 4071 *p++ = hexdigits[(ch >> 4) & 0x0000000F]; 4072 *p++ = hexdigits[ch & 0x0000000F]; 4073 continue; 4074 } 4075#else 4076 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */ 4077 else if (ch >= 0xD800 && ch < 0xDC00) { 4078 Py_UNICODE ch2; 4079 Py_UCS4 ucs; 4080 4081 ch2 = *s++; 4082 size--; 4083 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) { 4084 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000; 4085 *p++ = '\\'; 4086 *p++ = 'U'; 4087 *p++ = hexdigits[(ucs >> 28) & 0x0000000F]; 4088 *p++ = hexdigits[(ucs >> 24) & 0x0000000F]; 4089 *p++ = hexdigits[(ucs >> 20) & 0x0000000F]; 4090 *p++ = hexdigits[(ucs >> 16) & 0x0000000F]; 4091 *p++ = hexdigits[(ucs >> 12) & 0x0000000F]; 4092 *p++ = hexdigits[(ucs >> 8) & 0x0000000F]; 4093 *p++ = hexdigits[(ucs >> 4) & 0x0000000F]; 4094 *p++ = hexdigits[ucs & 0x0000000F]; 4095 continue; 4096 } 4097 /* Fall through: isolated surrogates are copied as-is */ 4098 s--; 4099 size++; 4100 } 4101#endif 4102 4103 /* Map 16-bit characters to '\uxxxx' */ 4104 if (ch >= 256) { 4105 *p++ = '\\'; 4106 *p++ = 'u'; 4107 *p++ = hexdigits[(ch >> 12) & 0x000F]; 4108 *p++ = hexdigits[(ch >> 8) & 0x000F]; 4109 *p++ = hexdigits[(ch >> 4) & 0x000F]; 4110 *p++ = hexdigits[ch & 0x000F]; 4111 } 4112 4113 /* Map special whitespace to '\t', \n', '\r' */ 4114 else if (ch == '\t') { 4115 *p++ = '\\'; 4116 *p++ = 't'; 4117 } 4118 else if (ch == '\n') { 4119 *p++ = '\\'; 4120 *p++ = 'n'; 4121 } 4122 else if (ch == '\r') { 4123 *p++ = '\\'; 4124 *p++ = 'r'; 4125 } 4126 4127 /* Map non-printable US ASCII to '\xhh' */ 4128 else if (ch < ' ' || ch >= 0x7F) { 4129 *p++ = '\\'; 4130 *p++ = 'x'; 4131 *p++ = hexdigits[(ch >> 4) & 0x000F]; 4132 *p++ = hexdigits[ch & 0x000F]; 4133 } 4134 4135 /* Copy everything else as-is */ 4136 else 4137 *p++ = (char) ch; 4138 } 4139 4140 assert(p - PyBytes_AS_STRING(repr) > 0); 4141 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) 4142 return NULL; 4143 return repr; 4144} 4145 4146PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode) 4147{ 4148 PyObject *s; 4149 if (!PyUnicode_Check(unicode)) { 4150 PyErr_BadArgument(); 4151 return NULL; 4152 } 4153 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode), 4154 PyUnicode_GET_SIZE(unicode)); 4155 return s; 4156} 4157 4158/* --- Raw Unicode Escape Codec ------------------------------------------- */ 4159 4160PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s, 4161 Py_ssize_t size, 4162 const char *errors) 4163{ 4164 const char *starts = s; 4165 Py_ssize_t startinpos; 4166 Py_ssize_t endinpos; 4167 Py_ssize_t outpos; 4168 PyUnicodeObject *v; 4169 Py_UNICODE *p; 4170 const char *end; 4171 const char *bs; 4172 PyObject *errorHandler = NULL; 4173 PyObject *exc = NULL; 4174 4175 /* Escaped strings will always be longer than the resulting 4176 Unicode string, so we start with size here and then reduce the 4177 length after conversion to the true value. (But decoding error 4178 handler might have to resize the string) */ 4179 v = _PyUnicode_New(size); 4180 if (v == NULL) 4181 goto onError; 4182 if (size == 0) 4183 return (PyObject *)v; 4184 p = PyUnicode_AS_UNICODE(v); 4185 end = s + size; 4186 while (s < end) { 4187 unsigned char c; 4188 Py_UCS4 x; 4189 int i; 4190 int count; 4191 4192 /* Non-escape characters are interpreted as Unicode ordinals */ 4193 if (*s != '\\') { 4194 *p++ = (unsigned char)*s++; 4195 continue; 4196 } 4197 startinpos = s-starts; 4198 4199 /* \u-escapes are only interpreted iff the number of leading 4200 backslashes if odd */ 4201 bs = s; 4202 for (;s < end;) { 4203 if (*s != '\\') 4204 break; 4205 *p++ = (unsigned char)*s++; 4206 } 4207 if (((s - bs) & 1) == 0 || 4208 s >= end || 4209 (*s != 'u' && *s != 'U')) { 4210 continue; 4211 } 4212 p--; 4213 count = *s=='u' ? 4 : 8; 4214 s++; 4215 4216 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */ 4217 outpos = p-PyUnicode_AS_UNICODE(v); 4218 for (x = 0, i = 0; i < count; ++i, ++s) { 4219 c = (unsigned char)*s; 4220 if (!Py_ISXDIGIT(c)) { 4221 endinpos = s-starts; 4222 if (unicode_decode_call_errorhandler( 4223 errors, &errorHandler, 4224 "rawunicodeescape", "truncated \\uXXXX", 4225 &starts, &end, &startinpos, &endinpos, &exc, &s, 4226 &v, &outpos, &p)) 4227 goto onError; 4228 goto nextByte; 4229 } 4230 x = (x<<4) & ~0xF; 4231 if (c >= '0' && c <= '9') 4232 x += c - '0'; 4233 else if (c >= 'a' && c <= 'f') 4234 x += 10 + c - 'a'; 4235 else 4236 x += 10 + c - 'A'; 4237 } 4238 if (x <= 0xffff) 4239 /* UCS-2 character */ 4240 *p++ = (Py_UNICODE) x; 4241 else if (x <= 0x10ffff) { 4242 /* UCS-4 character. Either store directly, or as 4243 surrogate pair. */ 4244#ifdef Py_UNICODE_WIDE 4245 *p++ = (Py_UNICODE) x; 4246#else 4247 x -= 0x10000L; 4248 *p++ = 0xD800 + (Py_UNICODE) (x >> 10); 4249 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF); 4250#endif 4251 } else { 4252 endinpos = s-starts; 4253 outpos = p-PyUnicode_AS_UNICODE(v); 4254 if (unicode_decode_call_errorhandler( 4255 errors, &errorHandler, 4256 "rawunicodeescape", "\\Uxxxxxxxx out of range", 4257 &starts, &end, &startinpos, &endinpos, &exc, &s, 4258 &v, &outpos, &p)) 4259 goto onError; 4260 } 4261 nextByte: 4262 ; 4263 } 4264 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0) 4265 goto onError; 4266 Py_XDECREF(errorHandler); 4267 Py_XDECREF(exc); 4268 return (PyObject *)v; 4269 4270 onError: 4271 Py_XDECREF(v); 4272 Py_XDECREF(errorHandler); 4273 Py_XDECREF(exc); 4274 return NULL; 4275} 4276 4277PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s, 4278 Py_ssize_t size) 4279{ 4280 PyObject *repr; 4281 char *p; 4282 char *q; 4283 4284#ifdef Py_UNICODE_WIDE 4285 const Py_ssize_t expandsize = 10; 4286#else 4287 const Py_ssize_t expandsize = 6; 4288#endif 4289 4290 if (size > PY_SSIZE_T_MAX / expandsize) 4291 return PyErr_NoMemory(); 4292 4293 repr = PyBytes_FromStringAndSize(NULL, expandsize * size); 4294 if (repr == NULL) 4295 return NULL; 4296 if (size == 0) 4297 return repr; 4298 4299 p = q = PyBytes_AS_STRING(repr); 4300 while (size-- > 0) { 4301 Py_UNICODE ch = *s++; 4302#ifdef Py_UNICODE_WIDE 4303 /* Map 32-bit characters to '\Uxxxxxxxx' */ 4304 if (ch >= 0x10000) { 4305 *p++ = '\\'; 4306 *p++ = 'U'; 4307 *p++ = hexdigits[(ch >> 28) & 0xf]; 4308 *p++ = hexdigits[(ch >> 24) & 0xf]; 4309 *p++ = hexdigits[(ch >> 20) & 0xf]; 4310 *p++ = hexdigits[(ch >> 16) & 0xf]; 4311 *p++ = hexdigits[(ch >> 12) & 0xf]; 4312 *p++ = hexdigits[(ch >> 8) & 0xf]; 4313 *p++ = hexdigits[(ch >> 4) & 0xf]; 4314 *p++ = hexdigits[ch & 15]; 4315 } 4316 else 4317#else 4318 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */ 4319 if (ch >= 0xD800 && ch < 0xDC00) { 4320 Py_UNICODE ch2; 4321 Py_UCS4 ucs; 4322 4323 ch2 = *s++; 4324 size--; 4325 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) { 4326 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000; 4327 *p++ = '\\'; 4328 *p++ = 'U'; 4329 *p++ = hexdigits[(ucs >> 28) & 0xf]; 4330 *p++ = hexdigits[(ucs >> 24) & 0xf]; 4331 *p++ = hexdigits[(ucs >> 20) & 0xf]; 4332 *p++ = hexdigits[(ucs >> 16) & 0xf]; 4333 *p++ = hexdigits[(ucs >> 12) & 0xf]; 4334 *p++ = hexdigits[(ucs >> 8) & 0xf]; 4335 *p++ = hexdigits[(ucs >> 4) & 0xf]; 4336 *p++ = hexdigits[ucs & 0xf]; 4337 continue; 4338 } 4339 /* Fall through: isolated surrogates are copied as-is */ 4340 s--; 4341 size++; 4342 } 4343#endif 4344 /* Map 16-bit characters to '\uxxxx' */ 4345 if (ch >= 256) { 4346 *p++ = '\\'; 4347 *p++ = 'u'; 4348 *p++ = hexdigits[(ch >> 12) & 0xf]; 4349 *p++ = hexdigits[(ch >> 8) & 0xf]; 4350 *p++ = hexdigits[(ch >> 4) & 0xf]; 4351 *p++ = hexdigits[ch & 15]; 4352 } 4353 /* Copy everything else as-is */ 4354 else 4355 *p++ = (char) ch; 4356 } 4357 size = p - q; 4358 4359 assert(size > 0); 4360 if (_PyBytes_Resize(&repr, size) < 0) 4361 return NULL; 4362 return repr; 4363} 4364 4365PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode) 4366{ 4367 PyObject *s; 4368 if (!PyUnicode_Check(unicode)) { 4369 PyErr_BadArgument(); 4370 return NULL; 4371 } 4372 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode), 4373 PyUnicode_GET_SIZE(unicode)); 4374 4375 return s; 4376} 4377 4378/* --- Unicode Internal Codec ------------------------------------------- */ 4379 4380PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s, 4381 Py_ssize_t size, 4382 const char *errors) 4383{ 4384 const char *starts = s; 4385 Py_ssize_t startinpos; 4386 Py_ssize_t endinpos; 4387 Py_ssize_t outpos; 4388 PyUnicodeObject *v; 4389 Py_UNICODE *p; 4390 const char *end; 4391 const char *reason; 4392 PyObject *errorHandler = NULL; 4393 PyObject *exc = NULL; 4394 4395#ifdef Py_UNICODE_WIDE 4396 Py_UNICODE unimax = PyUnicode_GetMax(); 4397#endif 4398 4399 /* XXX overflow detection missing */ 4400 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE); 4401 if (v == NULL) 4402 goto onError; 4403 if (PyUnicode_GetSize((PyObject *)v) == 0) 4404 return (PyObject *)v; 4405 p = PyUnicode_AS_UNICODE(v); 4406 end = s + size; 4407 4408 while (s < end) { 4409 memcpy(p, s, sizeof(Py_UNICODE)); 4410 /* We have to sanity check the raw data, otherwise doom looms for 4411 some malformed UCS-4 data. */ 4412 if ( 4413#ifdef Py_UNICODE_WIDE 4414 *p > unimax || *p < 0 || 4415#endif 4416 end-s < Py_UNICODE_SIZE 4417 ) 4418 { 4419 startinpos = s - starts; 4420 if (end-s < Py_UNICODE_SIZE) { 4421 endinpos = end-starts; 4422 reason = "truncated input"; 4423 } 4424 else { 4425 endinpos = s - starts + Py_UNICODE_SIZE; 4426 reason = "illegal code point (> 0x10FFFF)"; 4427 } 4428 outpos = p - PyUnicode_AS_UNICODE(v); 4429 if (unicode_decode_call_errorhandler( 4430 errors, &errorHandler, 4431 "unicode_internal", reason, 4432 &starts, &end, &startinpos, &endinpos, &exc, &s, 4433 &v, &outpos, &p)) { 4434 goto onError; 4435 } 4436 } 4437 else { 4438 p++; 4439 s += Py_UNICODE_SIZE; 4440 } 4441 } 4442 4443 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0) 4444 goto onError; 4445 Py_XDECREF(errorHandler); 4446 Py_XDECREF(exc); 4447 return (PyObject *)v; 4448 4449 onError: 4450 Py_XDECREF(v); 4451 Py_XDECREF(errorHandler); 4452 Py_XDECREF(exc); 4453 return NULL; 4454} 4455 4456/* --- Latin-1 Codec ------------------------------------------------------ */ 4457 4458PyObject *PyUnicode_DecodeLatin1(const char *s, 4459 Py_ssize_t size, 4460 const char *errors) 4461{ 4462 PyUnicodeObject *v; 4463 Py_UNICODE *p; 4464 const char *e, *unrolled_end; 4465 4466 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */ 4467 if (size == 1) { 4468 Py_UNICODE r = *(unsigned char*)s; 4469 return PyUnicode_FromUnicode(&r, 1); 4470 } 4471 4472 v = _PyUnicode_New(size); 4473 if (v == NULL) 4474 goto onError; 4475 if (size == 0) 4476 return (PyObject *)v; 4477 p = PyUnicode_AS_UNICODE(v); 4478 e = s + size; 4479 /* Unrolling the copy makes it much faster by reducing the looping 4480 overhead. This is similar to what many memcpy() implementations do. */ 4481 unrolled_end = e - 4; 4482 while (s < unrolled_end) { 4483 p[0] = (unsigned char) s[0]; 4484 p[1] = (unsigned char) s[1]; 4485 p[2] = (unsigned char) s[2]; 4486 p[3] = (unsigned char) s[3]; 4487 s += 4; 4488 p += 4; 4489 } 4490 while (s < e) 4491 *p++ = (unsigned char) *s++; 4492 return (PyObject *)v; 4493 4494 onError: 4495 Py_XDECREF(v); 4496 return NULL; 4497} 4498 4499/* create or adjust a UnicodeEncodeError */ 4500static void make_encode_exception(PyObject **exceptionObject, 4501 const char *encoding, 4502 const Py_UNICODE *unicode, Py_ssize_t size, 4503 Py_ssize_t startpos, Py_ssize_t endpos, 4504 const char *reason) 4505{ 4506 if (*exceptionObject == NULL) { 4507 *exceptionObject = PyUnicodeEncodeError_Create( 4508 encoding, unicode, size, startpos, endpos, reason); 4509 } 4510 else { 4511 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos)) 4512 goto onError; 4513 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos)) 4514 goto onError; 4515 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason)) 4516 goto onError; 4517 return; 4518 onError: 4519 Py_DECREF(*exceptionObject); 4520 *exceptionObject = NULL; 4521 } 4522} 4523 4524/* raises a UnicodeEncodeError */ 4525static void raise_encode_exception(PyObject **exceptionObject, 4526 const char *encoding, 4527 const Py_UNICODE *unicode, Py_ssize_t size, 4528 Py_ssize_t startpos, Py_ssize_t endpos, 4529 const char *reason) 4530{ 4531 make_encode_exception(exceptionObject, 4532 encoding, unicode, size, startpos, endpos, reason); 4533 if (*exceptionObject != NULL) 4534 PyCodec_StrictErrors(*exceptionObject); 4535} 4536 4537/* error handling callback helper: 4538 build arguments, call the callback and check the arguments, 4539 put the result into newpos and return the replacement string, which 4540 has to be freed by the caller */ 4541static PyObject *unicode_encode_call_errorhandler(const char *errors, 4542 PyObject **errorHandler, 4543 const char *encoding, const char *reason, 4544 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject, 4545 Py_ssize_t startpos, Py_ssize_t endpos, 4546 Py_ssize_t *newpos) 4547{ 4548 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple"; 4549 4550 PyObject *restuple; 4551 PyObject *resunicode; 4552 4553 if (*errorHandler == NULL) { 4554 *errorHandler = PyCodec_LookupError(errors); 4555 if (*errorHandler == NULL) 4556 return NULL; 4557 } 4558 4559 make_encode_exception(exceptionObject, 4560 encoding, unicode, size, startpos, endpos, reason); 4561 if (*exceptionObject == NULL) 4562 return NULL; 4563 4564 restuple = PyObject_CallFunctionObjArgs( 4565 *errorHandler, *exceptionObject, NULL); 4566 if (restuple == NULL) 4567 return NULL; 4568 if (!PyTuple_Check(restuple)) { 4569 PyErr_SetString(PyExc_TypeError, &argparse[3]); 4570 Py_DECREF(restuple); 4571 return NULL; 4572 } 4573 if (!PyArg_ParseTuple(restuple, argparse, 4574 &resunicode, newpos)) { 4575 Py_DECREF(restuple); 4576 return NULL; 4577 } 4578 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) { 4579 PyErr_SetString(PyExc_TypeError, &argparse[3]); 4580 Py_DECREF(restuple); 4581 return NULL; 4582 } 4583 if (*newpos<0) 4584 *newpos = size+*newpos; 4585 if (*newpos<0 || *newpos>size) { 4586 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos); 4587 Py_DECREF(restuple); 4588 return NULL; 4589 } 4590 Py_INCREF(resunicode); 4591 Py_DECREF(restuple); 4592 return resunicode; 4593} 4594 4595static PyObject *unicode_encode_ucs1(const Py_UNICODE *p, 4596 Py_ssize_t size, 4597 const char *errors, 4598 int limit) 4599{ 4600 /* output object */ 4601 PyObject *res; 4602 /* pointers to the beginning and end+1 of input */ 4603 const Py_UNICODE *startp = p; 4604 const Py_UNICODE *endp = p + size; 4605 /* pointer to the beginning of the unencodable characters */ 4606 /* const Py_UNICODE *badp = NULL; */ 4607 /* pointer into the output */ 4608 char *str; 4609 /* current output position */ 4610 Py_ssize_t ressize; 4611 const char *encoding = (limit == 256) ? "latin-1" : "ascii"; 4612 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)"; 4613 PyObject *errorHandler = NULL; 4614 PyObject *exc = NULL; 4615 /* the following variable is used for caching string comparisons 4616 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */ 4617 int known_errorHandler = -1; 4618 4619 /* allocate enough for a simple encoding without 4620 replacements, if we need more, we'll resize */ 4621 if (size == 0) 4622 return PyBytes_FromStringAndSize(NULL, 0); 4623 res = PyBytes_FromStringAndSize(NULL, size); 4624 if (res == NULL) 4625 return NULL; 4626 str = PyBytes_AS_STRING(res); 4627 ressize = size; 4628 4629 while (p<endp) { 4630 Py_UNICODE c = *p; 4631 4632 /* can we encode this? */ 4633 if (c<limit) { 4634 /* no overflow check, because we know that the space is enough */ 4635 *str++ = (char)c; 4636 ++p; 4637 } 4638 else { 4639 Py_ssize_t unicodepos = p-startp; 4640 Py_ssize_t requiredsize; 4641 PyObject *repunicode; 4642 Py_ssize_t repsize; 4643 Py_ssize_t newpos; 4644 Py_ssize_t respos; 4645 Py_UNICODE *uni2; 4646 /* startpos for collecting unencodable chars */ 4647 const Py_UNICODE *collstart = p; 4648 const Py_UNICODE *collend = p; 4649 /* find all unecodable characters */ 4650 while ((collend < endp) && ((*collend)>=limit)) 4651 ++collend; 4652 /* cache callback name lookup (if not done yet, i.e. it's the first error) */ 4653 if (known_errorHandler==-1) { 4654 if ((errors==NULL) || (!strcmp(errors, "strict"))) 4655 known_errorHandler = 1; 4656 else if (!strcmp(errors, "replace")) 4657 known_errorHandler = 2; 4658 else if (!strcmp(errors, "ignore")) 4659 known_errorHandler = 3; 4660 else if (!strcmp(errors, "xmlcharrefreplace")) 4661 known_errorHandler = 4; 4662 else 4663 known_errorHandler = 0; 4664 } 4665 switch (known_errorHandler) { 4666 case 1: /* strict */ 4667 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason); 4668 goto onError; 4669 case 2: /* replace */ 4670 while (collstart++<collend) 4671 *str++ = '?'; /* fall through */ 4672 case 3: /* ignore */ 4673 p = collend; 4674 break; 4675 case 4: /* xmlcharrefreplace */ 4676 respos = str - PyBytes_AS_STRING(res); 4677 /* determine replacement size (temporarily (mis)uses p) */ 4678 for (p = collstart, repsize = 0; p < collend; ++p) { 4679 if (*p<10) 4680 repsize += 2+1+1; 4681 else if (*p<100) 4682 repsize += 2+2+1; 4683 else if (*p<1000) 4684 repsize += 2+3+1; 4685 else if (*p<10000) 4686 repsize += 2+4+1; 4687#ifndef Py_UNICODE_WIDE 4688 else 4689 repsize += 2+5+1; 4690#else 4691 else if (*p<100000) 4692 repsize += 2+5+1; 4693 else if (*p<1000000) 4694 repsize += 2+6+1; 4695 else 4696 repsize += 2+7+1; 4697#endif 4698 } 4699 requiredsize = respos+repsize+(endp-collend); 4700 if (requiredsize > ressize) { 4701 if (requiredsize<2*ressize) 4702 requiredsize = 2*ressize; 4703 if (_PyBytes_Resize(&res, requiredsize)) 4704 goto onError; 4705 str = PyBytes_AS_STRING(res) + respos; 4706 ressize = requiredsize; 4707 } 4708 /* generate replacement (temporarily (mis)uses p) */ 4709 for (p = collstart; p < collend; ++p) { 4710 str += sprintf(str, "&#%d;", (int)*p); 4711 } 4712 p = collend; 4713 break; 4714 default: 4715 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler, 4716 encoding, reason, startp, size, &exc, 4717 collstart-startp, collend-startp, &newpos); 4718 if (repunicode == NULL) 4719 goto onError; 4720 if (PyBytes_Check(repunicode)) { 4721 /* Directly copy bytes result to output. */ 4722 repsize = PyBytes_Size(repunicode); 4723 if (repsize > 1) { 4724 /* Make room for all additional bytes. */ 4725 respos = str - PyBytes_AS_STRING(res); 4726 if (_PyBytes_Resize(&res, ressize+repsize-1)) { 4727 Py_DECREF(repunicode); 4728 goto onError; 4729 } 4730 str = PyBytes_AS_STRING(res) + respos; 4731 ressize += repsize-1; 4732 } 4733 memcpy(str, PyBytes_AsString(repunicode), repsize); 4734 str += repsize; 4735 p = startp + newpos; 4736 Py_DECREF(repunicode); 4737 break; 4738 } 4739 /* need more space? (at least enough for what we 4740 have+the replacement+the rest of the string, so 4741 we won't have to check space for encodable characters) */ 4742 respos = str - PyBytes_AS_STRING(res); 4743 repsize = PyUnicode_GET_SIZE(repunicode); 4744 requiredsize = respos+repsize+(endp-collend); 4745 if (requiredsize > ressize) { 4746 if (requiredsize<2*ressize) 4747 requiredsize = 2*ressize; 4748 if (_PyBytes_Resize(&res, requiredsize)) { 4749 Py_DECREF(repunicode); 4750 goto onError; 4751 } 4752 str = PyBytes_AS_STRING(res) + respos; 4753 ressize = requiredsize; 4754 } 4755 /* check if there is anything unencodable in the replacement 4756 and copy it to the output */ 4757 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) { 4758 c = *uni2; 4759 if (c >= limit) { 4760 raise_encode_exception(&exc, encoding, startp, size, 4761 unicodepos, unicodepos+1, reason); 4762 Py_DECREF(repunicode); 4763 goto onError; 4764 } 4765 *str = (char)c; 4766 } 4767 p = startp + newpos; 4768 Py_DECREF(repunicode); 4769 } 4770 } 4771 } 4772 /* Resize if we allocated to much */ 4773 size = str - PyBytes_AS_STRING(res); 4774 if (size < ressize) { /* If this falls res will be NULL */ 4775 assert(size >= 0); 4776 if (_PyBytes_Resize(&res, size) < 0) 4777 goto onError; 4778 } 4779 4780 Py_XDECREF(errorHandler); 4781 Py_XDECREF(exc); 4782 return res; 4783 4784 onError: 4785 Py_XDECREF(res); 4786 Py_XDECREF(errorHandler); 4787 Py_XDECREF(exc); 4788 return NULL; 4789} 4790 4791PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p, 4792 Py_ssize_t size, 4793 const char *errors) 4794{ 4795 return unicode_encode_ucs1(p, size, errors, 256); 4796} 4797 4798PyObject *PyUnicode_AsLatin1String(PyObject *unicode) 4799{ 4800 if (!PyUnicode_Check(unicode)) { 4801 PyErr_BadArgument(); 4802 return NULL; 4803 } 4804 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode), 4805 PyUnicode_GET_SIZE(unicode), 4806 NULL); 4807} 4808 4809/* --- 7-bit ASCII Codec -------------------------------------------------- */ 4810 4811PyObject *PyUnicode_DecodeASCII(const char *s, 4812 Py_ssize_t size, 4813 const char *errors) 4814{ 4815 const char *starts = s; 4816 PyUnicodeObject *v; 4817 Py_UNICODE *p; 4818 Py_ssize_t startinpos; 4819 Py_ssize_t endinpos; 4820 Py_ssize_t outpos; 4821 const char *e; 4822 PyObject *errorHandler = NULL; 4823 PyObject *exc = NULL; 4824 4825 /* ASCII is equivalent to the first 128 ordinals in Unicode. */ 4826 if (size == 1 && *(unsigned char*)s < 128) { 4827 Py_UNICODE r = *(unsigned char*)s; 4828 return PyUnicode_FromUnicode(&r, 1); 4829 } 4830 4831 v = _PyUnicode_New(size); 4832 if (v == NULL) 4833 goto onError; 4834 if (size == 0) 4835 return (PyObject *)v; 4836 p = PyUnicode_AS_UNICODE(v); 4837 e = s + size; 4838 while (s < e) { 4839 register unsigned char c = (unsigned char)*s; 4840 if (c < 128) { 4841 *p++ = c; 4842 ++s; 4843 } 4844 else { 4845 startinpos = s-starts; 4846 endinpos = startinpos + 1; 4847 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v); 4848 if (unicode_decode_call_errorhandler( 4849 errors, &errorHandler, 4850 "ascii", "ordinal not in range(128)", 4851 &starts, &e, &startinpos, &endinpos, &exc, &s, 4852 &v, &outpos, &p)) 4853 goto onError; 4854 } 4855 } 4856 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v)) 4857 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0) 4858 goto onError; 4859 Py_XDECREF(errorHandler); 4860 Py_XDECREF(exc); 4861 return (PyObject *)v; 4862 4863 onError: 4864 Py_XDECREF(v); 4865 Py_XDECREF(errorHandler); 4866 Py_XDECREF(exc); 4867 return NULL; 4868} 4869 4870PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p, 4871 Py_ssize_t size, 4872 const char *errors) 4873{ 4874 return unicode_encode_ucs1(p, size, errors, 128); 4875} 4876 4877PyObject *PyUnicode_AsASCIIString(PyObject *unicode) 4878{ 4879 if (!PyUnicode_Check(unicode)) { 4880 PyErr_BadArgument(); 4881 return NULL; 4882 } 4883 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode), 4884 PyUnicode_GET_SIZE(unicode), 4885 NULL); 4886} 4887 4888#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T) 4889 4890/* --- MBCS codecs for Windows -------------------------------------------- */ 4891 4892#if SIZEOF_INT < SIZEOF_SIZE_T 4893#define NEED_RETRY 4894#endif 4895 4896/* XXX This code is limited to "true" double-byte encodings, as 4897 a) it assumes an incomplete character consists of a single byte, and 4898 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte 4899 encodings, see IsDBCSLeadByteEx documentation. */ 4900 4901static int is_dbcs_lead_byte(const char *s, int offset) 4902{ 4903 const char *curr = s + offset; 4904 4905 if (IsDBCSLeadByte(*curr)) { 4906 const char *prev = CharPrev(s, curr); 4907 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2); 4908 } 4909 return 0; 4910} 4911 4912/* 4913 * Decode MBCS string into unicode object. If 'final' is set, converts 4914 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise. 4915 */ 4916static int decode_mbcs(PyUnicodeObject **v, 4917 const char *s, /* MBCS string */ 4918 int size, /* sizeof MBCS string */ 4919 int final, 4920 const char *errors) 4921{ 4922 Py_UNICODE *p; 4923 Py_ssize_t n; 4924 DWORD usize; 4925 DWORD flags; 4926 4927 assert(size >= 0); 4928 4929 /* check and handle 'errors' arg */ 4930 if (errors==NULL || strcmp(errors, "strict")==0) 4931 flags = MB_ERR_INVALID_CHARS; 4932 else if (strcmp(errors, "ignore")==0) 4933 flags = 0; 4934 else { 4935 PyErr_Format(PyExc_ValueError, 4936 "mbcs encoding does not support errors='%s'", 4937 errors); 4938 return -1; 4939 } 4940 4941 /* Skip trailing lead-byte unless 'final' is set */ 4942 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1)) 4943 --size; 4944 4945 /* First get the size of the result */ 4946 if (size > 0) { 4947 usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0); 4948 if (usize==0) 4949 goto mbcs_decode_error; 4950 } else 4951 usize = 0; 4952 4953 if (*v == NULL) { 4954 /* Create unicode object */ 4955 *v = _PyUnicode_New(usize); 4956 if (*v == NULL) 4957 return -1; 4958 n = 0; 4959 } 4960 else { 4961 /* Extend unicode object */ 4962 n = PyUnicode_GET_SIZE(*v); 4963 if (_PyUnicode_Resize(v, n + usize) < 0) 4964 return -1; 4965 } 4966 4967 /* Do the conversion */ 4968 if (usize > 0) { 4969 p = PyUnicode_AS_UNICODE(*v) + n; 4970 if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) { 4971 goto mbcs_decode_error; 4972 } 4973 } 4974 return size; 4975 4976mbcs_decode_error: 4977 /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then 4978 we raise a UnicodeDecodeError - else it is a 'generic' 4979 windows error 4980 */ 4981 if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) { 4982 /* Ideally, we should get reason from FormatMessage - this 4983 is the Windows 2000 English version of the message 4984 */ 4985 PyObject *exc = NULL; 4986 const char *reason = "No mapping for the Unicode character exists " 4987 "in the target multi-byte code page."; 4988 make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason); 4989 if (exc != NULL) { 4990 PyCodec_StrictErrors(exc); 4991 Py_DECREF(exc); 4992 } 4993 } else { 4994 PyErr_SetFromWindowsErrWithFilename(0, NULL); 4995 } 4996 return -1; 4997} 4998 4999PyObject *PyUnicode_DecodeMBCSStateful(const char *s, 5000 Py_ssize_t size, 5001 const char *errors, 5002 Py_ssize_t *consumed) 5003{ 5004 PyUnicodeObject *v = NULL; 5005 int done; 5006 5007 if (consumed) 5008 *consumed = 0; 5009 5010#ifdef NEED_RETRY 5011 retry: 5012 if (size > INT_MAX) 5013 done = decode_mbcs(&v, s, INT_MAX, 0, errors); 5014 else 5015#endif 5016 done = decode_mbcs(&v, s, (int)size, !consumed, errors); 5017 5018 if (done < 0) { 5019 Py_XDECREF(v); 5020 return NULL; 5021 } 5022 5023 if (consumed) 5024 *consumed += done; 5025 5026#ifdef NEED_RETRY 5027 if (size > INT_MAX) { 5028 s += done; 5029 size -= done; 5030 goto retry; 5031 } 5032#endif 5033 5034 return (PyObject *)v; 5035} 5036 5037PyObject *PyUnicode_DecodeMBCS(const char *s, 5038 Py_ssize_t size, 5039 const char *errors) 5040{ 5041 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL); 5042} 5043 5044/* 5045 * Convert unicode into string object (MBCS). 5046 * Returns 0 if succeed, -1 otherwise. 5047 */ 5048static int encode_mbcs(PyObject **repr, 5049 const Py_UNICODE *p, /* unicode */ 5050 int size, /* size of unicode */ 5051 const char* errors) 5052{ 5053 BOOL usedDefaultChar = FALSE; 5054 BOOL *pusedDefaultChar; 5055 int mbcssize; 5056 Py_ssize_t n; 5057 PyObject *exc = NULL; 5058 DWORD flags; 5059 5060 assert(size >= 0); 5061 5062 /* check and handle 'errors' arg */ 5063 if (errors==NULL || strcmp(errors, "strict")==0) { 5064 flags = WC_NO_BEST_FIT_CHARS; 5065 pusedDefaultChar = &usedDefaultChar; 5066 } else if (strcmp(errors, "replace")==0) { 5067 flags = 0; 5068 pusedDefaultChar = NULL; 5069 } else { 5070 PyErr_Format(PyExc_ValueError, 5071 "mbcs encoding does not support errors='%s'", 5072 errors); 5073 return -1; 5074 } 5075 5076 /* First get the size of the result */ 5077 if (size > 0) { 5078 mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0, 5079 NULL, pusedDefaultChar); 5080 if (mbcssize == 0) { 5081 PyErr_SetFromWindowsErrWithFilename(0, NULL); 5082 return -1; 5083 } 5084 /* If we used a default char, then we failed! */ 5085 if (pusedDefaultChar && *pusedDefaultChar) 5086 goto mbcs_encode_error; 5087 } else { 5088 mbcssize = 0; 5089 } 5090 5091 if (*repr == NULL) { 5092 /* Create string object */ 5093 *repr = PyBytes_FromStringAndSize(NULL, mbcssize); 5094 if (*repr == NULL) 5095 return -1; 5096 n = 0; 5097 } 5098 else { 5099 /* Extend string object */ 5100 n = PyBytes_Size(*repr); 5101 if (_PyBytes_Resize(repr, n + mbcssize) < 0) 5102 return -1; 5103 } 5104 5105 /* Do the conversion */ 5106 if (size > 0) { 5107 char *s = PyBytes_AS_STRING(*repr) + n; 5108 if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize, 5109 NULL, pusedDefaultChar)) { 5110 PyErr_SetFromWindowsErrWithFilename(0, NULL); 5111 return -1; 5112 } 5113 if (pusedDefaultChar && *pusedDefaultChar) 5114 goto mbcs_encode_error; 5115 } 5116 return 0; 5117 5118mbcs_encode_error: 5119 raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character"); 5120 Py_XDECREF(exc); 5121 return -1; 5122} 5123 5124PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p, 5125 Py_ssize_t size, 5126 const char *errors) 5127{ 5128 PyObject *repr = NULL; 5129 int ret; 5130 5131#ifdef NEED_RETRY 5132 retry: 5133 if (size > INT_MAX) 5134 ret = encode_mbcs(&repr, p, INT_MAX, errors); 5135 else 5136#endif 5137 ret = encode_mbcs(&repr, p, (int)size, errors); 5138 5139 if (ret < 0) { 5140 Py_XDECREF(repr); 5141 return NULL; 5142 } 5143 5144#ifdef NEED_RETRY 5145 if (size > INT_MAX) { 5146 p += INT_MAX; 5147 size -= INT_MAX; 5148 goto retry; 5149 } 5150#endif 5151 5152 return repr; 5153} 5154 5155PyObject *PyUnicode_AsMBCSString(PyObject *unicode) 5156{ 5157 if (!PyUnicode_Check(unicode)) { 5158 PyErr_BadArgument(); 5159 return NULL; 5160 } 5161 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode), 5162 PyUnicode_GET_SIZE(unicode), 5163 NULL); 5164} 5165 5166#undef NEED_RETRY 5167 5168#endif /* MS_WINDOWS */ 5169 5170/* --- Character Mapping Codec -------------------------------------------- */ 5171 5172PyObject *PyUnicode_DecodeCharmap(const char *s, 5173 Py_ssize_t size, 5174 PyObject *mapping, 5175 const char *errors) 5176{ 5177 const char *starts = s; 5178 Py_ssize_t startinpos; 5179 Py_ssize_t endinpos; 5180 Py_ssize_t outpos; 5181 const char *e; 5182 PyUnicodeObject *v; 5183 Py_UNICODE *p; 5184 Py_ssize_t extrachars = 0; 5185 PyObject *errorHandler = NULL; 5186 PyObject *exc = NULL; 5187 Py_UNICODE *mapstring = NULL; 5188 Py_ssize_t maplen = 0; 5189 5190 /* Default to Latin-1 */ 5191 if (mapping == NULL) 5192 return PyUnicode_DecodeLatin1(s, size, errors); 5193 5194 v = _PyUnicode_New(size); 5195 if (v == NULL) 5196 goto onError; 5197 if (size == 0) 5198 return (PyObject *)v; 5199 p = PyUnicode_AS_UNICODE(v); 5200 e = s + size; 5201 if (PyUnicode_CheckExact(mapping)) { 5202 mapstring = PyUnicode_AS_UNICODE(mapping); 5203 maplen = PyUnicode_GET_SIZE(mapping); 5204 while (s < e) { 5205 unsigned char ch = *s; 5206 Py_UNICODE x = 0xfffe; /* illegal value */ 5207 5208 if (ch < maplen) 5209 x = mapstring[ch]; 5210 5211 if (x == 0xfffe) { 5212 /* undefined mapping */ 5213 outpos = p-PyUnicode_AS_UNICODE(v); 5214 startinpos = s-starts; 5215 endinpos = startinpos+1; 5216 if (unicode_decode_call_errorhandler( 5217 errors, &errorHandler, 5218 "charmap", "character maps to <undefined>", 5219 &starts, &e, &startinpos, &endinpos, &exc, &s, 5220 &v, &outpos, &p)) { 5221 goto onError; 5222 } 5223 continue; 5224 } 5225 *p++ = x; 5226 ++s; 5227 } 5228 } 5229 else { 5230 while (s < e) { 5231 unsigned char ch = *s; 5232 PyObject *w, *x; 5233 5234 /* Get mapping (char ordinal -> integer, Unicode char or None) */ 5235 w = PyLong_FromLong((long)ch); 5236 if (w == NULL) 5237 goto onError; 5238 x = PyObject_GetItem(mapping, w); 5239 Py_DECREF(w); 5240 if (x == NULL) { 5241 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 5242 /* No mapping found means: mapping is undefined. */ 5243 PyErr_Clear(); 5244 x = Py_None; 5245 Py_INCREF(x); 5246 } else 5247 goto onError; 5248 } 5249 5250 /* Apply mapping */ 5251 if (PyLong_Check(x)) { 5252 long value = PyLong_AS_LONG(x); 5253 if (value < 0 || value > 65535) { 5254 PyErr_SetString(PyExc_TypeError, 5255 "character mapping must be in range(65536)"); 5256 Py_DECREF(x); 5257 goto onError; 5258 } 5259 *p++ = (Py_UNICODE)value; 5260 } 5261 else if (x == Py_None) { 5262 /* undefined mapping */ 5263 outpos = p-PyUnicode_AS_UNICODE(v); 5264 startinpos = s-starts; 5265 endinpos = startinpos+1; 5266 if (unicode_decode_call_errorhandler( 5267 errors, &errorHandler, 5268 "charmap", "character maps to <undefined>", 5269 &starts, &e, &startinpos, &endinpos, &exc, &s, 5270 &v, &outpos, &p)) { 5271 Py_DECREF(x); 5272 goto onError; 5273 } 5274 Py_DECREF(x); 5275 continue; 5276 } 5277 else if (PyUnicode_Check(x)) { 5278 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x); 5279 5280 if (targetsize == 1) 5281 /* 1-1 mapping */ 5282 *p++ = *PyUnicode_AS_UNICODE(x); 5283 5284 else if (targetsize > 1) { 5285 /* 1-n mapping */ 5286 if (targetsize > extrachars) { 5287 /* resize first */ 5288 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v); 5289 Py_ssize_t needed = (targetsize - extrachars) + \ 5290 (targetsize << 2); 5291 extrachars += needed; 5292 /* XXX overflow detection missing */ 5293 if (_PyUnicode_Resize(&v, 5294 PyUnicode_GET_SIZE(v) + needed) < 0) { 5295 Py_DECREF(x); 5296 goto onError; 5297 } 5298 p = PyUnicode_AS_UNICODE(v) + oldpos; 5299 } 5300 Py_UNICODE_COPY(p, 5301 PyUnicode_AS_UNICODE(x), 5302 targetsize); 5303 p += targetsize; 5304 extrachars -= targetsize; 5305 } 5306 /* 1-0 mapping: skip the character */ 5307 } 5308 else { 5309 /* wrong return value */ 5310 PyErr_SetString(PyExc_TypeError, 5311 "character mapping must return integer, None or str"); 5312 Py_DECREF(x); 5313 goto onError; 5314 } 5315 Py_DECREF(x); 5316 ++s; 5317 } 5318 } 5319 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v)) 5320 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0) 5321 goto onError; 5322 Py_XDECREF(errorHandler); 5323 Py_XDECREF(exc); 5324 return (PyObject *)v; 5325 5326 onError: 5327 Py_XDECREF(errorHandler); 5328 Py_XDECREF(exc); 5329 Py_XDECREF(v); 5330 return NULL; 5331} 5332 5333/* Charmap encoding: the lookup table */ 5334 5335struct encoding_map{ 5336 PyObject_HEAD 5337 unsigned char level1[32]; 5338 int count2, count3; 5339 unsigned char level23[1]; 5340}; 5341 5342static PyObject* 5343encoding_map_size(PyObject *obj, PyObject* args) 5344{ 5345 struct encoding_map *map = (struct encoding_map*)obj; 5346 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 + 5347 128*map->count3); 5348} 5349 5350static PyMethodDef encoding_map_methods[] = { 5351 {"size", encoding_map_size, METH_NOARGS, 5352 PyDoc_STR("Return the size (in bytes) of this object") }, 5353 { 0 } 5354}; 5355 5356static void 5357encoding_map_dealloc(PyObject* o) 5358{ 5359 PyObject_FREE(o); 5360} 5361 5362static PyTypeObject EncodingMapType = { 5363 PyVarObject_HEAD_INIT(NULL, 0) 5364 "EncodingMap", /*tp_name*/ 5365 sizeof(struct encoding_map), /*tp_basicsize*/ 5366 0, /*tp_itemsize*/ 5367 /* methods */ 5368 encoding_map_dealloc, /*tp_dealloc*/ 5369 0, /*tp_print*/ 5370 0, /*tp_getattr*/ 5371 0, /*tp_setattr*/ 5372 0, /*tp_reserved*/ 5373 0, /*tp_repr*/ 5374 0, /*tp_as_number*/ 5375 0, /*tp_as_sequence*/ 5376 0, /*tp_as_mapping*/ 5377 0, /*tp_hash*/ 5378 0, /*tp_call*/ 5379 0, /*tp_str*/ 5380 0, /*tp_getattro*/ 5381 0, /*tp_setattro*/ 5382 0, /*tp_as_buffer*/ 5383 Py_TPFLAGS_DEFAULT, /*tp_flags*/ 5384 0, /*tp_doc*/ 5385 0, /*tp_traverse*/ 5386 0, /*tp_clear*/ 5387 0, /*tp_richcompare*/ 5388 0, /*tp_weaklistoffset*/ 5389 0, /*tp_iter*/ 5390 0, /*tp_iternext*/ 5391 encoding_map_methods, /*tp_methods*/ 5392 0, /*tp_members*/ 5393 0, /*tp_getset*/ 5394 0, /*tp_base*/ 5395 0, /*tp_dict*/ 5396 0, /*tp_descr_get*/ 5397 0, /*tp_descr_set*/ 5398 0, /*tp_dictoffset*/ 5399 0, /*tp_init*/ 5400 0, /*tp_alloc*/ 5401 0, /*tp_new*/ 5402 0, /*tp_free*/ 5403 0, /*tp_is_gc*/ 5404}; 5405 5406PyObject* 5407PyUnicode_BuildEncodingMap(PyObject* string) 5408{ 5409 Py_UNICODE *decode; 5410 PyObject *result; 5411 struct encoding_map *mresult; 5412 int i; 5413 int need_dict = 0; 5414 unsigned char level1[32]; 5415 unsigned char level2[512]; 5416 unsigned char *mlevel1, *mlevel2, *mlevel3; 5417 int count2 = 0, count3 = 0; 5418 5419 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) { 5420 PyErr_BadArgument(); 5421 return NULL; 5422 } 5423 decode = PyUnicode_AS_UNICODE(string); 5424 memset(level1, 0xFF, sizeof level1); 5425 memset(level2, 0xFF, sizeof level2); 5426 5427 /* If there isn't a one-to-one mapping of NULL to \0, 5428 or if there are non-BMP characters, we need to use 5429 a mapping dictionary. */ 5430 if (decode[0] != 0) 5431 need_dict = 1; 5432 for (i = 1; i < 256; i++) { 5433 int l1, l2; 5434 if (decode[i] == 0 5435#ifdef Py_UNICODE_WIDE 5436 || decode[i] > 0xFFFF 5437#endif 5438 ) { 5439 need_dict = 1; 5440 break; 5441 } 5442 if (decode[i] == 0xFFFE) 5443 /* unmapped character */ 5444 continue; 5445 l1 = decode[i] >> 11; 5446 l2 = decode[i] >> 7; 5447 if (level1[l1] == 0xFF) 5448 level1[l1] = count2++; 5449 if (level2[l2] == 0xFF) 5450 level2[l2] = count3++; 5451 } 5452 5453 if (count2 >= 0xFF || count3 >= 0xFF) 5454 need_dict = 1; 5455 5456 if (need_dict) { 5457 PyObject *result = PyDict_New(); 5458 PyObject *key, *value; 5459 if (!result) 5460 return NULL; 5461 for (i = 0; i < 256; i++) { 5462 key = value = NULL; 5463 key = PyLong_FromLong(decode[i]); 5464 value = PyLong_FromLong(i); 5465 if (!key || !value) 5466 goto failed1; 5467 if (PyDict_SetItem(result, key, value) == -1) 5468 goto failed1; 5469 Py_DECREF(key); 5470 Py_DECREF(value); 5471 } 5472 return result; 5473 failed1: 5474 Py_XDECREF(key); 5475 Py_XDECREF(value); 5476 Py_DECREF(result); 5477 return NULL; 5478 } 5479 5480 /* Create a three-level trie */ 5481 result = PyObject_MALLOC(sizeof(struct encoding_map) + 5482 16*count2 + 128*count3 - 1); 5483 if (!result) 5484 return PyErr_NoMemory(); 5485 PyObject_Init(result, &EncodingMapType); 5486 mresult = (struct encoding_map*)result; 5487 mresult->count2 = count2; 5488 mresult->count3 = count3; 5489 mlevel1 = mresult->level1; 5490 mlevel2 = mresult->level23; 5491 mlevel3 = mresult->level23 + 16*count2; 5492 memcpy(mlevel1, level1, 32); 5493 memset(mlevel2, 0xFF, 16*count2); 5494 memset(mlevel3, 0, 128*count3); 5495 count3 = 0; 5496 for (i = 1; i < 256; i++) { 5497 int o1, o2, o3, i2, i3; 5498 if (decode[i] == 0xFFFE) 5499 /* unmapped character */ 5500 continue; 5501 o1 = decode[i]>>11; 5502 o2 = (decode[i]>>7) & 0xF; 5503 i2 = 16*mlevel1[o1] + o2; 5504 if (mlevel2[i2] == 0xFF) 5505 mlevel2[i2] = count3++; 5506 o3 = decode[i] & 0x7F; 5507 i3 = 128*mlevel2[i2] + o3; 5508 mlevel3[i3] = i; 5509 } 5510 return result; 5511} 5512 5513static int 5514encoding_map_lookup(Py_UNICODE c, PyObject *mapping) 5515{ 5516 struct encoding_map *map = (struct encoding_map*)mapping; 5517 int l1 = c>>11; 5518 int l2 = (c>>7) & 0xF; 5519 int l3 = c & 0x7F; 5520 int i; 5521 5522#ifdef Py_UNICODE_WIDE 5523 if (c > 0xFFFF) { 5524 return -1; 5525 } 5526#endif 5527 if (c == 0) 5528 return 0; 5529 /* level 1*/ 5530 i = map->level1[l1]; 5531 if (i == 0xFF) { 5532 return -1; 5533 } 5534 /* level 2*/ 5535 i = map->level23[16*i+l2]; 5536 if (i == 0xFF) { 5537 return -1; 5538 } 5539 /* level 3 */ 5540 i = map->level23[16*map->count2 + 128*i + l3]; 5541 if (i == 0) { 5542 return -1; 5543 } 5544 return i; 5545} 5546 5547/* Lookup the character ch in the mapping. If the character 5548 can't be found, Py_None is returned (or NULL, if another 5549 error occurred). */ 5550static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping) 5551{ 5552 PyObject *w = PyLong_FromLong((long)c); 5553 PyObject *x; 5554 5555 if (w == NULL) 5556 return NULL; 5557 x = PyObject_GetItem(mapping, w); 5558 Py_DECREF(w); 5559 if (x == NULL) { 5560 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 5561 /* No mapping found means: mapping is undefined. */ 5562 PyErr_Clear(); 5563 x = Py_None; 5564 Py_INCREF(x); 5565 return x; 5566 } else 5567 return NULL; 5568 } 5569 else if (x == Py_None) 5570 return x; 5571 else if (PyLong_Check(x)) { 5572 long value = PyLong_AS_LONG(x); 5573 if (value < 0 || value > 255) { 5574 PyErr_SetString(PyExc_TypeError, 5575 "character mapping must be in range(256)"); 5576 Py_DECREF(x); 5577 return NULL; 5578 } 5579 return x; 5580 } 5581 else if (PyBytes_Check(x)) 5582 return x; 5583 else { 5584 /* wrong return value */ 5585 PyErr_Format(PyExc_TypeError, 5586 "character mapping must return integer, bytes or None, not %.400s", 5587 x->ob_type->tp_name); 5588 Py_DECREF(x); 5589 return NULL; 5590 } 5591} 5592 5593static int 5594charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize) 5595{ 5596 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj); 5597 /* exponentially overallocate to minimize reallocations */ 5598 if (requiredsize < 2*outsize) 5599 requiredsize = 2*outsize; 5600 if (_PyBytes_Resize(outobj, requiredsize)) 5601 return -1; 5602 return 0; 5603} 5604 5605typedef enum charmapencode_result { 5606 enc_SUCCESS, enc_FAILED, enc_EXCEPTION 5607}charmapencode_result; 5608/* lookup the character, put the result in the output string and adjust 5609 various state variables. Resize the output bytes object if not enough 5610 space is available. Return a new reference to the object that 5611 was put in the output buffer, or Py_None, if the mapping was undefined 5612 (in which case no character was written) or NULL, if a 5613 reallocation error occurred. The caller must decref the result */ 5614static 5615charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping, 5616 PyObject **outobj, Py_ssize_t *outpos) 5617{ 5618 PyObject *rep; 5619 char *outstart; 5620 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj); 5621 5622 if (Py_TYPE(mapping) == &EncodingMapType) { 5623 int res = encoding_map_lookup(c, mapping); 5624 Py_ssize_t requiredsize = *outpos+1; 5625 if (res == -1) 5626 return enc_FAILED; 5627 if (outsize<requiredsize) 5628 if (charmapencode_resize(outobj, outpos, requiredsize)) 5629 return enc_EXCEPTION; 5630 outstart = PyBytes_AS_STRING(*outobj); 5631 outstart[(*outpos)++] = (char)res; 5632 return enc_SUCCESS; 5633 } 5634 5635 rep = charmapencode_lookup(c, mapping); 5636 if (rep==NULL) 5637 return enc_EXCEPTION; 5638 else if (rep==Py_None) { 5639 Py_DECREF(rep); 5640 return enc_FAILED; 5641 } else { 5642 if (PyLong_Check(rep)) { 5643 Py_ssize_t requiredsize = *outpos+1; 5644 if (outsize<requiredsize) 5645 if (charmapencode_resize(outobj, outpos, requiredsize)) { 5646 Py_DECREF(rep); 5647 return enc_EXCEPTION; 5648 } 5649 outstart = PyBytes_AS_STRING(*outobj); 5650 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep); 5651 } 5652 else { 5653 const char *repchars = PyBytes_AS_STRING(rep); 5654 Py_ssize_t repsize = PyBytes_GET_SIZE(rep); 5655 Py_ssize_t requiredsize = *outpos+repsize; 5656 if (outsize<requiredsize) 5657 if (charmapencode_resize(outobj, outpos, requiredsize)) { 5658 Py_DECREF(rep); 5659 return enc_EXCEPTION; 5660 } 5661 outstart = PyBytes_AS_STRING(*outobj); 5662 memcpy(outstart + *outpos, repchars, repsize); 5663 *outpos += repsize; 5664 } 5665 } 5666 Py_DECREF(rep); 5667 return enc_SUCCESS; 5668} 5669 5670/* handle an error in PyUnicode_EncodeCharmap 5671 Return 0 on success, -1 on error */ 5672static 5673int charmap_encoding_error( 5674 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping, 5675 PyObject **exceptionObject, 5676 int *known_errorHandler, PyObject **errorHandler, const char *errors, 5677 PyObject **res, Py_ssize_t *respos) 5678{ 5679 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */ 5680 Py_ssize_t repsize; 5681 Py_ssize_t newpos; 5682 Py_UNICODE *uni2; 5683 /* startpos for collecting unencodable chars */ 5684 Py_ssize_t collstartpos = *inpos; 5685 Py_ssize_t collendpos = *inpos+1; 5686 Py_ssize_t collpos; 5687 char *encoding = "charmap"; 5688 char *reason = "character maps to <undefined>"; 5689 charmapencode_result x; 5690 5691 /* find all unencodable characters */ 5692 while (collendpos < size) { 5693 PyObject *rep; 5694 if (Py_TYPE(mapping) == &EncodingMapType) { 5695 int res = encoding_map_lookup(p[collendpos], mapping); 5696 if (res != -1) 5697 break; 5698 ++collendpos; 5699 continue; 5700 } 5701 5702 rep = charmapencode_lookup(p[collendpos], mapping); 5703 if (rep==NULL) 5704 return -1; 5705 else if (rep!=Py_None) { 5706 Py_DECREF(rep); 5707 break; 5708 } 5709 Py_DECREF(rep); 5710 ++collendpos; 5711 } 5712 /* cache callback name lookup 5713 * (if not done yet, i.e. it's the first error) */ 5714 if (*known_errorHandler==-1) { 5715 if ((errors==NULL) || (!strcmp(errors, "strict"))) 5716 *known_errorHandler = 1; 5717 else if (!strcmp(errors, "replace")) 5718 *known_errorHandler = 2; 5719 else if (!strcmp(errors, "ignore")) 5720 *known_errorHandler = 3; 5721 else if (!strcmp(errors, "xmlcharrefreplace")) 5722 *known_errorHandler = 4; 5723 else 5724 *known_errorHandler = 0; 5725 } 5726 switch (*known_errorHandler) { 5727 case 1: /* strict */ 5728 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason); 5729 return -1; 5730 case 2: /* replace */ 5731 for (collpos = collstartpos; collpos<collendpos; ++collpos) { 5732 x = charmapencode_output('?', mapping, res, respos); 5733 if (x==enc_EXCEPTION) { 5734 return -1; 5735 } 5736 else if (x==enc_FAILED) { 5737 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason); 5738 return -1; 5739 } 5740 } 5741 /* fall through */ 5742 case 3: /* ignore */ 5743 *inpos = collendpos; 5744 break; 5745 case 4: /* xmlcharrefreplace */ 5746 /* generate replacement (temporarily (mis)uses p) */ 5747 for (collpos = collstartpos; collpos < collendpos; ++collpos) { 5748 char buffer[2+29+1+1]; 5749 char *cp; 5750 sprintf(buffer, "&#%d;", (int)p[collpos]); 5751 for (cp = buffer; *cp; ++cp) { 5752 x = charmapencode_output(*cp, mapping, res, respos); 5753 if (x==enc_EXCEPTION) 5754 return -1; 5755 else if (x==enc_FAILED) { 5756 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason); 5757 return -1; 5758 } 5759 } 5760 } 5761 *inpos = collendpos; 5762 break; 5763 default: 5764 repunicode = unicode_encode_call_errorhandler(errors, errorHandler, 5765 encoding, reason, p, size, exceptionObject, 5766 collstartpos, collendpos, &newpos); 5767 if (repunicode == NULL) 5768 return -1; 5769 if (PyBytes_Check(repunicode)) { 5770 /* Directly copy bytes result to output. */ 5771 Py_ssize_t outsize = PyBytes_Size(*res); 5772 Py_ssize_t requiredsize; 5773 repsize = PyBytes_Size(repunicode); 5774 requiredsize = *respos + repsize; 5775 if (requiredsize > outsize) 5776 /* Make room for all additional bytes. */ 5777 if (charmapencode_resize(res, respos, requiredsize)) { 5778 Py_DECREF(repunicode); 5779 return -1; 5780 } 5781 memcpy(PyBytes_AsString(*res) + *respos, 5782 PyBytes_AsString(repunicode), repsize); 5783 *respos += repsize; 5784 *inpos = newpos; 5785 Py_DECREF(repunicode); 5786 break; 5787 } 5788 /* generate replacement */ 5789 repsize = PyUnicode_GET_SIZE(repunicode); 5790 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) { 5791 x = charmapencode_output(*uni2, mapping, res, respos); 5792 if (x==enc_EXCEPTION) { 5793 return -1; 5794 } 5795 else if (x==enc_FAILED) { 5796 Py_DECREF(repunicode); 5797 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason); 5798 return -1; 5799 } 5800 } 5801 *inpos = newpos; 5802 Py_DECREF(repunicode); 5803 } 5804 return 0; 5805} 5806 5807PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p, 5808 Py_ssize_t size, 5809 PyObject *mapping, 5810 const char *errors) 5811{ 5812 /* output object */ 5813 PyObject *res = NULL; 5814 /* current input position */ 5815 Py_ssize_t inpos = 0; 5816 /* current output position */ 5817 Py_ssize_t respos = 0; 5818 PyObject *errorHandler = NULL; 5819 PyObject *exc = NULL; 5820 /* the following variable is used for caching string comparisons 5821 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 5822 * 3=ignore, 4=xmlcharrefreplace */ 5823 int known_errorHandler = -1; 5824 5825 /* Default to Latin-1 */ 5826 if (mapping == NULL) 5827 return PyUnicode_EncodeLatin1(p, size, errors); 5828 5829 /* allocate enough for a simple encoding without 5830 replacements, if we need more, we'll resize */ 5831 res = PyBytes_FromStringAndSize(NULL, size); 5832 if (res == NULL) 5833 goto onError; 5834 if (size == 0) 5835 return res; 5836 5837 while (inpos<size) { 5838 /* try to encode it */ 5839 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos); 5840 if (x==enc_EXCEPTION) /* error */ 5841 goto onError; 5842 if (x==enc_FAILED) { /* unencodable character */ 5843 if (charmap_encoding_error(p, size, &inpos, mapping, 5844 &exc, 5845 &known_errorHandler, &errorHandler, errors, 5846 &res, &respos)) { 5847 goto onError; 5848 } 5849 } 5850 else 5851 /* done with this character => adjust input position */ 5852 ++inpos; 5853 } 5854 5855 /* Resize if we allocated to much */ 5856 if (respos<PyBytes_GET_SIZE(res)) 5857 if (_PyBytes_Resize(&res, respos) < 0) 5858 goto onError; 5859 5860 Py_XDECREF(exc); 5861 Py_XDECREF(errorHandler); 5862 return res; 5863 5864 onError: 5865 Py_XDECREF(res); 5866 Py_XDECREF(exc); 5867 Py_XDECREF(errorHandler); 5868 return NULL; 5869} 5870 5871PyObject *PyUnicode_AsCharmapString(PyObject *unicode, 5872 PyObject *mapping) 5873{ 5874 if (!PyUnicode_Check(unicode) || mapping == NULL) { 5875 PyErr_BadArgument(); 5876 return NULL; 5877 } 5878 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode), 5879 PyUnicode_GET_SIZE(unicode), 5880 mapping, 5881 NULL); 5882} 5883 5884/* create or adjust a UnicodeTranslateError */ 5885static void make_translate_exception(PyObject **exceptionObject, 5886 const Py_UNICODE *unicode, Py_ssize_t size, 5887 Py_ssize_t startpos, Py_ssize_t endpos, 5888 const char *reason) 5889{ 5890 if (*exceptionObject == NULL) { 5891 *exceptionObject = PyUnicodeTranslateError_Create( 5892 unicode, size, startpos, endpos, reason); 5893 } 5894 else { 5895 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos)) 5896 goto onError; 5897 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos)) 5898 goto onError; 5899 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason)) 5900 goto onError; 5901 return; 5902 onError: 5903 Py_DECREF(*exceptionObject); 5904 *exceptionObject = NULL; 5905 } 5906} 5907 5908/* raises a UnicodeTranslateError */ 5909static void raise_translate_exception(PyObject **exceptionObject, 5910 const Py_UNICODE *unicode, Py_ssize_t size, 5911 Py_ssize_t startpos, Py_ssize_t endpos, 5912 const char *reason) 5913{ 5914 make_translate_exception(exceptionObject, 5915 unicode, size, startpos, endpos, reason); 5916 if (*exceptionObject != NULL) 5917 PyCodec_StrictErrors(*exceptionObject); 5918} 5919 5920/* error handling callback helper: 5921 build arguments, call the callback and check the arguments, 5922 put the result into newpos and return the replacement string, which 5923 has to be freed by the caller */ 5924static PyObject *unicode_translate_call_errorhandler(const char *errors, 5925 PyObject **errorHandler, 5926 const char *reason, 5927 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject, 5928 Py_ssize_t startpos, Py_ssize_t endpos, 5929 Py_ssize_t *newpos) 5930{ 5931 static char *argparse = "O!n;translating error handler must return (str, int) tuple"; 5932 5933 Py_ssize_t i_newpos; 5934 PyObject *restuple; 5935 PyObject *resunicode; 5936 5937 if (*errorHandler == NULL) { 5938 *errorHandler = PyCodec_LookupError(errors); 5939 if (*errorHandler == NULL) 5940 return NULL; 5941 } 5942 5943 make_translate_exception(exceptionObject, 5944 unicode, size, startpos, endpos, reason); 5945 if (*exceptionObject == NULL) 5946 return NULL; 5947 5948 restuple = PyObject_CallFunctionObjArgs( 5949 *errorHandler, *exceptionObject, NULL); 5950 if (restuple == NULL) 5951 return NULL; 5952 if (!PyTuple_Check(restuple)) { 5953 PyErr_SetString(PyExc_TypeError, &argparse[4]); 5954 Py_DECREF(restuple); 5955 return NULL; 5956 } 5957 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, 5958 &resunicode, &i_newpos)) { 5959 Py_DECREF(restuple); 5960 return NULL; 5961 } 5962 if (i_newpos<0) 5963 *newpos = size+i_newpos; 5964 else 5965 *newpos = i_newpos; 5966 if (*newpos<0 || *newpos>size) { 5967 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos); 5968 Py_DECREF(restuple); 5969 return NULL; 5970 } 5971 Py_INCREF(resunicode); 5972 Py_DECREF(restuple); 5973 return resunicode; 5974} 5975 5976/* Lookup the character ch in the mapping and put the result in result, 5977 which must be decrefed by the caller. 5978 Return 0 on success, -1 on error */ 5979static 5980int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result) 5981{ 5982 PyObject *w = PyLong_FromLong((long)c); 5983 PyObject *x; 5984 5985 if (w == NULL) 5986 return -1; 5987 x = PyObject_GetItem(mapping, w); 5988 Py_DECREF(w); 5989 if (x == NULL) { 5990 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 5991 /* No mapping found means: use 1:1 mapping. */ 5992 PyErr_Clear(); 5993 *result = NULL; 5994 return 0; 5995 } else 5996 return -1; 5997 } 5998 else if (x == Py_None) { 5999 *result = x; 6000 return 0; 6001 } 6002 else if (PyLong_Check(x)) { 6003 long value = PyLong_AS_LONG(x); 6004 long max = PyUnicode_GetMax(); 6005 if (value < 0 || value > max) { 6006 PyErr_Format(PyExc_TypeError, 6007 "character mapping must be in range(0x%x)", max+1); 6008 Py_DECREF(x); 6009 return -1; 6010 } 6011 *result = x; 6012 return 0; 6013 } 6014 else if (PyUnicode_Check(x)) { 6015 *result = x; 6016 return 0; 6017 } 6018 else { 6019 /* wrong return value */ 6020 PyErr_SetString(PyExc_TypeError, 6021 "character mapping must return integer, None or str"); 6022 Py_DECREF(x); 6023 return -1; 6024 } 6025} 6026/* ensure that *outobj is at least requiredsize characters long, 6027 if not reallocate and adjust various state variables. 6028 Return 0 on success, -1 on error */ 6029static 6030int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp, 6031 Py_ssize_t requiredsize) 6032{ 6033 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj); 6034 if (requiredsize > oldsize) { 6035 /* remember old output position */ 6036 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj); 6037 /* exponentially overallocate to minimize reallocations */ 6038 if (requiredsize < 2 * oldsize) 6039 requiredsize = 2 * oldsize; 6040 if (PyUnicode_Resize(outobj, requiredsize) < 0) 6041 return -1; 6042 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos; 6043 } 6044 return 0; 6045} 6046/* lookup the character, put the result in the output string and adjust 6047 various state variables. Return a new reference to the object that 6048 was put in the output buffer in *result, or Py_None, if the mapping was 6049 undefined (in which case no character was written). 6050 The called must decref result. 6051 Return 0 on success, -1 on error. */ 6052static 6053int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp, 6054 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp, 6055 PyObject **res) 6056{ 6057 if (charmaptranslate_lookup(*curinp, mapping, res)) 6058 return -1; 6059 if (*res==NULL) { 6060 /* not found => default to 1:1 mapping */ 6061 *(*outp)++ = *curinp; 6062 } 6063 else if (*res==Py_None) 6064 ; 6065 else if (PyLong_Check(*res)) { 6066 /* no overflow check, because we know that the space is enough */ 6067 *(*outp)++ = (Py_UNICODE)PyLong_AS_LONG(*res); 6068 } 6069 else if (PyUnicode_Check(*res)) { 6070 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res); 6071 if (repsize==1) { 6072 /* no overflow check, because we know that the space is enough */ 6073 *(*outp)++ = *PyUnicode_AS_UNICODE(*res); 6074 } 6075 else if (repsize!=0) { 6076 /* more than one character */ 6077 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) + 6078 (insize - (curinp-startinp)) + 6079 repsize - 1; 6080 if (charmaptranslate_makespace(outobj, outp, requiredsize)) 6081 return -1; 6082 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize); 6083 *outp += repsize; 6084 } 6085 } 6086 else 6087 return -1; 6088 return 0; 6089} 6090 6091PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p, 6092 Py_ssize_t size, 6093 PyObject *mapping, 6094 const char *errors) 6095{ 6096 /* output object */ 6097 PyObject *res = NULL; 6098 /* pointers to the beginning and end+1 of input */ 6099 const Py_UNICODE *startp = p; 6100 const Py_UNICODE *endp = p + size; 6101 /* pointer into the output */ 6102 Py_UNICODE *str; 6103 /* current output position */ 6104 Py_ssize_t respos = 0; 6105 char *reason = "character maps to <undefined>"; 6106 PyObject *errorHandler = NULL; 6107 PyObject *exc = NULL; 6108 /* the following variable is used for caching string comparisons 6109 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 6110 * 3=ignore, 4=xmlcharrefreplace */ 6111 int known_errorHandler = -1; 6112 6113 if (mapping == NULL) { 6114 PyErr_BadArgument(); 6115 return NULL; 6116 } 6117 6118 /* allocate enough for a simple 1:1 translation without 6119 replacements, if we need more, we'll resize */ 6120 res = PyUnicode_FromUnicode(NULL, size); 6121 if (res == NULL) 6122 goto onError; 6123 if (size == 0) 6124 return res; 6125 str = PyUnicode_AS_UNICODE(res); 6126 6127 while (p<endp) { 6128 /* try to encode it */ 6129 PyObject *x = NULL; 6130 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) { 6131 Py_XDECREF(x); 6132 goto onError; 6133 } 6134 Py_XDECREF(x); 6135 if (x!=Py_None) /* it worked => adjust input pointer */ 6136 ++p; 6137 else { /* untranslatable character */ 6138 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */ 6139 Py_ssize_t repsize; 6140 Py_ssize_t newpos; 6141 Py_UNICODE *uni2; 6142 /* startpos for collecting untranslatable chars */ 6143 const Py_UNICODE *collstart = p; 6144 const Py_UNICODE *collend = p+1; 6145 const Py_UNICODE *coll; 6146 6147 /* find all untranslatable characters */ 6148 while (collend < endp) { 6149 if (charmaptranslate_lookup(*collend, mapping, &x)) 6150 goto onError; 6151 Py_XDECREF(x); 6152 if (x!=Py_None) 6153 break; 6154 ++collend; 6155 } 6156 /* cache callback name lookup 6157 * (if not done yet, i.e. it's the first error) */ 6158 if (known_errorHandler==-1) { 6159 if ((errors==NULL) || (!strcmp(errors, "strict"))) 6160 known_errorHandler = 1; 6161 else if (!strcmp(errors, "replace")) 6162 known_errorHandler = 2; 6163 else if (!strcmp(errors, "ignore")) 6164 known_errorHandler = 3; 6165 else if (!strcmp(errors, "xmlcharrefreplace")) 6166 known_errorHandler = 4; 6167 else 6168 known_errorHandler = 0; 6169 } 6170 switch (known_errorHandler) { 6171 case 1: /* strict */ 6172 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason); 6173 goto onError; 6174 case 2: /* replace */ 6175 /* No need to check for space, this is a 1:1 replacement */ 6176 for (coll = collstart; coll<collend; ++coll) 6177 *str++ = '?'; 6178 /* fall through */ 6179 case 3: /* ignore */ 6180 p = collend; 6181 break; 6182 case 4: /* xmlcharrefreplace */ 6183 /* generate replacement (temporarily (mis)uses p) */ 6184 for (p = collstart; p < collend; ++p) { 6185 char buffer[2+29+1+1]; 6186 char *cp; 6187 sprintf(buffer, "&#%d;", (int)*p); 6188 if (charmaptranslate_makespace(&res, &str, 6189 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend))) 6190 goto onError; 6191 for (cp = buffer; *cp; ++cp) 6192 *str++ = *cp; 6193 } 6194 p = collend; 6195 break; 6196 default: 6197 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler, 6198 reason, startp, size, &exc, 6199 collstart-startp, collend-startp, &newpos); 6200 if (repunicode == NULL) 6201 goto onError; 6202 /* generate replacement */ 6203 repsize = PyUnicode_GET_SIZE(repunicode); 6204 if (charmaptranslate_makespace(&res, &str, 6205 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) { 6206 Py_DECREF(repunicode); 6207 goto onError; 6208 } 6209 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) 6210 *str++ = *uni2; 6211 p = startp + newpos; 6212 Py_DECREF(repunicode); 6213 } 6214 } 6215 } 6216 /* Resize if we allocated to much */ 6217 respos = str-PyUnicode_AS_UNICODE(res); 6218 if (respos<PyUnicode_GET_SIZE(res)) { 6219 if (PyUnicode_Resize(&res, respos) < 0) 6220 goto onError; 6221 } 6222 Py_XDECREF(exc); 6223 Py_XDECREF(errorHandler); 6224 return res; 6225 6226 onError: 6227 Py_XDECREF(res); 6228 Py_XDECREF(exc); 6229 Py_XDECREF(errorHandler); 6230 return NULL; 6231} 6232 6233PyObject *PyUnicode_Translate(PyObject *str, 6234 PyObject *mapping, 6235 const char *errors) 6236{ 6237 PyObject *result; 6238 6239 str = PyUnicode_FromObject(str); 6240 if (str == NULL) 6241 goto onError; 6242 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str), 6243 PyUnicode_GET_SIZE(str), 6244 mapping, 6245 errors); 6246 Py_DECREF(str); 6247 return result; 6248 6249 onError: 6250 Py_XDECREF(str); 6251 return NULL; 6252} 6253 6254PyObject * 6255PyUnicode_TransformDecimalToASCII(Py_UNICODE *s, 6256 Py_ssize_t length) 6257{ 6258 PyObject *result; 6259 Py_UNICODE *p; /* write pointer into result */ 6260 Py_ssize_t i; 6261 /* Copy to a new string */ 6262 result = (PyObject *)_PyUnicode_New(length); 6263 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length); 6264 if (result == NULL) 6265 return result; 6266 p = PyUnicode_AS_UNICODE(result); 6267 /* Iterate over code points */ 6268 for (i = 0; i < length; i++) { 6269 Py_UNICODE ch =s[i]; 6270 if (ch > 127) { 6271 int decimal = Py_UNICODE_TODECIMAL(ch); 6272 if (decimal >= 0) 6273 p[i] = '0' + decimal; 6274 } 6275 } 6276 return result; 6277} 6278/* --- Decimal Encoder ---------------------------------------------------- */ 6279 6280int PyUnicode_EncodeDecimal(Py_UNICODE *s, 6281 Py_ssize_t length, 6282 char *output, 6283 const char *errors) 6284{ 6285 Py_UNICODE *p, *end; 6286 PyObject *errorHandler = NULL; 6287 PyObject *exc = NULL; 6288 const char *encoding = "decimal"; 6289 const char *reason = "invalid decimal Unicode string"; 6290 /* the following variable is used for caching string comparisons 6291 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */ 6292 int known_errorHandler = -1; 6293 6294 if (output == NULL) { 6295 PyErr_BadArgument(); 6296 return -1; 6297 } 6298 6299 p = s; 6300 end = s + length; 6301 while (p < end) { 6302 register Py_UNICODE ch = *p; 6303 int decimal; 6304 PyObject *repunicode; 6305 Py_ssize_t repsize; 6306 Py_ssize_t newpos; 6307 Py_UNICODE *uni2; 6308 Py_UNICODE *collstart; 6309 Py_UNICODE *collend; 6310 6311 if (Py_UNICODE_ISSPACE(ch)) { 6312 *output++ = ' '; 6313 ++p; 6314 continue; 6315 } 6316 decimal = Py_UNICODE_TODECIMAL(ch); 6317 if (decimal >= 0) { 6318 *output++ = '0' + decimal; 6319 ++p; 6320 continue; 6321 } 6322 if (0 < ch && ch < 256) { 6323 *output++ = (char)ch; 6324 ++p; 6325 continue; 6326 } 6327 /* All other characters are considered unencodable */ 6328 collstart = p; 6329 for (collend = p+1; collend < end; collend++) { 6330 if ((0 < *collend && *collend < 256) || 6331 Py_UNICODE_ISSPACE(*collend) || 6332 0 <= Py_UNICODE_TODECIMAL(*collend)) 6333 break; 6334 } 6335 /* cache callback name lookup 6336 * (if not done yet, i.e. it's the first error) */ 6337 if (known_errorHandler==-1) { 6338 if ((errors==NULL) || (!strcmp(errors, "strict"))) 6339 known_errorHandler = 1; 6340 else if (!strcmp(errors, "replace")) 6341 known_errorHandler = 2; 6342 else if (!strcmp(errors, "ignore")) 6343 known_errorHandler = 3; 6344 else if (!strcmp(errors, "xmlcharrefreplace")) 6345 known_errorHandler = 4; 6346 else 6347 known_errorHandler = 0; 6348 } 6349 switch (known_errorHandler) { 6350 case 1: /* strict */ 6351 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason); 6352 goto onError; 6353 case 2: /* replace */ 6354 for (p = collstart; p < collend; ++p) 6355 *output++ = '?'; 6356 /* fall through */ 6357 case 3: /* ignore */ 6358 p = collend; 6359 break; 6360 case 4: /* xmlcharrefreplace */ 6361 /* generate replacement (temporarily (mis)uses p) */ 6362 for (p = collstart; p < collend; ++p) 6363 output += sprintf(output, "&#%d;", (int)*p); 6364 p = collend; 6365 break; 6366 default: 6367 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler, 6368 encoding, reason, s, length, &exc, 6369 collstart-s, collend-s, &newpos); 6370 if (repunicode == NULL) 6371 goto onError; 6372 if (!PyUnicode_Check(repunicode)) { 6373 /* Byte results not supported, since they have no decimal property. */ 6374 PyErr_SetString(PyExc_TypeError, "error handler should return unicode"); 6375 Py_DECREF(repunicode); 6376 goto onError; 6377 } 6378 /* generate replacement */ 6379 repsize = PyUnicode_GET_SIZE(repunicode); 6380 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) { 6381 Py_UNICODE ch = *uni2; 6382 if (Py_UNICODE_ISSPACE(ch)) 6383 *output++ = ' '; 6384 else { 6385 decimal = Py_UNICODE_TODECIMAL(ch); 6386 if (decimal >= 0) 6387 *output++ = '0' + decimal; 6388 else if (0 < ch && ch < 256) 6389 *output++ = (char)ch; 6390 else { 6391 Py_DECREF(repunicode); 6392 raise_encode_exception(&exc, encoding, 6393 s, length, collstart-s, collend-s, reason); 6394 goto onError; 6395 } 6396 } 6397 } 6398 p = s + newpos; 6399 Py_DECREF(repunicode); 6400 } 6401 } 6402 /* 0-terminate the output string */ 6403 *output++ = '\0'; 6404 Py_XDECREF(exc); 6405 Py_XDECREF(errorHandler); 6406 return 0; 6407 6408 onError: 6409 Py_XDECREF(exc); 6410 Py_XDECREF(errorHandler); 6411 return -1; 6412} 6413 6414/* --- Helpers ------------------------------------------------------------ */ 6415 6416#include "stringlib/unicodedefs.h" 6417#include "stringlib/fastsearch.h" 6418 6419#include "stringlib/count.h" 6420#include "stringlib/find.h" 6421#include "stringlib/partition.h" 6422#include "stringlib/split.h" 6423 6424#define _Py_InsertThousandsGrouping _PyUnicode_InsertThousandsGrouping 6425#define _Py_InsertThousandsGroupingLocale _PyUnicode_InsertThousandsGroupingLocale 6426#include "stringlib/localeutil.h" 6427 6428/* helper macro to fixup start/end slice values */ 6429#define ADJUST_INDICES(start, end, len) \ 6430 if (end > len) \ 6431 end = len; \ 6432 else if (end < 0) { \ 6433 end += len; \ 6434 if (end < 0) \ 6435 end = 0; \ 6436 } \ 6437 if (start < 0) { \ 6438 start += len; \ 6439 if (start < 0) \ 6440 start = 0; \ 6441 } 6442 6443/* _Py_UNICODE_NEXT is a private macro used to retrieve the character pointed 6444 * by 'ptr', possibly combining surrogate pairs on narrow builds. 6445 * 'ptr' and 'end' must be Py_UNICODE*, with 'ptr' pointing at the character 6446 * that should be returned and 'end' pointing to the end of the buffer. 6447 * ('end' is used on narrow builds to detect a lone surrogate at the 6448 * end of the buffer that should be returned unchanged.) 6449 * The ptr and end arguments should be side-effect free and ptr must an lvalue. 6450 * The type of the returned char is always Py_UCS4. 6451 * 6452 * Note: the macro advances ptr to next char, so it might have side-effects 6453 * (especially if used with other macros). 6454 */ 6455 6456/* helper macros used by _Py_UNICODE_NEXT */ 6457#define _Py_UNICODE_IS_HIGH_SURROGATE(ch) (0xD800 <= ch && ch <= 0xDBFF) 6458#define _Py_UNICODE_IS_LOW_SURROGATE(ch) (0xDC00 <= ch && ch <= 0xDFFF) 6459/* Join two surrogate characters and return a single Py_UCS4 value. */ 6460#define _Py_UNICODE_JOIN_SURROGATES(high, low) \ 6461 (((((Py_UCS4)(high) & 0x03FF) << 10) | \ 6462 ((Py_UCS4)(low) & 0x03FF)) + 0x10000) 6463 6464#ifdef Py_UNICODE_WIDE 6465#define _Py_UNICODE_NEXT(ptr, end) *(ptr)++ 6466#else 6467#define _Py_UNICODE_NEXT(ptr, end) \ 6468 (((_Py_UNICODE_IS_HIGH_SURROGATE(*(ptr)) && (ptr) < (end)) && \ 6469 _Py_UNICODE_IS_LOW_SURROGATE((ptr)[1])) ? \ 6470 ((ptr) += 2,_Py_UNICODE_JOIN_SURROGATES((ptr)[-2], (ptr)[-1])) : \ 6471 (Py_UCS4)*(ptr)++) 6472#endif 6473 6474Py_ssize_t PyUnicode_Count(PyObject *str, 6475 PyObject *substr, 6476 Py_ssize_t start, 6477 Py_ssize_t end) 6478{ 6479 Py_ssize_t result; 6480 PyUnicodeObject* str_obj; 6481 PyUnicodeObject* sub_obj; 6482 6483 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str); 6484 if (!str_obj) 6485 return -1; 6486 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr); 6487 if (!sub_obj) { 6488 Py_DECREF(str_obj); 6489 return -1; 6490 } 6491 6492 ADJUST_INDICES(start, end, str_obj->length); 6493 result = stringlib_count( 6494 str_obj->str + start, end - start, sub_obj->str, sub_obj->length, 6495 PY_SSIZE_T_MAX 6496 ); 6497 6498 Py_DECREF(sub_obj); 6499 Py_DECREF(str_obj); 6500 6501 return result; 6502} 6503 6504Py_ssize_t PyUnicode_Find(PyObject *str, 6505 PyObject *sub, 6506 Py_ssize_t start, 6507 Py_ssize_t end, 6508 int direction) 6509{ 6510 Py_ssize_t result; 6511 6512 str = PyUnicode_FromObject(str); 6513 if (!str) 6514 return -2; 6515 sub = PyUnicode_FromObject(sub); 6516 if (!sub) { 6517 Py_DECREF(str); 6518 return -2; 6519 } 6520 6521 if (direction > 0) 6522 result = stringlib_find_slice( 6523 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str), 6524 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub), 6525 start, end 6526 ); 6527 else 6528 result = stringlib_rfind_slice( 6529 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str), 6530 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub), 6531 start, end 6532 ); 6533 6534 Py_DECREF(str); 6535 Py_DECREF(sub); 6536 6537 return result; 6538} 6539 6540static 6541int tailmatch(PyUnicodeObject *self, 6542 PyUnicodeObject *substring, 6543 Py_ssize_t start, 6544 Py_ssize_t end, 6545 int direction) 6546{ 6547 if (substring->length == 0) 6548 return 1; 6549 6550 ADJUST_INDICES(start, end, self->length); 6551 end -= substring->length; 6552 if (end < start) 6553 return 0; 6554 6555 if (direction > 0) { 6556 if (Py_UNICODE_MATCH(self, end, substring)) 6557 return 1; 6558 } else { 6559 if (Py_UNICODE_MATCH(self, start, substring)) 6560 return 1; 6561 } 6562 6563 return 0; 6564} 6565 6566Py_ssize_t PyUnicode_Tailmatch(PyObject *str, 6567 PyObject *substr, 6568 Py_ssize_t start, 6569 Py_ssize_t end, 6570 int direction) 6571{ 6572 Py_ssize_t result; 6573 6574 str = PyUnicode_FromObject(str); 6575 if (str == NULL) 6576 return -1; 6577 substr = PyUnicode_FromObject(substr); 6578 if (substr == NULL) { 6579 Py_DECREF(str); 6580 return -1; 6581 } 6582 6583 result = tailmatch((PyUnicodeObject *)str, 6584 (PyUnicodeObject *)substr, 6585 start, end, direction); 6586 Py_DECREF(str); 6587 Py_DECREF(substr); 6588 return result; 6589} 6590 6591/* Apply fixfct filter to the Unicode object self and return a 6592 reference to the modified object */ 6593 6594static 6595PyObject *fixup(PyUnicodeObject *self, 6596 int (*fixfct)(PyUnicodeObject *s)) 6597{ 6598 6599 PyUnicodeObject *u; 6600 6601 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length); 6602 if (u == NULL) 6603 return NULL; 6604 6605 Py_UNICODE_COPY(u->str, self->str, self->length); 6606 6607 if (!fixfct(u) && PyUnicode_CheckExact(self)) { 6608 /* fixfct should return TRUE if it modified the buffer. If 6609 FALSE, return a reference to the original buffer instead 6610 (to save space, not time) */ 6611 Py_INCREF(self); 6612 Py_DECREF(u); 6613 return (PyObject*) self; 6614 } 6615 return (PyObject*) u; 6616} 6617 6618static 6619int fixupper(PyUnicodeObject *self) 6620{ 6621 Py_ssize_t len = self->length; 6622 Py_UNICODE *s = self->str; 6623 int status = 0; 6624 6625 while (len-- > 0) { 6626 register Py_UNICODE ch; 6627 6628 ch = Py_UNICODE_TOUPPER(*s); 6629 if (ch != *s) { 6630 status = 1; 6631 *s = ch; 6632 } 6633 s++; 6634 } 6635 6636 return status; 6637} 6638 6639static 6640int fixlower(PyUnicodeObject *self) 6641{ 6642 Py_ssize_t len = self->length; 6643 Py_UNICODE *s = self->str; 6644 int status = 0; 6645 6646 while (len-- > 0) { 6647 register Py_UNICODE ch; 6648 6649 ch = Py_UNICODE_TOLOWER(*s); 6650 if (ch != *s) { 6651 status = 1; 6652 *s = ch; 6653 } 6654 s++; 6655 } 6656 6657 return status; 6658} 6659 6660static 6661int fixswapcase(PyUnicodeObject *self) 6662{ 6663 Py_ssize_t len = self->length; 6664 Py_UNICODE *s = self->str; 6665 int status = 0; 6666 6667 while (len-- > 0) { 6668 if (Py_UNICODE_ISUPPER(*s)) { 6669 *s = Py_UNICODE_TOLOWER(*s); 6670 status = 1; 6671 } else if (Py_UNICODE_ISLOWER(*s)) { 6672 *s = Py_UNICODE_TOUPPER(*s); 6673 status = 1; 6674 } 6675 s++; 6676 } 6677 6678 return status; 6679} 6680 6681static 6682int fixcapitalize(PyUnicodeObject *self) 6683{ 6684 Py_ssize_t len = self->length; 6685 Py_UNICODE *s = self->str; 6686 int status = 0; 6687 6688 if (len == 0) 6689 return 0; 6690 if (!Py_UNICODE_ISUPPER(*s)) { 6691 *s = Py_UNICODE_TOUPPER(*s); 6692 status = 1; 6693 } 6694 s++; 6695 while (--len > 0) { 6696 if (!Py_UNICODE_ISLOWER(*s)) { 6697 *s = Py_UNICODE_TOLOWER(*s); 6698 status = 1; 6699 } 6700 s++; 6701 } 6702 return status; 6703} 6704 6705static 6706int fixtitle(PyUnicodeObject *self) 6707{ 6708 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 6709 register Py_UNICODE *e; 6710 int previous_is_cased; 6711 6712 /* Shortcut for single character strings */ 6713 if (PyUnicode_GET_SIZE(self) == 1) { 6714 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p); 6715 if (*p != ch) { 6716 *p = ch; 6717 return 1; 6718 } 6719 else 6720 return 0; 6721 } 6722 6723 e = p + PyUnicode_GET_SIZE(self); 6724 previous_is_cased = 0; 6725 for (; p < e; p++) { 6726 register const Py_UNICODE ch = *p; 6727 6728 if (previous_is_cased) 6729 *p = Py_UNICODE_TOLOWER(ch); 6730 else 6731 *p = Py_UNICODE_TOTITLE(ch); 6732 6733 if (Py_UNICODE_ISLOWER(ch) || 6734 Py_UNICODE_ISUPPER(ch) || 6735 Py_UNICODE_ISTITLE(ch)) 6736 previous_is_cased = 1; 6737 else 6738 previous_is_cased = 0; 6739 } 6740 return 1; 6741} 6742 6743PyObject * 6744PyUnicode_Join(PyObject *separator, PyObject *seq) 6745{ 6746 const Py_UNICODE blank = ' '; 6747 const Py_UNICODE *sep = ␣ 6748 Py_ssize_t seplen = 1; 6749 PyUnicodeObject *res = NULL; /* the result */ 6750 Py_UNICODE *res_p; /* pointer to free byte in res's string area */ 6751 PyObject *fseq; /* PySequence_Fast(seq) */ 6752 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */ 6753 PyObject **items; 6754 PyObject *item; 6755 Py_ssize_t sz, i; 6756 6757 fseq = PySequence_Fast(seq, ""); 6758 if (fseq == NULL) { 6759 return NULL; 6760 } 6761 6762 /* NOTE: the following code can't call back into Python code, 6763 * so we are sure that fseq won't be mutated. 6764 */ 6765 6766 seqlen = PySequence_Fast_GET_SIZE(fseq); 6767 /* If empty sequence, return u"". */ 6768 if (seqlen == 0) { 6769 res = _PyUnicode_New(0); /* empty sequence; return u"" */ 6770 goto Done; 6771 } 6772 items = PySequence_Fast_ITEMS(fseq); 6773 /* If singleton sequence with an exact Unicode, return that. */ 6774 if (seqlen == 1) { 6775 item = items[0]; 6776 if (PyUnicode_CheckExact(item)) { 6777 Py_INCREF(item); 6778 res = (PyUnicodeObject *)item; 6779 goto Done; 6780 } 6781 } 6782 else { 6783 /* Set up sep and seplen */ 6784 if (separator == NULL) { 6785 sep = ␣ 6786 seplen = 1; 6787 } 6788 else { 6789 if (!PyUnicode_Check(separator)) { 6790 PyErr_Format(PyExc_TypeError, 6791 "separator: expected str instance," 6792 " %.80s found", 6793 Py_TYPE(separator)->tp_name); 6794 goto onError; 6795 } 6796 sep = PyUnicode_AS_UNICODE(separator); 6797 seplen = PyUnicode_GET_SIZE(separator); 6798 } 6799 } 6800 6801 /* There are at least two things to join, or else we have a subclass 6802 * of str in the sequence. 6803 * Do a pre-pass to figure out the total amount of space we'll 6804 * need (sz), and see whether all argument are strings. 6805 */ 6806 sz = 0; 6807 for (i = 0; i < seqlen; i++) { 6808 const Py_ssize_t old_sz = sz; 6809 item = items[i]; 6810 if (!PyUnicode_Check(item)) { 6811 PyErr_Format(PyExc_TypeError, 6812 "sequence item %zd: expected str instance," 6813 " %.80s found", 6814 i, Py_TYPE(item)->tp_name); 6815 goto onError; 6816 } 6817 sz += PyUnicode_GET_SIZE(item); 6818 if (i != 0) 6819 sz += seplen; 6820 if (sz < old_sz || sz > PY_SSIZE_T_MAX) { 6821 PyErr_SetString(PyExc_OverflowError, 6822 "join() result is too long for a Python string"); 6823 goto onError; 6824 } 6825 } 6826 6827 res = _PyUnicode_New(sz); 6828 if (res == NULL) 6829 goto onError; 6830 6831 /* Catenate everything. */ 6832 res_p = PyUnicode_AS_UNICODE(res); 6833 for (i = 0; i < seqlen; ++i) { 6834 Py_ssize_t itemlen; 6835 item = items[i]; 6836 itemlen = PyUnicode_GET_SIZE(item); 6837 /* Copy item, and maybe the separator. */ 6838 if (i) { 6839 Py_UNICODE_COPY(res_p, sep, seplen); 6840 res_p += seplen; 6841 } 6842 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen); 6843 res_p += itemlen; 6844 } 6845 6846 Done: 6847 Py_DECREF(fseq); 6848 return (PyObject *)res; 6849 6850 onError: 6851 Py_DECREF(fseq); 6852 Py_XDECREF(res); 6853 return NULL; 6854} 6855 6856static 6857PyUnicodeObject *pad(PyUnicodeObject *self, 6858 Py_ssize_t left, 6859 Py_ssize_t right, 6860 Py_UNICODE fill) 6861{ 6862 PyUnicodeObject *u; 6863 6864 if (left < 0) 6865 left = 0; 6866 if (right < 0) 6867 right = 0; 6868 6869 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) { 6870 Py_INCREF(self); 6871 return self; 6872 } 6873 6874 if (left > PY_SSIZE_T_MAX - self->length || 6875 right > PY_SSIZE_T_MAX - (left + self->length)) { 6876 PyErr_SetString(PyExc_OverflowError, "padded string is too long"); 6877 return NULL; 6878 } 6879 u = _PyUnicode_New(left + self->length + right); 6880 if (u) { 6881 if (left) 6882 Py_UNICODE_FILL(u->str, fill, left); 6883 Py_UNICODE_COPY(u->str + left, self->str, self->length); 6884 if (right) 6885 Py_UNICODE_FILL(u->str + left + self->length, fill, right); 6886 } 6887 6888 return u; 6889} 6890 6891PyObject *PyUnicode_Splitlines(PyObject *string, int keepends) 6892{ 6893 PyObject *list; 6894 6895 string = PyUnicode_FromObject(string); 6896 if (string == NULL) 6897 return NULL; 6898 6899 list = stringlib_splitlines( 6900 (PyObject*) string, PyUnicode_AS_UNICODE(string), 6901 PyUnicode_GET_SIZE(string), keepends); 6902 6903 Py_DECREF(string); 6904 return list; 6905} 6906 6907static 6908PyObject *split(PyUnicodeObject *self, 6909 PyUnicodeObject *substring, 6910 Py_ssize_t maxcount) 6911{ 6912 if (maxcount < 0) 6913 maxcount = PY_SSIZE_T_MAX; 6914 6915 if (substring == NULL) 6916 return stringlib_split_whitespace( 6917 (PyObject*) self, self->str, self->length, maxcount 6918 ); 6919 6920 return stringlib_split( 6921 (PyObject*) self, self->str, self->length, 6922 substring->str, substring->length, 6923 maxcount 6924 ); 6925} 6926 6927static 6928PyObject *rsplit(PyUnicodeObject *self, 6929 PyUnicodeObject *substring, 6930 Py_ssize_t maxcount) 6931{ 6932 if (maxcount < 0) 6933 maxcount = PY_SSIZE_T_MAX; 6934 6935 if (substring == NULL) 6936 return stringlib_rsplit_whitespace( 6937 (PyObject*) self, self->str, self->length, maxcount 6938 ); 6939 6940 return stringlib_rsplit( 6941 (PyObject*) self, self->str, self->length, 6942 substring->str, substring->length, 6943 maxcount 6944 ); 6945} 6946 6947static 6948PyObject *replace(PyUnicodeObject *self, 6949 PyUnicodeObject *str1, 6950 PyUnicodeObject *str2, 6951 Py_ssize_t maxcount) 6952{ 6953 PyUnicodeObject *u; 6954 6955 if (maxcount < 0) 6956 maxcount = PY_SSIZE_T_MAX; 6957 else if (maxcount == 0 || self->length == 0) 6958 goto nothing; 6959 6960 if (str1->length == str2->length) { 6961 Py_ssize_t i; 6962 /* same length */ 6963 if (str1->length == 0) 6964 goto nothing; 6965 if (str1->length == 1) { 6966 /* replace characters */ 6967 Py_UNICODE u1, u2; 6968 if (!findchar(self->str, self->length, str1->str[0])) 6969 goto nothing; 6970 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length); 6971 if (!u) 6972 return NULL; 6973 Py_UNICODE_COPY(u->str, self->str, self->length); 6974 u1 = str1->str[0]; 6975 u2 = str2->str[0]; 6976 for (i = 0; i < u->length; i++) 6977 if (u->str[i] == u1) { 6978 if (--maxcount < 0) 6979 break; 6980 u->str[i] = u2; 6981 } 6982 } else { 6983 i = stringlib_find( 6984 self->str, self->length, str1->str, str1->length, 0 6985 ); 6986 if (i < 0) 6987 goto nothing; 6988 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length); 6989 if (!u) 6990 return NULL; 6991 Py_UNICODE_COPY(u->str, self->str, self->length); 6992 6993 /* change everything in-place, starting with this one */ 6994 Py_UNICODE_COPY(u->str+i, str2->str, str2->length); 6995 i += str1->length; 6996 6997 while ( --maxcount > 0) { 6998 i = stringlib_find(self->str+i, self->length-i, 6999 str1->str, str1->length, 7000 i); 7001 if (i == -1) 7002 break; 7003 Py_UNICODE_COPY(u->str+i, str2->str, str2->length); 7004 i += str1->length; 7005 } 7006 } 7007 } else { 7008 7009 Py_ssize_t n, i, j; 7010 Py_ssize_t product, new_size, delta; 7011 Py_UNICODE *p; 7012 7013 /* replace strings */ 7014 n = stringlib_count(self->str, self->length, str1->str, str1->length, 7015 maxcount); 7016 if (n == 0) 7017 goto nothing; 7018 /* new_size = self->length + n * (str2->length - str1->length)); */ 7019 delta = (str2->length - str1->length); 7020 if (delta == 0) { 7021 new_size = self->length; 7022 } else { 7023 product = n * (str2->length - str1->length); 7024 if ((product / (str2->length - str1->length)) != n) { 7025 PyErr_SetString(PyExc_OverflowError, 7026 "replace string is too long"); 7027 return NULL; 7028 } 7029 new_size = self->length + product; 7030 if (new_size < 0) { 7031 PyErr_SetString(PyExc_OverflowError, 7032 "replace string is too long"); 7033 return NULL; 7034 } 7035 } 7036 u = _PyUnicode_New(new_size); 7037 if (!u) 7038 return NULL; 7039 i = 0; 7040 p = u->str; 7041 if (str1->length > 0) { 7042 while (n-- > 0) { 7043 /* look for next match */ 7044 j = stringlib_find(self->str+i, self->length-i, 7045 str1->str, str1->length, 7046 i); 7047 if (j == -1) 7048 break; 7049 else if (j > i) { 7050 /* copy unchanged part [i:j] */ 7051 Py_UNICODE_COPY(p, self->str+i, j-i); 7052 p += j - i; 7053 } 7054 /* copy substitution string */ 7055 if (str2->length > 0) { 7056 Py_UNICODE_COPY(p, str2->str, str2->length); 7057 p += str2->length; 7058 } 7059 i = j + str1->length; 7060 } 7061 if (i < self->length) 7062 /* copy tail [i:] */ 7063 Py_UNICODE_COPY(p, self->str+i, self->length-i); 7064 } else { 7065 /* interleave */ 7066 while (n > 0) { 7067 Py_UNICODE_COPY(p, str2->str, str2->length); 7068 p += str2->length; 7069 if (--n <= 0) 7070 break; 7071 *p++ = self->str[i++]; 7072 } 7073 Py_UNICODE_COPY(p, self->str+i, self->length-i); 7074 } 7075 } 7076 return (PyObject *) u; 7077 7078 nothing: 7079 /* nothing to replace; return original string (when possible) */ 7080 if (PyUnicode_CheckExact(self)) { 7081 Py_INCREF(self); 7082 return (PyObject *) self; 7083 } 7084 return PyUnicode_FromUnicode(self->str, self->length); 7085} 7086 7087/* --- Unicode Object Methods --------------------------------------------- */ 7088 7089PyDoc_STRVAR(title__doc__, 7090 "S.title() -> str\n\ 7091\n\ 7092Return a titlecased version of S, i.e. words start with title case\n\ 7093characters, all remaining cased characters have lower case."); 7094 7095static PyObject* 7096unicode_title(PyUnicodeObject *self) 7097{ 7098 return fixup(self, fixtitle); 7099} 7100 7101PyDoc_STRVAR(capitalize__doc__, 7102 "S.capitalize() -> str\n\ 7103\n\ 7104Return a capitalized version of S, i.e. make the first character\n\ 7105have upper case and the rest lower case."); 7106 7107static PyObject* 7108unicode_capitalize(PyUnicodeObject *self) 7109{ 7110 return fixup(self, fixcapitalize); 7111} 7112 7113#if 0 7114PyDoc_STRVAR(capwords__doc__, 7115 "S.capwords() -> str\n\ 7116\n\ 7117Apply .capitalize() to all words in S and return the result with\n\ 7118normalized whitespace (all whitespace strings are replaced by ' ')."); 7119 7120static PyObject* 7121unicode_capwords(PyUnicodeObject *self) 7122{ 7123 PyObject *list; 7124 PyObject *item; 7125 Py_ssize_t i; 7126 7127 /* Split into words */ 7128 list = split(self, NULL, -1); 7129 if (!list) 7130 return NULL; 7131 7132 /* Capitalize each word */ 7133 for (i = 0; i < PyList_GET_SIZE(list); i++) { 7134 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i), 7135 fixcapitalize); 7136 if (item == NULL) 7137 goto onError; 7138 Py_DECREF(PyList_GET_ITEM(list, i)); 7139 PyList_SET_ITEM(list, i, item); 7140 } 7141 7142 /* Join the words to form a new string */ 7143 item = PyUnicode_Join(NULL, list); 7144 7145 onError: 7146 Py_DECREF(list); 7147 return (PyObject *)item; 7148} 7149#endif 7150 7151/* Argument converter. Coerces to a single unicode character */ 7152 7153static int 7154convert_uc(PyObject *obj, void *addr) 7155{ 7156 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr; 7157 PyObject *uniobj; 7158 Py_UNICODE *unistr; 7159 7160 uniobj = PyUnicode_FromObject(obj); 7161 if (uniobj == NULL) { 7162 PyErr_SetString(PyExc_TypeError, 7163 "The fill character cannot be converted to Unicode"); 7164 return 0; 7165 } 7166 if (PyUnicode_GET_SIZE(uniobj) != 1) { 7167 PyErr_SetString(PyExc_TypeError, 7168 "The fill character must be exactly one character long"); 7169 Py_DECREF(uniobj); 7170 return 0; 7171 } 7172 unistr = PyUnicode_AS_UNICODE(uniobj); 7173 *fillcharloc = unistr[0]; 7174 Py_DECREF(uniobj); 7175 return 1; 7176} 7177 7178PyDoc_STRVAR(center__doc__, 7179 "S.center(width[, fillchar]) -> str\n\ 7180\n\ 7181Return S centered in a string of length width. Padding is\n\ 7182done using the specified fill character (default is a space)"); 7183 7184static PyObject * 7185unicode_center(PyUnicodeObject *self, PyObject *args) 7186{ 7187 Py_ssize_t marg, left; 7188 Py_ssize_t width; 7189 Py_UNICODE fillchar = ' '; 7190 7191 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar)) 7192 return NULL; 7193 7194 if (self->length >= width && PyUnicode_CheckExact(self)) { 7195 Py_INCREF(self); 7196 return (PyObject*) self; 7197 } 7198 7199 marg = width - self->length; 7200 left = marg / 2 + (marg & width & 1); 7201 7202 return (PyObject*) pad(self, left, marg - left, fillchar); 7203} 7204 7205#if 0 7206 7207/* This code should go into some future Unicode collation support 7208 module. The basic comparison should compare ordinals on a naive 7209 basis (this is what Java does and thus Jython too). */ 7210 7211/* speedy UTF-16 code point order comparison */ 7212/* gleaned from: */ 7213/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */ 7214 7215static short utf16Fixup[32] = 7216{ 7217 0, 0, 0, 0, 0, 0, 0, 0, 7218 0, 0, 0, 0, 0, 0, 0, 0, 7219 0, 0, 0, 0, 0, 0, 0, 0, 7220 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800 7221}; 7222 7223static int 7224unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2) 7225{ 7226 Py_ssize_t len1, len2; 7227 7228 Py_UNICODE *s1 = str1->str; 7229 Py_UNICODE *s2 = str2->str; 7230 7231 len1 = str1->length; 7232 len2 = str2->length; 7233 7234 while (len1 > 0 && len2 > 0) { 7235 Py_UNICODE c1, c2; 7236 7237 c1 = *s1++; 7238 c2 = *s2++; 7239 7240 if (c1 > (1<<11) * 26) 7241 c1 += utf16Fixup[c1>>11]; 7242 if (c2 > (1<<11) * 26) 7243 c2 += utf16Fixup[c2>>11]; 7244 /* now c1 and c2 are in UTF-32-compatible order */ 7245 7246 if (c1 != c2) 7247 return (c1 < c2) ? -1 : 1; 7248 7249 len1--; len2--; 7250 } 7251 7252 return (len1 < len2) ? -1 : (len1 != len2); 7253} 7254 7255#else 7256 7257static int 7258unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2) 7259{ 7260 register Py_ssize_t len1, len2; 7261 7262 Py_UNICODE *s1 = str1->str; 7263 Py_UNICODE *s2 = str2->str; 7264 7265 len1 = str1->length; 7266 len2 = str2->length; 7267 7268 while (len1 > 0 && len2 > 0) { 7269 Py_UNICODE c1, c2; 7270 7271 c1 = *s1++; 7272 c2 = *s2++; 7273 7274 if (c1 != c2) 7275 return (c1 < c2) ? -1 : 1; 7276 7277 len1--; len2--; 7278 } 7279 7280 return (len1 < len2) ? -1 : (len1 != len2); 7281} 7282 7283#endif 7284 7285int PyUnicode_Compare(PyObject *left, 7286 PyObject *right) 7287{ 7288 if (PyUnicode_Check(left) && PyUnicode_Check(right)) 7289 return unicode_compare((PyUnicodeObject *)left, 7290 (PyUnicodeObject *)right); 7291 PyErr_Format(PyExc_TypeError, 7292 "Can't compare %.100s and %.100s", 7293 left->ob_type->tp_name, 7294 right->ob_type->tp_name); 7295 return -1; 7296} 7297 7298int 7299PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str) 7300{ 7301 int i; 7302 Py_UNICODE *id; 7303 assert(PyUnicode_Check(uni)); 7304 id = PyUnicode_AS_UNICODE(uni); 7305 /* Compare Unicode string and source character set string */ 7306 for (i = 0; id[i] && str[i]; i++) 7307 if (id[i] != str[i]) 7308 return ((int)id[i] < (int)str[i]) ? -1 : 1; 7309 /* This check keeps Python strings that end in '\0' from comparing equal 7310 to C strings identical up to that point. */ 7311 if (PyUnicode_GET_SIZE(uni) != i || id[i]) 7312 return 1; /* uni is longer */ 7313 if (str[i]) 7314 return -1; /* str is longer */ 7315 return 0; 7316} 7317 7318 7319#define TEST_COND(cond) \ 7320 ((cond) ? Py_True : Py_False) 7321 7322PyObject *PyUnicode_RichCompare(PyObject *left, 7323 PyObject *right, 7324 int op) 7325{ 7326 int result; 7327 7328 if (PyUnicode_Check(left) && PyUnicode_Check(right)) { 7329 PyObject *v; 7330 if (((PyUnicodeObject *) left)->length != 7331 ((PyUnicodeObject *) right)->length) { 7332 if (op == Py_EQ) { 7333 Py_INCREF(Py_False); 7334 return Py_False; 7335 } 7336 if (op == Py_NE) { 7337 Py_INCREF(Py_True); 7338 return Py_True; 7339 } 7340 } 7341 if (left == right) 7342 result = 0; 7343 else 7344 result = unicode_compare((PyUnicodeObject *)left, 7345 (PyUnicodeObject *)right); 7346 7347 /* Convert the return value to a Boolean */ 7348 switch (op) { 7349 case Py_EQ: 7350 v = TEST_COND(result == 0); 7351 break; 7352 case Py_NE: 7353 v = TEST_COND(result != 0); 7354 break; 7355 case Py_LE: 7356 v = TEST_COND(result <= 0); 7357 break; 7358 case Py_GE: 7359 v = TEST_COND(result >= 0); 7360 break; 7361 case Py_LT: 7362 v = TEST_COND(result == -1); 7363 break; 7364 case Py_GT: 7365 v = TEST_COND(result == 1); 7366 break; 7367 default: 7368 PyErr_BadArgument(); 7369 return NULL; 7370 } 7371 Py_INCREF(v); 7372 return v; 7373 } 7374 7375 Py_INCREF(Py_NotImplemented); 7376 return Py_NotImplemented; 7377} 7378 7379int PyUnicode_Contains(PyObject *container, 7380 PyObject *element) 7381{ 7382 PyObject *str, *sub; 7383 int result; 7384 7385 /* Coerce the two arguments */ 7386 sub = PyUnicode_FromObject(element); 7387 if (!sub) { 7388 PyErr_Format(PyExc_TypeError, 7389 "'in <string>' requires string as left operand, not %s", 7390 element->ob_type->tp_name); 7391 return -1; 7392 } 7393 7394 str = PyUnicode_FromObject(container); 7395 if (!str) { 7396 Py_DECREF(sub); 7397 return -1; 7398 } 7399 7400 result = stringlib_contains_obj(str, sub); 7401 7402 Py_DECREF(str); 7403 Py_DECREF(sub); 7404 7405 return result; 7406} 7407 7408/* Concat to string or Unicode object giving a new Unicode object. */ 7409 7410PyObject *PyUnicode_Concat(PyObject *left, 7411 PyObject *right) 7412{ 7413 PyUnicodeObject *u = NULL, *v = NULL, *w; 7414 7415 /* Coerce the two arguments */ 7416 u = (PyUnicodeObject *)PyUnicode_FromObject(left); 7417 if (u == NULL) 7418 goto onError; 7419 v = (PyUnicodeObject *)PyUnicode_FromObject(right); 7420 if (v == NULL) 7421 goto onError; 7422 7423 /* Shortcuts */ 7424 if (v == unicode_empty) { 7425 Py_DECREF(v); 7426 return (PyObject *)u; 7427 } 7428 if (u == unicode_empty) { 7429 Py_DECREF(u); 7430 return (PyObject *)v; 7431 } 7432 7433 /* Concat the two Unicode strings */ 7434 w = _PyUnicode_New(u->length + v->length); 7435 if (w == NULL) 7436 goto onError; 7437 Py_UNICODE_COPY(w->str, u->str, u->length); 7438 Py_UNICODE_COPY(w->str + u->length, v->str, v->length); 7439 7440 Py_DECREF(u); 7441 Py_DECREF(v); 7442 return (PyObject *)w; 7443 7444 onError: 7445 Py_XDECREF(u); 7446 Py_XDECREF(v); 7447 return NULL; 7448} 7449 7450void 7451PyUnicode_Append(PyObject **pleft, PyObject *right) 7452{ 7453 PyObject *new; 7454 if (*pleft == NULL) 7455 return; 7456 if (right == NULL || !PyUnicode_Check(*pleft)) { 7457 Py_DECREF(*pleft); 7458 *pleft = NULL; 7459 return; 7460 } 7461 new = PyUnicode_Concat(*pleft, right); 7462 Py_DECREF(*pleft); 7463 *pleft = new; 7464} 7465 7466void 7467PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right) 7468{ 7469 PyUnicode_Append(pleft, right); 7470 Py_XDECREF(right); 7471} 7472 7473PyDoc_STRVAR(count__doc__, 7474 "S.count(sub[, start[, end]]) -> int\n\ 7475\n\ 7476Return the number of non-overlapping occurrences of substring sub in\n\ 7477string S[start:end]. Optional arguments start and end are\n\ 7478interpreted as in slice notation."); 7479 7480static PyObject * 7481unicode_count(PyUnicodeObject *self, PyObject *args) 7482{ 7483 PyUnicodeObject *substring; 7484 Py_ssize_t start = 0; 7485 Py_ssize_t end = PY_SSIZE_T_MAX; 7486 PyObject *result; 7487 7488 if (!stringlib_parse_args_finds_unicode("count", args, &substring, 7489 &start, &end)) 7490 return NULL; 7491 7492 ADJUST_INDICES(start, end, self->length); 7493 result = PyLong_FromSsize_t( 7494 stringlib_count(self->str + start, end - start, 7495 substring->str, substring->length, 7496 PY_SSIZE_T_MAX) 7497 ); 7498 7499 Py_DECREF(substring); 7500 7501 return result; 7502} 7503 7504PyDoc_STRVAR(encode__doc__, 7505 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\ 7506\n\ 7507Encode S using the codec registered for encoding. Default encoding\n\ 7508is 'utf-8'. errors may be given to set a different error\n\ 7509handling scheme. Default is 'strict' meaning that encoding errors raise\n\ 7510a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\ 7511'xmlcharrefreplace' as well as any other name registered with\n\ 7512codecs.register_error that can handle UnicodeEncodeErrors."); 7513 7514static PyObject * 7515unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs) 7516{ 7517 static char *kwlist[] = {"encoding", "errors", 0}; 7518 char *encoding = NULL; 7519 char *errors = NULL; 7520 7521 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode", 7522 kwlist, &encoding, &errors)) 7523 return NULL; 7524 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors); 7525} 7526 7527PyDoc_STRVAR(expandtabs__doc__, 7528 "S.expandtabs([tabsize]) -> str\n\ 7529\n\ 7530Return a copy of S where all tab characters are expanded using spaces.\n\ 7531If tabsize is not given, a tab size of 8 characters is assumed."); 7532 7533static PyObject* 7534unicode_expandtabs(PyUnicodeObject *self, PyObject *args) 7535{ 7536 Py_UNICODE *e; 7537 Py_UNICODE *p; 7538 Py_UNICODE *q; 7539 Py_UNICODE *qe; 7540 Py_ssize_t i, j, incr; 7541 PyUnicodeObject *u; 7542 int tabsize = 8; 7543 7544 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize)) 7545 return NULL; 7546 7547 /* First pass: determine size of output string */ 7548 i = 0; /* chars up to and including most recent \n or \r */ 7549 j = 0; /* chars since most recent \n or \r (use in tab calculations) */ 7550 e = self->str + self->length; /* end of input */ 7551 for (p = self->str; p < e; p++) 7552 if (*p == '\t') { 7553 if (tabsize > 0) { 7554 incr = tabsize - (j % tabsize); /* cannot overflow */ 7555 if (j > PY_SSIZE_T_MAX - incr) 7556 goto overflow1; 7557 j += incr; 7558 } 7559 } 7560 else { 7561 if (j > PY_SSIZE_T_MAX - 1) 7562 goto overflow1; 7563 j++; 7564 if (*p == '\n' || *p == '\r') { 7565 if (i > PY_SSIZE_T_MAX - j) 7566 goto overflow1; 7567 i += j; 7568 j = 0; 7569 } 7570 } 7571 7572 if (i > PY_SSIZE_T_MAX - j) 7573 goto overflow1; 7574 7575 /* Second pass: create output string and fill it */ 7576 u = _PyUnicode_New(i + j); 7577 if (!u) 7578 return NULL; 7579 7580 j = 0; /* same as in first pass */ 7581 q = u->str; /* next output char */ 7582 qe = u->str + u->length; /* end of output */ 7583 7584 for (p = self->str; p < e; p++) 7585 if (*p == '\t') { 7586 if (tabsize > 0) { 7587 i = tabsize - (j % tabsize); 7588 j += i; 7589 while (i--) { 7590 if (q >= qe) 7591 goto overflow2; 7592 *q++ = ' '; 7593 } 7594 } 7595 } 7596 else { 7597 if (q >= qe) 7598 goto overflow2; 7599 *q++ = *p; 7600 j++; 7601 if (*p == '\n' || *p == '\r') 7602 j = 0; 7603 } 7604 7605 return (PyObject*) u; 7606 7607 overflow2: 7608 Py_DECREF(u); 7609 overflow1: 7610 PyErr_SetString(PyExc_OverflowError, "new string is too long"); 7611 return NULL; 7612} 7613 7614PyDoc_STRVAR(find__doc__, 7615 "S.find(sub[, start[, end]]) -> int\n\ 7616\n\ 7617Return the lowest index in S where substring sub is found,\n\ 7618such that sub is contained within S[start:end]. Optional\n\ 7619arguments start and end are interpreted as in slice notation.\n\ 7620\n\ 7621Return -1 on failure."); 7622 7623static PyObject * 7624unicode_find(PyUnicodeObject *self, PyObject *args) 7625{ 7626 PyUnicodeObject *substring; 7627 Py_ssize_t start; 7628 Py_ssize_t end; 7629 Py_ssize_t result; 7630 7631 if (!stringlib_parse_args_finds_unicode("find", args, &substring, 7632 &start, &end)) 7633 return NULL; 7634 7635 result = stringlib_find_slice( 7636 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self), 7637 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring), 7638 start, end 7639 ); 7640 7641 Py_DECREF(substring); 7642 7643 return PyLong_FromSsize_t(result); 7644} 7645 7646static PyObject * 7647unicode_getitem(PyUnicodeObject *self, Py_ssize_t index) 7648{ 7649 if (index < 0 || index >= self->length) { 7650 PyErr_SetString(PyExc_IndexError, "string index out of range"); 7651 return NULL; 7652 } 7653 7654 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1); 7655} 7656 7657/* Believe it or not, this produces the same value for ASCII strings 7658 as string_hash(). */ 7659static Py_hash_t 7660unicode_hash(PyUnicodeObject *self) 7661{ 7662 Py_ssize_t len; 7663 Py_UNICODE *p; 7664 Py_hash_t x; 7665 7666#ifdef Py_DEBUG 7667 assert(_Py_HashSecret_Initialized); 7668#endif 7669 if (self->hash != -1) 7670 return self->hash; 7671 len = Py_SIZE(self); 7672 /* 7673 We make the hash of the empty string be 0, rather than using 7674 (prefix ^ suffix), since this slightly obfuscates the hash secret 7675 */ 7676 if (len == 0) { 7677 self->hash = 0; 7678 return 0; 7679 } 7680 p = self->str; 7681 x = _Py_HashSecret.prefix; 7682 x ^= *p << 7; 7683 while (--len >= 0) 7684 x = (_PyHASH_MULTIPLIER*x) ^ *p++; 7685 x ^= Py_SIZE(self); 7686 x ^= _Py_HashSecret.suffix; 7687 if (x == -1) 7688 x = -2; 7689 self->hash = x; 7690 return x; 7691} 7692 7693PyDoc_STRVAR(index__doc__, 7694 "S.index(sub[, start[, end]]) -> int\n\ 7695\n\ 7696Like S.find() but raise ValueError when the substring is not found."); 7697 7698static PyObject * 7699unicode_index(PyUnicodeObject *self, PyObject *args) 7700{ 7701 Py_ssize_t result; 7702 PyUnicodeObject *substring; 7703 Py_ssize_t start; 7704 Py_ssize_t end; 7705 7706 if (!stringlib_parse_args_finds_unicode("index", args, &substring, 7707 &start, &end)) 7708 return NULL; 7709 7710 result = stringlib_find_slice( 7711 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self), 7712 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring), 7713 start, end 7714 ); 7715 7716 Py_DECREF(substring); 7717 7718 if (result < 0) { 7719 PyErr_SetString(PyExc_ValueError, "substring not found"); 7720 return NULL; 7721 } 7722 7723 return PyLong_FromSsize_t(result); 7724} 7725 7726PyDoc_STRVAR(islower__doc__, 7727 "S.islower() -> bool\n\ 7728\n\ 7729Return True if all cased characters in S are lowercase and there is\n\ 7730at least one cased character in S, False otherwise."); 7731 7732static PyObject* 7733unicode_islower(PyUnicodeObject *self) 7734{ 7735 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 7736 register const Py_UNICODE *e; 7737 int cased; 7738 7739 /* Shortcut for single character strings */ 7740 if (PyUnicode_GET_SIZE(self) == 1) 7741 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p)); 7742 7743 /* Special case for empty strings */ 7744 if (PyUnicode_GET_SIZE(self) == 0) 7745 return PyBool_FromLong(0); 7746 7747 e = p + PyUnicode_GET_SIZE(self); 7748 cased = 0; 7749 while (p < e) { 7750 const Py_UCS4 ch = _Py_UNICODE_NEXT(p, e); 7751 7752 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) 7753 return PyBool_FromLong(0); 7754 else if (!cased && Py_UNICODE_ISLOWER(ch)) 7755 cased = 1; 7756 } 7757 return PyBool_FromLong(cased); 7758} 7759 7760PyDoc_STRVAR(isupper__doc__, 7761 "S.isupper() -> bool\n\ 7762\n\ 7763Return True if all cased characters in S are uppercase and there is\n\ 7764at least one cased character in S, False otherwise."); 7765 7766static PyObject* 7767unicode_isupper(PyUnicodeObject *self) 7768{ 7769 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 7770 register const Py_UNICODE *e; 7771 int cased; 7772 7773 /* Shortcut for single character strings */ 7774 if (PyUnicode_GET_SIZE(self) == 1) 7775 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0); 7776 7777 /* Special case for empty strings */ 7778 if (PyUnicode_GET_SIZE(self) == 0) 7779 return PyBool_FromLong(0); 7780 7781 e = p + PyUnicode_GET_SIZE(self); 7782 cased = 0; 7783 while (p < e) { 7784 const Py_UCS4 ch = _Py_UNICODE_NEXT(p, e); 7785 7786 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch)) 7787 return PyBool_FromLong(0); 7788 else if (!cased && Py_UNICODE_ISUPPER(ch)) 7789 cased = 1; 7790 } 7791 return PyBool_FromLong(cased); 7792} 7793 7794PyDoc_STRVAR(istitle__doc__, 7795 "S.istitle() -> bool\n\ 7796\n\ 7797Return True if S is a titlecased string and there is at least one\n\ 7798character in S, i.e. upper- and titlecase characters may only\n\ 7799follow uncased characters and lowercase characters only cased ones.\n\ 7800Return False otherwise."); 7801 7802static PyObject* 7803unicode_istitle(PyUnicodeObject *self) 7804{ 7805 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 7806 register const Py_UNICODE *e; 7807 int cased, previous_is_cased; 7808 7809 /* Shortcut for single character strings */ 7810 if (PyUnicode_GET_SIZE(self) == 1) 7811 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) || 7812 (Py_UNICODE_ISUPPER(*p) != 0)); 7813 7814 /* Special case for empty strings */ 7815 if (PyUnicode_GET_SIZE(self) == 0) 7816 return PyBool_FromLong(0); 7817 7818 e = p + PyUnicode_GET_SIZE(self); 7819 cased = 0; 7820 previous_is_cased = 0; 7821 while (p < e) { 7822 const Py_UCS4 ch = _Py_UNICODE_NEXT(p, e); 7823 7824 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) { 7825 if (previous_is_cased) 7826 return PyBool_FromLong(0); 7827 previous_is_cased = 1; 7828 cased = 1; 7829 } 7830 else if (Py_UNICODE_ISLOWER(ch)) { 7831 if (!previous_is_cased) 7832 return PyBool_FromLong(0); 7833 previous_is_cased = 1; 7834 cased = 1; 7835 } 7836 else 7837 previous_is_cased = 0; 7838 } 7839 return PyBool_FromLong(cased); 7840} 7841 7842PyDoc_STRVAR(isspace__doc__, 7843 "S.isspace() -> bool\n\ 7844\n\ 7845Return True if all characters in S are whitespace\n\ 7846and there is at least one character in S, False otherwise."); 7847 7848static PyObject* 7849unicode_isspace(PyUnicodeObject *self) 7850{ 7851 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 7852 register const Py_UNICODE *e; 7853 7854 /* Shortcut for single character strings */ 7855 if (PyUnicode_GET_SIZE(self) == 1 && 7856 Py_UNICODE_ISSPACE(*p)) 7857 return PyBool_FromLong(1); 7858 7859 /* Special case for empty strings */ 7860 if (PyUnicode_GET_SIZE(self) == 0) 7861 return PyBool_FromLong(0); 7862 7863 e = p + PyUnicode_GET_SIZE(self); 7864 while (p < e) { 7865 const Py_UCS4 ch = _Py_UNICODE_NEXT(p, e); 7866 if (!Py_UNICODE_ISSPACE(ch)) 7867 return PyBool_FromLong(0); 7868 } 7869 return PyBool_FromLong(1); 7870} 7871 7872PyDoc_STRVAR(isalpha__doc__, 7873 "S.isalpha() -> bool\n\ 7874\n\ 7875Return True if all characters in S are alphabetic\n\ 7876and there is at least one character in S, False otherwise."); 7877 7878static PyObject* 7879unicode_isalpha(PyUnicodeObject *self) 7880{ 7881 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 7882 register const Py_UNICODE *e; 7883 7884 /* Shortcut for single character strings */ 7885 if (PyUnicode_GET_SIZE(self) == 1 && 7886 Py_UNICODE_ISALPHA(*p)) 7887 return PyBool_FromLong(1); 7888 7889 /* Special case for empty strings */ 7890 if (PyUnicode_GET_SIZE(self) == 0) 7891 return PyBool_FromLong(0); 7892 7893 e = p + PyUnicode_GET_SIZE(self); 7894 while (p < e) { 7895 if (!Py_UNICODE_ISALPHA(_Py_UNICODE_NEXT(p, e))) 7896 return PyBool_FromLong(0); 7897 } 7898 return PyBool_FromLong(1); 7899} 7900 7901PyDoc_STRVAR(isalnum__doc__, 7902 "S.isalnum() -> bool\n\ 7903\n\ 7904Return True if all characters in S are alphanumeric\n\ 7905and there is at least one character in S, False otherwise."); 7906 7907static PyObject* 7908unicode_isalnum(PyUnicodeObject *self) 7909{ 7910 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 7911 register const Py_UNICODE *e; 7912 7913 /* Shortcut for single character strings */ 7914 if (PyUnicode_GET_SIZE(self) == 1 && 7915 Py_UNICODE_ISALNUM(*p)) 7916 return PyBool_FromLong(1); 7917 7918 /* Special case for empty strings */ 7919 if (PyUnicode_GET_SIZE(self) == 0) 7920 return PyBool_FromLong(0); 7921 7922 e = p + PyUnicode_GET_SIZE(self); 7923 while (p < e) { 7924 const Py_UCS4 ch = _Py_UNICODE_NEXT(p, e); 7925 if (!Py_UNICODE_ISALNUM(ch)) 7926 return PyBool_FromLong(0); 7927 } 7928 return PyBool_FromLong(1); 7929} 7930 7931PyDoc_STRVAR(isdecimal__doc__, 7932 "S.isdecimal() -> bool\n\ 7933\n\ 7934Return True if there are only decimal characters in S,\n\ 7935False otherwise."); 7936 7937static PyObject* 7938unicode_isdecimal(PyUnicodeObject *self) 7939{ 7940 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 7941 register const Py_UNICODE *e; 7942 7943 /* Shortcut for single character strings */ 7944 if (PyUnicode_GET_SIZE(self) == 1 && 7945 Py_UNICODE_ISDECIMAL(*p)) 7946 return PyBool_FromLong(1); 7947 7948 /* Special case for empty strings */ 7949 if (PyUnicode_GET_SIZE(self) == 0) 7950 return PyBool_FromLong(0); 7951 7952 e = p + PyUnicode_GET_SIZE(self); 7953 while (p < e) { 7954 if (!Py_UNICODE_ISDECIMAL(_Py_UNICODE_NEXT(p, e))) 7955 return PyBool_FromLong(0); 7956 } 7957 return PyBool_FromLong(1); 7958} 7959 7960PyDoc_STRVAR(isdigit__doc__, 7961 "S.isdigit() -> bool\n\ 7962\n\ 7963Return True if all characters in S are digits\n\ 7964and there is at least one character in S, False otherwise."); 7965 7966static PyObject* 7967unicode_isdigit(PyUnicodeObject *self) 7968{ 7969 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 7970 register const Py_UNICODE *e; 7971 7972 /* Shortcut for single character strings */ 7973 if (PyUnicode_GET_SIZE(self) == 1 && 7974 Py_UNICODE_ISDIGIT(*p)) 7975 return PyBool_FromLong(1); 7976 7977 /* Special case for empty strings */ 7978 if (PyUnicode_GET_SIZE(self) == 0) 7979 return PyBool_FromLong(0); 7980 7981 e = p + PyUnicode_GET_SIZE(self); 7982 while (p < e) { 7983 if (!Py_UNICODE_ISDIGIT(_Py_UNICODE_NEXT(p, e))) 7984 return PyBool_FromLong(0); 7985 } 7986 return PyBool_FromLong(1); 7987} 7988 7989PyDoc_STRVAR(isnumeric__doc__, 7990 "S.isnumeric() -> bool\n\ 7991\n\ 7992Return True if there are only numeric characters in S,\n\ 7993False otherwise."); 7994 7995static PyObject* 7996unicode_isnumeric(PyUnicodeObject *self) 7997{ 7998 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 7999 register const Py_UNICODE *e; 8000 8001 /* Shortcut for single character strings */ 8002 if (PyUnicode_GET_SIZE(self) == 1 && 8003 Py_UNICODE_ISNUMERIC(*p)) 8004 return PyBool_FromLong(1); 8005 8006 /* Special case for empty strings */ 8007 if (PyUnicode_GET_SIZE(self) == 0) 8008 return PyBool_FromLong(0); 8009 8010 e = p + PyUnicode_GET_SIZE(self); 8011 while (p < e) { 8012 if (!Py_UNICODE_ISNUMERIC(_Py_UNICODE_NEXT(p, e))) 8013 return PyBool_FromLong(0); 8014 } 8015 return PyBool_FromLong(1); 8016} 8017 8018int 8019PyUnicode_IsIdentifier(PyObject *self) 8020{ 8021 const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self); 8022 const Py_UNICODE *e; 8023 Py_UCS4 first; 8024 8025 /* Special case for empty strings */ 8026 if (PyUnicode_GET_SIZE(self) == 0) 8027 return 0; 8028 8029 /* PEP 3131 says that the first character must be in 8030 XID_Start and subsequent characters in XID_Continue, 8031 and for the ASCII range, the 2.x rules apply (i.e 8032 start with letters and underscore, continue with 8033 letters, digits, underscore). However, given the current 8034 definition of XID_Start and XID_Continue, it is sufficient 8035 to check just for these, except that _ must be allowed 8036 as starting an identifier. */ 8037 e = p + PyUnicode_GET_SIZE(self); 8038 first = _Py_UNICODE_NEXT(p, e); 8039 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */) 8040 return 0; 8041 8042 while (p < e) 8043 if (!_PyUnicode_IsXidContinue(_Py_UNICODE_NEXT(p, e))) 8044 return 0; 8045 return 1; 8046} 8047 8048PyDoc_STRVAR(isidentifier__doc__, 8049 "S.isidentifier() -> bool\n\ 8050\n\ 8051Return True if S is a valid identifier according\n\ 8052to the language definition."); 8053 8054static PyObject* 8055unicode_isidentifier(PyObject *self) 8056{ 8057 return PyBool_FromLong(PyUnicode_IsIdentifier(self)); 8058} 8059 8060PyDoc_STRVAR(isprintable__doc__, 8061 "S.isprintable() -> bool\n\ 8062\n\ 8063Return True if all characters in S are considered\n\ 8064printable in repr() or S is empty, False otherwise."); 8065 8066static PyObject* 8067unicode_isprintable(PyObject *self) 8068{ 8069 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 8070 register const Py_UNICODE *e; 8071 8072 /* Shortcut for single character strings */ 8073 if (PyUnicode_GET_SIZE(self) == 1 && Py_UNICODE_ISPRINTABLE(*p)) { 8074 Py_RETURN_TRUE; 8075 } 8076 8077 e = p + PyUnicode_GET_SIZE(self); 8078 while (p < e) { 8079 if (!Py_UNICODE_ISPRINTABLE(_Py_UNICODE_NEXT(p, e))) { 8080 Py_RETURN_FALSE; 8081 } 8082 } 8083 Py_RETURN_TRUE; 8084} 8085 8086PyDoc_STRVAR(join__doc__, 8087 "S.join(iterable) -> str\n\ 8088\n\ 8089Return a string which is the concatenation of the strings in the\n\ 8090iterable. The separator between elements is S."); 8091 8092static PyObject* 8093unicode_join(PyObject *self, PyObject *data) 8094{ 8095 return PyUnicode_Join(self, data); 8096} 8097 8098static Py_ssize_t 8099unicode_length(PyUnicodeObject *self) 8100{ 8101 return self->length; 8102} 8103 8104PyDoc_STRVAR(ljust__doc__, 8105 "S.ljust(width[, fillchar]) -> str\n\ 8106\n\ 8107Return S left-justified in a Unicode string of length width. Padding is\n\ 8108done using the specified fill character (default is a space)."); 8109 8110static PyObject * 8111unicode_ljust(PyUnicodeObject *self, PyObject *args) 8112{ 8113 Py_ssize_t width; 8114 Py_UNICODE fillchar = ' '; 8115 8116 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar)) 8117 return NULL; 8118 8119 if (self->length >= width && PyUnicode_CheckExact(self)) { 8120 Py_INCREF(self); 8121 return (PyObject*) self; 8122 } 8123 8124 return (PyObject*) pad(self, 0, width - self->length, fillchar); 8125} 8126 8127PyDoc_STRVAR(lower__doc__, 8128 "S.lower() -> str\n\ 8129\n\ 8130Return a copy of the string S converted to lowercase."); 8131 8132static PyObject* 8133unicode_lower(PyUnicodeObject *self) 8134{ 8135 return fixup(self, fixlower); 8136} 8137 8138#define LEFTSTRIP 0 8139#define RIGHTSTRIP 1 8140#define BOTHSTRIP 2 8141 8142/* Arrays indexed by above */ 8143static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"}; 8144 8145#define STRIPNAME(i) (stripformat[i]+3) 8146 8147/* externally visible for str.strip(unicode) */ 8148PyObject * 8149_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj) 8150{ 8151 Py_UNICODE *s = PyUnicode_AS_UNICODE(self); 8152 Py_ssize_t len = PyUnicode_GET_SIZE(self); 8153 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj); 8154 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj); 8155 Py_ssize_t i, j; 8156 8157 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen); 8158 8159 i = 0; 8160 if (striptype != RIGHTSTRIP) { 8161 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) { 8162 i++; 8163 } 8164 } 8165 8166 j = len; 8167 if (striptype != LEFTSTRIP) { 8168 do { 8169 j--; 8170 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen)); 8171 j++; 8172 } 8173 8174 if (i == 0 && j == len && PyUnicode_CheckExact(self)) { 8175 Py_INCREF(self); 8176 return (PyObject*)self; 8177 } 8178 else 8179 return PyUnicode_FromUnicode(s+i, j-i); 8180} 8181 8182 8183static PyObject * 8184do_strip(PyUnicodeObject *self, int striptype) 8185{ 8186 Py_UNICODE *s = PyUnicode_AS_UNICODE(self); 8187 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j; 8188 8189 i = 0; 8190 if (striptype != RIGHTSTRIP) { 8191 while (i < len && Py_UNICODE_ISSPACE(s[i])) { 8192 i++; 8193 } 8194 } 8195 8196 j = len; 8197 if (striptype != LEFTSTRIP) { 8198 do { 8199 j--; 8200 } while (j >= i && Py_UNICODE_ISSPACE(s[j])); 8201 j++; 8202 } 8203 8204 if (i == 0 && j == len && PyUnicode_CheckExact(self)) { 8205 Py_INCREF(self); 8206 return (PyObject*)self; 8207 } 8208 else 8209 return PyUnicode_FromUnicode(s+i, j-i); 8210} 8211 8212 8213static PyObject * 8214do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args) 8215{ 8216 PyObject *sep = NULL; 8217 8218 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep)) 8219 return NULL; 8220 8221 if (sep != NULL && sep != Py_None) { 8222 if (PyUnicode_Check(sep)) 8223 return _PyUnicode_XStrip(self, striptype, sep); 8224 else { 8225 PyErr_Format(PyExc_TypeError, 8226 "%s arg must be None or str", 8227 STRIPNAME(striptype)); 8228 return NULL; 8229 } 8230 } 8231 8232 return do_strip(self, striptype); 8233} 8234 8235 8236PyDoc_STRVAR(strip__doc__, 8237 "S.strip([chars]) -> str\n\ 8238\n\ 8239Return a copy of the string S with leading and trailing\n\ 8240whitespace removed.\n\ 8241If chars is given and not None, remove characters in chars instead."); 8242 8243static PyObject * 8244unicode_strip(PyUnicodeObject *self, PyObject *args) 8245{ 8246 if (PyTuple_GET_SIZE(args) == 0) 8247 return do_strip(self, BOTHSTRIP); /* Common case */ 8248 else 8249 return do_argstrip(self, BOTHSTRIP, args); 8250} 8251 8252 8253PyDoc_STRVAR(lstrip__doc__, 8254 "S.lstrip([chars]) -> str\n\ 8255\n\ 8256Return a copy of the string S with leading whitespace removed.\n\ 8257If chars is given and not None, remove characters in chars instead."); 8258 8259static PyObject * 8260unicode_lstrip(PyUnicodeObject *self, PyObject *args) 8261{ 8262 if (PyTuple_GET_SIZE(args) == 0) 8263 return do_strip(self, LEFTSTRIP); /* Common case */ 8264 else 8265 return do_argstrip(self, LEFTSTRIP, args); 8266} 8267 8268 8269PyDoc_STRVAR(rstrip__doc__, 8270 "S.rstrip([chars]) -> str\n\ 8271\n\ 8272Return a copy of the string S with trailing whitespace removed.\n\ 8273If chars is given and not None, remove characters in chars instead."); 8274 8275static PyObject * 8276unicode_rstrip(PyUnicodeObject *self, PyObject *args) 8277{ 8278 if (PyTuple_GET_SIZE(args) == 0) 8279 return do_strip(self, RIGHTSTRIP); /* Common case */ 8280 else 8281 return do_argstrip(self, RIGHTSTRIP, args); 8282} 8283 8284 8285static PyObject* 8286unicode_repeat(PyUnicodeObject *str, Py_ssize_t len) 8287{ 8288 PyUnicodeObject *u; 8289 Py_UNICODE *p; 8290 Py_ssize_t nchars; 8291 size_t nbytes; 8292 8293 if (len < 1) { 8294 Py_INCREF(unicode_empty); 8295 return (PyObject *)unicode_empty; 8296 } 8297 8298 if (len == 1 && PyUnicode_CheckExact(str)) { 8299 /* no repeat, return original string */ 8300 Py_INCREF(str); 8301 return (PyObject*) str; 8302 } 8303 8304 /* ensure # of chars needed doesn't overflow int and # of bytes 8305 * needed doesn't overflow size_t 8306 */ 8307 nchars = len * str->length; 8308 if (nchars / len != str->length) { 8309 PyErr_SetString(PyExc_OverflowError, 8310 "repeated string is too long"); 8311 return NULL; 8312 } 8313 nbytes = (nchars + 1) * sizeof(Py_UNICODE); 8314 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) { 8315 PyErr_SetString(PyExc_OverflowError, 8316 "repeated string is too long"); 8317 return NULL; 8318 } 8319 u = _PyUnicode_New(nchars); 8320 if (!u) 8321 return NULL; 8322 8323 p = u->str; 8324 8325 if (str->length == 1) { 8326 Py_UNICODE_FILL(p, str->str[0], len); 8327 } else { 8328 Py_ssize_t done = str->length; /* number of characters copied this far */ 8329 Py_UNICODE_COPY(p, str->str, str->length); 8330 while (done < nchars) { 8331 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done; 8332 Py_UNICODE_COPY(p+done, p, n); 8333 done += n; 8334 } 8335 } 8336 8337 return (PyObject*) u; 8338} 8339 8340PyObject *PyUnicode_Replace(PyObject *obj, 8341 PyObject *subobj, 8342 PyObject *replobj, 8343 Py_ssize_t maxcount) 8344{ 8345 PyObject *self; 8346 PyObject *str1; 8347 PyObject *str2; 8348 PyObject *result; 8349 8350 self = PyUnicode_FromObject(obj); 8351 if (self == NULL) 8352 return NULL; 8353 str1 = PyUnicode_FromObject(subobj); 8354 if (str1 == NULL) { 8355 Py_DECREF(self); 8356 return NULL; 8357 } 8358 str2 = PyUnicode_FromObject(replobj); 8359 if (str2 == NULL) { 8360 Py_DECREF(self); 8361 Py_DECREF(str1); 8362 return NULL; 8363 } 8364 result = replace((PyUnicodeObject *)self, 8365 (PyUnicodeObject *)str1, 8366 (PyUnicodeObject *)str2, 8367 maxcount); 8368 Py_DECREF(self); 8369 Py_DECREF(str1); 8370 Py_DECREF(str2); 8371 return result; 8372} 8373 8374PyDoc_STRVAR(replace__doc__, 8375 "S.replace(old, new[, count]) -> str\n\ 8376\n\ 8377Return a copy of S with all occurrences of substring\n\ 8378old replaced by new. If the optional argument count is\n\ 8379given, only the first count occurrences are replaced."); 8380 8381static PyObject* 8382unicode_replace(PyUnicodeObject *self, PyObject *args) 8383{ 8384 PyUnicodeObject *str1; 8385 PyUnicodeObject *str2; 8386 Py_ssize_t maxcount = -1; 8387 PyObject *result; 8388 8389 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount)) 8390 return NULL; 8391 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1); 8392 if (str1 == NULL) 8393 return NULL; 8394 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2); 8395 if (str2 == NULL) { 8396 Py_DECREF(str1); 8397 return NULL; 8398 } 8399 8400 result = replace(self, str1, str2, maxcount); 8401 8402 Py_DECREF(str1); 8403 Py_DECREF(str2); 8404 return result; 8405} 8406 8407static 8408PyObject *unicode_repr(PyObject *unicode) 8409{ 8410 PyObject *repr; 8411 Py_UNICODE *p; 8412 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode); 8413 Py_ssize_t size = PyUnicode_GET_SIZE(unicode); 8414 8415 /* XXX(nnorwitz): rather than over-allocating, it would be 8416 better to choose a different scheme. Perhaps scan the 8417 first N-chars of the string and allocate based on that size. 8418 */ 8419 /* Initial allocation is based on the longest-possible unichr 8420 escape. 8421 8422 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source 8423 unichr, so in this case it's the longest unichr escape. In 8424 narrow (UTF-16) builds this is five chars per source unichr 8425 since there are two unichrs in the surrogate pair, so in narrow 8426 (UTF-16) builds it's not the longest unichr escape. 8427 8428 In wide or narrow builds '\uxxxx' is 6 chars per source unichr, 8429 so in the narrow (UTF-16) build case it's the longest unichr 8430 escape. 8431 */ 8432 8433 repr = PyUnicode_FromUnicode(NULL, 8434 2 /* quotes */ 8435#ifdef Py_UNICODE_WIDE 8436 + 10*size 8437#else 8438 + 6*size 8439#endif 8440 + 1); 8441 if (repr == NULL) 8442 return NULL; 8443 8444 p = PyUnicode_AS_UNICODE(repr); 8445 8446 /* Add quote */ 8447 *p++ = (findchar(s, size, '\'') && 8448 !findchar(s, size, '"')) ? '"' : '\''; 8449 while (size-- > 0) { 8450 Py_UNICODE ch = *s++; 8451 8452 /* Escape quotes and backslashes */ 8453 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) { 8454 *p++ = '\\'; 8455 *p++ = ch; 8456 continue; 8457 } 8458 8459 /* Map special whitespace to '\t', \n', '\r' */ 8460 if (ch == '\t') { 8461 *p++ = '\\'; 8462 *p++ = 't'; 8463 } 8464 else if (ch == '\n') { 8465 *p++ = '\\'; 8466 *p++ = 'n'; 8467 } 8468 else if (ch == '\r') { 8469 *p++ = '\\'; 8470 *p++ = 'r'; 8471 } 8472 8473 /* Map non-printable US ASCII to '\xhh' */ 8474 else if (ch < ' ' || ch == 0x7F) { 8475 *p++ = '\\'; 8476 *p++ = 'x'; 8477 *p++ = hexdigits[(ch >> 4) & 0x000F]; 8478 *p++ = hexdigits[ch & 0x000F]; 8479 } 8480 8481 /* Copy ASCII characters as-is */ 8482 else if (ch < 0x7F) { 8483 *p++ = ch; 8484 } 8485 8486 /* Non-ASCII characters */ 8487 else { 8488 Py_UCS4 ucs = ch; 8489 8490#ifndef Py_UNICODE_WIDE 8491 Py_UNICODE ch2 = 0; 8492 /* Get code point from surrogate pair */ 8493 if (size > 0) { 8494 ch2 = *s; 8495 if (ch >= 0xD800 && ch < 0xDC00 && ch2 >= 0xDC00 8496 && ch2 <= 0xDFFF) { 8497 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) 8498 + 0x00010000; 8499 s++; 8500 size--; 8501 } 8502 } 8503#endif 8504 /* Map Unicode whitespace and control characters 8505 (categories Z* and C* except ASCII space) 8506 */ 8507 if (!Py_UNICODE_ISPRINTABLE(ucs)) { 8508 /* Map 8-bit characters to '\xhh' */ 8509 if (ucs <= 0xff) { 8510 *p++ = '\\'; 8511 *p++ = 'x'; 8512 *p++ = hexdigits[(ch >> 4) & 0x000F]; 8513 *p++ = hexdigits[ch & 0x000F]; 8514 } 8515 /* Map 21-bit characters to '\U00xxxxxx' */ 8516 else if (ucs >= 0x10000) { 8517 *p++ = '\\'; 8518 *p++ = 'U'; 8519 *p++ = hexdigits[(ucs >> 28) & 0x0000000F]; 8520 *p++ = hexdigits[(ucs >> 24) & 0x0000000F]; 8521 *p++ = hexdigits[(ucs >> 20) & 0x0000000F]; 8522 *p++ = hexdigits[(ucs >> 16) & 0x0000000F]; 8523 *p++ = hexdigits[(ucs >> 12) & 0x0000000F]; 8524 *p++ = hexdigits[(ucs >> 8) & 0x0000000F]; 8525 *p++ = hexdigits[(ucs >> 4) & 0x0000000F]; 8526 *p++ = hexdigits[ucs & 0x0000000F]; 8527 } 8528 /* Map 16-bit characters to '\uxxxx' */ 8529 else { 8530 *p++ = '\\'; 8531 *p++ = 'u'; 8532 *p++ = hexdigits[(ucs >> 12) & 0x000F]; 8533 *p++ = hexdigits[(ucs >> 8) & 0x000F]; 8534 *p++ = hexdigits[(ucs >> 4) & 0x000F]; 8535 *p++ = hexdigits[ucs & 0x000F]; 8536 } 8537 } 8538 /* Copy characters as-is */ 8539 else { 8540 *p++ = ch; 8541#ifndef Py_UNICODE_WIDE 8542 if (ucs >= 0x10000) 8543 *p++ = ch2; 8544#endif 8545 } 8546 } 8547 } 8548 /* Add quote */ 8549 *p++ = PyUnicode_AS_UNICODE(repr)[0]; 8550 8551 *p = '\0'; 8552 PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr)); 8553 return repr; 8554} 8555 8556PyDoc_STRVAR(rfind__doc__, 8557 "S.rfind(sub[, start[, end]]) -> int\n\ 8558\n\ 8559Return the highest index in S where substring sub is found,\n\ 8560such that sub is contained within S[start:end]. Optional\n\ 8561arguments start and end are interpreted as in slice notation.\n\ 8562\n\ 8563Return -1 on failure."); 8564 8565static PyObject * 8566unicode_rfind(PyUnicodeObject *self, PyObject *args) 8567{ 8568 PyUnicodeObject *substring; 8569 Py_ssize_t start; 8570 Py_ssize_t end; 8571 Py_ssize_t result; 8572 8573 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring, 8574 &start, &end)) 8575 return NULL; 8576 8577 result = stringlib_rfind_slice( 8578 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self), 8579 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring), 8580 start, end 8581 ); 8582 8583 Py_DECREF(substring); 8584 8585 return PyLong_FromSsize_t(result); 8586} 8587 8588PyDoc_STRVAR(rindex__doc__, 8589 "S.rindex(sub[, start[, end]]) -> int\n\ 8590\n\ 8591Like S.rfind() but raise ValueError when the substring is not found."); 8592 8593static PyObject * 8594unicode_rindex(PyUnicodeObject *self, PyObject *args) 8595{ 8596 PyUnicodeObject *substring; 8597 Py_ssize_t start; 8598 Py_ssize_t end; 8599 Py_ssize_t result; 8600 8601 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring, 8602 &start, &end)) 8603 return NULL; 8604 8605 result = stringlib_rfind_slice( 8606 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self), 8607 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring), 8608 start, end 8609 ); 8610 8611 Py_DECREF(substring); 8612 8613 if (result < 0) { 8614 PyErr_SetString(PyExc_ValueError, "substring not found"); 8615 return NULL; 8616 } 8617 return PyLong_FromSsize_t(result); 8618} 8619 8620PyDoc_STRVAR(rjust__doc__, 8621 "S.rjust(width[, fillchar]) -> str\n\ 8622\n\ 8623Return S right-justified in a string of length width. Padding is\n\ 8624done using the specified fill character (default is a space)."); 8625 8626static PyObject * 8627unicode_rjust(PyUnicodeObject *self, PyObject *args) 8628{ 8629 Py_ssize_t width; 8630 Py_UNICODE fillchar = ' '; 8631 8632 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar)) 8633 return NULL; 8634 8635 if (self->length >= width && PyUnicode_CheckExact(self)) { 8636 Py_INCREF(self); 8637 return (PyObject*) self; 8638 } 8639 8640 return (PyObject*) pad(self, width - self->length, 0, fillchar); 8641} 8642 8643PyObject *PyUnicode_Split(PyObject *s, 8644 PyObject *sep, 8645 Py_ssize_t maxsplit) 8646{ 8647 PyObject *result; 8648 8649 s = PyUnicode_FromObject(s); 8650 if (s == NULL) 8651 return NULL; 8652 if (sep != NULL) { 8653 sep = PyUnicode_FromObject(sep); 8654 if (sep == NULL) { 8655 Py_DECREF(s); 8656 return NULL; 8657 } 8658 } 8659 8660 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit); 8661 8662 Py_DECREF(s); 8663 Py_XDECREF(sep); 8664 return result; 8665} 8666 8667PyDoc_STRVAR(split__doc__, 8668 "S.split([sep[, maxsplit]]) -> list of strings\n\ 8669\n\ 8670Return a list of the words in S, using sep as the\n\ 8671delimiter string. If maxsplit is given, at most maxsplit\n\ 8672splits are done. If sep is not specified or is None, any\n\ 8673whitespace string is a separator and empty strings are\n\ 8674removed from the result."); 8675 8676static PyObject* 8677unicode_split(PyUnicodeObject *self, PyObject *args) 8678{ 8679 PyObject *substring = Py_None; 8680 Py_ssize_t maxcount = -1; 8681 8682 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount)) 8683 return NULL; 8684 8685 if (substring == Py_None) 8686 return split(self, NULL, maxcount); 8687 else if (PyUnicode_Check(substring)) 8688 return split(self, (PyUnicodeObject *)substring, maxcount); 8689 else 8690 return PyUnicode_Split((PyObject *)self, substring, maxcount); 8691} 8692 8693PyObject * 8694PyUnicode_Partition(PyObject *str_in, PyObject *sep_in) 8695{ 8696 PyObject* str_obj; 8697 PyObject* sep_obj; 8698 PyObject* out; 8699 8700 str_obj = PyUnicode_FromObject(str_in); 8701 if (!str_obj) 8702 return NULL; 8703 sep_obj = PyUnicode_FromObject(sep_in); 8704 if (!sep_obj) { 8705 Py_DECREF(str_obj); 8706 return NULL; 8707 } 8708 8709 out = stringlib_partition( 8710 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj), 8711 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj) 8712 ); 8713 8714 Py_DECREF(sep_obj); 8715 Py_DECREF(str_obj); 8716 8717 return out; 8718} 8719 8720 8721PyObject * 8722PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in) 8723{ 8724 PyObject* str_obj; 8725 PyObject* sep_obj; 8726 PyObject* out; 8727 8728 str_obj = PyUnicode_FromObject(str_in); 8729 if (!str_obj) 8730 return NULL; 8731 sep_obj = PyUnicode_FromObject(sep_in); 8732 if (!sep_obj) { 8733 Py_DECREF(str_obj); 8734 return NULL; 8735 } 8736 8737 out = stringlib_rpartition( 8738 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj), 8739 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj) 8740 ); 8741 8742 Py_DECREF(sep_obj); 8743 Py_DECREF(str_obj); 8744 8745 return out; 8746} 8747 8748PyDoc_STRVAR(partition__doc__, 8749 "S.partition(sep) -> (head, sep, tail)\n\ 8750\n\ 8751Search for the separator sep in S, and return the part before it,\n\ 8752the separator itself, and the part after it. If the separator is not\n\ 8753found, return S and two empty strings."); 8754 8755static PyObject* 8756unicode_partition(PyUnicodeObject *self, PyObject *separator) 8757{ 8758 return PyUnicode_Partition((PyObject *)self, separator); 8759} 8760 8761PyDoc_STRVAR(rpartition__doc__, 8762 "S.rpartition(sep) -> (head, sep, tail)\n\ 8763\n\ 8764Search for the separator sep in S, starting at the end of S, and return\n\ 8765the part before it, the separator itself, and the part after it. If the\n\ 8766separator is not found, return two empty strings and S."); 8767 8768static PyObject* 8769unicode_rpartition(PyUnicodeObject *self, PyObject *separator) 8770{ 8771 return PyUnicode_RPartition((PyObject *)self, separator); 8772} 8773 8774PyObject *PyUnicode_RSplit(PyObject *s, 8775 PyObject *sep, 8776 Py_ssize_t maxsplit) 8777{ 8778 PyObject *result; 8779 8780 s = PyUnicode_FromObject(s); 8781 if (s == NULL) 8782 return NULL; 8783 if (sep != NULL) { 8784 sep = PyUnicode_FromObject(sep); 8785 if (sep == NULL) { 8786 Py_DECREF(s); 8787 return NULL; 8788 } 8789 } 8790 8791 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit); 8792 8793 Py_DECREF(s); 8794 Py_XDECREF(sep); 8795 return result; 8796} 8797 8798PyDoc_STRVAR(rsplit__doc__, 8799 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\ 8800\n\ 8801Return a list of the words in S, using sep as the\n\ 8802delimiter string, starting at the end of the string and\n\ 8803working to the front. If maxsplit is given, at most maxsplit\n\ 8804splits are done. If sep is not specified, any whitespace string\n\ 8805is a separator."); 8806 8807static PyObject* 8808unicode_rsplit(PyUnicodeObject *self, PyObject *args) 8809{ 8810 PyObject *substring = Py_None; 8811 Py_ssize_t maxcount = -1; 8812 8813 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount)) 8814 return NULL; 8815 8816 if (substring == Py_None) 8817 return rsplit(self, NULL, maxcount); 8818 else if (PyUnicode_Check(substring)) 8819 return rsplit(self, (PyUnicodeObject *)substring, maxcount); 8820 else 8821 return PyUnicode_RSplit((PyObject *)self, substring, maxcount); 8822} 8823 8824PyDoc_STRVAR(splitlines__doc__, 8825 "S.splitlines([keepends]) -> list of strings\n\ 8826\n\ 8827Return a list of the lines in S, breaking at line boundaries.\n\ 8828Line breaks are not included in the resulting list unless keepends\n\ 8829is given and true."); 8830 8831static PyObject* 8832unicode_splitlines(PyUnicodeObject *self, PyObject *args) 8833{ 8834 int keepends = 0; 8835 8836 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends)) 8837 return NULL; 8838 8839 return PyUnicode_Splitlines((PyObject *)self, keepends); 8840} 8841 8842static 8843PyObject *unicode_str(PyObject *self) 8844{ 8845 if (PyUnicode_CheckExact(self)) { 8846 Py_INCREF(self); 8847 return self; 8848 } else 8849 /* Subtype -- return genuine unicode string with the same value. */ 8850 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self), 8851 PyUnicode_GET_SIZE(self)); 8852} 8853 8854PyDoc_STRVAR(swapcase__doc__, 8855 "S.swapcase() -> str\n\ 8856\n\ 8857Return a copy of S with uppercase characters converted to lowercase\n\ 8858and vice versa."); 8859 8860static PyObject* 8861unicode_swapcase(PyUnicodeObject *self) 8862{ 8863 return fixup(self, fixswapcase); 8864} 8865 8866PyDoc_STRVAR(maketrans__doc__, 8867 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\ 8868\n\ 8869Return a translation table usable for str.translate().\n\ 8870If there is only one argument, it must be a dictionary mapping Unicode\n\ 8871ordinals (integers) or characters to Unicode ordinals, strings or None.\n\ 8872Character keys will be then converted to ordinals.\n\ 8873If there are two arguments, they must be strings of equal length, and\n\ 8874in the resulting dictionary, each character in x will be mapped to the\n\ 8875character at the same position in y. If there is a third argument, it\n\ 8876must be a string, whose characters will be mapped to None in the result."); 8877 8878static PyObject* 8879unicode_maketrans(PyUnicodeObject *null, PyObject *args) 8880{ 8881 PyObject *x, *y = NULL, *z = NULL; 8882 PyObject *new = NULL, *key, *value; 8883 Py_ssize_t i = 0; 8884 int res; 8885 8886 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z)) 8887 return NULL; 8888 new = PyDict_New(); 8889 if (!new) 8890 return NULL; 8891 if (y != NULL) { 8892 /* x must be a string too, of equal length */ 8893 Py_ssize_t ylen = PyUnicode_GET_SIZE(y); 8894 if (!PyUnicode_Check(x)) { 8895 PyErr_SetString(PyExc_TypeError, "first maketrans argument must " 8896 "be a string if there is a second argument"); 8897 goto err; 8898 } 8899 if (PyUnicode_GET_SIZE(x) != ylen) { 8900 PyErr_SetString(PyExc_ValueError, "the first two maketrans " 8901 "arguments must have equal length"); 8902 goto err; 8903 } 8904 /* create entries for translating chars in x to those in y */ 8905 for (i = 0; i < PyUnicode_GET_SIZE(x); i++) { 8906 key = PyLong_FromLong(PyUnicode_AS_UNICODE(x)[i]); 8907 if (!key) 8908 goto err; 8909 value = PyLong_FromLong(PyUnicode_AS_UNICODE(y)[i]); 8910 if (!value) { 8911 Py_DECREF(key); 8912 goto err; 8913 } 8914 res = PyDict_SetItem(new, key, value); 8915 Py_DECREF(key); 8916 Py_DECREF(value); 8917 if (res < 0) 8918 goto err; 8919 } 8920 /* create entries for deleting chars in z */ 8921 if (z != NULL) { 8922 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) { 8923 key = PyLong_FromLong(PyUnicode_AS_UNICODE(z)[i]); 8924 if (!key) 8925 goto err; 8926 res = PyDict_SetItem(new, key, Py_None); 8927 Py_DECREF(key); 8928 if (res < 0) 8929 goto err; 8930 } 8931 } 8932 } else { 8933 /* x must be a dict */ 8934 if (!PyDict_CheckExact(x)) { 8935 PyErr_SetString(PyExc_TypeError, "if you give only one argument " 8936 "to maketrans it must be a dict"); 8937 goto err; 8938 } 8939 /* copy entries into the new dict, converting string keys to int keys */ 8940 while (PyDict_Next(x, &i, &key, &value)) { 8941 if (PyUnicode_Check(key)) { 8942 /* convert string keys to integer keys */ 8943 PyObject *newkey; 8944 if (PyUnicode_GET_SIZE(key) != 1) { 8945 PyErr_SetString(PyExc_ValueError, "string keys in translate " 8946 "table must be of length 1"); 8947 goto err; 8948 } 8949 newkey = PyLong_FromLong(PyUnicode_AS_UNICODE(key)[0]); 8950 if (!newkey) 8951 goto err; 8952 res = PyDict_SetItem(new, newkey, value); 8953 Py_DECREF(newkey); 8954 if (res < 0) 8955 goto err; 8956 } else if (PyLong_Check(key)) { 8957 /* just keep integer keys */ 8958 if (PyDict_SetItem(new, key, value) < 0) 8959 goto err; 8960 } else { 8961 PyErr_SetString(PyExc_TypeError, "keys in translate table must " 8962 "be strings or integers"); 8963 goto err; 8964 } 8965 } 8966 } 8967 return new; 8968 err: 8969 Py_DECREF(new); 8970 return NULL; 8971} 8972 8973PyDoc_STRVAR(translate__doc__, 8974 "S.translate(table) -> str\n\ 8975\n\ 8976Return a copy of the string S, where all characters have been mapped\n\ 8977through the given translation table, which must be a mapping of\n\ 8978Unicode ordinals to Unicode ordinals, strings, or None.\n\ 8979Unmapped characters are left untouched. Characters mapped to None\n\ 8980are deleted."); 8981 8982static PyObject* 8983unicode_translate(PyUnicodeObject *self, PyObject *table) 8984{ 8985 return PyUnicode_TranslateCharmap(self->str, self->length, table, "ignore"); 8986} 8987 8988PyDoc_STRVAR(upper__doc__, 8989 "S.upper() -> str\n\ 8990\n\ 8991Return a copy of S converted to uppercase."); 8992 8993static PyObject* 8994unicode_upper(PyUnicodeObject *self) 8995{ 8996 return fixup(self, fixupper); 8997} 8998 8999PyDoc_STRVAR(zfill__doc__, 9000 "S.zfill(width) -> str\n\ 9001\n\ 9002Pad a numeric string S with zeros on the left, to fill a field\n\ 9003of the specified width. The string S is never truncated."); 9004 9005static PyObject * 9006unicode_zfill(PyUnicodeObject *self, PyObject *args) 9007{ 9008 Py_ssize_t fill; 9009 PyUnicodeObject *u; 9010 9011 Py_ssize_t width; 9012 if (!PyArg_ParseTuple(args, "n:zfill", &width)) 9013 return NULL; 9014 9015 if (self->length >= width) { 9016 if (PyUnicode_CheckExact(self)) { 9017 Py_INCREF(self); 9018 return (PyObject*) self; 9019 } 9020 else 9021 return PyUnicode_FromUnicode( 9022 PyUnicode_AS_UNICODE(self), 9023 PyUnicode_GET_SIZE(self) 9024 ); 9025 } 9026 9027 fill = width - self->length; 9028 9029 u = pad(self, fill, 0, '0'); 9030 9031 if (u == NULL) 9032 return NULL; 9033 9034 if (u->str[fill] == '+' || u->str[fill] == '-') { 9035 /* move sign to beginning of string */ 9036 u->str[0] = u->str[fill]; 9037 u->str[fill] = '0'; 9038 } 9039 9040 return (PyObject*) u; 9041} 9042 9043#if 0 9044static PyObject* 9045unicode_freelistsize(PyUnicodeObject *self) 9046{ 9047 return PyLong_FromLong(numfree); 9048} 9049 9050static PyObject * 9051unicode__decimal2ascii(PyObject *self) 9052{ 9053 return PyUnicode_TransformDecimalToASCII(PyUnicode_AS_UNICODE(self), 9054 PyUnicode_GET_SIZE(self)); 9055} 9056#endif 9057 9058PyDoc_STRVAR(startswith__doc__, 9059 "S.startswith(prefix[, start[, end]]) -> bool\n\ 9060\n\ 9061Return True if S starts with the specified prefix, False otherwise.\n\ 9062With optional start, test S beginning at that position.\n\ 9063With optional end, stop comparing S at that position.\n\ 9064prefix can also be a tuple of strings to try."); 9065 9066static PyObject * 9067unicode_startswith(PyUnicodeObject *self, 9068 PyObject *args) 9069{ 9070 PyObject *subobj; 9071 PyUnicodeObject *substring; 9072 Py_ssize_t start = 0; 9073 Py_ssize_t end = PY_SSIZE_T_MAX; 9074 int result; 9075 9076 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end)) 9077 return NULL; 9078 if (PyTuple_Check(subobj)) { 9079 Py_ssize_t i; 9080 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) { 9081 substring = (PyUnicodeObject *)PyUnicode_FromObject( 9082 PyTuple_GET_ITEM(subobj, i)); 9083 if (substring == NULL) 9084 return NULL; 9085 result = tailmatch(self, substring, start, end, -1); 9086 Py_DECREF(substring); 9087 if (result) { 9088 Py_RETURN_TRUE; 9089 } 9090 } 9091 /* nothing matched */ 9092 Py_RETURN_FALSE; 9093 } 9094 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj); 9095 if (substring == NULL) { 9096 if (PyErr_ExceptionMatches(PyExc_TypeError)) 9097 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or " 9098 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name); 9099 return NULL; 9100 } 9101 result = tailmatch(self, substring, start, end, -1); 9102 Py_DECREF(substring); 9103 return PyBool_FromLong(result); 9104} 9105 9106 9107PyDoc_STRVAR(endswith__doc__, 9108 "S.endswith(suffix[, start[, end]]) -> bool\n\ 9109\n\ 9110Return True if S ends with the specified suffix, False otherwise.\n\ 9111With optional start, test S beginning at that position.\n\ 9112With optional end, stop comparing S at that position.\n\ 9113suffix can also be a tuple of strings to try."); 9114 9115static PyObject * 9116unicode_endswith(PyUnicodeObject *self, 9117 PyObject *args) 9118{ 9119 PyObject *subobj; 9120 PyUnicodeObject *substring; 9121 Py_ssize_t start = 0; 9122 Py_ssize_t end = PY_SSIZE_T_MAX; 9123 int result; 9124 9125 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end)) 9126 return NULL; 9127 if (PyTuple_Check(subobj)) { 9128 Py_ssize_t i; 9129 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) { 9130 substring = (PyUnicodeObject *)PyUnicode_FromObject( 9131 PyTuple_GET_ITEM(subobj, i)); 9132 if (substring == NULL) 9133 return NULL; 9134 result = tailmatch(self, substring, start, end, +1); 9135 Py_DECREF(substring); 9136 if (result) { 9137 Py_RETURN_TRUE; 9138 } 9139 } 9140 Py_RETURN_FALSE; 9141 } 9142 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj); 9143 if (substring == NULL) { 9144 if (PyErr_ExceptionMatches(PyExc_TypeError)) 9145 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or " 9146 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name); 9147 return NULL; 9148 } 9149 result = tailmatch(self, substring, start, end, +1); 9150 Py_DECREF(substring); 9151 return PyBool_FromLong(result); 9152} 9153 9154#include "stringlib/string_format.h" 9155 9156PyDoc_STRVAR(format__doc__, 9157 "S.format(*args, **kwargs) -> str\n\ 9158\n\ 9159Return a formatted version of S, using substitutions from args and kwargs.\n\ 9160The substitutions are identified by braces ('{' and '}')."); 9161 9162PyDoc_STRVAR(format_map__doc__, 9163 "S.format_map(mapping) -> str\n\ 9164\n\ 9165Return a formatted version of S, using substitutions from mapping.\n\ 9166The substitutions are identified by braces ('{' and '}')."); 9167 9168static PyObject * 9169unicode__format__(PyObject* self, PyObject* args) 9170{ 9171 PyObject *format_spec; 9172 9173 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec)) 9174 return NULL; 9175 9176 return _PyUnicode_FormatAdvanced(self, 9177 PyUnicode_AS_UNICODE(format_spec), 9178 PyUnicode_GET_SIZE(format_spec)); 9179} 9180 9181PyDoc_STRVAR(p_format__doc__, 9182 "S.__format__(format_spec) -> str\n\ 9183\n\ 9184Return a formatted version of S as described by format_spec."); 9185 9186static PyObject * 9187unicode__sizeof__(PyUnicodeObject *v) 9188{ 9189 return PyLong_FromSsize_t(sizeof(PyUnicodeObject) + 9190 sizeof(Py_UNICODE) * (v->length + 1)); 9191} 9192 9193PyDoc_STRVAR(sizeof__doc__, 9194 "S.__sizeof__() -> size of S in memory, in bytes"); 9195 9196static PyObject * 9197unicode_getnewargs(PyUnicodeObject *v) 9198{ 9199 return Py_BuildValue("(u#)", v->str, v->length); 9200} 9201 9202static PyMethodDef unicode_methods[] = { 9203 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__}, 9204 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__}, 9205 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__}, 9206 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__}, 9207 {"join", (PyCFunction) unicode_join, METH_O, join__doc__}, 9208 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__}, 9209 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__}, 9210 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__}, 9211 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__}, 9212 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__}, 9213 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__}, 9214 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__}, 9215 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__}, 9216 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__}, 9217 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__}, 9218 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__}, 9219 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__}, 9220 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__}, 9221 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__}, 9222 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__}, 9223 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__}, 9224 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__}, 9225 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__}, 9226 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__}, 9227 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__}, 9228 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__}, 9229 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__}, 9230 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__}, 9231 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__}, 9232 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__}, 9233 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__}, 9234 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__}, 9235 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__}, 9236 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__}, 9237 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__}, 9238 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__}, 9239 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__}, 9240 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__}, 9241 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__}, 9242 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__}, 9243 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__}, 9244 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__}, 9245 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__}, 9246 {"maketrans", (PyCFunction) unicode_maketrans, 9247 METH_VARARGS | METH_STATIC, maketrans__doc__}, 9248 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__}, 9249#if 0 9250 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__}, 9251#endif 9252 9253#if 0 9254 /* These methods are just used for debugging the implementation. */ 9255 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS}, 9256 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS}, 9257#endif 9258 9259 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS}, 9260 {NULL, NULL} 9261}; 9262 9263static PyObject * 9264unicode_mod(PyObject *v, PyObject *w) 9265{ 9266 if (!PyUnicode_Check(v)) { 9267 Py_INCREF(Py_NotImplemented); 9268 return Py_NotImplemented; 9269 } 9270 return PyUnicode_Format(v, w); 9271} 9272 9273static PyNumberMethods unicode_as_number = { 9274 0, /*nb_add*/ 9275 0, /*nb_subtract*/ 9276 0, /*nb_multiply*/ 9277 unicode_mod, /*nb_remainder*/ 9278}; 9279 9280static PySequenceMethods unicode_as_sequence = { 9281 (lenfunc) unicode_length, /* sq_length */ 9282 PyUnicode_Concat, /* sq_concat */ 9283 (ssizeargfunc) unicode_repeat, /* sq_repeat */ 9284 (ssizeargfunc) unicode_getitem, /* sq_item */ 9285 0, /* sq_slice */ 9286 0, /* sq_ass_item */ 9287 0, /* sq_ass_slice */ 9288 PyUnicode_Contains, /* sq_contains */ 9289}; 9290 9291static PyObject* 9292unicode_subscript(PyUnicodeObject* self, PyObject* item) 9293{ 9294 if (PyIndex_Check(item)) { 9295 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError); 9296 if (i == -1 && PyErr_Occurred()) 9297 return NULL; 9298 if (i < 0) 9299 i += PyUnicode_GET_SIZE(self); 9300 return unicode_getitem(self, i); 9301 } else if (PySlice_Check(item)) { 9302 Py_ssize_t start, stop, step, slicelength, cur, i; 9303 Py_UNICODE* source_buf; 9304 Py_UNICODE* result_buf; 9305 PyObject* result; 9306 9307 if (PySlice_GetIndicesEx(item, PyUnicode_GET_SIZE(self), 9308 &start, &stop, &step, &slicelength) < 0) { 9309 return NULL; 9310 } 9311 9312 if (slicelength <= 0) { 9313 return PyUnicode_FromUnicode(NULL, 0); 9314 } else if (start == 0 && step == 1 && slicelength == self->length && 9315 PyUnicode_CheckExact(self)) { 9316 Py_INCREF(self); 9317 return (PyObject *)self; 9318 } else if (step == 1) { 9319 return PyUnicode_FromUnicode(self->str + start, slicelength); 9320 } else { 9321 source_buf = PyUnicode_AS_UNICODE((PyObject*)self); 9322 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength* 9323 sizeof(Py_UNICODE)); 9324 9325 if (result_buf == NULL) 9326 return PyErr_NoMemory(); 9327 9328 for (cur = start, i = 0; i < slicelength; cur += step, i++) { 9329 result_buf[i] = source_buf[cur]; 9330 } 9331 9332 result = PyUnicode_FromUnicode(result_buf, slicelength); 9333 PyObject_FREE(result_buf); 9334 return result; 9335 } 9336 } else { 9337 PyErr_SetString(PyExc_TypeError, "string indices must be integers"); 9338 return NULL; 9339 } 9340} 9341 9342static PyMappingMethods unicode_as_mapping = { 9343 (lenfunc)unicode_length, /* mp_length */ 9344 (binaryfunc)unicode_subscript, /* mp_subscript */ 9345 (objobjargproc)0, /* mp_ass_subscript */ 9346}; 9347 9348 9349/* Helpers for PyUnicode_Format() */ 9350 9351static PyObject * 9352getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx) 9353{ 9354 Py_ssize_t argidx = *p_argidx; 9355 if (argidx < arglen) { 9356 (*p_argidx)++; 9357 if (arglen < 0) 9358 return args; 9359 else 9360 return PyTuple_GetItem(args, argidx); 9361 } 9362 PyErr_SetString(PyExc_TypeError, 9363 "not enough arguments for format string"); 9364 return NULL; 9365} 9366 9367/* Returns a new reference to a PyUnicode object, or NULL on failure. */ 9368 9369static PyObject * 9370formatfloat(PyObject *v, int flags, int prec, int type) 9371{ 9372 char *p; 9373 PyObject *result; 9374 double x; 9375 9376 x = PyFloat_AsDouble(v); 9377 if (x == -1.0 && PyErr_Occurred()) 9378 return NULL; 9379 9380 if (prec < 0) 9381 prec = 6; 9382 9383 p = PyOS_double_to_string(x, type, prec, 9384 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL); 9385 if (p == NULL) 9386 return NULL; 9387 result = PyUnicode_FromStringAndSize(p, strlen(p)); 9388 PyMem_Free(p); 9389 return result; 9390} 9391 9392static PyObject* 9393formatlong(PyObject *val, int flags, int prec, int type) 9394{ 9395 char *buf; 9396 int len; 9397 PyObject *str; /* temporary string object. */ 9398 PyObject *result; 9399 9400 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len); 9401 if (!str) 9402 return NULL; 9403 result = PyUnicode_FromStringAndSize(buf, len); 9404 Py_DECREF(str); 9405 return result; 9406} 9407 9408static int 9409formatchar(Py_UNICODE *buf, 9410 size_t buflen, 9411 PyObject *v) 9412{ 9413 /* presume that the buffer is at least 3 characters long */ 9414 if (PyUnicode_Check(v)) { 9415 if (PyUnicode_GET_SIZE(v) == 1) { 9416 buf[0] = PyUnicode_AS_UNICODE(v)[0]; 9417 buf[1] = '\0'; 9418 return 1; 9419 } 9420#ifndef Py_UNICODE_WIDE 9421 if (PyUnicode_GET_SIZE(v) == 2) { 9422 /* Decode a valid surrogate pair */ 9423 int c0 = PyUnicode_AS_UNICODE(v)[0]; 9424 int c1 = PyUnicode_AS_UNICODE(v)[1]; 9425 if (0xD800 <= c0 && c0 <= 0xDBFF && 9426 0xDC00 <= c1 && c1 <= 0xDFFF) { 9427 buf[0] = c0; 9428 buf[1] = c1; 9429 buf[2] = '\0'; 9430 return 2; 9431 } 9432 } 9433#endif 9434 goto onError; 9435 } 9436 else { 9437 /* Integer input truncated to a character */ 9438 long x; 9439 x = PyLong_AsLong(v); 9440 if (x == -1 && PyErr_Occurred()) 9441 goto onError; 9442 9443 if (x < 0 || x > 0x10ffff) { 9444 PyErr_SetString(PyExc_OverflowError, 9445 "%c arg not in range(0x110000)"); 9446 return -1; 9447 } 9448 9449#ifndef Py_UNICODE_WIDE 9450 if (x > 0xffff) { 9451 x -= 0x10000; 9452 buf[0] = (Py_UNICODE)(0xD800 | (x >> 10)); 9453 buf[1] = (Py_UNICODE)(0xDC00 | (x & 0x3FF)); 9454 return 2; 9455 } 9456#endif 9457 buf[0] = (Py_UNICODE) x; 9458 buf[1] = '\0'; 9459 return 1; 9460 } 9461 9462 onError: 9463 PyErr_SetString(PyExc_TypeError, 9464 "%c requires int or char"); 9465 return -1; 9466} 9467 9468/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...) 9469 FORMATBUFLEN is the length of the buffer in which chars are formatted. 9470*/ 9471#define FORMATBUFLEN (size_t)10 9472 9473PyObject *PyUnicode_Format(PyObject *format, 9474 PyObject *args) 9475{ 9476 Py_UNICODE *fmt, *res; 9477 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx; 9478 int args_owned = 0; 9479 PyUnicodeObject *result = NULL; 9480 PyObject *dict = NULL; 9481 PyObject *uformat; 9482 9483 if (format == NULL || args == NULL) { 9484 PyErr_BadInternalCall(); 9485 return NULL; 9486 } 9487 uformat = PyUnicode_FromObject(format); 9488 if (uformat == NULL) 9489 return NULL; 9490 fmt = PyUnicode_AS_UNICODE(uformat); 9491 fmtcnt = PyUnicode_GET_SIZE(uformat); 9492 9493 reslen = rescnt = fmtcnt + 100; 9494 result = _PyUnicode_New(reslen); 9495 if (result == NULL) 9496 goto onError; 9497 res = PyUnicode_AS_UNICODE(result); 9498 9499 if (PyTuple_Check(args)) { 9500 arglen = PyTuple_Size(args); 9501 argidx = 0; 9502 } 9503 else { 9504 arglen = -1; 9505 argidx = -2; 9506 } 9507 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args)) 9508 dict = args; 9509 9510 while (--fmtcnt >= 0) { 9511 if (*fmt != '%') { 9512 if (--rescnt < 0) { 9513 rescnt = fmtcnt + 100; 9514 reslen += rescnt; 9515 if (_PyUnicode_Resize(&result, reslen) < 0) 9516 goto onError; 9517 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt; 9518 --rescnt; 9519 } 9520 *res++ = *fmt++; 9521 } 9522 else { 9523 /* Got a format specifier */ 9524 int flags = 0; 9525 Py_ssize_t width = -1; 9526 int prec = -1; 9527 Py_UNICODE c = '\0'; 9528 Py_UNICODE fill; 9529 int isnumok; 9530 PyObject *v = NULL; 9531 PyObject *temp = NULL; 9532 Py_UNICODE *pbuf; 9533 Py_UNICODE sign; 9534 Py_ssize_t len; 9535 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For formatchar() */ 9536 9537 fmt++; 9538 if (*fmt == '(') { 9539 Py_UNICODE *keystart; 9540 Py_ssize_t keylen; 9541 PyObject *key; 9542 int pcount = 1; 9543 9544 if (dict == NULL) { 9545 PyErr_SetString(PyExc_TypeError, 9546 "format requires a mapping"); 9547 goto onError; 9548 } 9549 ++fmt; 9550 --fmtcnt; 9551 keystart = fmt; 9552 /* Skip over balanced parentheses */ 9553 while (pcount > 0 && --fmtcnt >= 0) { 9554 if (*fmt == ')') 9555 --pcount; 9556 else if (*fmt == '(') 9557 ++pcount; 9558 fmt++; 9559 } 9560 keylen = fmt - keystart - 1; 9561 if (fmtcnt < 0 || pcount > 0) { 9562 PyErr_SetString(PyExc_ValueError, 9563 "incomplete format key"); 9564 goto onError; 9565 } 9566#if 0 9567 /* keys are converted to strings using UTF-8 and 9568 then looked up since Python uses strings to hold 9569 variables names etc. in its namespaces and we 9570 wouldn't want to break common idioms. */ 9571 key = PyUnicode_EncodeUTF8(keystart, 9572 keylen, 9573 NULL); 9574#else 9575 key = PyUnicode_FromUnicode(keystart, keylen); 9576#endif 9577 if (key == NULL) 9578 goto onError; 9579 if (args_owned) { 9580 Py_DECREF(args); 9581 args_owned = 0; 9582 } 9583 args = PyObject_GetItem(dict, key); 9584 Py_DECREF(key); 9585 if (args == NULL) { 9586 goto onError; 9587 } 9588 args_owned = 1; 9589 arglen = -1; 9590 argidx = -2; 9591 } 9592 while (--fmtcnt >= 0) { 9593 switch (c = *fmt++) { 9594 case '-': flags |= F_LJUST; continue; 9595 case '+': flags |= F_SIGN; continue; 9596 case ' ': flags |= F_BLANK; continue; 9597 case '#': flags |= F_ALT; continue; 9598 case '0': flags |= F_ZERO; continue; 9599 } 9600 break; 9601 } 9602 if (c == '*') { 9603 v = getnextarg(args, arglen, &argidx); 9604 if (v == NULL) 9605 goto onError; 9606 if (!PyLong_Check(v)) { 9607 PyErr_SetString(PyExc_TypeError, 9608 "* wants int"); 9609 goto onError; 9610 } 9611 width = PyLong_AsLong(v); 9612 if (width == -1 && PyErr_Occurred()) 9613 goto onError; 9614 if (width < 0) { 9615 flags |= F_LJUST; 9616 width = -width; 9617 } 9618 if (--fmtcnt >= 0) 9619 c = *fmt++; 9620 } 9621 else if (c >= '0' && c <= '9') { 9622 width = c - '0'; 9623 while (--fmtcnt >= 0) { 9624 c = *fmt++; 9625 if (c < '0' || c > '9') 9626 break; 9627 if ((width*10) / 10 != width) { 9628 PyErr_SetString(PyExc_ValueError, 9629 "width too big"); 9630 goto onError; 9631 } 9632 width = width*10 + (c - '0'); 9633 } 9634 } 9635 if (c == '.') { 9636 prec = 0; 9637 if (--fmtcnt >= 0) 9638 c = *fmt++; 9639 if (c == '*') { 9640 v = getnextarg(args, arglen, &argidx); 9641 if (v == NULL) 9642 goto onError; 9643 if (!PyLong_Check(v)) { 9644 PyErr_SetString(PyExc_TypeError, 9645 "* wants int"); 9646 goto onError; 9647 } 9648 prec = PyLong_AsLong(v); 9649 if (prec == -1 && PyErr_Occurred()) 9650 goto onError; 9651 if (prec < 0) 9652 prec = 0; 9653 if (--fmtcnt >= 0) 9654 c = *fmt++; 9655 } 9656 else if (c >= '0' && c <= '9') { 9657 prec = c - '0'; 9658 while (--fmtcnt >= 0) { 9659 c = *fmt++; 9660 if (c < '0' || c > '9') 9661 break; 9662 if ((prec*10) / 10 != prec) { 9663 PyErr_SetString(PyExc_ValueError, 9664 "prec too big"); 9665 goto onError; 9666 } 9667 prec = prec*10 + (c - '0'); 9668 } 9669 } 9670 } /* prec */ 9671 if (fmtcnt >= 0) { 9672 if (c == 'h' || c == 'l' || c == 'L') { 9673 if (--fmtcnt >= 0) 9674 c = *fmt++; 9675 } 9676 } 9677 if (fmtcnt < 0) { 9678 PyErr_SetString(PyExc_ValueError, 9679 "incomplete format"); 9680 goto onError; 9681 } 9682 if (c != '%') { 9683 v = getnextarg(args, arglen, &argidx); 9684 if (v == NULL) 9685 goto onError; 9686 } 9687 sign = 0; 9688 fill = ' '; 9689 switch (c) { 9690 9691 case '%': 9692 pbuf = formatbuf; 9693 /* presume that buffer length is at least 1 */ 9694 pbuf[0] = '%'; 9695 len = 1; 9696 break; 9697 9698 case 's': 9699 case 'r': 9700 case 'a': 9701 if (PyUnicode_CheckExact(v) && c == 's') { 9702 temp = v; 9703 Py_INCREF(temp); 9704 } 9705 else { 9706 if (c == 's') 9707 temp = PyObject_Str(v); 9708 else if (c == 'r') 9709 temp = PyObject_Repr(v); 9710 else 9711 temp = PyObject_ASCII(v); 9712 if (temp == NULL) 9713 goto onError; 9714 if (PyUnicode_Check(temp)) 9715 /* nothing to do */; 9716 else { 9717 Py_DECREF(temp); 9718 PyErr_SetString(PyExc_TypeError, 9719 "%s argument has non-string str()"); 9720 goto onError; 9721 } 9722 } 9723 pbuf = PyUnicode_AS_UNICODE(temp); 9724 len = PyUnicode_GET_SIZE(temp); 9725 if (prec >= 0 && len > prec) 9726 len = prec; 9727 break; 9728 9729 case 'i': 9730 case 'd': 9731 case 'u': 9732 case 'o': 9733 case 'x': 9734 case 'X': 9735 isnumok = 0; 9736 if (PyNumber_Check(v)) { 9737 PyObject *iobj=NULL; 9738 9739 if (PyLong_Check(v)) { 9740 iobj = v; 9741 Py_INCREF(iobj); 9742 } 9743 else { 9744 iobj = PyNumber_Long(v); 9745 } 9746 if (iobj!=NULL) { 9747 if (PyLong_Check(iobj)) { 9748 isnumok = 1; 9749 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c)); 9750 Py_DECREF(iobj); 9751 if (!temp) 9752 goto onError; 9753 pbuf = PyUnicode_AS_UNICODE(temp); 9754 len = PyUnicode_GET_SIZE(temp); 9755 sign = 1; 9756 } 9757 else { 9758 Py_DECREF(iobj); 9759 } 9760 } 9761 } 9762 if (!isnumok) { 9763 PyErr_Format(PyExc_TypeError, 9764 "%%%c format: a number is required, " 9765 "not %.200s", (char)c, Py_TYPE(v)->tp_name); 9766 goto onError; 9767 } 9768 if (flags & F_ZERO) 9769 fill = '0'; 9770 break; 9771 9772 case 'e': 9773 case 'E': 9774 case 'f': 9775 case 'F': 9776 case 'g': 9777 case 'G': 9778 temp = formatfloat(v, flags, prec, c); 9779 if (!temp) 9780 goto onError; 9781 pbuf = PyUnicode_AS_UNICODE(temp); 9782 len = PyUnicode_GET_SIZE(temp); 9783 sign = 1; 9784 if (flags & F_ZERO) 9785 fill = '0'; 9786 break; 9787 9788 case 'c': 9789 pbuf = formatbuf; 9790 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v); 9791 if (len < 0) 9792 goto onError; 9793 break; 9794 9795 default: 9796 PyErr_Format(PyExc_ValueError, 9797 "unsupported format character '%c' (0x%x) " 9798 "at index %zd", 9799 (31<=c && c<=126) ? (char)c : '?', 9800 (int)c, 9801 (Py_ssize_t)(fmt - 1 - 9802 PyUnicode_AS_UNICODE(uformat))); 9803 goto onError; 9804 } 9805 if (sign) { 9806 if (*pbuf == '-' || *pbuf == '+') { 9807 sign = *pbuf++; 9808 len--; 9809 } 9810 else if (flags & F_SIGN) 9811 sign = '+'; 9812 else if (flags & F_BLANK) 9813 sign = ' '; 9814 else 9815 sign = 0; 9816 } 9817 if (width < len) 9818 width = len; 9819 if (rescnt - (sign != 0) < width) { 9820 reslen -= rescnt; 9821 rescnt = width + fmtcnt + 100; 9822 reslen += rescnt; 9823 if (reslen < 0) { 9824 Py_XDECREF(temp); 9825 PyErr_NoMemory(); 9826 goto onError; 9827 } 9828 if (_PyUnicode_Resize(&result, reslen) < 0) { 9829 Py_XDECREF(temp); 9830 goto onError; 9831 } 9832 res = PyUnicode_AS_UNICODE(result) 9833 + reslen - rescnt; 9834 } 9835 if (sign) { 9836 if (fill != ' ') 9837 *res++ = sign; 9838 rescnt--; 9839 if (width > len) 9840 width--; 9841 } 9842 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) { 9843 assert(pbuf[0] == '0'); 9844 assert(pbuf[1] == c); 9845 if (fill != ' ') { 9846 *res++ = *pbuf++; 9847 *res++ = *pbuf++; 9848 } 9849 rescnt -= 2; 9850 width -= 2; 9851 if (width < 0) 9852 width = 0; 9853 len -= 2; 9854 } 9855 if (width > len && !(flags & F_LJUST)) { 9856 do { 9857 --rescnt; 9858 *res++ = fill; 9859 } while (--width > len); 9860 } 9861 if (fill == ' ') { 9862 if (sign) 9863 *res++ = sign; 9864 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) { 9865 assert(pbuf[0] == '0'); 9866 assert(pbuf[1] == c); 9867 *res++ = *pbuf++; 9868 *res++ = *pbuf++; 9869 } 9870 } 9871 Py_UNICODE_COPY(res, pbuf, len); 9872 res += len; 9873 rescnt -= len; 9874 while (--width >= len) { 9875 --rescnt; 9876 *res++ = ' '; 9877 } 9878 if (dict && (argidx < arglen) && c != '%') { 9879 PyErr_SetString(PyExc_TypeError, 9880 "not all arguments converted during string formatting"); 9881 Py_XDECREF(temp); 9882 goto onError; 9883 } 9884 Py_XDECREF(temp); 9885 } /* '%' */ 9886 } /* until end */ 9887 if (argidx < arglen && !dict) { 9888 PyErr_SetString(PyExc_TypeError, 9889 "not all arguments converted during string formatting"); 9890 goto onError; 9891 } 9892 9893 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0) 9894 goto onError; 9895 if (args_owned) { 9896 Py_DECREF(args); 9897 } 9898 Py_DECREF(uformat); 9899 return (PyObject *)result; 9900 9901 onError: 9902 Py_XDECREF(result); 9903 Py_DECREF(uformat); 9904 if (args_owned) { 9905 Py_DECREF(args); 9906 } 9907 return NULL; 9908} 9909 9910static PyObject * 9911unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds); 9912 9913static PyObject * 9914unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds) 9915{ 9916 PyObject *x = NULL; 9917 static char *kwlist[] = {"object", "encoding", "errors", 0}; 9918 char *encoding = NULL; 9919 char *errors = NULL; 9920 9921 if (type != &PyUnicode_Type) 9922 return unicode_subtype_new(type, args, kwds); 9923 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str", 9924 kwlist, &x, &encoding, &errors)) 9925 return NULL; 9926 if (x == NULL) 9927 return (PyObject *)_PyUnicode_New(0); 9928 if (encoding == NULL && errors == NULL) 9929 return PyObject_Str(x); 9930 else 9931 return PyUnicode_FromEncodedObject(x, encoding, errors); 9932} 9933 9934static PyObject * 9935unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds) 9936{ 9937 PyUnicodeObject *tmp, *pnew; 9938 Py_ssize_t n; 9939 9940 assert(PyType_IsSubtype(type, &PyUnicode_Type)); 9941 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds); 9942 if (tmp == NULL) 9943 return NULL; 9944 assert(PyUnicode_Check(tmp)); 9945 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length); 9946 if (pnew == NULL) { 9947 Py_DECREF(tmp); 9948 return NULL; 9949 } 9950 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1)); 9951 if (pnew->str == NULL) { 9952 _Py_ForgetReference((PyObject *)pnew); 9953 PyObject_Del(pnew); 9954 Py_DECREF(tmp); 9955 return PyErr_NoMemory(); 9956 } 9957 Py_UNICODE_COPY(pnew->str, tmp->str, n+1); 9958 pnew->length = n; 9959 pnew->hash = tmp->hash; 9960 Py_DECREF(tmp); 9961 return (PyObject *)pnew; 9962} 9963 9964PyDoc_STRVAR(unicode_doc, 9965 "str(object[, encoding[, errors]]) -> str\n\ 9966\n\ 9967Create a new string object from the given object. If encoding or\n\ 9968errors is specified, then the object must expose a data buffer\n\ 9969that will be decoded using the given encoding and error handler.\n\ 9970Otherwise, returns the result of object.__str__() (if defined)\n\ 9971or repr(object).\n\ 9972encoding defaults to sys.getdefaultencoding().\n\ 9973errors defaults to 'strict'."); 9974 9975static PyObject *unicode_iter(PyObject *seq); 9976 9977PyTypeObject PyUnicode_Type = { 9978 PyVarObject_HEAD_INIT(&PyType_Type, 0) 9979 "str", /* tp_name */ 9980 sizeof(PyUnicodeObject), /* tp_size */ 9981 0, /* tp_itemsize */ 9982 /* Slots */ 9983 (destructor)unicode_dealloc, /* tp_dealloc */ 9984 0, /* tp_print */ 9985 0, /* tp_getattr */ 9986 0, /* tp_setattr */ 9987 0, /* tp_reserved */ 9988 unicode_repr, /* tp_repr */ 9989 &unicode_as_number, /* tp_as_number */ 9990 &unicode_as_sequence, /* tp_as_sequence */ 9991 &unicode_as_mapping, /* tp_as_mapping */ 9992 (hashfunc) unicode_hash, /* tp_hash*/ 9993 0, /* tp_call*/ 9994 (reprfunc) unicode_str, /* tp_str */ 9995 PyObject_GenericGetAttr, /* tp_getattro */ 9996 0, /* tp_setattro */ 9997 0, /* tp_as_buffer */ 9998 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | 9999 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */ 10000 unicode_doc, /* tp_doc */ 10001 0, /* tp_traverse */ 10002 0, /* tp_clear */ 10003 PyUnicode_RichCompare, /* tp_richcompare */ 10004 0, /* tp_weaklistoffset */ 10005 unicode_iter, /* tp_iter */ 10006 0, /* tp_iternext */ 10007 unicode_methods, /* tp_methods */ 10008 0, /* tp_members */ 10009 0, /* tp_getset */ 10010 &PyBaseObject_Type, /* tp_base */ 10011 0, /* tp_dict */ 10012 0, /* tp_descr_get */ 10013 0, /* tp_descr_set */ 10014 0, /* tp_dictoffset */ 10015 0, /* tp_init */ 10016 0, /* tp_alloc */ 10017 unicode_new, /* tp_new */ 10018 PyObject_Del, /* tp_free */ 10019}; 10020 10021/* Initialize the Unicode implementation */ 10022 10023void _PyUnicode_Init(void) 10024{ 10025 int i; 10026 10027 /* XXX - move this array to unicodectype.c ? */ 10028 Py_UNICODE linebreak[] = { 10029 0x000A, /* LINE FEED */ 10030 0x000D, /* CARRIAGE RETURN */ 10031 0x001C, /* FILE SEPARATOR */ 10032 0x001D, /* GROUP SEPARATOR */ 10033 0x001E, /* RECORD SEPARATOR */ 10034 0x0085, /* NEXT LINE */ 10035 0x2028, /* LINE SEPARATOR */ 10036 0x2029, /* PARAGRAPH SEPARATOR */ 10037 }; 10038 10039 /* Init the implementation */ 10040 free_list = NULL; 10041 numfree = 0; 10042 unicode_empty = _PyUnicode_New(0); 10043 if (!unicode_empty) 10044 return; 10045 10046 for (i = 0; i < 256; i++) 10047 unicode_latin1[i] = NULL; 10048 if (PyType_Ready(&PyUnicode_Type) < 0) 10049 Py_FatalError("Can't initialize 'unicode'"); 10050 10051 /* initialize the linebreak bloom filter */ 10052 bloom_linebreak = make_bloom_mask( 10053 linebreak, sizeof(linebreak) / sizeof(linebreak[0]) 10054 ); 10055 10056 PyType_Ready(&EncodingMapType); 10057} 10058 10059/* Finalize the Unicode implementation */ 10060 10061int 10062PyUnicode_ClearFreeList(void) 10063{ 10064 int freelist_size = numfree; 10065 PyUnicodeObject *u; 10066 10067 for (u = free_list; u != NULL;) { 10068 PyUnicodeObject *v = u; 10069 u = *(PyUnicodeObject **)u; 10070 if (v->str) 10071 PyObject_DEL(v->str); 10072 Py_XDECREF(v->defenc); 10073 PyObject_Del(v); 10074 numfree--; 10075 } 10076 free_list = NULL; 10077 assert(numfree == 0); 10078 return freelist_size; 10079} 10080 10081void 10082_PyUnicode_Fini(void) 10083{ 10084 int i; 10085 10086 Py_XDECREF(unicode_empty); 10087 unicode_empty = NULL; 10088 10089 for (i = 0; i < 256; i++) { 10090 if (unicode_latin1[i]) { 10091 Py_DECREF(unicode_latin1[i]); 10092 unicode_latin1[i] = NULL; 10093 } 10094 } 10095 (void)PyUnicode_ClearFreeList(); 10096} 10097 10098void 10099PyUnicode_InternInPlace(PyObject **p) 10100{ 10101 register PyUnicodeObject *s = (PyUnicodeObject *)(*p); 10102 PyObject *t; 10103 if (s == NULL || !PyUnicode_Check(s)) 10104 Py_FatalError( 10105 "PyUnicode_InternInPlace: unicode strings only please!"); 10106 /* If it's a subclass, we don't really know what putting 10107 it in the interned dict might do. */ 10108 if (!PyUnicode_CheckExact(s)) 10109 return; 10110 if (PyUnicode_CHECK_INTERNED(s)) 10111 return; 10112 if (interned == NULL) { 10113 interned = PyDict_New(); 10114 if (interned == NULL) { 10115 PyErr_Clear(); /* Don't leave an exception */ 10116 return; 10117 } 10118 } 10119 /* It might be that the GetItem call fails even 10120 though the key is present in the dictionary, 10121 namely when this happens during a stack overflow. */ 10122 Py_ALLOW_RECURSION 10123 t = PyDict_GetItem(interned, (PyObject *)s); 10124 Py_END_ALLOW_RECURSION 10125 10126 if (t) { 10127 Py_INCREF(t); 10128 Py_DECREF(*p); 10129 *p = t; 10130 return; 10131 } 10132 10133 PyThreadState_GET()->recursion_critical = 1; 10134 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) { 10135 PyErr_Clear(); 10136 PyThreadState_GET()->recursion_critical = 0; 10137 return; 10138 } 10139 PyThreadState_GET()->recursion_critical = 0; 10140 /* The two references in interned are not counted by refcnt. 10141 The deallocator will take care of this */ 10142 Py_REFCNT(s) -= 2; 10143 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL; 10144} 10145 10146void 10147PyUnicode_InternImmortal(PyObject **p) 10148{ 10149 PyUnicode_InternInPlace(p); 10150 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) { 10151 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL; 10152 Py_INCREF(*p); 10153 } 10154} 10155 10156PyObject * 10157PyUnicode_InternFromString(const char *cp) 10158{ 10159 PyObject *s = PyUnicode_FromString(cp); 10160 if (s == NULL) 10161 return NULL; 10162 PyUnicode_InternInPlace(&s); 10163 return s; 10164} 10165 10166void _Py_ReleaseInternedUnicodeStrings(void) 10167{ 10168 PyObject *keys; 10169 PyUnicodeObject *s; 10170 Py_ssize_t i, n; 10171 Py_ssize_t immortal_size = 0, mortal_size = 0; 10172 10173 if (interned == NULL || !PyDict_Check(interned)) 10174 return; 10175 keys = PyDict_Keys(interned); 10176 if (keys == NULL || !PyList_Check(keys)) { 10177 PyErr_Clear(); 10178 return; 10179 } 10180 10181 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak 10182 detector, interned unicode strings are not forcibly deallocated; 10183 rather, we give them their stolen references back, and then clear 10184 and DECREF the interned dict. */ 10185 10186 n = PyList_GET_SIZE(keys); 10187 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n", 10188 n); 10189 for (i = 0; i < n; i++) { 10190 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i); 10191 switch (s->state) { 10192 case SSTATE_NOT_INTERNED: 10193 /* XXX Shouldn't happen */ 10194 break; 10195 case SSTATE_INTERNED_IMMORTAL: 10196 Py_REFCNT(s) += 1; 10197 immortal_size += s->length; 10198 break; 10199 case SSTATE_INTERNED_MORTAL: 10200 Py_REFCNT(s) += 2; 10201 mortal_size += s->length; 10202 break; 10203 default: 10204 Py_FatalError("Inconsistent interned string state."); 10205 } 10206 s->state = SSTATE_NOT_INTERNED; 10207 } 10208 fprintf(stderr, "total size of all interned strings: " 10209 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d " 10210 "mortal/immortal\n", mortal_size, immortal_size); 10211 Py_DECREF(keys); 10212 PyDict_Clear(interned); 10213 Py_DECREF(interned); 10214 interned = NULL; 10215} 10216 10217 10218/********************* Unicode Iterator **************************/ 10219 10220typedef struct { 10221 PyObject_HEAD 10222 Py_ssize_t it_index; 10223 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */ 10224} unicodeiterobject; 10225 10226static void 10227unicodeiter_dealloc(unicodeiterobject *it) 10228{ 10229 _PyObject_GC_UNTRACK(it); 10230 Py_XDECREF(it->it_seq); 10231 PyObject_GC_Del(it); 10232} 10233 10234static int 10235unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg) 10236{ 10237 Py_VISIT(it->it_seq); 10238 return 0; 10239} 10240 10241static PyObject * 10242unicodeiter_next(unicodeiterobject *it) 10243{ 10244 PyUnicodeObject *seq; 10245 PyObject *item; 10246 10247 assert(it != NULL); 10248 seq = it->it_seq; 10249 if (seq == NULL) 10250 return NULL; 10251 assert(PyUnicode_Check(seq)); 10252 10253 if (it->it_index < PyUnicode_GET_SIZE(seq)) { 10254 item = PyUnicode_FromUnicode( 10255 PyUnicode_AS_UNICODE(seq)+it->it_index, 1); 10256 if (item != NULL) 10257 ++it->it_index; 10258 return item; 10259 } 10260 10261 Py_DECREF(seq); 10262 it->it_seq = NULL; 10263 return NULL; 10264} 10265 10266static PyObject * 10267unicodeiter_len(unicodeiterobject *it) 10268{ 10269 Py_ssize_t len = 0; 10270 if (it->it_seq) 10271 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index; 10272 return PyLong_FromSsize_t(len); 10273} 10274 10275PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it))."); 10276 10277static PyMethodDef unicodeiter_methods[] = { 10278 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS, 10279 length_hint_doc}, 10280 {NULL, NULL} /* sentinel */ 10281}; 10282 10283PyTypeObject PyUnicodeIter_Type = { 10284 PyVarObject_HEAD_INIT(&PyType_Type, 0) 10285 "str_iterator", /* tp_name */ 10286 sizeof(unicodeiterobject), /* tp_basicsize */ 10287 0, /* tp_itemsize */ 10288 /* methods */ 10289 (destructor)unicodeiter_dealloc, /* tp_dealloc */ 10290 0, /* tp_print */ 10291 0, /* tp_getattr */ 10292 0, /* tp_setattr */ 10293 0, /* tp_reserved */ 10294 0, /* tp_repr */ 10295 0, /* tp_as_number */ 10296 0, /* tp_as_sequence */ 10297 0, /* tp_as_mapping */ 10298 0, /* tp_hash */ 10299 0, /* tp_call */ 10300 0, /* tp_str */ 10301 PyObject_GenericGetAttr, /* tp_getattro */ 10302 0, /* tp_setattro */ 10303 0, /* tp_as_buffer */ 10304 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */ 10305 0, /* tp_doc */ 10306 (traverseproc)unicodeiter_traverse, /* tp_traverse */ 10307 0, /* tp_clear */ 10308 0, /* tp_richcompare */ 10309 0, /* tp_weaklistoffset */ 10310 PyObject_SelfIter, /* tp_iter */ 10311 (iternextfunc)unicodeiter_next, /* tp_iternext */ 10312 unicodeiter_methods, /* tp_methods */ 10313 0, 10314}; 10315 10316static PyObject * 10317unicode_iter(PyObject *seq) 10318{ 10319 unicodeiterobject *it; 10320 10321 if (!PyUnicode_Check(seq)) { 10322 PyErr_BadInternalCall(); 10323 return NULL; 10324 } 10325 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type); 10326 if (it == NULL) 10327 return NULL; 10328 it->it_index = 0; 10329 Py_INCREF(seq); 10330 it->it_seq = (PyUnicodeObject *)seq; 10331 _PyObject_GC_TRACK(it); 10332 return (PyObject *)it; 10333} 10334 10335size_t 10336Py_UNICODE_strlen(const Py_UNICODE *u) 10337{ 10338 int res = 0; 10339 while(*u++) 10340 res++; 10341 return res; 10342} 10343 10344Py_UNICODE* 10345Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2) 10346{ 10347 Py_UNICODE *u = s1; 10348 while ((*u++ = *s2++)); 10349 return s1; 10350} 10351 10352Py_UNICODE* 10353Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n) 10354{ 10355 Py_UNICODE *u = s1; 10356 while ((*u++ = *s2++)) 10357 if (n-- == 0) 10358 break; 10359 return s1; 10360} 10361 10362Py_UNICODE* 10363Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2) 10364{ 10365 Py_UNICODE *u1 = s1; 10366 u1 += Py_UNICODE_strlen(u1); 10367 Py_UNICODE_strcpy(u1, s2); 10368 return s1; 10369} 10370 10371int 10372Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2) 10373{ 10374 while (*s1 && *s2 && *s1 == *s2) 10375 s1++, s2++; 10376 if (*s1 && *s2) 10377 return (*s1 < *s2) ? -1 : +1; 10378 if (*s1) 10379 return 1; 10380 if (*s2) 10381 return -1; 10382 return 0; 10383} 10384 10385int 10386Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n) 10387{ 10388 register Py_UNICODE u1, u2; 10389 for (; n != 0; n--) { 10390 u1 = *s1; 10391 u2 = *s2; 10392 if (u1 != u2) 10393 return (u1 < u2) ? -1 : +1; 10394 if (u1 == '\0') 10395 return 0; 10396 s1++; 10397 s2++; 10398 } 10399 return 0; 10400} 10401 10402Py_UNICODE* 10403Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c) 10404{ 10405 const Py_UNICODE *p; 10406 for (p = s; *p; p++) 10407 if (*p == c) 10408 return (Py_UNICODE*)p; 10409 return NULL; 10410} 10411 10412Py_UNICODE* 10413Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c) 10414{ 10415 const Py_UNICODE *p; 10416 p = s + Py_UNICODE_strlen(s); 10417 while (p != s) { 10418 p--; 10419 if (*p == c) 10420 return (Py_UNICODE*)p; 10421 } 10422 return NULL; 10423} 10424 10425Py_UNICODE* 10426PyUnicode_AsUnicodeCopy(PyObject *object) 10427{ 10428 PyUnicodeObject *unicode = (PyUnicodeObject *)object; 10429 Py_UNICODE *copy; 10430 Py_ssize_t size; 10431 10432 /* Ensure we won't overflow the size. */ 10433 if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) { 10434 PyErr_NoMemory(); 10435 return NULL; 10436 } 10437 size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */ 10438 size *= sizeof(Py_UNICODE); 10439 copy = PyMem_Malloc(size); 10440 if (copy == NULL) { 10441 PyErr_NoMemory(); 10442 return NULL; 10443 } 10444 memcpy(copy, PyUnicode_AS_UNICODE(unicode), size); 10445 return copy; 10446} 10447 10448/* A _string module, to export formatter_parser and formatter_field_name_split 10449 to the string.Formatter class implemented in Python. */ 10450 10451static PyMethodDef _string_methods[] = { 10452 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split, 10453 METH_O, PyDoc_STR("split the argument as a field name")}, 10454 {"formatter_parser", (PyCFunction) formatter_parser, 10455 METH_O, PyDoc_STR("parse the argument as a format string")}, 10456 {NULL, NULL} 10457}; 10458 10459static struct PyModuleDef _string_module = { 10460 PyModuleDef_HEAD_INIT, 10461 "_string", 10462 PyDoc_STR("string helper module"), 10463 0, 10464 _string_methods, 10465 NULL, 10466 NULL, 10467 NULL, 10468 NULL 10469}; 10470 10471PyMODINIT_FUNC 10472PyInit__string(void) 10473{ 10474 return PyModule_Create(&_string_module); 10475} 10476 10477 10478#ifdef __cplusplus 10479} 10480#endif 10481