unicodeobject.c revision 46408606d80347108a6550805d29402d2771bda3
1/* 2 3Unicode implementation based on original code by Fredrik Lundh, 4modified by Marc-Andre Lemburg <mal@lemburg.com> according to the 5Unicode Integration Proposal (see file Misc/unicode.txt). 6 7Major speed upgrades to the method implementations at the Reykjavik 8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke. 9 10Copyright (c) Corporation for National Research Initiatives. 11 12-------------------------------------------------------------------- 13The original string type implementation is: 14 15 Copyright (c) 1999 by Secret Labs AB 16 Copyright (c) 1999 by Fredrik Lundh 17 18By obtaining, using, and/or copying this software and/or its 19associated documentation, you agree that you have read, understood, 20and will comply with the following terms and conditions: 21 22Permission to use, copy, modify, and distribute this software and its 23associated documentation for any purpose and without fee is hereby 24granted, provided that the above copyright notice appears in all 25copies, and that both that copyright notice and this permission notice 26appear in supporting documentation, and that the name of Secret Labs 27AB or the author not be used in advertising or publicity pertaining to 28distribution of the software without specific, written prior 29permission. 30 31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO 32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND 33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR 34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT 37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 38-------------------------------------------------------------------- 39 40*/ 41 42#define PY_SSIZE_T_CLEAN 43#include "Python.h" 44#include "bytes_methods.h" 45 46#include "unicodeobject.h" 47#include "ucnhash.h" 48 49#ifdef MS_WINDOWS 50#include <windows.h> 51#endif 52 53/* Limit for the Unicode object free list */ 54 55#define PyUnicode_MAXFREELIST 1024 56 57/* Limit for the Unicode object free list stay alive optimization. 58 59 The implementation will keep allocated Unicode memory intact for 60 all objects on the free list having a size less than this 61 limit. This reduces malloc() overhead for small Unicode objects. 62 63 At worst this will result in PyUnicode_MAXFREELIST * 64 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT + 65 malloc()-overhead) bytes of unused garbage. 66 67 Setting the limit to 0 effectively turns the feature off. 68 69 Note: This is an experimental feature ! If you get core dumps when 70 using Unicode objects, turn this feature off. 71 72*/ 73 74#define KEEPALIVE_SIZE_LIMIT 9 75 76/* Endianness switches; defaults to little endian */ 77 78#ifdef WORDS_BIGENDIAN 79# define BYTEORDER_IS_BIG_ENDIAN 80#else 81# define BYTEORDER_IS_LITTLE_ENDIAN 82#endif 83 84/* --- Globals ------------------------------------------------------------ 85 86 The globals are initialized by the _PyUnicode_Init() API and should 87 not be used before calling that API. 88 89*/ 90 91 92#ifdef __cplusplus 93extern "C" { 94#endif 95 96/* This dictionary holds all interned unicode strings. Note that references 97 to strings in this dictionary are *not* counted in the string's ob_refcnt. 98 When the interned string reaches a refcnt of 0 the string deallocation 99 function will delete the reference from this dictionary. 100 101 Another way to look at this is that to say that the actual reference 102 count of a string is: s->ob_refcnt + (s->state ? 2 : 0) 103*/ 104static PyObject *interned; 105 106/* Free list for Unicode objects */ 107static PyUnicodeObject *free_list; 108static int numfree; 109 110/* The empty Unicode object is shared to improve performance. */ 111static PyUnicodeObject *unicode_empty; 112 113/* Single character Unicode strings in the Latin-1 range are being 114 shared as well. */ 115static PyUnicodeObject *unicode_latin1[256]; 116 117/* Fast detection of the most frequent whitespace characters */ 118const unsigned char _Py_ascii_whitespace[] = { 119 0, 0, 0, 0, 0, 0, 0, 0, 120/* case 0x0009: * CHARACTER TABULATION */ 121/* case 0x000A: * LINE FEED */ 122/* case 0x000B: * LINE TABULATION */ 123/* case 0x000C: * FORM FEED */ 124/* case 0x000D: * CARRIAGE RETURN */ 125 0, 1, 1, 1, 1, 1, 0, 0, 126 0, 0, 0, 0, 0, 0, 0, 0, 127/* case 0x001C: * FILE SEPARATOR */ 128/* case 0x001D: * GROUP SEPARATOR */ 129/* case 0x001E: * RECORD SEPARATOR */ 130/* case 0x001F: * UNIT SEPARATOR */ 131 0, 0, 0, 0, 1, 1, 1, 1, 132/* case 0x0020: * SPACE */ 133 1, 0, 0, 0, 0, 0, 0, 0, 134 0, 0, 0, 0, 0, 0, 0, 0, 135 0, 0, 0, 0, 0, 0, 0, 0, 136 0, 0, 0, 0, 0, 0, 0, 0, 137 138 0, 0, 0, 0, 0, 0, 0, 0, 139 0, 0, 0, 0, 0, 0, 0, 0, 140 0, 0, 0, 0, 0, 0, 0, 0, 141 0, 0, 0, 0, 0, 0, 0, 0, 142 0, 0, 0, 0, 0, 0, 0, 0, 143 0, 0, 0, 0, 0, 0, 0, 0, 144 0, 0, 0, 0, 0, 0, 0, 0, 145 0, 0, 0, 0, 0, 0, 0, 0 146}; 147 148static PyObject *unicode_encode_call_errorhandler(const char *errors, 149 PyObject **errorHandler,const char *encoding, const char *reason, 150 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject, 151 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos); 152 153static void raise_encode_exception(PyObject **exceptionObject, 154 const char *encoding, 155 const Py_UNICODE *unicode, Py_ssize_t size, 156 Py_ssize_t startpos, Py_ssize_t endpos, 157 const char *reason); 158 159/* Same for linebreaks */ 160static unsigned char ascii_linebreak[] = { 161 0, 0, 0, 0, 0, 0, 0, 0, 162/* 0x000A, * LINE FEED */ 163/* 0x000B, * LINE TABULATION */ 164/* 0x000C, * FORM FEED */ 165/* 0x000D, * CARRIAGE RETURN */ 166 0, 0, 1, 1, 1, 1, 0, 0, 167 0, 0, 0, 0, 0, 0, 0, 0, 168/* 0x001C, * FILE SEPARATOR */ 169/* 0x001D, * GROUP SEPARATOR */ 170/* 0x001E, * RECORD SEPARATOR */ 171 0, 0, 0, 0, 1, 1, 1, 0, 172 0, 0, 0, 0, 0, 0, 0, 0, 173 0, 0, 0, 0, 0, 0, 0, 0, 174 0, 0, 0, 0, 0, 0, 0, 0, 175 0, 0, 0, 0, 0, 0, 0, 0, 176 177 0, 0, 0, 0, 0, 0, 0, 0, 178 0, 0, 0, 0, 0, 0, 0, 0, 179 0, 0, 0, 0, 0, 0, 0, 0, 180 0, 0, 0, 0, 0, 0, 0, 0, 181 0, 0, 0, 0, 0, 0, 0, 0, 182 0, 0, 0, 0, 0, 0, 0, 0, 183 0, 0, 0, 0, 0, 0, 0, 0, 184 0, 0, 0, 0, 0, 0, 0, 0 185}; 186 187 188Py_UNICODE 189PyUnicode_GetMax(void) 190{ 191#ifdef Py_UNICODE_WIDE 192 return 0x10FFFF; 193#else 194 /* This is actually an illegal character, so it should 195 not be passed to unichr. */ 196 return 0xFFFF; 197#endif 198} 199 200/* --- Bloom Filters ----------------------------------------------------- */ 201 202/* stuff to implement simple "bloom filters" for Unicode characters. 203 to keep things simple, we use a single bitmask, using the least 5 204 bits from each unicode characters as the bit index. */ 205 206/* the linebreak mask is set up by Unicode_Init below */ 207 208#if LONG_BIT >= 128 209#define BLOOM_WIDTH 128 210#elif LONG_BIT >= 64 211#define BLOOM_WIDTH 64 212#elif LONG_BIT >= 32 213#define BLOOM_WIDTH 32 214#else 215#error "LONG_BIT is smaller than 32" 216#endif 217 218#define BLOOM_MASK unsigned long 219 220static BLOOM_MASK bloom_linebreak; 221 222#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1))))) 223#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1))))) 224 225#define BLOOM_LINEBREAK(ch) \ 226 ((ch) < 128U ? ascii_linebreak[(ch)] : \ 227 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch))) 228 229Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len) 230{ 231 /* calculate simple bloom-style bitmask for a given unicode string */ 232 233 BLOOM_MASK mask; 234 Py_ssize_t i; 235 236 mask = 0; 237 for (i = 0; i < len; i++) 238 BLOOM_ADD(mask, ptr[i]); 239 240 return mask; 241} 242 243Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen) 244{ 245 Py_ssize_t i; 246 247 for (i = 0; i < setlen; i++) 248 if (set[i] == chr) 249 return 1; 250 251 return 0; 252} 253 254#define BLOOM_MEMBER(mask, chr, set, setlen) \ 255 BLOOM(mask, chr) && unicode_member(chr, set, setlen) 256 257/* --- Unicode Object ----------------------------------------------------- */ 258 259static 260int unicode_resize(register PyUnicodeObject *unicode, 261 Py_ssize_t length) 262{ 263 void *oldstr; 264 265 /* Shortcut if there's nothing much to do. */ 266 if (unicode->length == length) 267 goto reset; 268 269 /* Resizing shared object (unicode_empty or single character 270 objects) in-place is not allowed. Use PyUnicode_Resize() 271 instead ! */ 272 273 if (unicode == unicode_empty || 274 (unicode->length == 1 && 275 unicode->str[0] < 256U && 276 unicode_latin1[unicode->str[0]] == unicode)) { 277 PyErr_SetString(PyExc_SystemError, 278 "can't resize shared str objects"); 279 return -1; 280 } 281 282 /* We allocate one more byte to make sure the string is Ux0000 terminated. 283 The overallocation is also used by fastsearch, which assumes that it's 284 safe to look at str[length] (without making any assumptions about what 285 it contains). */ 286 287 oldstr = unicode->str; 288 unicode->str = PyObject_REALLOC(unicode->str, 289 sizeof(Py_UNICODE) * (length + 1)); 290 if (!unicode->str) { 291 unicode->str = (Py_UNICODE *)oldstr; 292 PyErr_NoMemory(); 293 return -1; 294 } 295 unicode->str[length] = 0; 296 unicode->length = length; 297 298 reset: 299 /* Reset the object caches */ 300 if (unicode->defenc) { 301 Py_CLEAR(unicode->defenc); 302 } 303 unicode->hash = -1; 304 305 return 0; 306} 307 308/* We allocate one more byte to make sure the string is 309 Ux0000 terminated; some code (e.g. new_identifier) 310 relies on that. 311 312 XXX This allocator could further be enhanced by assuring that the 313 free list never reduces its size below 1. 314 315*/ 316 317static 318PyUnicodeObject *_PyUnicode_New(Py_ssize_t length) 319{ 320 register PyUnicodeObject *unicode; 321 322 /* Optimization for empty strings */ 323 if (length == 0 && unicode_empty != NULL) { 324 Py_INCREF(unicode_empty); 325 return unicode_empty; 326 } 327 328 /* Ensure we won't overflow the size. */ 329 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) { 330 return (PyUnicodeObject *)PyErr_NoMemory(); 331 } 332 333 /* Unicode freelist & memory allocation */ 334 if (free_list) { 335 unicode = free_list; 336 free_list = *(PyUnicodeObject **)unicode; 337 numfree--; 338 if (unicode->str) { 339 /* Keep-Alive optimization: we only upsize the buffer, 340 never downsize it. */ 341 if ((unicode->length < length) && 342 unicode_resize(unicode, length) < 0) { 343 PyObject_DEL(unicode->str); 344 unicode->str = NULL; 345 } 346 } 347 else { 348 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1); 349 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size); 350 } 351 PyObject_INIT(unicode, &PyUnicode_Type); 352 } 353 else { 354 size_t new_size; 355 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type); 356 if (unicode == NULL) 357 return NULL; 358 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1); 359 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size); 360 } 361 362 if (!unicode->str) { 363 PyErr_NoMemory(); 364 goto onError; 365 } 366 /* Initialize the first element to guard against cases where 367 * the caller fails before initializing str -- unicode_resize() 368 * reads str[0], and the Keep-Alive optimization can keep memory 369 * allocated for str alive across a call to unicode_dealloc(unicode). 370 * We don't want unicode_resize to read uninitialized memory in 371 * that case. 372 */ 373 unicode->str[0] = 0; 374 unicode->str[length] = 0; 375 unicode->length = length; 376 unicode->hash = -1; 377 unicode->state = 0; 378 unicode->defenc = NULL; 379 return unicode; 380 381 onError: 382 /* XXX UNREF/NEWREF interface should be more symmetrical */ 383 _Py_DEC_REFTOTAL; 384 _Py_ForgetReference((PyObject *)unicode); 385 PyObject_Del(unicode); 386 return NULL; 387} 388 389static 390void unicode_dealloc(register PyUnicodeObject *unicode) 391{ 392 switch (PyUnicode_CHECK_INTERNED(unicode)) { 393 case SSTATE_NOT_INTERNED: 394 break; 395 396 case SSTATE_INTERNED_MORTAL: 397 /* revive dead object temporarily for DelItem */ 398 Py_REFCNT(unicode) = 3; 399 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0) 400 Py_FatalError( 401 "deletion of interned string failed"); 402 break; 403 404 case SSTATE_INTERNED_IMMORTAL: 405 Py_FatalError("Immortal interned string died."); 406 407 default: 408 Py_FatalError("Inconsistent interned string state."); 409 } 410 411 if (PyUnicode_CheckExact(unicode) && 412 numfree < PyUnicode_MAXFREELIST) { 413 /* Keep-Alive optimization */ 414 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) { 415 PyObject_DEL(unicode->str); 416 unicode->str = NULL; 417 unicode->length = 0; 418 } 419 if (unicode->defenc) { 420 Py_CLEAR(unicode->defenc); 421 } 422 /* Add to free list */ 423 *(PyUnicodeObject **)unicode = free_list; 424 free_list = unicode; 425 numfree++; 426 } 427 else { 428 PyObject_DEL(unicode->str); 429 Py_XDECREF(unicode->defenc); 430 Py_TYPE(unicode)->tp_free((PyObject *)unicode); 431 } 432} 433 434static 435int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length) 436{ 437 register PyUnicodeObject *v; 438 439 /* Argument checks */ 440 if (unicode == NULL) { 441 PyErr_BadInternalCall(); 442 return -1; 443 } 444 v = *unicode; 445 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) { 446 PyErr_BadInternalCall(); 447 return -1; 448 } 449 450 /* Resizing unicode_empty and single character objects is not 451 possible since these are being shared. We simply return a fresh 452 copy with the same Unicode content. */ 453 if (v->length != length && 454 (v == unicode_empty || v->length == 1)) { 455 PyUnicodeObject *w = _PyUnicode_New(length); 456 if (w == NULL) 457 return -1; 458 Py_UNICODE_COPY(w->str, v->str, 459 length < v->length ? length : v->length); 460 Py_DECREF(*unicode); 461 *unicode = w; 462 return 0; 463 } 464 465 /* Note that we don't have to modify *unicode for unshared Unicode 466 objects, since we can modify them in-place. */ 467 return unicode_resize(v, length); 468} 469 470int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length) 471{ 472 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length); 473} 474 475PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u, 476 Py_ssize_t size) 477{ 478 PyUnicodeObject *unicode; 479 480 /* If the Unicode data is known at construction time, we can apply 481 some optimizations which share commonly used objects. */ 482 if (u != NULL) { 483 484 /* Optimization for empty strings */ 485 if (size == 0 && unicode_empty != NULL) { 486 Py_INCREF(unicode_empty); 487 return (PyObject *)unicode_empty; 488 } 489 490 /* Single character Unicode objects in the Latin-1 range are 491 shared when using this constructor */ 492 if (size == 1 && *u < 256) { 493 unicode = unicode_latin1[*u]; 494 if (!unicode) { 495 unicode = _PyUnicode_New(1); 496 if (!unicode) 497 return NULL; 498 unicode->str[0] = *u; 499 unicode_latin1[*u] = unicode; 500 } 501 Py_INCREF(unicode); 502 return (PyObject *)unicode; 503 } 504 } 505 506 unicode = _PyUnicode_New(size); 507 if (!unicode) 508 return NULL; 509 510 /* Copy the Unicode data into the new object */ 511 if (u != NULL) 512 Py_UNICODE_COPY(unicode->str, u, size); 513 514 return (PyObject *)unicode; 515} 516 517PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size) 518{ 519 PyUnicodeObject *unicode; 520 521 if (size < 0) { 522 PyErr_SetString(PyExc_SystemError, 523 "Negative size passed to PyUnicode_FromStringAndSize"); 524 return NULL; 525 } 526 527 /* If the Unicode data is known at construction time, we can apply 528 some optimizations which share commonly used objects. 529 Also, this means the input must be UTF-8, so fall back to the 530 UTF-8 decoder at the end. */ 531 if (u != NULL) { 532 533 /* Optimization for empty strings */ 534 if (size == 0 && unicode_empty != NULL) { 535 Py_INCREF(unicode_empty); 536 return (PyObject *)unicode_empty; 537 } 538 539 /* Single characters are shared when using this constructor. 540 Restrict to ASCII, since the input must be UTF-8. */ 541 if (size == 1 && Py_CHARMASK(*u) < 128) { 542 unicode = unicode_latin1[Py_CHARMASK(*u)]; 543 if (!unicode) { 544 unicode = _PyUnicode_New(1); 545 if (!unicode) 546 return NULL; 547 unicode->str[0] = Py_CHARMASK(*u); 548 unicode_latin1[Py_CHARMASK(*u)] = unicode; 549 } 550 Py_INCREF(unicode); 551 return (PyObject *)unicode; 552 } 553 554 return PyUnicode_DecodeUTF8(u, size, NULL); 555 } 556 557 unicode = _PyUnicode_New(size); 558 if (!unicode) 559 return NULL; 560 561 return (PyObject *)unicode; 562} 563 564PyObject *PyUnicode_FromString(const char *u) 565{ 566 size_t size = strlen(u); 567 if (size > PY_SSIZE_T_MAX) { 568 PyErr_SetString(PyExc_OverflowError, "input too long"); 569 return NULL; 570 } 571 572 return PyUnicode_FromStringAndSize(u, size); 573} 574 575#ifdef HAVE_WCHAR_H 576 577#if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4) 578# define CONVERT_WCHAR_TO_SURROGATES 579#endif 580 581#ifdef CONVERT_WCHAR_TO_SURROGATES 582 583/* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need 584 to convert from UTF32 to UTF16. */ 585 586PyObject *PyUnicode_FromWideChar(register const wchar_t *w, 587 Py_ssize_t size) 588{ 589 PyUnicodeObject *unicode; 590 register Py_ssize_t i; 591 Py_ssize_t alloc; 592 const wchar_t *orig_w; 593 594 if (w == NULL) { 595 if (size == 0) 596 return PyUnicode_FromStringAndSize(NULL, 0); 597 PyErr_BadInternalCall(); 598 return NULL; 599 } 600 601 if (size == -1) { 602 size = wcslen(w); 603 } 604 605 alloc = size; 606 orig_w = w; 607 for (i = size; i > 0; i--) { 608 if (*w > 0xFFFF) 609 alloc++; 610 w++; 611 } 612 w = orig_w; 613 unicode = _PyUnicode_New(alloc); 614 if (!unicode) 615 return NULL; 616 617 /* Copy the wchar_t data into the new object */ 618 { 619 register Py_UNICODE *u; 620 u = PyUnicode_AS_UNICODE(unicode); 621 for (i = size; i > 0; i--) { 622 if (*w > 0xFFFF) { 623 wchar_t ordinal = *w++; 624 ordinal -= 0x10000; 625 *u++ = 0xD800 | (ordinal >> 10); 626 *u++ = 0xDC00 | (ordinal & 0x3FF); 627 } 628 else 629 *u++ = *w++; 630 } 631 } 632 return (PyObject *)unicode; 633} 634 635#else 636 637PyObject *PyUnicode_FromWideChar(register const wchar_t *w, 638 Py_ssize_t size) 639{ 640 PyUnicodeObject *unicode; 641 642 if (w == NULL) { 643 if (size == 0) 644 return PyUnicode_FromStringAndSize(NULL, 0); 645 PyErr_BadInternalCall(); 646 return NULL; 647 } 648 649 if (size == -1) { 650 size = wcslen(w); 651 } 652 653 unicode = _PyUnicode_New(size); 654 if (!unicode) 655 return NULL; 656 657 /* Copy the wchar_t data into the new object */ 658#if Py_UNICODE_SIZE == SIZEOF_WCHAR_T 659 memcpy(unicode->str, w, size * sizeof(wchar_t)); 660#else 661 { 662 register Py_UNICODE *u; 663 register Py_ssize_t i; 664 u = PyUnicode_AS_UNICODE(unicode); 665 for (i = size; i > 0; i--) 666 *u++ = *w++; 667 } 668#endif 669 670 return (PyObject *)unicode; 671} 672 673#endif /* CONVERT_WCHAR_TO_SURROGATES */ 674 675#undef CONVERT_WCHAR_TO_SURROGATES 676 677static void 678makefmt(char *fmt, int longflag, int longlongflag, int size_tflag, 679 int zeropad, int width, int precision, char c) 680{ 681 *fmt++ = '%'; 682 if (width) { 683 if (zeropad) 684 *fmt++ = '0'; 685 fmt += sprintf(fmt, "%d", width); 686 } 687 if (precision) 688 fmt += sprintf(fmt, ".%d", precision); 689 if (longflag) 690 *fmt++ = 'l'; 691 else if (longlongflag) { 692 /* longlongflag should only ever be nonzero on machines with 693 HAVE_LONG_LONG defined */ 694#ifdef HAVE_LONG_LONG 695 char *f = PY_FORMAT_LONG_LONG; 696 while (*f) 697 *fmt++ = *f++; 698#else 699 /* we shouldn't ever get here */ 700 assert(0); 701 *fmt++ = 'l'; 702#endif 703 } 704 else if (size_tflag) { 705 char *f = PY_FORMAT_SIZE_T; 706 while (*f) 707 *fmt++ = *f++; 708 } 709 *fmt++ = c; 710 *fmt = '\0'; 711} 712 713#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;} 714 715/* size of fixed-size buffer for formatting single arguments */ 716#define ITEM_BUFFER_LEN 21 717/* maximum number of characters required for output of %ld. 21 characters 718 allows for 64-bit integers (in decimal) and an optional sign. */ 719#define MAX_LONG_CHARS 21 720/* maximum number of characters required for output of %lld. 721 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits, 722 plus 1 for the sign. 53/22 is an upper bound for log10(256). */ 723#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22) 724 725PyObject * 726PyUnicode_FromFormatV(const char *format, va_list vargs) 727{ 728 va_list count; 729 Py_ssize_t callcount = 0; 730 PyObject **callresults = NULL; 731 PyObject **callresult = NULL; 732 Py_ssize_t n = 0; 733 int width = 0; 734 int precision = 0; 735 int zeropad; 736 const char* f; 737 Py_UNICODE *s; 738 PyObject *string; 739 /* used by sprintf */ 740 char buffer[ITEM_BUFFER_LEN+1]; 741 /* use abuffer instead of buffer, if we need more space 742 * (which can happen if there's a format specifier with width). */ 743 char *abuffer = NULL; 744 char *realbuffer; 745 Py_ssize_t abuffersize = 0; 746 char fmt[61]; /* should be enough for %0width.precisionlld */ 747 const char *copy; 748 749 Py_VA_COPY(count, vargs); 750 /* step 1: count the number of %S/%R/%A/%s format specifications 751 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/ 752 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the 753 * result in an array) */ 754 for (f = format; *f; f++) { 755 if (*f == '%') { 756 if (*(f+1)=='%') 757 continue; 758 if (*(f+1)=='S' || *(f+1)=='R' || *(f+1)=='A') 759 ++callcount; 760 while (ISDIGIT((unsigned)*f)) 761 width = (width*10) + *f++ - '0'; 762 while (*++f && *f != '%' && !ISALPHA((unsigned)*f)) 763 ; 764 if (*f == 's') 765 ++callcount; 766 } 767 } 768 /* step 2: allocate memory for the results of 769 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */ 770 if (callcount) { 771 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount); 772 if (!callresults) { 773 PyErr_NoMemory(); 774 return NULL; 775 } 776 callresult = callresults; 777 } 778 /* step 3: figure out how large a buffer we need */ 779 for (f = format; *f; f++) { 780 if (*f == '%') { 781#ifdef HAVE_LONG_LONG 782 int longlongflag = 0; 783#endif 784 const char* p = f; 785 width = 0; 786 while (ISDIGIT((unsigned)*f)) 787 width = (width*10) + *f++ - '0'; 788 while (*++f && *f != '%' && !ISALPHA((unsigned)*f)) 789 ; 790 791 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since 792 * they don't affect the amount of space we reserve. 793 */ 794 if (*f == 'l') { 795 if (f[1] == 'd' || f[1] == 'u') { 796 ++f; 797 } 798#ifdef HAVE_LONG_LONG 799 else if (f[1] == 'l' && 800 (f[2] == 'd' || f[2] == 'u')) { 801 longlongflag = 1; 802 f += 2; 803 } 804#endif 805 } 806 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) { 807 ++f; 808 } 809 810 switch (*f) { 811 case 'c': 812 (void)va_arg(count, int); 813 /* fall through... */ 814 case '%': 815 n++; 816 break; 817 case 'd': case 'u': case 'i': case 'x': 818 (void) va_arg(count, int); 819#ifdef HAVE_LONG_LONG 820 if (longlongflag) { 821 if (width < MAX_LONG_LONG_CHARS) 822 width = MAX_LONG_LONG_CHARS; 823 } 824 else 825#endif 826 /* MAX_LONG_CHARS is enough to hold a 64-bit integer, 827 including sign. Decimal takes the most space. This 828 isn't enough for octal. If a width is specified we 829 need more (which we allocate later). */ 830 if (width < MAX_LONG_CHARS) 831 width = MAX_LONG_CHARS; 832 n += width; 833 /* XXX should allow for large precision here too. */ 834 if (abuffersize < width) 835 abuffersize = width; 836 break; 837 case 's': 838 { 839 /* UTF-8 */ 840 const char *s = va_arg(count, const char*); 841 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace"); 842 if (!str) 843 goto fail; 844 n += PyUnicode_GET_SIZE(str); 845 /* Remember the str and switch to the next slot */ 846 *callresult++ = str; 847 break; 848 } 849 case 'U': 850 { 851 PyObject *obj = va_arg(count, PyObject *); 852 assert(obj && PyUnicode_Check(obj)); 853 n += PyUnicode_GET_SIZE(obj); 854 break; 855 } 856 case 'V': 857 { 858 PyObject *obj = va_arg(count, PyObject *); 859 const char *str = va_arg(count, const char *); 860 assert(obj || str); 861 assert(!obj || PyUnicode_Check(obj)); 862 if (obj) 863 n += PyUnicode_GET_SIZE(obj); 864 else 865 n += strlen(str); 866 break; 867 } 868 case 'S': 869 { 870 PyObject *obj = va_arg(count, PyObject *); 871 PyObject *str; 872 assert(obj); 873 str = PyObject_Str(obj); 874 if (!str) 875 goto fail; 876 n += PyUnicode_GET_SIZE(str); 877 /* Remember the str and switch to the next slot */ 878 *callresult++ = str; 879 break; 880 } 881 case 'R': 882 { 883 PyObject *obj = va_arg(count, PyObject *); 884 PyObject *repr; 885 assert(obj); 886 repr = PyObject_Repr(obj); 887 if (!repr) 888 goto fail; 889 n += PyUnicode_GET_SIZE(repr); 890 /* Remember the repr and switch to the next slot */ 891 *callresult++ = repr; 892 break; 893 } 894 case 'A': 895 { 896 PyObject *obj = va_arg(count, PyObject *); 897 PyObject *ascii; 898 assert(obj); 899 ascii = PyObject_ASCII(obj); 900 if (!ascii) 901 goto fail; 902 n += PyUnicode_GET_SIZE(ascii); 903 /* Remember the repr and switch to the next slot */ 904 *callresult++ = ascii; 905 break; 906 } 907 case 'p': 908 (void) va_arg(count, int); 909 /* maximum 64-bit pointer representation: 910 * 0xffffffffffffffff 911 * so 19 characters is enough. 912 * XXX I count 18 -- what's the extra for? 913 */ 914 n += 19; 915 break; 916 default: 917 /* if we stumble upon an unknown 918 formatting code, copy the rest of 919 the format string to the output 920 string. (we cannot just skip the 921 code, since there's no way to know 922 what's in the argument list) */ 923 n += strlen(p); 924 goto expand; 925 } 926 } else 927 n++; 928 } 929 expand: 930 if (abuffersize > ITEM_BUFFER_LEN) { 931 /* add 1 for sprintf's trailing null byte */ 932 abuffer = PyObject_Malloc(abuffersize + 1); 933 if (!abuffer) { 934 PyErr_NoMemory(); 935 goto fail; 936 } 937 realbuffer = abuffer; 938 } 939 else 940 realbuffer = buffer; 941 /* step 4: fill the buffer */ 942 /* Since we've analyzed how much space we need for the worst case, 943 we don't have to resize the string. 944 There can be no errors beyond this point. */ 945 string = PyUnicode_FromUnicode(NULL, n); 946 if (!string) 947 goto fail; 948 949 s = PyUnicode_AS_UNICODE(string); 950 callresult = callresults; 951 952 for (f = format; *f; f++) { 953 if (*f == '%') { 954 const char* p = f++; 955 int longflag = 0; 956 int longlongflag = 0; 957 int size_tflag = 0; 958 zeropad = (*f == '0'); 959 /* parse the width.precision part */ 960 width = 0; 961 while (ISDIGIT((unsigned)*f)) 962 width = (width*10) + *f++ - '0'; 963 precision = 0; 964 if (*f == '.') { 965 f++; 966 while (ISDIGIT((unsigned)*f)) 967 precision = (precision*10) + *f++ - '0'; 968 } 969 /* Handle %ld, %lu, %lld and %llu. */ 970 if (*f == 'l') { 971 if (f[1] == 'd' || f[1] == 'u') { 972 longflag = 1; 973 ++f; 974 } 975#ifdef HAVE_LONG_LONG 976 else if (f[1] == 'l' && 977 (f[2] == 'd' || f[2] == 'u')) { 978 longlongflag = 1; 979 f += 2; 980 } 981#endif 982 } 983 /* handle the size_t flag. */ 984 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) { 985 size_tflag = 1; 986 ++f; 987 } 988 989 switch (*f) { 990 case 'c': 991 *s++ = va_arg(vargs, int); 992 break; 993 case 'd': 994 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad, 995 width, precision, 'd'); 996 if (longflag) 997 sprintf(realbuffer, fmt, va_arg(vargs, long)); 998#ifdef HAVE_LONG_LONG 999 else if (longlongflag) 1000 sprintf(realbuffer, fmt, va_arg(vargs, PY_LONG_LONG)); 1001#endif 1002 else if (size_tflag) 1003 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t)); 1004 else 1005 sprintf(realbuffer, fmt, va_arg(vargs, int)); 1006 appendstring(realbuffer); 1007 break; 1008 case 'u': 1009 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad, 1010 width, precision, 'u'); 1011 if (longflag) 1012 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long)); 1013#ifdef HAVE_LONG_LONG 1014 else if (longlongflag) 1015 sprintf(realbuffer, fmt, va_arg(vargs, 1016 unsigned PY_LONG_LONG)); 1017#endif 1018 else if (size_tflag) 1019 sprintf(realbuffer, fmt, va_arg(vargs, size_t)); 1020 else 1021 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int)); 1022 appendstring(realbuffer); 1023 break; 1024 case 'i': 1025 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'i'); 1026 sprintf(realbuffer, fmt, va_arg(vargs, int)); 1027 appendstring(realbuffer); 1028 break; 1029 case 'x': 1030 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x'); 1031 sprintf(realbuffer, fmt, va_arg(vargs, int)); 1032 appendstring(realbuffer); 1033 break; 1034 case 's': 1035 { 1036 /* unused, since we already have the result */ 1037 (void) va_arg(vargs, char *); 1038 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult), 1039 PyUnicode_GET_SIZE(*callresult)); 1040 s += PyUnicode_GET_SIZE(*callresult); 1041 /* We're done with the unicode()/repr() => forget it */ 1042 Py_DECREF(*callresult); 1043 /* switch to next unicode()/repr() result */ 1044 ++callresult; 1045 break; 1046 } 1047 case 'U': 1048 { 1049 PyObject *obj = va_arg(vargs, PyObject *); 1050 Py_ssize_t size = PyUnicode_GET_SIZE(obj); 1051 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size); 1052 s += size; 1053 break; 1054 } 1055 case 'V': 1056 { 1057 PyObject *obj = va_arg(vargs, PyObject *); 1058 const char *str = va_arg(vargs, const char *); 1059 if (obj) { 1060 Py_ssize_t size = PyUnicode_GET_SIZE(obj); 1061 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size); 1062 s += size; 1063 } else { 1064 appendstring(str); 1065 } 1066 break; 1067 } 1068 case 'S': 1069 case 'R': 1070 { 1071 Py_UNICODE *ucopy; 1072 Py_ssize_t usize; 1073 Py_ssize_t upos; 1074 /* unused, since we already have the result */ 1075 (void) va_arg(vargs, PyObject *); 1076 ucopy = PyUnicode_AS_UNICODE(*callresult); 1077 usize = PyUnicode_GET_SIZE(*callresult); 1078 for (upos = 0; upos<usize;) 1079 *s++ = ucopy[upos++]; 1080 /* We're done with the unicode()/repr() => forget it */ 1081 Py_DECREF(*callresult); 1082 /* switch to next unicode()/repr() result */ 1083 ++callresult; 1084 break; 1085 } 1086 case 'p': 1087 sprintf(buffer, "%p", va_arg(vargs, void*)); 1088 /* %p is ill-defined: ensure leading 0x. */ 1089 if (buffer[1] == 'X') 1090 buffer[1] = 'x'; 1091 else if (buffer[1] != 'x') { 1092 memmove(buffer+2, buffer, strlen(buffer)+1); 1093 buffer[0] = '0'; 1094 buffer[1] = 'x'; 1095 } 1096 appendstring(buffer); 1097 break; 1098 case '%': 1099 *s++ = '%'; 1100 break; 1101 default: 1102 appendstring(p); 1103 goto end; 1104 } 1105 } else 1106 *s++ = *f; 1107 } 1108 1109 end: 1110 if (callresults) 1111 PyObject_Free(callresults); 1112 if (abuffer) 1113 PyObject_Free(abuffer); 1114 PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string)); 1115 return string; 1116 fail: 1117 if (callresults) { 1118 PyObject **callresult2 = callresults; 1119 while (callresult2 < callresult) { 1120 Py_DECREF(*callresult2); 1121 ++callresult2; 1122 } 1123 PyObject_Free(callresults); 1124 } 1125 if (abuffer) 1126 PyObject_Free(abuffer); 1127 return NULL; 1128} 1129 1130#undef appendstring 1131 1132PyObject * 1133PyUnicode_FromFormat(const char *format, ...) 1134{ 1135 PyObject* ret; 1136 va_list vargs; 1137 1138#ifdef HAVE_STDARG_PROTOTYPES 1139 va_start(vargs, format); 1140#else 1141 va_start(vargs); 1142#endif 1143 ret = PyUnicode_FromFormatV(format, vargs); 1144 va_end(vargs); 1145 return ret; 1146} 1147 1148Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode, 1149 wchar_t *w, 1150 Py_ssize_t size) 1151{ 1152 if (unicode == NULL) { 1153 PyErr_BadInternalCall(); 1154 return -1; 1155 } 1156 1157 /* If possible, try to copy the 0-termination as well */ 1158 if (size > PyUnicode_GET_SIZE(unicode)) 1159 size = PyUnicode_GET_SIZE(unicode) + 1; 1160 1161#if Py_UNICODE_SIZE == SIZEOF_WCHAR_T 1162 memcpy(w, unicode->str, size * sizeof(wchar_t)); 1163#else 1164 { 1165 register Py_UNICODE *u; 1166 register Py_ssize_t i; 1167 u = PyUnicode_AS_UNICODE(unicode); 1168 for (i = size; i > 0; i--) 1169 *w++ = *u++; 1170 } 1171#endif 1172 1173 if (size > PyUnicode_GET_SIZE(unicode)) 1174 return PyUnicode_GET_SIZE(unicode); 1175 else 1176 return size; 1177} 1178 1179#endif 1180 1181PyObject *PyUnicode_FromOrdinal(int ordinal) 1182{ 1183 Py_UNICODE s[2]; 1184 1185 if (ordinal < 0 || ordinal > 0x10ffff) { 1186 PyErr_SetString(PyExc_ValueError, 1187 "chr() arg not in range(0x110000)"); 1188 return NULL; 1189 } 1190 1191#ifndef Py_UNICODE_WIDE 1192 if (ordinal > 0xffff) { 1193 ordinal -= 0x10000; 1194 s[0] = 0xD800 | (ordinal >> 10); 1195 s[1] = 0xDC00 | (ordinal & 0x3FF); 1196 return PyUnicode_FromUnicode(s, 2); 1197 } 1198#endif 1199 1200 s[0] = (Py_UNICODE)ordinal; 1201 return PyUnicode_FromUnicode(s, 1); 1202} 1203 1204PyObject *PyUnicode_FromObject(register PyObject *obj) 1205{ 1206 /* XXX Perhaps we should make this API an alias of 1207 PyObject_Str() instead ?! */ 1208 if (PyUnicode_CheckExact(obj)) { 1209 Py_INCREF(obj); 1210 return obj; 1211 } 1212 if (PyUnicode_Check(obj)) { 1213 /* For a Unicode subtype that's not a Unicode object, 1214 return a true Unicode object with the same data. */ 1215 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj), 1216 PyUnicode_GET_SIZE(obj)); 1217 } 1218 PyErr_Format(PyExc_TypeError, 1219 "Can't convert '%.100s' object to str implicitly", 1220 Py_TYPE(obj)->tp_name); 1221 return NULL; 1222} 1223 1224PyObject *PyUnicode_FromEncodedObject(register PyObject *obj, 1225 const char *encoding, 1226 const char *errors) 1227{ 1228 Py_buffer buffer; 1229 PyObject *v; 1230 1231 if (obj == NULL) { 1232 PyErr_BadInternalCall(); 1233 return NULL; 1234 } 1235 1236 /* Decoding bytes objects is the most common case and should be fast */ 1237 if (PyBytes_Check(obj)) { 1238 if (PyBytes_GET_SIZE(obj) == 0) { 1239 Py_INCREF(unicode_empty); 1240 v = (PyObject *) unicode_empty; 1241 } 1242 else { 1243 v = PyUnicode_Decode( 1244 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj), 1245 encoding, errors); 1246 } 1247 return v; 1248 } 1249 1250 if (PyUnicode_Check(obj)) { 1251 PyErr_SetString(PyExc_TypeError, 1252 "decoding str is not supported"); 1253 return NULL; 1254 } 1255 1256 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */ 1257 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) { 1258 PyErr_Format(PyExc_TypeError, 1259 "coercing to str: need bytes, bytearray " 1260 "or buffer-like object, %.80s found", 1261 Py_TYPE(obj)->tp_name); 1262 return NULL; 1263 } 1264 1265 if (buffer.len == 0) { 1266 Py_INCREF(unicode_empty); 1267 v = (PyObject *) unicode_empty; 1268 } 1269 else 1270 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors); 1271 1272 PyBuffer_Release(&buffer); 1273 return v; 1274} 1275 1276/* Convert encoding to lower case and replace '_' with '-' in order to 1277 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1), 1278 1 on success. */ 1279static int 1280normalize_encoding(const char *encoding, 1281 char *lower, 1282 size_t lower_len) 1283{ 1284 const char *e; 1285 char *l; 1286 char *l_end; 1287 1288 e = encoding; 1289 l = lower; 1290 l_end = &lower[lower_len - 1]; 1291 while (*e) { 1292 if (l == l_end) 1293 return 0; 1294 if (ISUPPER(*e)) { 1295 *l++ = TOLOWER(*e++); 1296 } 1297 else if (*e == '_') { 1298 *l++ = '-'; 1299 e++; 1300 } 1301 else { 1302 *l++ = *e++; 1303 } 1304 } 1305 *l = '\0'; 1306 return 1; 1307} 1308 1309PyObject *PyUnicode_Decode(const char *s, 1310 Py_ssize_t size, 1311 const char *encoding, 1312 const char *errors) 1313{ 1314 PyObject *buffer = NULL, *unicode; 1315 Py_buffer info; 1316 char lower[11]; /* Enough for any encoding shortcut */ 1317 1318 if (encoding == NULL) 1319 encoding = PyUnicode_GetDefaultEncoding(); 1320 1321 /* Shortcuts for common default encodings */ 1322 if (normalize_encoding(encoding, lower, sizeof(lower))) { 1323 if (strcmp(lower, "utf-8") == 0) 1324 return PyUnicode_DecodeUTF8(s, size, errors); 1325 else if ((strcmp(lower, "latin-1") == 0) || 1326 (strcmp(lower, "iso-8859-1") == 0)) 1327 return PyUnicode_DecodeLatin1(s, size, errors); 1328#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T) 1329 else if (strcmp(lower, "mbcs") == 0) 1330 return PyUnicode_DecodeMBCS(s, size, errors); 1331#endif 1332 else if (strcmp(lower, "ascii") == 0) 1333 return PyUnicode_DecodeASCII(s, size, errors); 1334 else if (strcmp(lower, "utf-16") == 0) 1335 return PyUnicode_DecodeUTF16(s, size, errors, 0); 1336 else if (strcmp(lower, "utf-32") == 0) 1337 return PyUnicode_DecodeUTF32(s, size, errors, 0); 1338 } 1339 1340 /* Decode via the codec registry */ 1341 buffer = NULL; 1342 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0) 1343 goto onError; 1344 buffer = PyMemoryView_FromBuffer(&info); 1345 if (buffer == NULL) 1346 goto onError; 1347 unicode = PyCodec_Decode(buffer, encoding, errors); 1348 if (unicode == NULL) 1349 goto onError; 1350 if (!PyUnicode_Check(unicode)) { 1351 PyErr_Format(PyExc_TypeError, 1352 "decoder did not return a str object (type=%.400s)", 1353 Py_TYPE(unicode)->tp_name); 1354 Py_DECREF(unicode); 1355 goto onError; 1356 } 1357 Py_DECREF(buffer); 1358 return unicode; 1359 1360 onError: 1361 Py_XDECREF(buffer); 1362 return NULL; 1363} 1364 1365PyObject *PyUnicode_AsDecodedObject(PyObject *unicode, 1366 const char *encoding, 1367 const char *errors) 1368{ 1369 PyObject *v; 1370 1371 if (!PyUnicode_Check(unicode)) { 1372 PyErr_BadArgument(); 1373 goto onError; 1374 } 1375 1376 if (encoding == NULL) 1377 encoding = PyUnicode_GetDefaultEncoding(); 1378 1379 /* Decode via the codec registry */ 1380 v = PyCodec_Decode(unicode, encoding, errors); 1381 if (v == NULL) 1382 goto onError; 1383 return v; 1384 1385 onError: 1386 return NULL; 1387} 1388 1389PyObject *PyUnicode_AsDecodedUnicode(PyObject *unicode, 1390 const char *encoding, 1391 const char *errors) 1392{ 1393 PyObject *v; 1394 1395 if (!PyUnicode_Check(unicode)) { 1396 PyErr_BadArgument(); 1397 goto onError; 1398 } 1399 1400 if (encoding == NULL) 1401 encoding = PyUnicode_GetDefaultEncoding(); 1402 1403 /* Decode via the codec registry */ 1404 v = PyCodec_Decode(unicode, encoding, errors); 1405 if (v == NULL) 1406 goto onError; 1407 if (!PyUnicode_Check(v)) { 1408 PyErr_Format(PyExc_TypeError, 1409 "decoder did not return a str object (type=%.400s)", 1410 Py_TYPE(v)->tp_name); 1411 Py_DECREF(v); 1412 goto onError; 1413 } 1414 return v; 1415 1416 onError: 1417 return NULL; 1418} 1419 1420PyObject *PyUnicode_Encode(const Py_UNICODE *s, 1421 Py_ssize_t size, 1422 const char *encoding, 1423 const char *errors) 1424{ 1425 PyObject *v, *unicode; 1426 1427 unicode = PyUnicode_FromUnicode(s, size); 1428 if (unicode == NULL) 1429 return NULL; 1430 v = PyUnicode_AsEncodedString(unicode, encoding, errors); 1431 Py_DECREF(unicode); 1432 return v; 1433} 1434 1435PyObject *PyUnicode_AsEncodedObject(PyObject *unicode, 1436 const char *encoding, 1437 const char *errors) 1438{ 1439 PyObject *v; 1440 1441 if (!PyUnicode_Check(unicode)) { 1442 PyErr_BadArgument(); 1443 goto onError; 1444 } 1445 1446 if (encoding == NULL) 1447 encoding = PyUnicode_GetDefaultEncoding(); 1448 1449 /* Encode via the codec registry */ 1450 v = PyCodec_Encode(unicode, encoding, errors); 1451 if (v == NULL) 1452 goto onError; 1453 return v; 1454 1455 onError: 1456 return NULL; 1457} 1458 1459PyObject *PyUnicode_EncodeFSDefault(PyObject *unicode) 1460{ 1461 if (Py_FileSystemDefaultEncoding) { 1462#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T) 1463 if (strcmp(Py_FileSystemDefaultEncoding, "mbcs") == 0) 1464 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode), 1465 PyUnicode_GET_SIZE(unicode), 1466 NULL); 1467#endif 1468 return PyUnicode_AsEncodedString(unicode, 1469 Py_FileSystemDefaultEncoding, 1470 "surrogateescape"); 1471 } else 1472 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode), 1473 PyUnicode_GET_SIZE(unicode), 1474 "surrogateescape"); 1475} 1476 1477PyObject *PyUnicode_AsEncodedString(PyObject *unicode, 1478 const char *encoding, 1479 const char *errors) 1480{ 1481 PyObject *v; 1482 char lower[11]; /* Enough for any encoding shortcut */ 1483 1484 if (!PyUnicode_Check(unicode)) { 1485 PyErr_BadArgument(); 1486 return NULL; 1487 } 1488 1489 if (encoding == NULL) 1490 encoding = PyUnicode_GetDefaultEncoding(); 1491 1492 /* Shortcuts for common default encodings */ 1493 if (normalize_encoding(encoding, lower, sizeof(lower))) { 1494 if (strcmp(lower, "utf-8") == 0) 1495 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode), 1496 PyUnicode_GET_SIZE(unicode), 1497 errors); 1498 else if ((strcmp(lower, "latin-1") == 0) || 1499 (strcmp(lower, "iso-8859-1") == 0)) 1500 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode), 1501 PyUnicode_GET_SIZE(unicode), 1502 errors); 1503#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T) 1504 else if (strcmp(lower, "mbcs") == 0) 1505 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode), 1506 PyUnicode_GET_SIZE(unicode), 1507 errors); 1508#endif 1509 else if (strcmp(lower, "ascii") == 0) 1510 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode), 1511 PyUnicode_GET_SIZE(unicode), 1512 errors); 1513 } 1514 /* During bootstrap, we may need to find the encodings 1515 package, to load the file system encoding, and require the 1516 file system encoding in order to load the encodings 1517 package. 1518 1519 Break out of this dependency by assuming that the path to 1520 the encodings module is ASCII-only. XXX could try wcstombs 1521 instead, if the file system encoding is the locale's 1522 encoding. */ 1523 if (Py_FileSystemDefaultEncoding && 1524 strcmp(encoding, Py_FileSystemDefaultEncoding) == 0 && 1525 !PyThreadState_GET()->interp->codecs_initialized) 1526 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode), 1527 PyUnicode_GET_SIZE(unicode), 1528 errors); 1529 1530 /* Encode via the codec registry */ 1531 v = PyCodec_Encode(unicode, encoding, errors); 1532 if (v == NULL) 1533 return NULL; 1534 1535 /* The normal path */ 1536 if (PyBytes_Check(v)) 1537 return v; 1538 1539 /* If the codec returns a buffer, raise a warning and convert to bytes */ 1540 if (PyByteArray_Check(v)) { 1541 int error; 1542 PyObject *b; 1543 1544 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1, 1545 "encoder %s returned bytearray instead of bytes", 1546 encoding); 1547 if (error) { 1548 Py_DECREF(v); 1549 return NULL; 1550 } 1551 1552 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v)); 1553 Py_DECREF(v); 1554 return b; 1555 } 1556 1557 PyErr_Format(PyExc_TypeError, 1558 "encoder did not return a bytes object (type=%.400s)", 1559 Py_TYPE(v)->tp_name); 1560 Py_DECREF(v); 1561 return NULL; 1562} 1563 1564PyObject *PyUnicode_AsEncodedUnicode(PyObject *unicode, 1565 const char *encoding, 1566 const char *errors) 1567{ 1568 PyObject *v; 1569 1570 if (!PyUnicode_Check(unicode)) { 1571 PyErr_BadArgument(); 1572 goto onError; 1573 } 1574 1575 if (encoding == NULL) 1576 encoding = PyUnicode_GetDefaultEncoding(); 1577 1578 /* Encode via the codec registry */ 1579 v = PyCodec_Encode(unicode, encoding, errors); 1580 if (v == NULL) 1581 goto onError; 1582 if (!PyUnicode_Check(v)) { 1583 PyErr_Format(PyExc_TypeError, 1584 "encoder did not return an str object (type=%.400s)", 1585 Py_TYPE(v)->tp_name); 1586 Py_DECREF(v); 1587 goto onError; 1588 } 1589 return v; 1590 1591 onError: 1592 return NULL; 1593} 1594 1595PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode, 1596 const char *errors) 1597{ 1598 PyObject *v = ((PyUnicodeObject *)unicode)->defenc; 1599 if (v) 1600 return v; 1601 if (errors != NULL) 1602 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString"); 1603 v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode), 1604 PyUnicode_GET_SIZE(unicode), 1605 NULL); 1606 if (!v) 1607 return NULL; 1608 ((PyUnicodeObject *)unicode)->defenc = v; 1609 return v; 1610} 1611 1612PyObject* 1613PyUnicode_DecodeFSDefault(const char *s) { 1614 Py_ssize_t size = (Py_ssize_t)strlen(s); 1615 return PyUnicode_DecodeFSDefaultAndSize(s, size); 1616} 1617 1618PyObject* 1619PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size) 1620{ 1621 /* During the early bootstrapping process, Py_FileSystemDefaultEncoding 1622 can be undefined. If it is case, decode using UTF-8. The following assumes 1623 that Py_FileSystemDefaultEncoding is set to a built-in encoding during the 1624 bootstrapping process where the codecs aren't ready yet. 1625 */ 1626 if (Py_FileSystemDefaultEncoding) { 1627#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T) 1628 if (strcmp(Py_FileSystemDefaultEncoding, "mbcs") == 0) { 1629 return PyUnicode_DecodeMBCS(s, size, NULL); 1630 } 1631#elif defined(__APPLE__) 1632 if (strcmp(Py_FileSystemDefaultEncoding, "utf-8") == 0) { 1633 return PyUnicode_DecodeUTF8(s, size, "surrogateescape"); 1634 } 1635#endif 1636 return PyUnicode_Decode(s, size, 1637 Py_FileSystemDefaultEncoding, 1638 "surrogateescape"); 1639 } 1640 else { 1641 return PyUnicode_DecodeUTF8(s, size, "surrogateescape"); 1642 } 1643} 1644 1645 1646int 1647PyUnicode_FSConverter(PyObject* arg, void* addr) 1648{ 1649 PyObject *output = NULL; 1650 Py_ssize_t size; 1651 void *data; 1652 if (arg == NULL) { 1653 Py_DECREF(*(PyObject**)addr); 1654 return 1; 1655 } 1656 if (PyBytes_Check(arg)) { 1657 output = arg; 1658 Py_INCREF(output); 1659 } 1660 else { 1661 arg = PyUnicode_FromObject(arg); 1662 if (!arg) 1663 return 0; 1664 output = PyUnicode_EncodeFSDefault(arg); 1665 Py_DECREF(arg); 1666 if (!output) 1667 return 0; 1668 if (!PyBytes_Check(output)) { 1669 Py_DECREF(output); 1670 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes"); 1671 return 0; 1672 } 1673 } 1674 size = PyBytes_GET_SIZE(output); 1675 data = PyBytes_AS_STRING(output); 1676 if (size != strlen(data)) { 1677 PyErr_SetString(PyExc_TypeError, "embedded NUL character"); 1678 Py_DECREF(output); 1679 return 0; 1680 } 1681 *(PyObject**)addr = output; 1682 return Py_CLEANUP_SUPPORTED; 1683} 1684 1685 1686int 1687PyUnicode_FSDecoder(PyObject* arg, void* addr) 1688{ 1689 PyObject *output = NULL; 1690 Py_ssize_t size; 1691 void *data; 1692 if (arg == NULL) { 1693 Py_DECREF(*(PyObject**)addr); 1694 return 1; 1695 } 1696 if (PyUnicode_Check(arg)) { 1697 output = arg; 1698 Py_INCREF(output); 1699 } 1700 else { 1701 arg = PyBytes_FromObject(arg); 1702 if (!arg) 1703 return 0; 1704 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg), 1705 PyBytes_GET_SIZE(arg)); 1706 Py_DECREF(arg); 1707 if (!output) 1708 return 0; 1709 if (!PyUnicode_Check(output)) { 1710 Py_DECREF(output); 1711 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode"); 1712 return 0; 1713 } 1714 } 1715 size = PyUnicode_GET_SIZE(output); 1716 data = PyUnicode_AS_UNICODE(output); 1717 if (size != Py_UNICODE_strlen(data)) { 1718 PyErr_SetString(PyExc_TypeError, "embedded NUL character"); 1719 Py_DECREF(output); 1720 return 0; 1721 } 1722 *(PyObject**)addr = output; 1723 return Py_CLEANUP_SUPPORTED; 1724} 1725 1726 1727char* 1728_PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize) 1729{ 1730 PyObject *bytes; 1731 if (!PyUnicode_Check(unicode)) { 1732 PyErr_BadArgument(); 1733 return NULL; 1734 } 1735 bytes = _PyUnicode_AsDefaultEncodedString(unicode, NULL); 1736 if (bytes == NULL) 1737 return NULL; 1738 if (psize != NULL) 1739 *psize = PyBytes_GET_SIZE(bytes); 1740 return PyBytes_AS_STRING(bytes); 1741} 1742 1743char* 1744_PyUnicode_AsString(PyObject *unicode) 1745{ 1746 return _PyUnicode_AsStringAndSize(unicode, NULL); 1747} 1748 1749Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode) 1750{ 1751 if (!PyUnicode_Check(unicode)) { 1752 PyErr_BadArgument(); 1753 goto onError; 1754 } 1755 return PyUnicode_AS_UNICODE(unicode); 1756 1757 onError: 1758 return NULL; 1759} 1760 1761Py_ssize_t PyUnicode_GetSize(PyObject *unicode) 1762{ 1763 if (!PyUnicode_Check(unicode)) { 1764 PyErr_BadArgument(); 1765 goto onError; 1766 } 1767 return PyUnicode_GET_SIZE(unicode); 1768 1769 onError: 1770 return -1; 1771} 1772 1773const char *PyUnicode_GetDefaultEncoding(void) 1774{ 1775 return "utf-8"; 1776} 1777 1778/* create or adjust a UnicodeDecodeError */ 1779static void 1780make_decode_exception(PyObject **exceptionObject, 1781 const char *encoding, 1782 const char *input, Py_ssize_t length, 1783 Py_ssize_t startpos, Py_ssize_t endpos, 1784 const char *reason) 1785{ 1786 if (*exceptionObject == NULL) { 1787 *exceptionObject = PyUnicodeDecodeError_Create( 1788 encoding, input, length, startpos, endpos, reason); 1789 } 1790 else { 1791 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos)) 1792 goto onError; 1793 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos)) 1794 goto onError; 1795 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason)) 1796 goto onError; 1797 } 1798 return; 1799 1800onError: 1801 Py_DECREF(*exceptionObject); 1802 *exceptionObject = NULL; 1803} 1804 1805/* error handling callback helper: 1806 build arguments, call the callback and check the arguments, 1807 if no exception occurred, copy the replacement to the output 1808 and adjust various state variables. 1809 return 0 on success, -1 on error 1810*/ 1811 1812static 1813int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler, 1814 const char *encoding, const char *reason, 1815 const char **input, const char **inend, Py_ssize_t *startinpos, 1816 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr, 1817 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr) 1818{ 1819 static char *argparse = "O!n;decoding error handler must return (str, int) tuple"; 1820 1821 PyObject *restuple = NULL; 1822 PyObject *repunicode = NULL; 1823 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output); 1824 Py_ssize_t insize; 1825 Py_ssize_t requiredsize; 1826 Py_ssize_t newpos; 1827 Py_UNICODE *repptr; 1828 PyObject *inputobj = NULL; 1829 Py_ssize_t repsize; 1830 int res = -1; 1831 1832 if (*errorHandler == NULL) { 1833 *errorHandler = PyCodec_LookupError(errors); 1834 if (*errorHandler == NULL) 1835 goto onError; 1836 } 1837 1838 make_decode_exception(exceptionObject, 1839 encoding, 1840 *input, *inend - *input, 1841 *startinpos, *endinpos, 1842 reason); 1843 if (*exceptionObject == NULL) 1844 goto onError; 1845 1846 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL); 1847 if (restuple == NULL) 1848 goto onError; 1849 if (!PyTuple_Check(restuple)) { 1850 PyErr_SetString(PyExc_TypeError, &argparse[4]); 1851 goto onError; 1852 } 1853 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos)) 1854 goto onError; 1855 1856 /* Copy back the bytes variables, which might have been modified by the 1857 callback */ 1858 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject); 1859 if (!inputobj) 1860 goto onError; 1861 if (!PyBytes_Check(inputobj)) { 1862 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes"); 1863 } 1864 *input = PyBytes_AS_STRING(inputobj); 1865 insize = PyBytes_GET_SIZE(inputobj); 1866 *inend = *input + insize; 1867 /* we can DECREF safely, as the exception has another reference, 1868 so the object won't go away. */ 1869 Py_DECREF(inputobj); 1870 1871 if (newpos<0) 1872 newpos = insize+newpos; 1873 if (newpos<0 || newpos>insize) { 1874 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos); 1875 goto onError; 1876 } 1877 1878 /* need more space? (at least enough for what we 1879 have+the replacement+the rest of the string (starting 1880 at the new input position), so we won't have to check space 1881 when there are no errors in the rest of the string) */ 1882 repptr = PyUnicode_AS_UNICODE(repunicode); 1883 repsize = PyUnicode_GET_SIZE(repunicode); 1884 requiredsize = *outpos + repsize + insize-newpos; 1885 if (requiredsize > outsize) { 1886 if (requiredsize<2*outsize) 1887 requiredsize = 2*outsize; 1888 if (_PyUnicode_Resize(output, requiredsize) < 0) 1889 goto onError; 1890 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos; 1891 } 1892 *endinpos = newpos; 1893 *inptr = *input + newpos; 1894 Py_UNICODE_COPY(*outptr, repptr, repsize); 1895 *outptr += repsize; 1896 *outpos += repsize; 1897 1898 /* we made it! */ 1899 res = 0; 1900 1901 onError: 1902 Py_XDECREF(restuple); 1903 return res; 1904} 1905 1906/* --- UTF-7 Codec -------------------------------------------------------- */ 1907 1908/* See RFC2152 for details. We encode conservatively and decode liberally. */ 1909 1910/* Three simple macros defining base-64. */ 1911 1912/* Is c a base-64 character? */ 1913 1914#define IS_BASE64(c) \ 1915 (((c) >= 'A' && (c) <= 'Z') || \ 1916 ((c) >= 'a' && (c) <= 'z') || \ 1917 ((c) >= '0' && (c) <= '9') || \ 1918 (c) == '+' || (c) == '/') 1919 1920/* given that c is a base-64 character, what is its base-64 value? */ 1921 1922#define FROM_BASE64(c) \ 1923 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \ 1924 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \ 1925 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \ 1926 (c) == '+' ? 62 : 63) 1927 1928/* What is the base-64 character of the bottom 6 bits of n? */ 1929 1930#define TO_BASE64(n) \ 1931 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f]) 1932 1933/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be 1934 * decoded as itself. We are permissive on decoding; the only ASCII 1935 * byte not decoding to itself is the + which begins a base64 1936 * string. */ 1937 1938#define DECODE_DIRECT(c) \ 1939 ((c) <= 127 && (c) != '+') 1940 1941/* The UTF-7 encoder treats ASCII characters differently according to 1942 * whether they are Set D, Set O, Whitespace, or special (i.e. none of 1943 * the above). See RFC2152. This array identifies these different 1944 * sets: 1945 * 0 : "Set D" 1946 * alphanumeric and '(),-./:? 1947 * 1 : "Set O" 1948 * !"#$%&*;<=>@[]^_`{|} 1949 * 2 : "whitespace" 1950 * ht nl cr sp 1951 * 3 : special (must be base64 encoded) 1952 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127) 1953 */ 1954 1955static 1956char utf7_category[128] = { 1957/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */ 1958 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3, 1959/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */ 1960 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1961/* sp ! " # $ % & ' ( ) * + , - . / */ 1962 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0, 1963/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */ 1964 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1965/* @ A B C D E F G H I J K L M N O */ 1966 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1967/* P Q R S T U V W X Y Z [ \ ] ^ _ */ 1968 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1, 1969/* ` a b c d e f g h i j k l m n o */ 1970 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1971/* p q r s t u v w x y z { | } ~ del */ 1972 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3, 1973}; 1974 1975/* ENCODE_DIRECT: this character should be encoded as itself. The 1976 * answer depends on whether we are encoding set O as itself, and also 1977 * on whether we are encoding whitespace as itself. RFC2152 makes it 1978 * clear that the answers to these questions vary between 1979 * applications, so this code needs to be flexible. */ 1980 1981#define ENCODE_DIRECT(c, directO, directWS) \ 1982 ((c) < 128 && (c) > 0 && \ 1983 ((utf7_category[(c)] == 0) || \ 1984 (directWS && (utf7_category[(c)] == 2)) || \ 1985 (directO && (utf7_category[(c)] == 1)))) 1986 1987PyObject *PyUnicode_DecodeUTF7(const char *s, 1988 Py_ssize_t size, 1989 const char *errors) 1990{ 1991 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL); 1992} 1993 1994/* The decoder. The only state we preserve is our read position, 1995 * i.e. how many characters we have consumed. So if we end in the 1996 * middle of a shift sequence we have to back off the read position 1997 * and the output to the beginning of the sequence, otherwise we lose 1998 * all the shift state (seen bits, number of bits seen, high 1999 * surrogate). */ 2000 2001PyObject *PyUnicode_DecodeUTF7Stateful(const char *s, 2002 Py_ssize_t size, 2003 const char *errors, 2004 Py_ssize_t *consumed) 2005{ 2006 const char *starts = s; 2007 Py_ssize_t startinpos; 2008 Py_ssize_t endinpos; 2009 Py_ssize_t outpos; 2010 const char *e; 2011 PyUnicodeObject *unicode; 2012 Py_UNICODE *p; 2013 const char *errmsg = ""; 2014 int inShift = 0; 2015 Py_UNICODE *shiftOutStart; 2016 unsigned int base64bits = 0; 2017 unsigned long base64buffer = 0; 2018 Py_UNICODE surrogate = 0; 2019 PyObject *errorHandler = NULL; 2020 PyObject *exc = NULL; 2021 2022 unicode = _PyUnicode_New(size); 2023 if (!unicode) 2024 return NULL; 2025 if (size == 0) { 2026 if (consumed) 2027 *consumed = 0; 2028 return (PyObject *)unicode; 2029 } 2030 2031 p = unicode->str; 2032 shiftOutStart = p; 2033 e = s + size; 2034 2035 while (s < e) { 2036 Py_UNICODE ch; 2037 restart: 2038 ch = (unsigned char) *s; 2039 2040 if (inShift) { /* in a base-64 section */ 2041 if (IS_BASE64(ch)) { /* consume a base-64 character */ 2042 base64buffer = (base64buffer << 6) | FROM_BASE64(ch); 2043 base64bits += 6; 2044 s++; 2045 if (base64bits >= 16) { 2046 /* we have enough bits for a UTF-16 value */ 2047 Py_UNICODE outCh = (Py_UNICODE) 2048 (base64buffer >> (base64bits-16)); 2049 base64bits -= 16; 2050 base64buffer &= (1 << base64bits) - 1; /* clear high bits */ 2051 if (surrogate) { 2052 /* expecting a second surrogate */ 2053 if (outCh >= 0xDC00 && outCh <= 0xDFFF) { 2054#ifdef Py_UNICODE_WIDE 2055 *p++ = (((surrogate & 0x3FF)<<10) 2056 | (outCh & 0x3FF)) + 0x10000; 2057#else 2058 *p++ = surrogate; 2059 *p++ = outCh; 2060#endif 2061 surrogate = 0; 2062 } 2063 else { 2064 surrogate = 0; 2065 errmsg = "second surrogate missing"; 2066 goto utf7Error; 2067 } 2068 } 2069 else if (outCh >= 0xD800 && outCh <= 0xDBFF) { 2070 /* first surrogate */ 2071 surrogate = outCh; 2072 } 2073 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) { 2074 errmsg = "unexpected second surrogate"; 2075 goto utf7Error; 2076 } 2077 else { 2078 *p++ = outCh; 2079 } 2080 } 2081 } 2082 else { /* now leaving a base-64 section */ 2083 inShift = 0; 2084 s++; 2085 if (surrogate) { 2086 errmsg = "second surrogate missing at end of shift sequence"; 2087 goto utf7Error; 2088 } 2089 if (base64bits > 0) { /* left-over bits */ 2090 if (base64bits >= 6) { 2091 /* We've seen at least one base-64 character */ 2092 errmsg = "partial character in shift sequence"; 2093 goto utf7Error; 2094 } 2095 else { 2096 /* Some bits remain; they should be zero */ 2097 if (base64buffer != 0) { 2098 errmsg = "non-zero padding bits in shift sequence"; 2099 goto utf7Error; 2100 } 2101 } 2102 } 2103 if (ch != '-') { 2104 /* '-' is absorbed; other terminating 2105 characters are preserved */ 2106 *p++ = ch; 2107 } 2108 } 2109 } 2110 else if ( ch == '+' ) { 2111 startinpos = s-starts; 2112 s++; /* consume '+' */ 2113 if (s < e && *s == '-') { /* '+-' encodes '+' */ 2114 s++; 2115 *p++ = '+'; 2116 } 2117 else { /* begin base64-encoded section */ 2118 inShift = 1; 2119 shiftOutStart = p; 2120 base64bits = 0; 2121 } 2122 } 2123 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */ 2124 *p++ = ch; 2125 s++; 2126 } 2127 else { 2128 startinpos = s-starts; 2129 s++; 2130 errmsg = "unexpected special character"; 2131 goto utf7Error; 2132 } 2133 continue; 2134utf7Error: 2135 outpos = p-PyUnicode_AS_UNICODE(unicode); 2136 endinpos = s-starts; 2137 if (unicode_decode_call_errorhandler( 2138 errors, &errorHandler, 2139 "utf7", errmsg, 2140 &starts, &e, &startinpos, &endinpos, &exc, &s, 2141 &unicode, &outpos, &p)) 2142 goto onError; 2143 } 2144 2145 /* end of string */ 2146 2147 if (inShift && !consumed) { /* in shift sequence, no more to follow */ 2148 /* if we're in an inconsistent state, that's an error */ 2149 if (surrogate || 2150 (base64bits >= 6) || 2151 (base64bits > 0 && base64buffer != 0)) { 2152 outpos = p-PyUnicode_AS_UNICODE(unicode); 2153 endinpos = size; 2154 if (unicode_decode_call_errorhandler( 2155 errors, &errorHandler, 2156 "utf7", "unterminated shift sequence", 2157 &starts, &e, &startinpos, &endinpos, &exc, &s, 2158 &unicode, &outpos, &p)) 2159 goto onError; 2160 if (s < e) 2161 goto restart; 2162 } 2163 } 2164 2165 /* return state */ 2166 if (consumed) { 2167 if (inShift) { 2168 p = shiftOutStart; /* back off output */ 2169 *consumed = startinpos; 2170 } 2171 else { 2172 *consumed = s-starts; 2173 } 2174 } 2175 2176 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0) 2177 goto onError; 2178 2179 Py_XDECREF(errorHandler); 2180 Py_XDECREF(exc); 2181 return (PyObject *)unicode; 2182 2183 onError: 2184 Py_XDECREF(errorHandler); 2185 Py_XDECREF(exc); 2186 Py_DECREF(unicode); 2187 return NULL; 2188} 2189 2190 2191PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s, 2192 Py_ssize_t size, 2193 int base64SetO, 2194 int base64WhiteSpace, 2195 const char *errors) 2196{ 2197 PyObject *v; 2198 /* It might be possible to tighten this worst case */ 2199 Py_ssize_t allocated = 8 * size; 2200 int inShift = 0; 2201 Py_ssize_t i = 0; 2202 unsigned int base64bits = 0; 2203 unsigned long base64buffer = 0; 2204 char * out; 2205 char * start; 2206 2207 if (size == 0) 2208 return PyBytes_FromStringAndSize(NULL, 0); 2209 2210 if (allocated / 8 != size) 2211 return PyErr_NoMemory(); 2212 2213 v = PyBytes_FromStringAndSize(NULL, allocated); 2214 if (v == NULL) 2215 return NULL; 2216 2217 start = out = PyBytes_AS_STRING(v); 2218 for (;i < size; ++i) { 2219 Py_UNICODE ch = s[i]; 2220 2221 if (inShift) { 2222 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) { 2223 /* shifting out */ 2224 if (base64bits) { /* output remaining bits */ 2225 *out++ = TO_BASE64(base64buffer << (6-base64bits)); 2226 base64buffer = 0; 2227 base64bits = 0; 2228 } 2229 inShift = 0; 2230 /* Characters not in the BASE64 set implicitly unshift the sequence 2231 so no '-' is required, except if the character is itself a '-' */ 2232 if (IS_BASE64(ch) || ch == '-') { 2233 *out++ = '-'; 2234 } 2235 *out++ = (char) ch; 2236 } 2237 else { 2238 goto encode_char; 2239 } 2240 } 2241 else { /* not in a shift sequence */ 2242 if (ch == '+') { 2243 *out++ = '+'; 2244 *out++ = '-'; 2245 } 2246 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) { 2247 *out++ = (char) ch; 2248 } 2249 else { 2250 *out++ = '+'; 2251 inShift = 1; 2252 goto encode_char; 2253 } 2254 } 2255 continue; 2256encode_char: 2257#ifdef Py_UNICODE_WIDE 2258 if (ch >= 0x10000) { 2259 /* code first surrogate */ 2260 base64bits += 16; 2261 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10); 2262 while (base64bits >= 6) { 2263 *out++ = TO_BASE64(base64buffer >> (base64bits-6)); 2264 base64bits -= 6; 2265 } 2266 /* prepare second surrogate */ 2267 ch = 0xDC00 | ((ch-0x10000) & 0x3FF); 2268 } 2269#endif 2270 base64bits += 16; 2271 base64buffer = (base64buffer << 16) | ch; 2272 while (base64bits >= 6) { 2273 *out++ = TO_BASE64(base64buffer >> (base64bits-6)); 2274 base64bits -= 6; 2275 } 2276 } 2277 if (base64bits) 2278 *out++= TO_BASE64(base64buffer << (6-base64bits) ); 2279 if (inShift) 2280 *out++ = '-'; 2281 if (_PyBytes_Resize(&v, out - start) < 0) 2282 return NULL; 2283 return v; 2284} 2285 2286#undef IS_BASE64 2287#undef FROM_BASE64 2288#undef TO_BASE64 2289#undef DECODE_DIRECT 2290#undef ENCODE_DIRECT 2291 2292/* --- UTF-8 Codec -------------------------------------------------------- */ 2293 2294static 2295char utf8_code_length[256] = { 2296 /* Map UTF-8 encoded prefix byte to sequence length. Zero means 2297 illegal prefix. See RFC 3629 for details */ 2298 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */ 2299 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2300 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2301 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2302 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2303 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2304 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2305 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */ 2306 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */ 2307 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2308 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2309 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */ 2310 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */ 2311 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */ 2312 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */ 2313 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */ 2314}; 2315 2316PyObject *PyUnicode_DecodeUTF8(const char *s, 2317 Py_ssize_t size, 2318 const char *errors) 2319{ 2320 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL); 2321} 2322 2323/* Mask to check or force alignment of a pointer to C 'long' boundaries */ 2324#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1) 2325 2326/* Mask to quickly check whether a C 'long' contains a 2327 non-ASCII, UTF8-encoded char. */ 2328#if (SIZEOF_LONG == 8) 2329# define ASCII_CHAR_MASK 0x8080808080808080L 2330#elif (SIZEOF_LONG == 4) 2331# define ASCII_CHAR_MASK 0x80808080L 2332#else 2333# error C 'long' size should be either 4 or 8! 2334#endif 2335 2336PyObject *PyUnicode_DecodeUTF8Stateful(const char *s, 2337 Py_ssize_t size, 2338 const char *errors, 2339 Py_ssize_t *consumed) 2340{ 2341 const char *starts = s; 2342 int n; 2343 int k; 2344 Py_ssize_t startinpos; 2345 Py_ssize_t endinpos; 2346 Py_ssize_t outpos; 2347 const char *e, *aligned_end; 2348 PyUnicodeObject *unicode; 2349 Py_UNICODE *p; 2350 const char *errmsg = ""; 2351 PyObject *errorHandler = NULL; 2352 PyObject *exc = NULL; 2353 2354 /* Note: size will always be longer than the resulting Unicode 2355 character count */ 2356 unicode = _PyUnicode_New(size); 2357 if (!unicode) 2358 return NULL; 2359 if (size == 0) { 2360 if (consumed) 2361 *consumed = 0; 2362 return (PyObject *)unicode; 2363 } 2364 2365 /* Unpack UTF-8 encoded data */ 2366 p = unicode->str; 2367 e = s + size; 2368 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK); 2369 2370 while (s < e) { 2371 Py_UCS4 ch = (unsigned char)*s; 2372 2373 if (ch < 0x80) { 2374 /* Fast path for runs of ASCII characters. Given that common UTF-8 2375 input will consist of an overwhelming majority of ASCII 2376 characters, we try to optimize for this case by checking 2377 as many characters as a C 'long' can contain. 2378 First, check if we can do an aligned read, as most CPUs have 2379 a penalty for unaligned reads. 2380 */ 2381 if (!((size_t) s & LONG_PTR_MASK)) { 2382 /* Help register allocation */ 2383 register const char *_s = s; 2384 register Py_UNICODE *_p = p; 2385 while (_s < aligned_end) { 2386 /* Read a whole long at a time (either 4 or 8 bytes), 2387 and do a fast unrolled copy if it only contains ASCII 2388 characters. */ 2389 unsigned long data = *(unsigned long *) _s; 2390 if (data & ASCII_CHAR_MASK) 2391 break; 2392 _p[0] = (unsigned char) _s[0]; 2393 _p[1] = (unsigned char) _s[1]; 2394 _p[2] = (unsigned char) _s[2]; 2395 _p[3] = (unsigned char) _s[3]; 2396#if (SIZEOF_LONG == 8) 2397 _p[4] = (unsigned char) _s[4]; 2398 _p[5] = (unsigned char) _s[5]; 2399 _p[6] = (unsigned char) _s[6]; 2400 _p[7] = (unsigned char) _s[7]; 2401#endif 2402 _s += SIZEOF_LONG; 2403 _p += SIZEOF_LONG; 2404 } 2405 s = _s; 2406 p = _p; 2407 if (s == e) 2408 break; 2409 ch = (unsigned char)*s; 2410 } 2411 } 2412 2413 if (ch < 0x80) { 2414 *p++ = (Py_UNICODE)ch; 2415 s++; 2416 continue; 2417 } 2418 2419 n = utf8_code_length[ch]; 2420 2421 if (s + n > e) { 2422 if (consumed) 2423 break; 2424 else { 2425 errmsg = "unexpected end of data"; 2426 startinpos = s-starts; 2427 endinpos = startinpos+1; 2428 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++) 2429 endinpos++; 2430 goto utf8Error; 2431 } 2432 } 2433 2434 switch (n) { 2435 2436 case 0: 2437 errmsg = "invalid start byte"; 2438 startinpos = s-starts; 2439 endinpos = startinpos+1; 2440 goto utf8Error; 2441 2442 case 1: 2443 errmsg = "internal error"; 2444 startinpos = s-starts; 2445 endinpos = startinpos+1; 2446 goto utf8Error; 2447 2448 case 2: 2449 if ((s[1] & 0xc0) != 0x80) { 2450 errmsg = "invalid continuation byte"; 2451 startinpos = s-starts; 2452 endinpos = startinpos + 1; 2453 goto utf8Error; 2454 } 2455 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f); 2456 assert ((ch > 0x007F) && (ch <= 0x07FF)); 2457 *p++ = (Py_UNICODE)ch; 2458 break; 2459 2460 case 3: 2461 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf 2462 will result in surrogates in range d800-dfff. Surrogates are 2463 not valid UTF-8 so they are rejected. 2464 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf 2465 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */ 2466 if ((s[1] & 0xc0) != 0x80 || 2467 (s[2] & 0xc0) != 0x80 || 2468 ((unsigned char)s[0] == 0xE0 && 2469 (unsigned char)s[1] < 0xA0) || 2470 ((unsigned char)s[0] == 0xED && 2471 (unsigned char)s[1] > 0x9F)) { 2472 errmsg = "invalid continuation byte"; 2473 startinpos = s-starts; 2474 endinpos = startinpos + 1; 2475 2476 /* if s[1] first two bits are 1 and 0, then the invalid 2477 continuation byte is s[2], so increment endinpos by 1, 2478 if not, s[1] is invalid and endinpos doesn't need to 2479 be incremented. */ 2480 if ((s[1] & 0xC0) == 0x80) 2481 endinpos++; 2482 goto utf8Error; 2483 } 2484 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f); 2485 assert ((ch > 0x07FF) && (ch <= 0xFFFF)); 2486 *p++ = (Py_UNICODE)ch; 2487 break; 2488 2489 case 4: 2490 if ((s[1] & 0xc0) != 0x80 || 2491 (s[2] & 0xc0) != 0x80 || 2492 (s[3] & 0xc0) != 0x80 || 2493 ((unsigned char)s[0] == 0xF0 && 2494 (unsigned char)s[1] < 0x90) || 2495 ((unsigned char)s[0] == 0xF4 && 2496 (unsigned char)s[1] > 0x8F)) { 2497 errmsg = "invalid continuation byte"; 2498 startinpos = s-starts; 2499 endinpos = startinpos + 1; 2500 if ((s[1] & 0xC0) == 0x80) { 2501 endinpos++; 2502 if ((s[2] & 0xC0) == 0x80) 2503 endinpos++; 2504 } 2505 goto utf8Error; 2506 } 2507 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) + 2508 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f); 2509 assert ((ch > 0xFFFF) && (ch <= 0x10ffff)); 2510 2511#ifdef Py_UNICODE_WIDE 2512 *p++ = (Py_UNICODE)ch; 2513#else 2514 /* compute and append the two surrogates: */ 2515 2516 /* translate from 10000..10FFFF to 0..FFFF */ 2517 ch -= 0x10000; 2518 2519 /* high surrogate = top 10 bits added to D800 */ 2520 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10)); 2521 2522 /* low surrogate = bottom 10 bits added to DC00 */ 2523 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF)); 2524#endif 2525 break; 2526 } 2527 s += n; 2528 continue; 2529 2530 utf8Error: 2531 outpos = p-PyUnicode_AS_UNICODE(unicode); 2532 if (unicode_decode_call_errorhandler( 2533 errors, &errorHandler, 2534 "utf8", errmsg, 2535 &starts, &e, &startinpos, &endinpos, &exc, &s, 2536 &unicode, &outpos, &p)) 2537 goto onError; 2538 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK); 2539 } 2540 if (consumed) 2541 *consumed = s-starts; 2542 2543 /* Adjust length */ 2544 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0) 2545 goto onError; 2546 2547 Py_XDECREF(errorHandler); 2548 Py_XDECREF(exc); 2549 return (PyObject *)unicode; 2550 2551 onError: 2552 Py_XDECREF(errorHandler); 2553 Py_XDECREF(exc); 2554 Py_DECREF(unicode); 2555 return NULL; 2556} 2557 2558#undef ASCII_CHAR_MASK 2559 2560 2561/* Allocation strategy: if the string is short, convert into a stack buffer 2562 and allocate exactly as much space needed at the end. Else allocate the 2563 maximum possible needed (4 result bytes per Unicode character), and return 2564 the excess memory at the end. 2565*/ 2566PyObject * 2567PyUnicode_EncodeUTF8(const Py_UNICODE *s, 2568 Py_ssize_t size, 2569 const char *errors) 2570{ 2571#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */ 2572 2573 Py_ssize_t i; /* index into s of next input byte */ 2574 PyObject *result; /* result string object */ 2575 char *p; /* next free byte in output buffer */ 2576 Py_ssize_t nallocated; /* number of result bytes allocated */ 2577 Py_ssize_t nneeded; /* number of result bytes needed */ 2578 char stackbuf[MAX_SHORT_UNICHARS * 4]; 2579 PyObject *errorHandler = NULL; 2580 PyObject *exc = NULL; 2581 2582 assert(s != NULL); 2583 assert(size >= 0); 2584 2585 if (size <= MAX_SHORT_UNICHARS) { 2586 /* Write into the stack buffer; nallocated can't overflow. 2587 * At the end, we'll allocate exactly as much heap space as it 2588 * turns out we need. 2589 */ 2590 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int); 2591 result = NULL; /* will allocate after we're done */ 2592 p = stackbuf; 2593 } 2594 else { 2595 /* Overallocate on the heap, and give the excess back at the end. */ 2596 nallocated = size * 4; 2597 if (nallocated / 4 != size) /* overflow! */ 2598 return PyErr_NoMemory(); 2599 result = PyBytes_FromStringAndSize(NULL, nallocated); 2600 if (result == NULL) 2601 return NULL; 2602 p = PyBytes_AS_STRING(result); 2603 } 2604 2605 for (i = 0; i < size;) { 2606 Py_UCS4 ch = s[i++]; 2607 2608 if (ch < 0x80) 2609 /* Encode ASCII */ 2610 *p++ = (char) ch; 2611 2612 else if (ch < 0x0800) { 2613 /* Encode Latin-1 */ 2614 *p++ = (char)(0xc0 | (ch >> 6)); 2615 *p++ = (char)(0x80 | (ch & 0x3f)); 2616 } else if (0xD800 <= ch && ch <= 0xDFFF) { 2617#ifndef Py_UNICODE_WIDE 2618 /* Special case: check for high and low surrogate */ 2619 if (ch <= 0xDBFF && i != size && 0xDC00 <= s[i] && s[i] <= 0xDFFF) { 2620 Py_UCS4 ch2 = s[i]; 2621 /* Combine the two surrogates to form a UCS4 value */ 2622 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000; 2623 i++; 2624 2625 /* Encode UCS4 Unicode ordinals */ 2626 *p++ = (char)(0xf0 | (ch >> 18)); 2627 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f)); 2628 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); 2629 *p++ = (char)(0x80 | (ch & 0x3f)); 2630 } else { 2631#endif 2632 Py_ssize_t newpos; 2633 PyObject *rep; 2634 Py_ssize_t repsize, k; 2635 rep = unicode_encode_call_errorhandler 2636 (errors, &errorHandler, "utf-8", "surrogates not allowed", 2637 s, size, &exc, i-1, i, &newpos); 2638 if (!rep) 2639 goto error; 2640 2641 if (PyBytes_Check(rep)) 2642 repsize = PyBytes_GET_SIZE(rep); 2643 else 2644 repsize = PyUnicode_GET_SIZE(rep); 2645 2646 if (repsize > 4) { 2647 Py_ssize_t offset; 2648 2649 if (result == NULL) 2650 offset = p - stackbuf; 2651 else 2652 offset = p - PyBytes_AS_STRING(result); 2653 2654 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) { 2655 /* integer overflow */ 2656 PyErr_NoMemory(); 2657 goto error; 2658 } 2659 nallocated += repsize - 4; 2660 if (result != NULL) { 2661 if (_PyBytes_Resize(&result, nallocated) < 0) 2662 goto error; 2663 } else { 2664 result = PyBytes_FromStringAndSize(NULL, nallocated); 2665 if (result == NULL) 2666 goto error; 2667 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset); 2668 } 2669 p = PyBytes_AS_STRING(result) + offset; 2670 } 2671 2672 if (PyBytes_Check(rep)) { 2673 char *prep = PyBytes_AS_STRING(rep); 2674 for(k = repsize; k > 0; k--) 2675 *p++ = *prep++; 2676 } else /* rep is unicode */ { 2677 Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep); 2678 Py_UNICODE c; 2679 2680 for(k=0; k<repsize; k++) { 2681 c = prep[k]; 2682 if (0x80 <= c) { 2683 raise_encode_exception(&exc, "utf-8", s, size, 2684 i-1, i, "surrogates not allowed"); 2685 goto error; 2686 } 2687 *p++ = (char)prep[k]; 2688 } 2689 } 2690 Py_DECREF(rep); 2691#ifndef Py_UNICODE_WIDE 2692 } 2693#endif 2694 } else if (ch < 0x10000) { 2695 *p++ = (char)(0xe0 | (ch >> 12)); 2696 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); 2697 *p++ = (char)(0x80 | (ch & 0x3f)); 2698 } else /* ch >= 0x10000 */ { 2699 /* Encode UCS4 Unicode ordinals */ 2700 *p++ = (char)(0xf0 | (ch >> 18)); 2701 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f)); 2702 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); 2703 *p++ = (char)(0x80 | (ch & 0x3f)); 2704 } 2705 } 2706 2707 if (result == NULL) { 2708 /* This was stack allocated. */ 2709 nneeded = p - stackbuf; 2710 assert(nneeded <= nallocated); 2711 result = PyBytes_FromStringAndSize(stackbuf, nneeded); 2712 } 2713 else { 2714 /* Cut back to size actually needed. */ 2715 nneeded = p - PyBytes_AS_STRING(result); 2716 assert(nneeded <= nallocated); 2717 _PyBytes_Resize(&result, nneeded); 2718 } 2719 Py_XDECREF(errorHandler); 2720 Py_XDECREF(exc); 2721 return result; 2722 error: 2723 Py_XDECREF(errorHandler); 2724 Py_XDECREF(exc); 2725 Py_XDECREF(result); 2726 return NULL; 2727 2728#undef MAX_SHORT_UNICHARS 2729} 2730 2731PyObject *PyUnicode_AsUTF8String(PyObject *unicode) 2732{ 2733 if (!PyUnicode_Check(unicode)) { 2734 PyErr_BadArgument(); 2735 return NULL; 2736 } 2737 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode), 2738 PyUnicode_GET_SIZE(unicode), 2739 NULL); 2740} 2741 2742/* --- UTF-32 Codec ------------------------------------------------------- */ 2743 2744PyObject * 2745PyUnicode_DecodeUTF32(const char *s, 2746 Py_ssize_t size, 2747 const char *errors, 2748 int *byteorder) 2749{ 2750 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL); 2751} 2752 2753PyObject * 2754PyUnicode_DecodeUTF32Stateful(const char *s, 2755 Py_ssize_t size, 2756 const char *errors, 2757 int *byteorder, 2758 Py_ssize_t *consumed) 2759{ 2760 const char *starts = s; 2761 Py_ssize_t startinpos; 2762 Py_ssize_t endinpos; 2763 Py_ssize_t outpos; 2764 PyUnicodeObject *unicode; 2765 Py_UNICODE *p; 2766#ifndef Py_UNICODE_WIDE 2767 int pairs = 0; 2768 const unsigned char *qq; 2769#else 2770 const int pairs = 0; 2771#endif 2772 const unsigned char *q, *e; 2773 int bo = 0; /* assume native ordering by default */ 2774 const char *errmsg = ""; 2775 /* Offsets from q for retrieving bytes in the right order. */ 2776#ifdef BYTEORDER_IS_LITTLE_ENDIAN 2777 int iorder[] = {0, 1, 2, 3}; 2778#else 2779 int iorder[] = {3, 2, 1, 0}; 2780#endif 2781 PyObject *errorHandler = NULL; 2782 PyObject *exc = NULL; 2783 2784 q = (unsigned char *)s; 2785 e = q + size; 2786 2787 if (byteorder) 2788 bo = *byteorder; 2789 2790 /* Check for BOM marks (U+FEFF) in the input and adjust current 2791 byte order setting accordingly. In native mode, the leading BOM 2792 mark is skipped, in all other modes, it is copied to the output 2793 stream as-is (giving a ZWNBSP character). */ 2794 if (bo == 0) { 2795 if (size >= 4) { 2796 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) | 2797 (q[iorder[1]] << 8) | q[iorder[0]]; 2798#ifdef BYTEORDER_IS_LITTLE_ENDIAN 2799 if (bom == 0x0000FEFF) { 2800 q += 4; 2801 bo = -1; 2802 } 2803 else if (bom == 0xFFFE0000) { 2804 q += 4; 2805 bo = 1; 2806 } 2807#else 2808 if (bom == 0x0000FEFF) { 2809 q += 4; 2810 bo = 1; 2811 } 2812 else if (bom == 0xFFFE0000) { 2813 q += 4; 2814 bo = -1; 2815 } 2816#endif 2817 } 2818 } 2819 2820 if (bo == -1) { 2821 /* force LE */ 2822 iorder[0] = 0; 2823 iorder[1] = 1; 2824 iorder[2] = 2; 2825 iorder[3] = 3; 2826 } 2827 else if (bo == 1) { 2828 /* force BE */ 2829 iorder[0] = 3; 2830 iorder[1] = 2; 2831 iorder[2] = 1; 2832 iorder[3] = 0; 2833 } 2834 2835 /* On narrow builds we split characters outside the BMP into two 2836 codepoints => count how much extra space we need. */ 2837#ifndef Py_UNICODE_WIDE 2838 for (qq = q; qq < e; qq += 4) 2839 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0) 2840 pairs++; 2841#endif 2842 2843 /* This might be one to much, because of a BOM */ 2844 unicode = _PyUnicode_New((size+3)/4+pairs); 2845 if (!unicode) 2846 return NULL; 2847 if (size == 0) 2848 return (PyObject *)unicode; 2849 2850 /* Unpack UTF-32 encoded data */ 2851 p = unicode->str; 2852 2853 while (q < e) { 2854 Py_UCS4 ch; 2855 /* remaining bytes at the end? (size should be divisible by 4) */ 2856 if (e-q<4) { 2857 if (consumed) 2858 break; 2859 errmsg = "truncated data"; 2860 startinpos = ((const char *)q)-starts; 2861 endinpos = ((const char *)e)-starts; 2862 goto utf32Error; 2863 /* The remaining input chars are ignored if the callback 2864 chooses to skip the input */ 2865 } 2866 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) | 2867 (q[iorder[1]] << 8) | q[iorder[0]]; 2868 2869 if (ch >= 0x110000) 2870 { 2871 errmsg = "codepoint not in range(0x110000)"; 2872 startinpos = ((const char *)q)-starts; 2873 endinpos = startinpos+4; 2874 goto utf32Error; 2875 } 2876#ifndef Py_UNICODE_WIDE 2877 if (ch >= 0x10000) 2878 { 2879 *p++ = 0xD800 | ((ch-0x10000) >> 10); 2880 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF); 2881 } 2882 else 2883#endif 2884 *p++ = ch; 2885 q += 4; 2886 continue; 2887 utf32Error: 2888 outpos = p-PyUnicode_AS_UNICODE(unicode); 2889 if (unicode_decode_call_errorhandler( 2890 errors, &errorHandler, 2891 "utf32", errmsg, 2892 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q, 2893 &unicode, &outpos, &p)) 2894 goto onError; 2895 } 2896 2897 if (byteorder) 2898 *byteorder = bo; 2899 2900 if (consumed) 2901 *consumed = (const char *)q-starts; 2902 2903 /* Adjust length */ 2904 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0) 2905 goto onError; 2906 2907 Py_XDECREF(errorHandler); 2908 Py_XDECREF(exc); 2909 return (PyObject *)unicode; 2910 2911 onError: 2912 Py_DECREF(unicode); 2913 Py_XDECREF(errorHandler); 2914 Py_XDECREF(exc); 2915 return NULL; 2916} 2917 2918PyObject * 2919PyUnicode_EncodeUTF32(const Py_UNICODE *s, 2920 Py_ssize_t size, 2921 const char *errors, 2922 int byteorder) 2923{ 2924 PyObject *v; 2925 unsigned char *p; 2926 Py_ssize_t nsize, bytesize; 2927#ifndef Py_UNICODE_WIDE 2928 Py_ssize_t i, pairs; 2929#else 2930 const int pairs = 0; 2931#endif 2932 /* Offsets from p for storing byte pairs in the right order. */ 2933#ifdef BYTEORDER_IS_LITTLE_ENDIAN 2934 int iorder[] = {0, 1, 2, 3}; 2935#else 2936 int iorder[] = {3, 2, 1, 0}; 2937#endif 2938 2939#define STORECHAR(CH) \ 2940 do { \ 2941 p[iorder[3]] = ((CH) >> 24) & 0xff; \ 2942 p[iorder[2]] = ((CH) >> 16) & 0xff; \ 2943 p[iorder[1]] = ((CH) >> 8) & 0xff; \ 2944 p[iorder[0]] = (CH) & 0xff; \ 2945 p += 4; \ 2946 } while(0) 2947 2948 /* In narrow builds we can output surrogate pairs as one codepoint, 2949 so we need less space. */ 2950#ifndef Py_UNICODE_WIDE 2951 for (i = pairs = 0; i < size-1; i++) 2952 if (0xD800 <= s[i] && s[i] <= 0xDBFF && 2953 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF) 2954 pairs++; 2955#endif 2956 nsize = (size - pairs + (byteorder == 0)); 2957 bytesize = nsize * 4; 2958 if (bytesize / 4 != nsize) 2959 return PyErr_NoMemory(); 2960 v = PyBytes_FromStringAndSize(NULL, bytesize); 2961 if (v == NULL) 2962 return NULL; 2963 2964 p = (unsigned char *)PyBytes_AS_STRING(v); 2965 if (byteorder == 0) 2966 STORECHAR(0xFEFF); 2967 if (size == 0) 2968 goto done; 2969 2970 if (byteorder == -1) { 2971 /* force LE */ 2972 iorder[0] = 0; 2973 iorder[1] = 1; 2974 iorder[2] = 2; 2975 iorder[3] = 3; 2976 } 2977 else if (byteorder == 1) { 2978 /* force BE */ 2979 iorder[0] = 3; 2980 iorder[1] = 2; 2981 iorder[2] = 1; 2982 iorder[3] = 0; 2983 } 2984 2985 while (size-- > 0) { 2986 Py_UCS4 ch = *s++; 2987#ifndef Py_UNICODE_WIDE 2988 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) { 2989 Py_UCS4 ch2 = *s; 2990 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) { 2991 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000; 2992 s++; 2993 size--; 2994 } 2995 } 2996#endif 2997 STORECHAR(ch); 2998 } 2999 3000 done: 3001 return v; 3002#undef STORECHAR 3003} 3004 3005PyObject *PyUnicode_AsUTF32String(PyObject *unicode) 3006{ 3007 if (!PyUnicode_Check(unicode)) { 3008 PyErr_BadArgument(); 3009 return NULL; 3010 } 3011 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode), 3012 PyUnicode_GET_SIZE(unicode), 3013 NULL, 3014 0); 3015} 3016 3017/* --- UTF-16 Codec ------------------------------------------------------- */ 3018 3019PyObject * 3020PyUnicode_DecodeUTF16(const char *s, 3021 Py_ssize_t size, 3022 const char *errors, 3023 int *byteorder) 3024{ 3025 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL); 3026} 3027 3028/* Two masks for fast checking of whether a C 'long' may contain 3029 UTF16-encoded surrogate characters. This is an efficient heuristic, 3030 assuming that non-surrogate characters with a code point >= 0x8000 are 3031 rare in most input. 3032 FAST_CHAR_MASK is used when the input is in native byte ordering, 3033 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering. 3034*/ 3035#if (SIZEOF_LONG == 8) 3036# define FAST_CHAR_MASK 0x8000800080008000L 3037# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L 3038#elif (SIZEOF_LONG == 4) 3039# define FAST_CHAR_MASK 0x80008000L 3040# define SWAPPED_FAST_CHAR_MASK 0x00800080L 3041#else 3042# error C 'long' size should be either 4 or 8! 3043#endif 3044 3045PyObject * 3046PyUnicode_DecodeUTF16Stateful(const char *s, 3047 Py_ssize_t size, 3048 const char *errors, 3049 int *byteorder, 3050 Py_ssize_t *consumed) 3051{ 3052 const char *starts = s; 3053 Py_ssize_t startinpos; 3054 Py_ssize_t endinpos; 3055 Py_ssize_t outpos; 3056 PyUnicodeObject *unicode; 3057 Py_UNICODE *p; 3058 const unsigned char *q, *e, *aligned_end; 3059 int bo = 0; /* assume native ordering by default */ 3060 int native_ordering = 0; 3061 const char *errmsg = ""; 3062 /* Offsets from q for retrieving byte pairs in the right order. */ 3063#ifdef BYTEORDER_IS_LITTLE_ENDIAN 3064 int ihi = 1, ilo = 0; 3065#else 3066 int ihi = 0, ilo = 1; 3067#endif 3068 PyObject *errorHandler = NULL; 3069 PyObject *exc = NULL; 3070 3071 /* Note: size will always be longer than the resulting Unicode 3072 character count */ 3073 unicode = _PyUnicode_New(size); 3074 if (!unicode) 3075 return NULL; 3076 if (size == 0) 3077 return (PyObject *)unicode; 3078 3079 /* Unpack UTF-16 encoded data */ 3080 p = unicode->str; 3081 q = (unsigned char *)s; 3082 e = q + size - 1; 3083 3084 if (byteorder) 3085 bo = *byteorder; 3086 3087 /* Check for BOM marks (U+FEFF) in the input and adjust current 3088 byte order setting accordingly. In native mode, the leading BOM 3089 mark is skipped, in all other modes, it is copied to the output 3090 stream as-is (giving a ZWNBSP character). */ 3091 if (bo == 0) { 3092 if (size >= 2) { 3093 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo]; 3094#ifdef BYTEORDER_IS_LITTLE_ENDIAN 3095 if (bom == 0xFEFF) { 3096 q += 2; 3097 bo = -1; 3098 } 3099 else if (bom == 0xFFFE) { 3100 q += 2; 3101 bo = 1; 3102 } 3103#else 3104 if (bom == 0xFEFF) { 3105 q += 2; 3106 bo = 1; 3107 } 3108 else if (bom == 0xFFFE) { 3109 q += 2; 3110 bo = -1; 3111 } 3112#endif 3113 } 3114 } 3115 3116 if (bo == -1) { 3117 /* force LE */ 3118 ihi = 1; 3119 ilo = 0; 3120 } 3121 else if (bo == 1) { 3122 /* force BE */ 3123 ihi = 0; 3124 ilo = 1; 3125 } 3126#ifdef BYTEORDER_IS_LITTLE_ENDIAN 3127 native_ordering = ilo < ihi; 3128#else 3129 native_ordering = ilo > ihi; 3130#endif 3131 3132 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK); 3133 while (q < e) { 3134 Py_UNICODE ch; 3135 /* First check for possible aligned read of a C 'long'. Unaligned 3136 reads are more expensive, better to defer to another iteration. */ 3137 if (!((size_t) q & LONG_PTR_MASK)) { 3138 /* Fast path for runs of non-surrogate chars. */ 3139 register const unsigned char *_q = q; 3140 Py_UNICODE *_p = p; 3141 if (native_ordering) { 3142 /* Native ordering is simple: as long as the input cannot 3143 possibly contain a surrogate char, do an unrolled copy 3144 of several 16-bit code points to the target object. 3145 The non-surrogate check is done on several input bytes 3146 at a time (as many as a C 'long' can contain). */ 3147 while (_q < aligned_end) { 3148 unsigned long data = * (unsigned long *) _q; 3149 if (data & FAST_CHAR_MASK) 3150 break; 3151 _p[0] = ((unsigned short *) _q)[0]; 3152 _p[1] = ((unsigned short *) _q)[1]; 3153#if (SIZEOF_LONG == 8) 3154 _p[2] = ((unsigned short *) _q)[2]; 3155 _p[3] = ((unsigned short *) _q)[3]; 3156#endif 3157 _q += SIZEOF_LONG; 3158 _p += SIZEOF_LONG / 2; 3159 } 3160 } 3161 else { 3162 /* Byteswapped ordering is similar, but we must decompose 3163 the copy bytewise, and take care of zero'ing out the 3164 upper bytes if the target object is in 32-bit units 3165 (that is, in UCS-4 builds). */ 3166 while (_q < aligned_end) { 3167 unsigned long data = * (unsigned long *) _q; 3168 if (data & SWAPPED_FAST_CHAR_MASK) 3169 break; 3170 /* Zero upper bytes in UCS-4 builds */ 3171#if (Py_UNICODE_SIZE > 2) 3172 _p[0] = 0; 3173 _p[1] = 0; 3174#if (SIZEOF_LONG == 8) 3175 _p[2] = 0; 3176 _p[3] = 0; 3177#endif 3178#endif 3179 /* Issue #4916; UCS-4 builds on big endian machines must 3180 fill the two last bytes of each 4-byte unit. */ 3181#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2) 3182# define OFF 2 3183#else 3184# define OFF 0 3185#endif 3186 ((unsigned char *) _p)[OFF + 1] = _q[0]; 3187 ((unsigned char *) _p)[OFF + 0] = _q[1]; 3188 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2]; 3189 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3]; 3190#if (SIZEOF_LONG == 8) 3191 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4]; 3192 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5]; 3193 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6]; 3194 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7]; 3195#endif 3196#undef OFF 3197 _q += SIZEOF_LONG; 3198 _p += SIZEOF_LONG / 2; 3199 } 3200 } 3201 p = _p; 3202 q = _q; 3203 if (q >= e) 3204 break; 3205 } 3206 ch = (q[ihi] << 8) | q[ilo]; 3207 3208 q += 2; 3209 3210 if (ch < 0xD800 || ch > 0xDFFF) { 3211 *p++ = ch; 3212 continue; 3213 } 3214 3215 /* UTF-16 code pair: */ 3216 if (q > e) { 3217 errmsg = "unexpected end of data"; 3218 startinpos = (((const char *)q) - 2) - starts; 3219 endinpos = ((const char *)e) + 1 - starts; 3220 goto utf16Error; 3221 } 3222 if (0xD800 <= ch && ch <= 0xDBFF) { 3223 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo]; 3224 q += 2; 3225 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) { 3226#ifndef Py_UNICODE_WIDE 3227 *p++ = ch; 3228 *p++ = ch2; 3229#else 3230 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000; 3231#endif 3232 continue; 3233 } 3234 else { 3235 errmsg = "illegal UTF-16 surrogate"; 3236 startinpos = (((const char *)q)-4)-starts; 3237 endinpos = startinpos+2; 3238 goto utf16Error; 3239 } 3240 3241 } 3242 errmsg = "illegal encoding"; 3243 startinpos = (((const char *)q)-2)-starts; 3244 endinpos = startinpos+2; 3245 /* Fall through to report the error */ 3246 3247 utf16Error: 3248 outpos = p - PyUnicode_AS_UNICODE(unicode); 3249 if (unicode_decode_call_errorhandler( 3250 errors, 3251 &errorHandler, 3252 "utf16", errmsg, 3253 &starts, 3254 (const char **)&e, 3255 &startinpos, 3256 &endinpos, 3257 &exc, 3258 (const char **)&q, 3259 &unicode, 3260 &outpos, 3261 &p)) 3262 goto onError; 3263 } 3264 /* remaining byte at the end? (size should be even) */ 3265 if (e == q) { 3266 if (!consumed) { 3267 errmsg = "truncated data"; 3268 startinpos = ((const char *)q) - starts; 3269 endinpos = ((const char *)e) + 1 - starts; 3270 outpos = p - PyUnicode_AS_UNICODE(unicode); 3271 if (unicode_decode_call_errorhandler( 3272 errors, 3273 &errorHandler, 3274 "utf16", errmsg, 3275 &starts, 3276 (const char **)&e, 3277 &startinpos, 3278 &endinpos, 3279 &exc, 3280 (const char **)&q, 3281 &unicode, 3282 &outpos, 3283 &p)) 3284 goto onError; 3285 /* The remaining input chars are ignored if the callback 3286 chooses to skip the input */ 3287 } 3288 } 3289 3290 if (byteorder) 3291 *byteorder = bo; 3292 3293 if (consumed) 3294 *consumed = (const char *)q-starts; 3295 3296 /* Adjust length */ 3297 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0) 3298 goto onError; 3299 3300 Py_XDECREF(errorHandler); 3301 Py_XDECREF(exc); 3302 return (PyObject *)unicode; 3303 3304 onError: 3305 Py_DECREF(unicode); 3306 Py_XDECREF(errorHandler); 3307 Py_XDECREF(exc); 3308 return NULL; 3309} 3310 3311#undef FAST_CHAR_MASK 3312#undef SWAPPED_FAST_CHAR_MASK 3313 3314PyObject * 3315PyUnicode_EncodeUTF16(const Py_UNICODE *s, 3316 Py_ssize_t size, 3317 const char *errors, 3318 int byteorder) 3319{ 3320 PyObject *v; 3321 unsigned char *p; 3322 Py_ssize_t nsize, bytesize; 3323#ifdef Py_UNICODE_WIDE 3324 Py_ssize_t i, pairs; 3325#else 3326 const int pairs = 0; 3327#endif 3328 /* Offsets from p for storing byte pairs in the right order. */ 3329#ifdef BYTEORDER_IS_LITTLE_ENDIAN 3330 int ihi = 1, ilo = 0; 3331#else 3332 int ihi = 0, ilo = 1; 3333#endif 3334 3335#define STORECHAR(CH) \ 3336 do { \ 3337 p[ihi] = ((CH) >> 8) & 0xff; \ 3338 p[ilo] = (CH) & 0xff; \ 3339 p += 2; \ 3340 } while(0) 3341 3342#ifdef Py_UNICODE_WIDE 3343 for (i = pairs = 0; i < size; i++) 3344 if (s[i] >= 0x10000) 3345 pairs++; 3346#endif 3347 /* 2 * (size + pairs + (byteorder == 0)) */ 3348 if (size > PY_SSIZE_T_MAX || 3349 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0)) 3350 return PyErr_NoMemory(); 3351 nsize = size + pairs + (byteorder == 0); 3352 bytesize = nsize * 2; 3353 if (bytesize / 2 != nsize) 3354 return PyErr_NoMemory(); 3355 v = PyBytes_FromStringAndSize(NULL, bytesize); 3356 if (v == NULL) 3357 return NULL; 3358 3359 p = (unsigned char *)PyBytes_AS_STRING(v); 3360 if (byteorder == 0) 3361 STORECHAR(0xFEFF); 3362 if (size == 0) 3363 goto done; 3364 3365 if (byteorder == -1) { 3366 /* force LE */ 3367 ihi = 1; 3368 ilo = 0; 3369 } 3370 else if (byteorder == 1) { 3371 /* force BE */ 3372 ihi = 0; 3373 ilo = 1; 3374 } 3375 3376 while (size-- > 0) { 3377 Py_UNICODE ch = *s++; 3378 Py_UNICODE ch2 = 0; 3379#ifdef Py_UNICODE_WIDE 3380 if (ch >= 0x10000) { 3381 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF); 3382 ch = 0xD800 | ((ch-0x10000) >> 10); 3383 } 3384#endif 3385 STORECHAR(ch); 3386 if (ch2) 3387 STORECHAR(ch2); 3388 } 3389 3390 done: 3391 return v; 3392#undef STORECHAR 3393} 3394 3395PyObject *PyUnicode_AsUTF16String(PyObject *unicode) 3396{ 3397 if (!PyUnicode_Check(unicode)) { 3398 PyErr_BadArgument(); 3399 return NULL; 3400 } 3401 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode), 3402 PyUnicode_GET_SIZE(unicode), 3403 NULL, 3404 0); 3405} 3406 3407/* --- Unicode Escape Codec ----------------------------------------------- */ 3408 3409static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL; 3410 3411PyObject *PyUnicode_DecodeUnicodeEscape(const char *s, 3412 Py_ssize_t size, 3413 const char *errors) 3414{ 3415 const char *starts = s; 3416 Py_ssize_t startinpos; 3417 Py_ssize_t endinpos; 3418 Py_ssize_t outpos; 3419 int i; 3420 PyUnicodeObject *v; 3421 Py_UNICODE *p; 3422 const char *end; 3423 char* message; 3424 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */ 3425 PyObject *errorHandler = NULL; 3426 PyObject *exc = NULL; 3427 3428 /* Escaped strings will always be longer than the resulting 3429 Unicode string, so we start with size here and then reduce the 3430 length after conversion to the true value. 3431 (but if the error callback returns a long replacement string 3432 we'll have to allocate more space) */ 3433 v = _PyUnicode_New(size); 3434 if (v == NULL) 3435 goto onError; 3436 if (size == 0) 3437 return (PyObject *)v; 3438 3439 p = PyUnicode_AS_UNICODE(v); 3440 end = s + size; 3441 3442 while (s < end) { 3443 unsigned char c; 3444 Py_UNICODE x; 3445 int digits; 3446 3447 /* Non-escape characters are interpreted as Unicode ordinals */ 3448 if (*s != '\\') { 3449 *p++ = (unsigned char) *s++; 3450 continue; 3451 } 3452 3453 startinpos = s-starts; 3454 /* \ - Escapes */ 3455 s++; 3456 c = *s++; 3457 if (s > end) 3458 c = '\0'; /* Invalid after \ */ 3459 switch (c) { 3460 3461 /* \x escapes */ 3462 case '\n': break; 3463 case '\\': *p++ = '\\'; break; 3464 case '\'': *p++ = '\''; break; 3465 case '\"': *p++ = '\"'; break; 3466 case 'b': *p++ = '\b'; break; 3467 case 'f': *p++ = '\014'; break; /* FF */ 3468 case 't': *p++ = '\t'; break; 3469 case 'n': *p++ = '\n'; break; 3470 case 'r': *p++ = '\r'; break; 3471 case 'v': *p++ = '\013'; break; /* VT */ 3472 case 'a': *p++ = '\007'; break; /* BEL, not classic C */ 3473 3474 /* \OOO (octal) escapes */ 3475 case '0': case '1': case '2': case '3': 3476 case '4': case '5': case '6': case '7': 3477 x = s[-1] - '0'; 3478 if (s < end && '0' <= *s && *s <= '7') { 3479 x = (x<<3) + *s++ - '0'; 3480 if (s < end && '0' <= *s && *s <= '7') 3481 x = (x<<3) + *s++ - '0'; 3482 } 3483 *p++ = x; 3484 break; 3485 3486 /* hex escapes */ 3487 /* \xXX */ 3488 case 'x': 3489 digits = 2; 3490 message = "truncated \\xXX escape"; 3491 goto hexescape; 3492 3493 /* \uXXXX */ 3494 case 'u': 3495 digits = 4; 3496 message = "truncated \\uXXXX escape"; 3497 goto hexescape; 3498 3499 /* \UXXXXXXXX */ 3500 case 'U': 3501 digits = 8; 3502 message = "truncated \\UXXXXXXXX escape"; 3503 hexescape: 3504 chr = 0; 3505 outpos = p-PyUnicode_AS_UNICODE(v); 3506 if (s+digits>end) { 3507 endinpos = size; 3508 if (unicode_decode_call_errorhandler( 3509 errors, &errorHandler, 3510 "unicodeescape", "end of string in escape sequence", 3511 &starts, &end, &startinpos, &endinpos, &exc, &s, 3512 &v, &outpos, &p)) 3513 goto onError; 3514 goto nextByte; 3515 } 3516 for (i = 0; i < digits; ++i) { 3517 c = (unsigned char) s[i]; 3518 if (!ISXDIGIT(c)) { 3519 endinpos = (s+i+1)-starts; 3520 if (unicode_decode_call_errorhandler( 3521 errors, &errorHandler, 3522 "unicodeescape", message, 3523 &starts, &end, &startinpos, &endinpos, &exc, &s, 3524 &v, &outpos, &p)) 3525 goto onError; 3526 goto nextByte; 3527 } 3528 chr = (chr<<4) & ~0xF; 3529 if (c >= '0' && c <= '9') 3530 chr += c - '0'; 3531 else if (c >= 'a' && c <= 'f') 3532 chr += 10 + c - 'a'; 3533 else 3534 chr += 10 + c - 'A'; 3535 } 3536 s += i; 3537 if (chr == 0xffffffff && PyErr_Occurred()) 3538 /* _decoding_error will have already written into the 3539 target buffer. */ 3540 break; 3541 store: 3542 /* when we get here, chr is a 32-bit unicode character */ 3543 if (chr <= 0xffff) 3544 /* UCS-2 character */ 3545 *p++ = (Py_UNICODE) chr; 3546 else if (chr <= 0x10ffff) { 3547 /* UCS-4 character. Either store directly, or as 3548 surrogate pair. */ 3549#ifdef Py_UNICODE_WIDE 3550 *p++ = chr; 3551#else 3552 chr -= 0x10000L; 3553 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10); 3554 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF); 3555#endif 3556 } else { 3557 endinpos = s-starts; 3558 outpos = p-PyUnicode_AS_UNICODE(v); 3559 if (unicode_decode_call_errorhandler( 3560 errors, &errorHandler, 3561 "unicodeescape", "illegal Unicode character", 3562 &starts, &end, &startinpos, &endinpos, &exc, &s, 3563 &v, &outpos, &p)) 3564 goto onError; 3565 } 3566 break; 3567 3568 /* \N{name} */ 3569 case 'N': 3570 message = "malformed \\N character escape"; 3571 if (ucnhash_CAPI == NULL) { 3572 /* load the unicode data module */ 3573 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(PyUnicodeData_CAPSULE_NAME, 1); 3574 if (ucnhash_CAPI == NULL) 3575 goto ucnhashError; 3576 } 3577 if (*s == '{') { 3578 const char *start = s+1; 3579 /* look for the closing brace */ 3580 while (*s != '}' && s < end) 3581 s++; 3582 if (s > start && s < end && *s == '}') { 3583 /* found a name. look it up in the unicode database */ 3584 message = "unknown Unicode character name"; 3585 s++; 3586 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr)) 3587 goto store; 3588 } 3589 } 3590 endinpos = s-starts; 3591 outpos = p-PyUnicode_AS_UNICODE(v); 3592 if (unicode_decode_call_errorhandler( 3593 errors, &errorHandler, 3594 "unicodeescape", message, 3595 &starts, &end, &startinpos, &endinpos, &exc, &s, 3596 &v, &outpos, &p)) 3597 goto onError; 3598 break; 3599 3600 default: 3601 if (s > end) { 3602 message = "\\ at end of string"; 3603 s--; 3604 endinpos = s-starts; 3605 outpos = p-PyUnicode_AS_UNICODE(v); 3606 if (unicode_decode_call_errorhandler( 3607 errors, &errorHandler, 3608 "unicodeescape", message, 3609 &starts, &end, &startinpos, &endinpos, &exc, &s, 3610 &v, &outpos, &p)) 3611 goto onError; 3612 } 3613 else { 3614 *p++ = '\\'; 3615 *p++ = (unsigned char)s[-1]; 3616 } 3617 break; 3618 } 3619 nextByte: 3620 ; 3621 } 3622 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0) 3623 goto onError; 3624 Py_XDECREF(errorHandler); 3625 Py_XDECREF(exc); 3626 return (PyObject *)v; 3627 3628 ucnhashError: 3629 PyErr_SetString( 3630 PyExc_UnicodeError, 3631 "\\N escapes not supported (can't load unicodedata module)" 3632 ); 3633 Py_XDECREF(v); 3634 Py_XDECREF(errorHandler); 3635 Py_XDECREF(exc); 3636 return NULL; 3637 3638 onError: 3639 Py_XDECREF(v); 3640 Py_XDECREF(errorHandler); 3641 Py_XDECREF(exc); 3642 return NULL; 3643} 3644 3645/* Return a Unicode-Escape string version of the Unicode object. 3646 3647 If quotes is true, the string is enclosed in u"" or u'' quotes as 3648 appropriate. 3649 3650*/ 3651 3652Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s, 3653 Py_ssize_t size, 3654 Py_UNICODE ch) 3655{ 3656 /* like wcschr, but doesn't stop at NULL characters */ 3657 3658 while (size-- > 0) { 3659 if (*s == ch) 3660 return s; 3661 s++; 3662 } 3663 3664 return NULL; 3665} 3666 3667static const char *hexdigits = "0123456789abcdef"; 3668 3669PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s, 3670 Py_ssize_t size) 3671{ 3672 PyObject *repr; 3673 char *p; 3674 3675#ifdef Py_UNICODE_WIDE 3676 const Py_ssize_t expandsize = 10; 3677#else 3678 const Py_ssize_t expandsize = 6; 3679#endif 3680 3681 /* XXX(nnorwitz): rather than over-allocating, it would be 3682 better to choose a different scheme. Perhaps scan the 3683 first N-chars of the string and allocate based on that size. 3684 */ 3685 /* Initial allocation is based on the longest-possible unichr 3686 escape. 3687 3688 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source 3689 unichr, so in this case it's the longest unichr escape. In 3690 narrow (UTF-16) builds this is five chars per source unichr 3691 since there are two unichrs in the surrogate pair, so in narrow 3692 (UTF-16) builds it's not the longest unichr escape. 3693 3694 In wide or narrow builds '\uxxxx' is 6 chars per source unichr, 3695 so in the narrow (UTF-16) build case it's the longest unichr 3696 escape. 3697 */ 3698 3699 if (size == 0) 3700 return PyBytes_FromStringAndSize(NULL, 0); 3701 3702 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize) 3703 return PyErr_NoMemory(); 3704 3705 repr = PyBytes_FromStringAndSize(NULL, 3706 2 3707 + expandsize*size 3708 + 1); 3709 if (repr == NULL) 3710 return NULL; 3711 3712 p = PyBytes_AS_STRING(repr); 3713 3714 while (size-- > 0) { 3715 Py_UNICODE ch = *s++; 3716 3717 /* Escape backslashes */ 3718 if (ch == '\\') { 3719 *p++ = '\\'; 3720 *p++ = (char) ch; 3721 continue; 3722 } 3723 3724#ifdef Py_UNICODE_WIDE 3725 /* Map 21-bit characters to '\U00xxxxxx' */ 3726 else if (ch >= 0x10000) { 3727 *p++ = '\\'; 3728 *p++ = 'U'; 3729 *p++ = hexdigits[(ch >> 28) & 0x0000000F]; 3730 *p++ = hexdigits[(ch >> 24) & 0x0000000F]; 3731 *p++ = hexdigits[(ch >> 20) & 0x0000000F]; 3732 *p++ = hexdigits[(ch >> 16) & 0x0000000F]; 3733 *p++ = hexdigits[(ch >> 12) & 0x0000000F]; 3734 *p++ = hexdigits[(ch >> 8) & 0x0000000F]; 3735 *p++ = hexdigits[(ch >> 4) & 0x0000000F]; 3736 *p++ = hexdigits[ch & 0x0000000F]; 3737 continue; 3738 } 3739#else 3740 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */ 3741 else if (ch >= 0xD800 && ch < 0xDC00) { 3742 Py_UNICODE ch2; 3743 Py_UCS4 ucs; 3744 3745 ch2 = *s++; 3746 size--; 3747 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) { 3748 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000; 3749 *p++ = '\\'; 3750 *p++ = 'U'; 3751 *p++ = hexdigits[(ucs >> 28) & 0x0000000F]; 3752 *p++ = hexdigits[(ucs >> 24) & 0x0000000F]; 3753 *p++ = hexdigits[(ucs >> 20) & 0x0000000F]; 3754 *p++ = hexdigits[(ucs >> 16) & 0x0000000F]; 3755 *p++ = hexdigits[(ucs >> 12) & 0x0000000F]; 3756 *p++ = hexdigits[(ucs >> 8) & 0x0000000F]; 3757 *p++ = hexdigits[(ucs >> 4) & 0x0000000F]; 3758 *p++ = hexdigits[ucs & 0x0000000F]; 3759 continue; 3760 } 3761 /* Fall through: isolated surrogates are copied as-is */ 3762 s--; 3763 size++; 3764 } 3765#endif 3766 3767 /* Map 16-bit characters to '\uxxxx' */ 3768 if (ch >= 256) { 3769 *p++ = '\\'; 3770 *p++ = 'u'; 3771 *p++ = hexdigits[(ch >> 12) & 0x000F]; 3772 *p++ = hexdigits[(ch >> 8) & 0x000F]; 3773 *p++ = hexdigits[(ch >> 4) & 0x000F]; 3774 *p++ = hexdigits[ch & 0x000F]; 3775 } 3776 3777 /* Map special whitespace to '\t', \n', '\r' */ 3778 else if (ch == '\t') { 3779 *p++ = '\\'; 3780 *p++ = 't'; 3781 } 3782 else if (ch == '\n') { 3783 *p++ = '\\'; 3784 *p++ = 'n'; 3785 } 3786 else if (ch == '\r') { 3787 *p++ = '\\'; 3788 *p++ = 'r'; 3789 } 3790 3791 /* Map non-printable US ASCII to '\xhh' */ 3792 else if (ch < ' ' || ch >= 0x7F) { 3793 *p++ = '\\'; 3794 *p++ = 'x'; 3795 *p++ = hexdigits[(ch >> 4) & 0x000F]; 3796 *p++ = hexdigits[ch & 0x000F]; 3797 } 3798 3799 /* Copy everything else as-is */ 3800 else 3801 *p++ = (char) ch; 3802 } 3803 3804 assert(p - PyBytes_AS_STRING(repr) > 0); 3805 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) 3806 return NULL; 3807 return repr; 3808} 3809 3810PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode) 3811{ 3812 PyObject *s; 3813 if (!PyUnicode_Check(unicode)) { 3814 PyErr_BadArgument(); 3815 return NULL; 3816 } 3817 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode), 3818 PyUnicode_GET_SIZE(unicode)); 3819 return s; 3820} 3821 3822/* --- Raw Unicode Escape Codec ------------------------------------------- */ 3823 3824PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s, 3825 Py_ssize_t size, 3826 const char *errors) 3827{ 3828 const char *starts = s; 3829 Py_ssize_t startinpos; 3830 Py_ssize_t endinpos; 3831 Py_ssize_t outpos; 3832 PyUnicodeObject *v; 3833 Py_UNICODE *p; 3834 const char *end; 3835 const char *bs; 3836 PyObject *errorHandler = NULL; 3837 PyObject *exc = NULL; 3838 3839 /* Escaped strings will always be longer than the resulting 3840 Unicode string, so we start with size here and then reduce the 3841 length after conversion to the true value. (But decoding error 3842 handler might have to resize the string) */ 3843 v = _PyUnicode_New(size); 3844 if (v == NULL) 3845 goto onError; 3846 if (size == 0) 3847 return (PyObject *)v; 3848 p = PyUnicode_AS_UNICODE(v); 3849 end = s + size; 3850 while (s < end) { 3851 unsigned char c; 3852 Py_UCS4 x; 3853 int i; 3854 int count; 3855 3856 /* Non-escape characters are interpreted as Unicode ordinals */ 3857 if (*s != '\\') { 3858 *p++ = (unsigned char)*s++; 3859 continue; 3860 } 3861 startinpos = s-starts; 3862 3863 /* \u-escapes are only interpreted iff the number of leading 3864 backslashes if odd */ 3865 bs = s; 3866 for (;s < end;) { 3867 if (*s != '\\') 3868 break; 3869 *p++ = (unsigned char)*s++; 3870 } 3871 if (((s - bs) & 1) == 0 || 3872 s >= end || 3873 (*s != 'u' && *s != 'U')) { 3874 continue; 3875 } 3876 p--; 3877 count = *s=='u' ? 4 : 8; 3878 s++; 3879 3880 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */ 3881 outpos = p-PyUnicode_AS_UNICODE(v); 3882 for (x = 0, i = 0; i < count; ++i, ++s) { 3883 c = (unsigned char)*s; 3884 if (!ISXDIGIT(c)) { 3885 endinpos = s-starts; 3886 if (unicode_decode_call_errorhandler( 3887 errors, &errorHandler, 3888 "rawunicodeescape", "truncated \\uXXXX", 3889 &starts, &end, &startinpos, &endinpos, &exc, &s, 3890 &v, &outpos, &p)) 3891 goto onError; 3892 goto nextByte; 3893 } 3894 x = (x<<4) & ~0xF; 3895 if (c >= '0' && c <= '9') 3896 x += c - '0'; 3897 else if (c >= 'a' && c <= 'f') 3898 x += 10 + c - 'a'; 3899 else 3900 x += 10 + c - 'A'; 3901 } 3902 if (x <= 0xffff) 3903 /* UCS-2 character */ 3904 *p++ = (Py_UNICODE) x; 3905 else if (x <= 0x10ffff) { 3906 /* UCS-4 character. Either store directly, or as 3907 surrogate pair. */ 3908#ifdef Py_UNICODE_WIDE 3909 *p++ = (Py_UNICODE) x; 3910#else 3911 x -= 0x10000L; 3912 *p++ = 0xD800 + (Py_UNICODE) (x >> 10); 3913 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF); 3914#endif 3915 } else { 3916 endinpos = s-starts; 3917 outpos = p-PyUnicode_AS_UNICODE(v); 3918 if (unicode_decode_call_errorhandler( 3919 errors, &errorHandler, 3920 "rawunicodeescape", "\\Uxxxxxxxx out of range", 3921 &starts, &end, &startinpos, &endinpos, &exc, &s, 3922 &v, &outpos, &p)) 3923 goto onError; 3924 } 3925 nextByte: 3926 ; 3927 } 3928 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0) 3929 goto onError; 3930 Py_XDECREF(errorHandler); 3931 Py_XDECREF(exc); 3932 return (PyObject *)v; 3933 3934 onError: 3935 Py_XDECREF(v); 3936 Py_XDECREF(errorHandler); 3937 Py_XDECREF(exc); 3938 return NULL; 3939} 3940 3941PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s, 3942 Py_ssize_t size) 3943{ 3944 PyObject *repr; 3945 char *p; 3946 char *q; 3947 3948#ifdef Py_UNICODE_WIDE 3949 const Py_ssize_t expandsize = 10; 3950#else 3951 const Py_ssize_t expandsize = 6; 3952#endif 3953 3954 if (size > PY_SSIZE_T_MAX / expandsize) 3955 return PyErr_NoMemory(); 3956 3957 repr = PyBytes_FromStringAndSize(NULL, expandsize * size); 3958 if (repr == NULL) 3959 return NULL; 3960 if (size == 0) 3961 return repr; 3962 3963 p = q = PyBytes_AS_STRING(repr); 3964 while (size-- > 0) { 3965 Py_UNICODE ch = *s++; 3966#ifdef Py_UNICODE_WIDE 3967 /* Map 32-bit characters to '\Uxxxxxxxx' */ 3968 if (ch >= 0x10000) { 3969 *p++ = '\\'; 3970 *p++ = 'U'; 3971 *p++ = hexdigits[(ch >> 28) & 0xf]; 3972 *p++ = hexdigits[(ch >> 24) & 0xf]; 3973 *p++ = hexdigits[(ch >> 20) & 0xf]; 3974 *p++ = hexdigits[(ch >> 16) & 0xf]; 3975 *p++ = hexdigits[(ch >> 12) & 0xf]; 3976 *p++ = hexdigits[(ch >> 8) & 0xf]; 3977 *p++ = hexdigits[(ch >> 4) & 0xf]; 3978 *p++ = hexdigits[ch & 15]; 3979 } 3980 else 3981#else 3982 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */ 3983 if (ch >= 0xD800 && ch < 0xDC00) { 3984 Py_UNICODE ch2; 3985 Py_UCS4 ucs; 3986 3987 ch2 = *s++; 3988 size--; 3989 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) { 3990 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000; 3991 *p++ = '\\'; 3992 *p++ = 'U'; 3993 *p++ = hexdigits[(ucs >> 28) & 0xf]; 3994 *p++ = hexdigits[(ucs >> 24) & 0xf]; 3995 *p++ = hexdigits[(ucs >> 20) & 0xf]; 3996 *p++ = hexdigits[(ucs >> 16) & 0xf]; 3997 *p++ = hexdigits[(ucs >> 12) & 0xf]; 3998 *p++ = hexdigits[(ucs >> 8) & 0xf]; 3999 *p++ = hexdigits[(ucs >> 4) & 0xf]; 4000 *p++ = hexdigits[ucs & 0xf]; 4001 continue; 4002 } 4003 /* Fall through: isolated surrogates are copied as-is */ 4004 s--; 4005 size++; 4006 } 4007#endif 4008 /* Map 16-bit characters to '\uxxxx' */ 4009 if (ch >= 256) { 4010 *p++ = '\\'; 4011 *p++ = 'u'; 4012 *p++ = hexdigits[(ch >> 12) & 0xf]; 4013 *p++ = hexdigits[(ch >> 8) & 0xf]; 4014 *p++ = hexdigits[(ch >> 4) & 0xf]; 4015 *p++ = hexdigits[ch & 15]; 4016 } 4017 /* Copy everything else as-is */ 4018 else 4019 *p++ = (char) ch; 4020 } 4021 size = p - q; 4022 4023 assert(size > 0); 4024 if (_PyBytes_Resize(&repr, size) < 0) 4025 return NULL; 4026 return repr; 4027} 4028 4029PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode) 4030{ 4031 PyObject *s; 4032 if (!PyUnicode_Check(unicode)) { 4033 PyErr_BadArgument(); 4034 return NULL; 4035 } 4036 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode), 4037 PyUnicode_GET_SIZE(unicode)); 4038 4039 return s; 4040} 4041 4042/* --- Unicode Internal Codec ------------------------------------------- */ 4043 4044PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s, 4045 Py_ssize_t size, 4046 const char *errors) 4047{ 4048 const char *starts = s; 4049 Py_ssize_t startinpos; 4050 Py_ssize_t endinpos; 4051 Py_ssize_t outpos; 4052 PyUnicodeObject *v; 4053 Py_UNICODE *p; 4054 const char *end; 4055 const char *reason; 4056 PyObject *errorHandler = NULL; 4057 PyObject *exc = NULL; 4058 4059#ifdef Py_UNICODE_WIDE 4060 Py_UNICODE unimax = PyUnicode_GetMax(); 4061#endif 4062 4063 /* XXX overflow detection missing */ 4064 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE); 4065 if (v == NULL) 4066 goto onError; 4067 if (PyUnicode_GetSize((PyObject *)v) == 0) 4068 return (PyObject *)v; 4069 p = PyUnicode_AS_UNICODE(v); 4070 end = s + size; 4071 4072 while (s < end) { 4073 memcpy(p, s, sizeof(Py_UNICODE)); 4074 /* We have to sanity check the raw data, otherwise doom looms for 4075 some malformed UCS-4 data. */ 4076 if ( 4077#ifdef Py_UNICODE_WIDE 4078 *p > unimax || *p < 0 || 4079#endif 4080 end-s < Py_UNICODE_SIZE 4081 ) 4082 { 4083 startinpos = s - starts; 4084 if (end-s < Py_UNICODE_SIZE) { 4085 endinpos = end-starts; 4086 reason = "truncated input"; 4087 } 4088 else { 4089 endinpos = s - starts + Py_UNICODE_SIZE; 4090 reason = "illegal code point (> 0x10FFFF)"; 4091 } 4092 outpos = p - PyUnicode_AS_UNICODE(v); 4093 if (unicode_decode_call_errorhandler( 4094 errors, &errorHandler, 4095 "unicode_internal", reason, 4096 &starts, &end, &startinpos, &endinpos, &exc, &s, 4097 &v, &outpos, &p)) { 4098 goto onError; 4099 } 4100 } 4101 else { 4102 p++; 4103 s += Py_UNICODE_SIZE; 4104 } 4105 } 4106 4107 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0) 4108 goto onError; 4109 Py_XDECREF(errorHandler); 4110 Py_XDECREF(exc); 4111 return (PyObject *)v; 4112 4113 onError: 4114 Py_XDECREF(v); 4115 Py_XDECREF(errorHandler); 4116 Py_XDECREF(exc); 4117 return NULL; 4118} 4119 4120/* --- Latin-1 Codec ------------------------------------------------------ */ 4121 4122PyObject *PyUnicode_DecodeLatin1(const char *s, 4123 Py_ssize_t size, 4124 const char *errors) 4125{ 4126 PyUnicodeObject *v; 4127 Py_UNICODE *p; 4128 const char *e, *unrolled_end; 4129 4130 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */ 4131 if (size == 1) { 4132 Py_UNICODE r = *(unsigned char*)s; 4133 return PyUnicode_FromUnicode(&r, 1); 4134 } 4135 4136 v = _PyUnicode_New(size); 4137 if (v == NULL) 4138 goto onError; 4139 if (size == 0) 4140 return (PyObject *)v; 4141 p = PyUnicode_AS_UNICODE(v); 4142 e = s + size; 4143 /* Unrolling the copy makes it much faster by reducing the looping 4144 overhead. This is similar to what many memcpy() implementations do. */ 4145 unrolled_end = e - 4; 4146 while (s < unrolled_end) { 4147 p[0] = (unsigned char) s[0]; 4148 p[1] = (unsigned char) s[1]; 4149 p[2] = (unsigned char) s[2]; 4150 p[3] = (unsigned char) s[3]; 4151 s += 4; 4152 p += 4; 4153 } 4154 while (s < e) 4155 *p++ = (unsigned char) *s++; 4156 return (PyObject *)v; 4157 4158 onError: 4159 Py_XDECREF(v); 4160 return NULL; 4161} 4162 4163/* create or adjust a UnicodeEncodeError */ 4164static void make_encode_exception(PyObject **exceptionObject, 4165 const char *encoding, 4166 const Py_UNICODE *unicode, Py_ssize_t size, 4167 Py_ssize_t startpos, Py_ssize_t endpos, 4168 const char *reason) 4169{ 4170 if (*exceptionObject == NULL) { 4171 *exceptionObject = PyUnicodeEncodeError_Create( 4172 encoding, unicode, size, startpos, endpos, reason); 4173 } 4174 else { 4175 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos)) 4176 goto onError; 4177 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos)) 4178 goto onError; 4179 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason)) 4180 goto onError; 4181 return; 4182 onError: 4183 Py_DECREF(*exceptionObject); 4184 *exceptionObject = NULL; 4185 } 4186} 4187 4188/* raises a UnicodeEncodeError */ 4189static void raise_encode_exception(PyObject **exceptionObject, 4190 const char *encoding, 4191 const Py_UNICODE *unicode, Py_ssize_t size, 4192 Py_ssize_t startpos, Py_ssize_t endpos, 4193 const char *reason) 4194{ 4195 make_encode_exception(exceptionObject, 4196 encoding, unicode, size, startpos, endpos, reason); 4197 if (*exceptionObject != NULL) 4198 PyCodec_StrictErrors(*exceptionObject); 4199} 4200 4201/* error handling callback helper: 4202 build arguments, call the callback and check the arguments, 4203 put the result into newpos and return the replacement string, which 4204 has to be freed by the caller */ 4205static PyObject *unicode_encode_call_errorhandler(const char *errors, 4206 PyObject **errorHandler, 4207 const char *encoding, const char *reason, 4208 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject, 4209 Py_ssize_t startpos, Py_ssize_t endpos, 4210 Py_ssize_t *newpos) 4211{ 4212 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple"; 4213 4214 PyObject *restuple; 4215 PyObject *resunicode; 4216 4217 if (*errorHandler == NULL) { 4218 *errorHandler = PyCodec_LookupError(errors); 4219 if (*errorHandler == NULL) 4220 return NULL; 4221 } 4222 4223 make_encode_exception(exceptionObject, 4224 encoding, unicode, size, startpos, endpos, reason); 4225 if (*exceptionObject == NULL) 4226 return NULL; 4227 4228 restuple = PyObject_CallFunctionObjArgs( 4229 *errorHandler, *exceptionObject, NULL); 4230 if (restuple == NULL) 4231 return NULL; 4232 if (!PyTuple_Check(restuple)) { 4233 PyErr_SetString(PyExc_TypeError, &argparse[3]); 4234 Py_DECREF(restuple); 4235 return NULL; 4236 } 4237 if (!PyArg_ParseTuple(restuple, argparse, 4238 &resunicode, newpos)) { 4239 Py_DECREF(restuple); 4240 return NULL; 4241 } 4242 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) { 4243 PyErr_SetString(PyExc_TypeError, &argparse[3]); 4244 Py_DECREF(restuple); 4245 return NULL; 4246 } 4247 if (*newpos<0) 4248 *newpos = size+*newpos; 4249 if (*newpos<0 || *newpos>size) { 4250 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos); 4251 Py_DECREF(restuple); 4252 return NULL; 4253 } 4254 Py_INCREF(resunicode); 4255 Py_DECREF(restuple); 4256 return resunicode; 4257} 4258 4259static PyObject *unicode_encode_ucs1(const Py_UNICODE *p, 4260 Py_ssize_t size, 4261 const char *errors, 4262 int limit) 4263{ 4264 /* output object */ 4265 PyObject *res; 4266 /* pointers to the beginning and end+1 of input */ 4267 const Py_UNICODE *startp = p; 4268 const Py_UNICODE *endp = p + size; 4269 /* pointer to the beginning of the unencodable characters */ 4270 /* const Py_UNICODE *badp = NULL; */ 4271 /* pointer into the output */ 4272 char *str; 4273 /* current output position */ 4274 Py_ssize_t ressize; 4275 const char *encoding = (limit == 256) ? "latin-1" : "ascii"; 4276 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)"; 4277 PyObject *errorHandler = NULL; 4278 PyObject *exc = NULL; 4279 /* the following variable is used for caching string comparisons 4280 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */ 4281 int known_errorHandler = -1; 4282 4283 /* allocate enough for a simple encoding without 4284 replacements, if we need more, we'll resize */ 4285 if (size == 0) 4286 return PyBytes_FromStringAndSize(NULL, 0); 4287 res = PyBytes_FromStringAndSize(NULL, size); 4288 if (res == NULL) 4289 return NULL; 4290 str = PyBytes_AS_STRING(res); 4291 ressize = size; 4292 4293 while (p<endp) { 4294 Py_UNICODE c = *p; 4295 4296 /* can we encode this? */ 4297 if (c<limit) { 4298 /* no overflow check, because we know that the space is enough */ 4299 *str++ = (char)c; 4300 ++p; 4301 } 4302 else { 4303 Py_ssize_t unicodepos = p-startp; 4304 Py_ssize_t requiredsize; 4305 PyObject *repunicode; 4306 Py_ssize_t repsize; 4307 Py_ssize_t newpos; 4308 Py_ssize_t respos; 4309 Py_UNICODE *uni2; 4310 /* startpos for collecting unencodable chars */ 4311 const Py_UNICODE *collstart = p; 4312 const Py_UNICODE *collend = p; 4313 /* find all unecodable characters */ 4314 while ((collend < endp) && ((*collend)>=limit)) 4315 ++collend; 4316 /* cache callback name lookup (if not done yet, i.e. it's the first error) */ 4317 if (known_errorHandler==-1) { 4318 if ((errors==NULL) || (!strcmp(errors, "strict"))) 4319 known_errorHandler = 1; 4320 else if (!strcmp(errors, "replace")) 4321 known_errorHandler = 2; 4322 else if (!strcmp(errors, "ignore")) 4323 known_errorHandler = 3; 4324 else if (!strcmp(errors, "xmlcharrefreplace")) 4325 known_errorHandler = 4; 4326 else 4327 known_errorHandler = 0; 4328 } 4329 switch (known_errorHandler) { 4330 case 1: /* strict */ 4331 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason); 4332 goto onError; 4333 case 2: /* replace */ 4334 while (collstart++<collend) 4335 *str++ = '?'; /* fall through */ 4336 case 3: /* ignore */ 4337 p = collend; 4338 break; 4339 case 4: /* xmlcharrefreplace */ 4340 respos = str - PyBytes_AS_STRING(res); 4341 /* determine replacement size (temporarily (mis)uses p) */ 4342 for (p = collstart, repsize = 0; p < collend; ++p) { 4343 if (*p<10) 4344 repsize += 2+1+1; 4345 else if (*p<100) 4346 repsize += 2+2+1; 4347 else if (*p<1000) 4348 repsize += 2+3+1; 4349 else if (*p<10000) 4350 repsize += 2+4+1; 4351#ifndef Py_UNICODE_WIDE 4352 else 4353 repsize += 2+5+1; 4354#else 4355 else if (*p<100000) 4356 repsize += 2+5+1; 4357 else if (*p<1000000) 4358 repsize += 2+6+1; 4359 else 4360 repsize += 2+7+1; 4361#endif 4362 } 4363 requiredsize = respos+repsize+(endp-collend); 4364 if (requiredsize > ressize) { 4365 if (requiredsize<2*ressize) 4366 requiredsize = 2*ressize; 4367 if (_PyBytes_Resize(&res, requiredsize)) 4368 goto onError; 4369 str = PyBytes_AS_STRING(res) + respos; 4370 ressize = requiredsize; 4371 } 4372 /* generate replacement (temporarily (mis)uses p) */ 4373 for (p = collstart; p < collend; ++p) { 4374 str += sprintf(str, "&#%d;", (int)*p); 4375 } 4376 p = collend; 4377 break; 4378 default: 4379 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler, 4380 encoding, reason, startp, size, &exc, 4381 collstart-startp, collend-startp, &newpos); 4382 if (repunicode == NULL) 4383 goto onError; 4384 if (PyBytes_Check(repunicode)) { 4385 /* Directly copy bytes result to output. */ 4386 repsize = PyBytes_Size(repunicode); 4387 if (repsize > 1) { 4388 /* Make room for all additional bytes. */ 4389 respos = str - PyBytes_AS_STRING(res); 4390 if (_PyBytes_Resize(&res, ressize+repsize-1)) { 4391 Py_DECREF(repunicode); 4392 goto onError; 4393 } 4394 str = PyBytes_AS_STRING(res) + respos; 4395 ressize += repsize-1; 4396 } 4397 memcpy(str, PyBytes_AsString(repunicode), repsize); 4398 str += repsize; 4399 p = startp + newpos; 4400 Py_DECREF(repunicode); 4401 break; 4402 } 4403 /* need more space? (at least enough for what we 4404 have+the replacement+the rest of the string, so 4405 we won't have to check space for encodable characters) */ 4406 respos = str - PyBytes_AS_STRING(res); 4407 repsize = PyUnicode_GET_SIZE(repunicode); 4408 requiredsize = respos+repsize+(endp-collend); 4409 if (requiredsize > ressize) { 4410 if (requiredsize<2*ressize) 4411 requiredsize = 2*ressize; 4412 if (_PyBytes_Resize(&res, requiredsize)) { 4413 Py_DECREF(repunicode); 4414 goto onError; 4415 } 4416 str = PyBytes_AS_STRING(res) + respos; 4417 ressize = requiredsize; 4418 } 4419 /* check if there is anything unencodable in the replacement 4420 and copy it to the output */ 4421 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) { 4422 c = *uni2; 4423 if (c >= limit) { 4424 raise_encode_exception(&exc, encoding, startp, size, 4425 unicodepos, unicodepos+1, reason); 4426 Py_DECREF(repunicode); 4427 goto onError; 4428 } 4429 *str = (char)c; 4430 } 4431 p = startp + newpos; 4432 Py_DECREF(repunicode); 4433 } 4434 } 4435 } 4436 /* Resize if we allocated to much */ 4437 size = str - PyBytes_AS_STRING(res); 4438 if (size < ressize) { /* If this falls res will be NULL */ 4439 assert(size >= 0); 4440 if (_PyBytes_Resize(&res, size) < 0) 4441 goto onError; 4442 } 4443 4444 Py_XDECREF(errorHandler); 4445 Py_XDECREF(exc); 4446 return res; 4447 4448 onError: 4449 Py_XDECREF(res); 4450 Py_XDECREF(errorHandler); 4451 Py_XDECREF(exc); 4452 return NULL; 4453} 4454 4455PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p, 4456 Py_ssize_t size, 4457 const char *errors) 4458{ 4459 return unicode_encode_ucs1(p, size, errors, 256); 4460} 4461 4462PyObject *PyUnicode_AsLatin1String(PyObject *unicode) 4463{ 4464 if (!PyUnicode_Check(unicode)) { 4465 PyErr_BadArgument(); 4466 return NULL; 4467 } 4468 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode), 4469 PyUnicode_GET_SIZE(unicode), 4470 NULL); 4471} 4472 4473/* --- 7-bit ASCII Codec -------------------------------------------------- */ 4474 4475PyObject *PyUnicode_DecodeASCII(const char *s, 4476 Py_ssize_t size, 4477 const char *errors) 4478{ 4479 const char *starts = s; 4480 PyUnicodeObject *v; 4481 Py_UNICODE *p; 4482 Py_ssize_t startinpos; 4483 Py_ssize_t endinpos; 4484 Py_ssize_t outpos; 4485 const char *e; 4486 PyObject *errorHandler = NULL; 4487 PyObject *exc = NULL; 4488 4489 /* ASCII is equivalent to the first 128 ordinals in Unicode. */ 4490 if (size == 1 && *(unsigned char*)s < 128) { 4491 Py_UNICODE r = *(unsigned char*)s; 4492 return PyUnicode_FromUnicode(&r, 1); 4493 } 4494 4495 v = _PyUnicode_New(size); 4496 if (v == NULL) 4497 goto onError; 4498 if (size == 0) 4499 return (PyObject *)v; 4500 p = PyUnicode_AS_UNICODE(v); 4501 e = s + size; 4502 while (s < e) { 4503 register unsigned char c = (unsigned char)*s; 4504 if (c < 128) { 4505 *p++ = c; 4506 ++s; 4507 } 4508 else { 4509 startinpos = s-starts; 4510 endinpos = startinpos + 1; 4511 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v); 4512 if (unicode_decode_call_errorhandler( 4513 errors, &errorHandler, 4514 "ascii", "ordinal not in range(128)", 4515 &starts, &e, &startinpos, &endinpos, &exc, &s, 4516 &v, &outpos, &p)) 4517 goto onError; 4518 } 4519 } 4520 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v)) 4521 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0) 4522 goto onError; 4523 Py_XDECREF(errorHandler); 4524 Py_XDECREF(exc); 4525 return (PyObject *)v; 4526 4527 onError: 4528 Py_XDECREF(v); 4529 Py_XDECREF(errorHandler); 4530 Py_XDECREF(exc); 4531 return NULL; 4532} 4533 4534PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p, 4535 Py_ssize_t size, 4536 const char *errors) 4537{ 4538 return unicode_encode_ucs1(p, size, errors, 128); 4539} 4540 4541PyObject *PyUnicode_AsASCIIString(PyObject *unicode) 4542{ 4543 if (!PyUnicode_Check(unicode)) { 4544 PyErr_BadArgument(); 4545 return NULL; 4546 } 4547 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode), 4548 PyUnicode_GET_SIZE(unicode), 4549 NULL); 4550} 4551 4552#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T) 4553 4554/* --- MBCS codecs for Windows -------------------------------------------- */ 4555 4556#if SIZEOF_INT < SIZEOF_SIZE_T 4557#define NEED_RETRY 4558#endif 4559 4560/* XXX This code is limited to "true" double-byte encodings, as 4561 a) it assumes an incomplete character consists of a single byte, and 4562 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte 4563 encodings, see IsDBCSLeadByteEx documentation. */ 4564 4565static int is_dbcs_lead_byte(const char *s, int offset) 4566{ 4567 const char *curr = s + offset; 4568 4569 if (IsDBCSLeadByte(*curr)) { 4570 const char *prev = CharPrev(s, curr); 4571 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2); 4572 } 4573 return 0; 4574} 4575 4576/* 4577 * Decode MBCS string into unicode object. If 'final' is set, converts 4578 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise. 4579 */ 4580static int decode_mbcs(PyUnicodeObject **v, 4581 const char *s, /* MBCS string */ 4582 int size, /* sizeof MBCS string */ 4583 int final, 4584 const char *errors) 4585{ 4586 Py_UNICODE *p; 4587 Py_ssize_t n; 4588 DWORD usize; 4589 DWORD flags; 4590 4591 assert(size >= 0); 4592 4593 /* check and handle 'errors' arg */ 4594 if (errors==NULL || strcmp(errors, "strict")==0) 4595 flags = MB_ERR_INVALID_CHARS; 4596 else if (strcmp(errors, "ignore")==0) 4597 flags = 0; 4598 else { 4599 PyErr_Format(PyExc_ValueError, 4600 "mbcs encoding does not support errors='%s'", 4601 errors); 4602 return -1; 4603 } 4604 4605 /* Skip trailing lead-byte unless 'final' is set */ 4606 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1)) 4607 --size; 4608 4609 /* First get the size of the result */ 4610 if (size > 0) { 4611 usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0); 4612 if (usize==0) 4613 goto mbcs_decode_error; 4614 } else 4615 usize = 0; 4616 4617 if (*v == NULL) { 4618 /* Create unicode object */ 4619 *v = _PyUnicode_New(usize); 4620 if (*v == NULL) 4621 return -1; 4622 n = 0; 4623 } 4624 else { 4625 /* Extend unicode object */ 4626 n = PyUnicode_GET_SIZE(*v); 4627 if (_PyUnicode_Resize(v, n + usize) < 0) 4628 return -1; 4629 } 4630 4631 /* Do the conversion */ 4632 if (usize > 0) { 4633 p = PyUnicode_AS_UNICODE(*v) + n; 4634 if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) { 4635 goto mbcs_decode_error; 4636 } 4637 } 4638 return size; 4639 4640mbcs_decode_error: 4641 /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then 4642 we raise a UnicodeDecodeError - else it is a 'generic' 4643 windows error 4644 */ 4645 if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) { 4646 /* Ideally, we should get reason from FormatMessage - this 4647 is the Windows 2000 English version of the message 4648 */ 4649 PyObject *exc = NULL; 4650 const char *reason = "No mapping for the Unicode character exists " 4651 "in the target multi-byte code page."; 4652 make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason); 4653 if (exc != NULL) { 4654 PyCodec_StrictErrors(exc); 4655 Py_DECREF(exc); 4656 } 4657 } else { 4658 PyErr_SetFromWindowsErrWithFilename(0, NULL); 4659 } 4660 return -1; 4661} 4662 4663PyObject *PyUnicode_DecodeMBCSStateful(const char *s, 4664 Py_ssize_t size, 4665 const char *errors, 4666 Py_ssize_t *consumed) 4667{ 4668 PyUnicodeObject *v = NULL; 4669 int done; 4670 4671 if (consumed) 4672 *consumed = 0; 4673 4674#ifdef NEED_RETRY 4675 retry: 4676 if (size > INT_MAX) 4677 done = decode_mbcs(&v, s, INT_MAX, 0, errors); 4678 else 4679#endif 4680 done = decode_mbcs(&v, s, (int)size, !consumed, errors); 4681 4682 if (done < 0) { 4683 Py_XDECREF(v); 4684 return NULL; 4685 } 4686 4687 if (consumed) 4688 *consumed += done; 4689 4690#ifdef NEED_RETRY 4691 if (size > INT_MAX) { 4692 s += done; 4693 size -= done; 4694 goto retry; 4695 } 4696#endif 4697 4698 return (PyObject *)v; 4699} 4700 4701PyObject *PyUnicode_DecodeMBCS(const char *s, 4702 Py_ssize_t size, 4703 const char *errors) 4704{ 4705 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL); 4706} 4707 4708/* 4709 * Convert unicode into string object (MBCS). 4710 * Returns 0 if succeed, -1 otherwise. 4711 */ 4712static int encode_mbcs(PyObject **repr, 4713 const Py_UNICODE *p, /* unicode */ 4714 int size, /* size of unicode */ 4715 const char* errors) 4716{ 4717 BOOL usedDefaultChar = FALSE; 4718 BOOL *pusedDefaultChar; 4719 int mbcssize; 4720 Py_ssize_t n; 4721 PyObject *exc = NULL; 4722 DWORD flags; 4723 4724 assert(size >= 0); 4725 4726 /* check and handle 'errors' arg */ 4727 if (errors==NULL || strcmp(errors, "strict")==0) { 4728 flags = WC_NO_BEST_FIT_CHARS; 4729 pusedDefaultChar = &usedDefaultChar; 4730 } else if (strcmp(errors, "replace")==0) { 4731 flags = 0; 4732 pusedDefaultChar = NULL; 4733 } else { 4734 PyErr_Format(PyExc_ValueError, 4735 "mbcs encoding does not support errors='%s'", 4736 errors); 4737 return -1; 4738 } 4739 4740 /* First get the size of the result */ 4741 if (size > 0) { 4742 mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0, 4743 NULL, pusedDefaultChar); 4744 if (mbcssize == 0) { 4745 PyErr_SetFromWindowsErrWithFilename(0, NULL); 4746 return -1; 4747 } 4748 /* If we used a default char, then we failed! */ 4749 if (pusedDefaultChar && *pusedDefaultChar) 4750 goto mbcs_encode_error; 4751 } else { 4752 mbcssize = 0; 4753 } 4754 4755 if (*repr == NULL) { 4756 /* Create string object */ 4757 *repr = PyBytes_FromStringAndSize(NULL, mbcssize); 4758 if (*repr == NULL) 4759 return -1; 4760 n = 0; 4761 } 4762 else { 4763 /* Extend string object */ 4764 n = PyBytes_Size(*repr); 4765 if (_PyBytes_Resize(repr, n + mbcssize) < 0) 4766 return -1; 4767 } 4768 4769 /* Do the conversion */ 4770 if (size > 0) { 4771 char *s = PyBytes_AS_STRING(*repr) + n; 4772 if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize, 4773 NULL, pusedDefaultChar)) { 4774 PyErr_SetFromWindowsErrWithFilename(0, NULL); 4775 return -1; 4776 } 4777 if (pusedDefaultChar && *pusedDefaultChar) 4778 goto mbcs_encode_error; 4779 } 4780 return 0; 4781 4782mbcs_encode_error: 4783 raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character"); 4784 Py_XDECREF(exc); 4785 return -1; 4786} 4787 4788PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p, 4789 Py_ssize_t size, 4790 const char *errors) 4791{ 4792 PyObject *repr = NULL; 4793 int ret; 4794 4795#ifdef NEED_RETRY 4796 retry: 4797 if (size > INT_MAX) 4798 ret = encode_mbcs(&repr, p, INT_MAX, errors); 4799 else 4800#endif 4801 ret = encode_mbcs(&repr, p, (int)size, errors); 4802 4803 if (ret < 0) { 4804 Py_XDECREF(repr); 4805 return NULL; 4806 } 4807 4808#ifdef NEED_RETRY 4809 if (size > INT_MAX) { 4810 p += INT_MAX; 4811 size -= INT_MAX; 4812 goto retry; 4813 } 4814#endif 4815 4816 return repr; 4817} 4818 4819PyObject *PyUnicode_AsMBCSString(PyObject *unicode) 4820{ 4821 if (!PyUnicode_Check(unicode)) { 4822 PyErr_BadArgument(); 4823 return NULL; 4824 } 4825 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode), 4826 PyUnicode_GET_SIZE(unicode), 4827 NULL); 4828} 4829 4830#undef NEED_RETRY 4831 4832#endif /* MS_WINDOWS */ 4833 4834/* --- Character Mapping Codec -------------------------------------------- */ 4835 4836PyObject *PyUnicode_DecodeCharmap(const char *s, 4837 Py_ssize_t size, 4838 PyObject *mapping, 4839 const char *errors) 4840{ 4841 const char *starts = s; 4842 Py_ssize_t startinpos; 4843 Py_ssize_t endinpos; 4844 Py_ssize_t outpos; 4845 const char *e; 4846 PyUnicodeObject *v; 4847 Py_UNICODE *p; 4848 Py_ssize_t extrachars = 0; 4849 PyObject *errorHandler = NULL; 4850 PyObject *exc = NULL; 4851 Py_UNICODE *mapstring = NULL; 4852 Py_ssize_t maplen = 0; 4853 4854 /* Default to Latin-1 */ 4855 if (mapping == NULL) 4856 return PyUnicode_DecodeLatin1(s, size, errors); 4857 4858 v = _PyUnicode_New(size); 4859 if (v == NULL) 4860 goto onError; 4861 if (size == 0) 4862 return (PyObject *)v; 4863 p = PyUnicode_AS_UNICODE(v); 4864 e = s + size; 4865 if (PyUnicode_CheckExact(mapping)) { 4866 mapstring = PyUnicode_AS_UNICODE(mapping); 4867 maplen = PyUnicode_GET_SIZE(mapping); 4868 while (s < e) { 4869 unsigned char ch = *s; 4870 Py_UNICODE x = 0xfffe; /* illegal value */ 4871 4872 if (ch < maplen) 4873 x = mapstring[ch]; 4874 4875 if (x == 0xfffe) { 4876 /* undefined mapping */ 4877 outpos = p-PyUnicode_AS_UNICODE(v); 4878 startinpos = s-starts; 4879 endinpos = startinpos+1; 4880 if (unicode_decode_call_errorhandler( 4881 errors, &errorHandler, 4882 "charmap", "character maps to <undefined>", 4883 &starts, &e, &startinpos, &endinpos, &exc, &s, 4884 &v, &outpos, &p)) { 4885 goto onError; 4886 } 4887 continue; 4888 } 4889 *p++ = x; 4890 ++s; 4891 } 4892 } 4893 else { 4894 while (s < e) { 4895 unsigned char ch = *s; 4896 PyObject *w, *x; 4897 4898 /* Get mapping (char ordinal -> integer, Unicode char or None) */ 4899 w = PyLong_FromLong((long)ch); 4900 if (w == NULL) 4901 goto onError; 4902 x = PyObject_GetItem(mapping, w); 4903 Py_DECREF(w); 4904 if (x == NULL) { 4905 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 4906 /* No mapping found means: mapping is undefined. */ 4907 PyErr_Clear(); 4908 x = Py_None; 4909 Py_INCREF(x); 4910 } else 4911 goto onError; 4912 } 4913 4914 /* Apply mapping */ 4915 if (PyLong_Check(x)) { 4916 long value = PyLong_AS_LONG(x); 4917 if (value < 0 || value > 65535) { 4918 PyErr_SetString(PyExc_TypeError, 4919 "character mapping must be in range(65536)"); 4920 Py_DECREF(x); 4921 goto onError; 4922 } 4923 *p++ = (Py_UNICODE)value; 4924 } 4925 else if (x == Py_None) { 4926 /* undefined mapping */ 4927 outpos = p-PyUnicode_AS_UNICODE(v); 4928 startinpos = s-starts; 4929 endinpos = startinpos+1; 4930 if (unicode_decode_call_errorhandler( 4931 errors, &errorHandler, 4932 "charmap", "character maps to <undefined>", 4933 &starts, &e, &startinpos, &endinpos, &exc, &s, 4934 &v, &outpos, &p)) { 4935 Py_DECREF(x); 4936 goto onError; 4937 } 4938 Py_DECREF(x); 4939 continue; 4940 } 4941 else if (PyUnicode_Check(x)) { 4942 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x); 4943 4944 if (targetsize == 1) 4945 /* 1-1 mapping */ 4946 *p++ = *PyUnicode_AS_UNICODE(x); 4947 4948 else if (targetsize > 1) { 4949 /* 1-n mapping */ 4950 if (targetsize > extrachars) { 4951 /* resize first */ 4952 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v); 4953 Py_ssize_t needed = (targetsize - extrachars) + \ 4954 (targetsize << 2); 4955 extrachars += needed; 4956 /* XXX overflow detection missing */ 4957 if (_PyUnicode_Resize(&v, 4958 PyUnicode_GET_SIZE(v) + needed) < 0) { 4959 Py_DECREF(x); 4960 goto onError; 4961 } 4962 p = PyUnicode_AS_UNICODE(v) + oldpos; 4963 } 4964 Py_UNICODE_COPY(p, 4965 PyUnicode_AS_UNICODE(x), 4966 targetsize); 4967 p += targetsize; 4968 extrachars -= targetsize; 4969 } 4970 /* 1-0 mapping: skip the character */ 4971 } 4972 else { 4973 /* wrong return value */ 4974 PyErr_SetString(PyExc_TypeError, 4975 "character mapping must return integer, None or str"); 4976 Py_DECREF(x); 4977 goto onError; 4978 } 4979 Py_DECREF(x); 4980 ++s; 4981 } 4982 } 4983 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v)) 4984 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0) 4985 goto onError; 4986 Py_XDECREF(errorHandler); 4987 Py_XDECREF(exc); 4988 return (PyObject *)v; 4989 4990 onError: 4991 Py_XDECREF(errorHandler); 4992 Py_XDECREF(exc); 4993 Py_XDECREF(v); 4994 return NULL; 4995} 4996 4997/* Charmap encoding: the lookup table */ 4998 4999struct encoding_map{ 5000 PyObject_HEAD 5001 unsigned char level1[32]; 5002 int count2, count3; 5003 unsigned char level23[1]; 5004}; 5005 5006static PyObject* 5007encoding_map_size(PyObject *obj, PyObject* args) 5008{ 5009 struct encoding_map *map = (struct encoding_map*)obj; 5010 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 + 5011 128*map->count3); 5012} 5013 5014static PyMethodDef encoding_map_methods[] = { 5015 {"size", encoding_map_size, METH_NOARGS, 5016 PyDoc_STR("Return the size (in bytes) of this object") }, 5017 { 0 } 5018}; 5019 5020static void 5021encoding_map_dealloc(PyObject* o) 5022{ 5023 PyObject_FREE(o); 5024} 5025 5026static PyTypeObject EncodingMapType = { 5027 PyVarObject_HEAD_INIT(NULL, 0) 5028 "EncodingMap", /*tp_name*/ 5029 sizeof(struct encoding_map), /*tp_basicsize*/ 5030 0, /*tp_itemsize*/ 5031 /* methods */ 5032 encoding_map_dealloc, /*tp_dealloc*/ 5033 0, /*tp_print*/ 5034 0, /*tp_getattr*/ 5035 0, /*tp_setattr*/ 5036 0, /*tp_reserved*/ 5037 0, /*tp_repr*/ 5038 0, /*tp_as_number*/ 5039 0, /*tp_as_sequence*/ 5040 0, /*tp_as_mapping*/ 5041 0, /*tp_hash*/ 5042 0, /*tp_call*/ 5043 0, /*tp_str*/ 5044 0, /*tp_getattro*/ 5045 0, /*tp_setattro*/ 5046 0, /*tp_as_buffer*/ 5047 Py_TPFLAGS_DEFAULT, /*tp_flags*/ 5048 0, /*tp_doc*/ 5049 0, /*tp_traverse*/ 5050 0, /*tp_clear*/ 5051 0, /*tp_richcompare*/ 5052 0, /*tp_weaklistoffset*/ 5053 0, /*tp_iter*/ 5054 0, /*tp_iternext*/ 5055 encoding_map_methods, /*tp_methods*/ 5056 0, /*tp_members*/ 5057 0, /*tp_getset*/ 5058 0, /*tp_base*/ 5059 0, /*tp_dict*/ 5060 0, /*tp_descr_get*/ 5061 0, /*tp_descr_set*/ 5062 0, /*tp_dictoffset*/ 5063 0, /*tp_init*/ 5064 0, /*tp_alloc*/ 5065 0, /*tp_new*/ 5066 0, /*tp_free*/ 5067 0, /*tp_is_gc*/ 5068}; 5069 5070PyObject* 5071PyUnicode_BuildEncodingMap(PyObject* string) 5072{ 5073 Py_UNICODE *decode; 5074 PyObject *result; 5075 struct encoding_map *mresult; 5076 int i; 5077 int need_dict = 0; 5078 unsigned char level1[32]; 5079 unsigned char level2[512]; 5080 unsigned char *mlevel1, *mlevel2, *mlevel3; 5081 int count2 = 0, count3 = 0; 5082 5083 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) { 5084 PyErr_BadArgument(); 5085 return NULL; 5086 } 5087 decode = PyUnicode_AS_UNICODE(string); 5088 memset(level1, 0xFF, sizeof level1); 5089 memset(level2, 0xFF, sizeof level2); 5090 5091 /* If there isn't a one-to-one mapping of NULL to \0, 5092 or if there are non-BMP characters, we need to use 5093 a mapping dictionary. */ 5094 if (decode[0] != 0) 5095 need_dict = 1; 5096 for (i = 1; i < 256; i++) { 5097 int l1, l2; 5098 if (decode[i] == 0 5099#ifdef Py_UNICODE_WIDE 5100 || decode[i] > 0xFFFF 5101#endif 5102 ) { 5103 need_dict = 1; 5104 break; 5105 } 5106 if (decode[i] == 0xFFFE) 5107 /* unmapped character */ 5108 continue; 5109 l1 = decode[i] >> 11; 5110 l2 = decode[i] >> 7; 5111 if (level1[l1] == 0xFF) 5112 level1[l1] = count2++; 5113 if (level2[l2] == 0xFF) 5114 level2[l2] = count3++; 5115 } 5116 5117 if (count2 >= 0xFF || count3 >= 0xFF) 5118 need_dict = 1; 5119 5120 if (need_dict) { 5121 PyObject *result = PyDict_New(); 5122 PyObject *key, *value; 5123 if (!result) 5124 return NULL; 5125 for (i = 0; i < 256; i++) { 5126 key = value = NULL; 5127 key = PyLong_FromLong(decode[i]); 5128 value = PyLong_FromLong(i); 5129 if (!key || !value) 5130 goto failed1; 5131 if (PyDict_SetItem(result, key, value) == -1) 5132 goto failed1; 5133 Py_DECREF(key); 5134 Py_DECREF(value); 5135 } 5136 return result; 5137 failed1: 5138 Py_XDECREF(key); 5139 Py_XDECREF(value); 5140 Py_DECREF(result); 5141 return NULL; 5142 } 5143 5144 /* Create a three-level trie */ 5145 result = PyObject_MALLOC(sizeof(struct encoding_map) + 5146 16*count2 + 128*count3 - 1); 5147 if (!result) 5148 return PyErr_NoMemory(); 5149 PyObject_Init(result, &EncodingMapType); 5150 mresult = (struct encoding_map*)result; 5151 mresult->count2 = count2; 5152 mresult->count3 = count3; 5153 mlevel1 = mresult->level1; 5154 mlevel2 = mresult->level23; 5155 mlevel3 = mresult->level23 + 16*count2; 5156 memcpy(mlevel1, level1, 32); 5157 memset(mlevel2, 0xFF, 16*count2); 5158 memset(mlevel3, 0, 128*count3); 5159 count3 = 0; 5160 for (i = 1; i < 256; i++) { 5161 int o1, o2, o3, i2, i3; 5162 if (decode[i] == 0xFFFE) 5163 /* unmapped character */ 5164 continue; 5165 o1 = decode[i]>>11; 5166 o2 = (decode[i]>>7) & 0xF; 5167 i2 = 16*mlevel1[o1] + o2; 5168 if (mlevel2[i2] == 0xFF) 5169 mlevel2[i2] = count3++; 5170 o3 = decode[i] & 0x7F; 5171 i3 = 128*mlevel2[i2] + o3; 5172 mlevel3[i3] = i; 5173 } 5174 return result; 5175} 5176 5177static int 5178encoding_map_lookup(Py_UNICODE c, PyObject *mapping) 5179{ 5180 struct encoding_map *map = (struct encoding_map*)mapping; 5181 int l1 = c>>11; 5182 int l2 = (c>>7) & 0xF; 5183 int l3 = c & 0x7F; 5184 int i; 5185 5186#ifdef Py_UNICODE_WIDE 5187 if (c > 0xFFFF) { 5188 return -1; 5189 } 5190#endif 5191 if (c == 0) 5192 return 0; 5193 /* level 1*/ 5194 i = map->level1[l1]; 5195 if (i == 0xFF) { 5196 return -1; 5197 } 5198 /* level 2*/ 5199 i = map->level23[16*i+l2]; 5200 if (i == 0xFF) { 5201 return -1; 5202 } 5203 /* level 3 */ 5204 i = map->level23[16*map->count2 + 128*i + l3]; 5205 if (i == 0) { 5206 return -1; 5207 } 5208 return i; 5209} 5210 5211/* Lookup the character ch in the mapping. If the character 5212 can't be found, Py_None is returned (or NULL, if another 5213 error occurred). */ 5214static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping) 5215{ 5216 PyObject *w = PyLong_FromLong((long)c); 5217 PyObject *x; 5218 5219 if (w == NULL) 5220 return NULL; 5221 x = PyObject_GetItem(mapping, w); 5222 Py_DECREF(w); 5223 if (x == NULL) { 5224 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 5225 /* No mapping found means: mapping is undefined. */ 5226 PyErr_Clear(); 5227 x = Py_None; 5228 Py_INCREF(x); 5229 return x; 5230 } else 5231 return NULL; 5232 } 5233 else if (x == Py_None) 5234 return x; 5235 else if (PyLong_Check(x)) { 5236 long value = PyLong_AS_LONG(x); 5237 if (value < 0 || value > 255) { 5238 PyErr_SetString(PyExc_TypeError, 5239 "character mapping must be in range(256)"); 5240 Py_DECREF(x); 5241 return NULL; 5242 } 5243 return x; 5244 } 5245 else if (PyBytes_Check(x)) 5246 return x; 5247 else { 5248 /* wrong return value */ 5249 PyErr_Format(PyExc_TypeError, 5250 "character mapping must return integer, bytes or None, not %.400s", 5251 x->ob_type->tp_name); 5252 Py_DECREF(x); 5253 return NULL; 5254 } 5255} 5256 5257static int 5258charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize) 5259{ 5260 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj); 5261 /* exponentially overallocate to minimize reallocations */ 5262 if (requiredsize < 2*outsize) 5263 requiredsize = 2*outsize; 5264 if (_PyBytes_Resize(outobj, requiredsize)) 5265 return -1; 5266 return 0; 5267} 5268 5269typedef enum charmapencode_result { 5270 enc_SUCCESS, enc_FAILED, enc_EXCEPTION 5271}charmapencode_result; 5272/* lookup the character, put the result in the output string and adjust 5273 various state variables. Resize the output bytes object if not enough 5274 space is available. Return a new reference to the object that 5275 was put in the output buffer, or Py_None, if the mapping was undefined 5276 (in which case no character was written) or NULL, if a 5277 reallocation error occurred. The caller must decref the result */ 5278static 5279charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping, 5280 PyObject **outobj, Py_ssize_t *outpos) 5281{ 5282 PyObject *rep; 5283 char *outstart; 5284 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj); 5285 5286 if (Py_TYPE(mapping) == &EncodingMapType) { 5287 int res = encoding_map_lookup(c, mapping); 5288 Py_ssize_t requiredsize = *outpos+1; 5289 if (res == -1) 5290 return enc_FAILED; 5291 if (outsize<requiredsize) 5292 if (charmapencode_resize(outobj, outpos, requiredsize)) 5293 return enc_EXCEPTION; 5294 outstart = PyBytes_AS_STRING(*outobj); 5295 outstart[(*outpos)++] = (char)res; 5296 return enc_SUCCESS; 5297 } 5298 5299 rep = charmapencode_lookup(c, mapping); 5300 if (rep==NULL) 5301 return enc_EXCEPTION; 5302 else if (rep==Py_None) { 5303 Py_DECREF(rep); 5304 return enc_FAILED; 5305 } else { 5306 if (PyLong_Check(rep)) { 5307 Py_ssize_t requiredsize = *outpos+1; 5308 if (outsize<requiredsize) 5309 if (charmapencode_resize(outobj, outpos, requiredsize)) { 5310 Py_DECREF(rep); 5311 return enc_EXCEPTION; 5312 } 5313 outstart = PyBytes_AS_STRING(*outobj); 5314 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep); 5315 } 5316 else { 5317 const char *repchars = PyBytes_AS_STRING(rep); 5318 Py_ssize_t repsize = PyBytes_GET_SIZE(rep); 5319 Py_ssize_t requiredsize = *outpos+repsize; 5320 if (outsize<requiredsize) 5321 if (charmapencode_resize(outobj, outpos, requiredsize)) { 5322 Py_DECREF(rep); 5323 return enc_EXCEPTION; 5324 } 5325 outstart = PyBytes_AS_STRING(*outobj); 5326 memcpy(outstart + *outpos, repchars, repsize); 5327 *outpos += repsize; 5328 } 5329 } 5330 Py_DECREF(rep); 5331 return enc_SUCCESS; 5332} 5333 5334/* handle an error in PyUnicode_EncodeCharmap 5335 Return 0 on success, -1 on error */ 5336static 5337int charmap_encoding_error( 5338 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping, 5339 PyObject **exceptionObject, 5340 int *known_errorHandler, PyObject **errorHandler, const char *errors, 5341 PyObject **res, Py_ssize_t *respos) 5342{ 5343 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */ 5344 Py_ssize_t repsize; 5345 Py_ssize_t newpos; 5346 Py_UNICODE *uni2; 5347 /* startpos for collecting unencodable chars */ 5348 Py_ssize_t collstartpos = *inpos; 5349 Py_ssize_t collendpos = *inpos+1; 5350 Py_ssize_t collpos; 5351 char *encoding = "charmap"; 5352 char *reason = "character maps to <undefined>"; 5353 charmapencode_result x; 5354 5355 /* find all unencodable characters */ 5356 while (collendpos < size) { 5357 PyObject *rep; 5358 if (Py_TYPE(mapping) == &EncodingMapType) { 5359 int res = encoding_map_lookup(p[collendpos], mapping); 5360 if (res != -1) 5361 break; 5362 ++collendpos; 5363 continue; 5364 } 5365 5366 rep = charmapencode_lookup(p[collendpos], mapping); 5367 if (rep==NULL) 5368 return -1; 5369 else if (rep!=Py_None) { 5370 Py_DECREF(rep); 5371 break; 5372 } 5373 Py_DECREF(rep); 5374 ++collendpos; 5375 } 5376 /* cache callback name lookup 5377 * (if not done yet, i.e. it's the first error) */ 5378 if (*known_errorHandler==-1) { 5379 if ((errors==NULL) || (!strcmp(errors, "strict"))) 5380 *known_errorHandler = 1; 5381 else if (!strcmp(errors, "replace")) 5382 *known_errorHandler = 2; 5383 else if (!strcmp(errors, "ignore")) 5384 *known_errorHandler = 3; 5385 else if (!strcmp(errors, "xmlcharrefreplace")) 5386 *known_errorHandler = 4; 5387 else 5388 *known_errorHandler = 0; 5389 } 5390 switch (*known_errorHandler) { 5391 case 1: /* strict */ 5392 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason); 5393 return -1; 5394 case 2: /* replace */ 5395 for (collpos = collstartpos; collpos<collendpos; ++collpos) { 5396 x = charmapencode_output('?', mapping, res, respos); 5397 if (x==enc_EXCEPTION) { 5398 return -1; 5399 } 5400 else if (x==enc_FAILED) { 5401 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason); 5402 return -1; 5403 } 5404 } 5405 /* fall through */ 5406 case 3: /* ignore */ 5407 *inpos = collendpos; 5408 break; 5409 case 4: /* xmlcharrefreplace */ 5410 /* generate replacement (temporarily (mis)uses p) */ 5411 for (collpos = collstartpos; collpos < collendpos; ++collpos) { 5412 char buffer[2+29+1+1]; 5413 char *cp; 5414 sprintf(buffer, "&#%d;", (int)p[collpos]); 5415 for (cp = buffer; *cp; ++cp) { 5416 x = charmapencode_output(*cp, mapping, res, respos); 5417 if (x==enc_EXCEPTION) 5418 return -1; 5419 else if (x==enc_FAILED) { 5420 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason); 5421 return -1; 5422 } 5423 } 5424 } 5425 *inpos = collendpos; 5426 break; 5427 default: 5428 repunicode = unicode_encode_call_errorhandler(errors, errorHandler, 5429 encoding, reason, p, size, exceptionObject, 5430 collstartpos, collendpos, &newpos); 5431 if (repunicode == NULL) 5432 return -1; 5433 if (PyBytes_Check(repunicode)) { 5434 /* Directly copy bytes result to output. */ 5435 Py_ssize_t outsize = PyBytes_Size(*res); 5436 Py_ssize_t requiredsize; 5437 repsize = PyBytes_Size(repunicode); 5438 requiredsize = *respos + repsize; 5439 if (requiredsize > outsize) 5440 /* Make room for all additional bytes. */ 5441 if (charmapencode_resize(res, respos, requiredsize)) { 5442 Py_DECREF(repunicode); 5443 return -1; 5444 } 5445 memcpy(PyBytes_AsString(*res) + *respos, 5446 PyBytes_AsString(repunicode), repsize); 5447 *respos += repsize; 5448 *inpos = newpos; 5449 Py_DECREF(repunicode); 5450 break; 5451 } 5452 /* generate replacement */ 5453 repsize = PyUnicode_GET_SIZE(repunicode); 5454 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) { 5455 x = charmapencode_output(*uni2, mapping, res, respos); 5456 if (x==enc_EXCEPTION) { 5457 return -1; 5458 } 5459 else if (x==enc_FAILED) { 5460 Py_DECREF(repunicode); 5461 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason); 5462 return -1; 5463 } 5464 } 5465 *inpos = newpos; 5466 Py_DECREF(repunicode); 5467 } 5468 return 0; 5469} 5470 5471PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p, 5472 Py_ssize_t size, 5473 PyObject *mapping, 5474 const char *errors) 5475{ 5476 /* output object */ 5477 PyObject *res = NULL; 5478 /* current input position */ 5479 Py_ssize_t inpos = 0; 5480 /* current output position */ 5481 Py_ssize_t respos = 0; 5482 PyObject *errorHandler = NULL; 5483 PyObject *exc = NULL; 5484 /* the following variable is used for caching string comparisons 5485 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 5486 * 3=ignore, 4=xmlcharrefreplace */ 5487 int known_errorHandler = -1; 5488 5489 /* Default to Latin-1 */ 5490 if (mapping == NULL) 5491 return PyUnicode_EncodeLatin1(p, size, errors); 5492 5493 /* allocate enough for a simple encoding without 5494 replacements, if we need more, we'll resize */ 5495 res = PyBytes_FromStringAndSize(NULL, size); 5496 if (res == NULL) 5497 goto onError; 5498 if (size == 0) 5499 return res; 5500 5501 while (inpos<size) { 5502 /* try to encode it */ 5503 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos); 5504 if (x==enc_EXCEPTION) /* error */ 5505 goto onError; 5506 if (x==enc_FAILED) { /* unencodable character */ 5507 if (charmap_encoding_error(p, size, &inpos, mapping, 5508 &exc, 5509 &known_errorHandler, &errorHandler, errors, 5510 &res, &respos)) { 5511 goto onError; 5512 } 5513 } 5514 else 5515 /* done with this character => adjust input position */ 5516 ++inpos; 5517 } 5518 5519 /* Resize if we allocated to much */ 5520 if (respos<PyBytes_GET_SIZE(res)) 5521 if (_PyBytes_Resize(&res, respos) < 0) 5522 goto onError; 5523 5524 Py_XDECREF(exc); 5525 Py_XDECREF(errorHandler); 5526 return res; 5527 5528 onError: 5529 Py_XDECREF(res); 5530 Py_XDECREF(exc); 5531 Py_XDECREF(errorHandler); 5532 return NULL; 5533} 5534 5535PyObject *PyUnicode_AsCharmapString(PyObject *unicode, 5536 PyObject *mapping) 5537{ 5538 if (!PyUnicode_Check(unicode) || mapping == NULL) { 5539 PyErr_BadArgument(); 5540 return NULL; 5541 } 5542 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode), 5543 PyUnicode_GET_SIZE(unicode), 5544 mapping, 5545 NULL); 5546} 5547 5548/* create or adjust a UnicodeTranslateError */ 5549static void make_translate_exception(PyObject **exceptionObject, 5550 const Py_UNICODE *unicode, Py_ssize_t size, 5551 Py_ssize_t startpos, Py_ssize_t endpos, 5552 const char *reason) 5553{ 5554 if (*exceptionObject == NULL) { 5555 *exceptionObject = PyUnicodeTranslateError_Create( 5556 unicode, size, startpos, endpos, reason); 5557 } 5558 else { 5559 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos)) 5560 goto onError; 5561 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos)) 5562 goto onError; 5563 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason)) 5564 goto onError; 5565 return; 5566 onError: 5567 Py_DECREF(*exceptionObject); 5568 *exceptionObject = NULL; 5569 } 5570} 5571 5572/* raises a UnicodeTranslateError */ 5573static void raise_translate_exception(PyObject **exceptionObject, 5574 const Py_UNICODE *unicode, Py_ssize_t size, 5575 Py_ssize_t startpos, Py_ssize_t endpos, 5576 const char *reason) 5577{ 5578 make_translate_exception(exceptionObject, 5579 unicode, size, startpos, endpos, reason); 5580 if (*exceptionObject != NULL) 5581 PyCodec_StrictErrors(*exceptionObject); 5582} 5583 5584/* error handling callback helper: 5585 build arguments, call the callback and check the arguments, 5586 put the result into newpos and return the replacement string, which 5587 has to be freed by the caller */ 5588static PyObject *unicode_translate_call_errorhandler(const char *errors, 5589 PyObject **errorHandler, 5590 const char *reason, 5591 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject, 5592 Py_ssize_t startpos, Py_ssize_t endpos, 5593 Py_ssize_t *newpos) 5594{ 5595 static char *argparse = "O!n;translating error handler must return (str, int) tuple"; 5596 5597 Py_ssize_t i_newpos; 5598 PyObject *restuple; 5599 PyObject *resunicode; 5600 5601 if (*errorHandler == NULL) { 5602 *errorHandler = PyCodec_LookupError(errors); 5603 if (*errorHandler == NULL) 5604 return NULL; 5605 } 5606 5607 make_translate_exception(exceptionObject, 5608 unicode, size, startpos, endpos, reason); 5609 if (*exceptionObject == NULL) 5610 return NULL; 5611 5612 restuple = PyObject_CallFunctionObjArgs( 5613 *errorHandler, *exceptionObject, NULL); 5614 if (restuple == NULL) 5615 return NULL; 5616 if (!PyTuple_Check(restuple)) { 5617 PyErr_SetString(PyExc_TypeError, &argparse[4]); 5618 Py_DECREF(restuple); 5619 return NULL; 5620 } 5621 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, 5622 &resunicode, &i_newpos)) { 5623 Py_DECREF(restuple); 5624 return NULL; 5625 } 5626 if (i_newpos<0) 5627 *newpos = size+i_newpos; 5628 else 5629 *newpos = i_newpos; 5630 if (*newpos<0 || *newpos>size) { 5631 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos); 5632 Py_DECREF(restuple); 5633 return NULL; 5634 } 5635 Py_INCREF(resunicode); 5636 Py_DECREF(restuple); 5637 return resunicode; 5638} 5639 5640/* Lookup the character ch in the mapping and put the result in result, 5641 which must be decrefed by the caller. 5642 Return 0 on success, -1 on error */ 5643static 5644int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result) 5645{ 5646 PyObject *w = PyLong_FromLong((long)c); 5647 PyObject *x; 5648 5649 if (w == NULL) 5650 return -1; 5651 x = PyObject_GetItem(mapping, w); 5652 Py_DECREF(w); 5653 if (x == NULL) { 5654 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 5655 /* No mapping found means: use 1:1 mapping. */ 5656 PyErr_Clear(); 5657 *result = NULL; 5658 return 0; 5659 } else 5660 return -1; 5661 } 5662 else if (x == Py_None) { 5663 *result = x; 5664 return 0; 5665 } 5666 else if (PyLong_Check(x)) { 5667 long value = PyLong_AS_LONG(x); 5668 long max = PyUnicode_GetMax(); 5669 if (value < 0 || value > max) { 5670 PyErr_Format(PyExc_TypeError, 5671 "character mapping must be in range(0x%x)", max+1); 5672 Py_DECREF(x); 5673 return -1; 5674 } 5675 *result = x; 5676 return 0; 5677 } 5678 else if (PyUnicode_Check(x)) { 5679 *result = x; 5680 return 0; 5681 } 5682 else { 5683 /* wrong return value */ 5684 PyErr_SetString(PyExc_TypeError, 5685 "character mapping must return integer, None or str"); 5686 Py_DECREF(x); 5687 return -1; 5688 } 5689} 5690/* ensure that *outobj is at least requiredsize characters long, 5691 if not reallocate and adjust various state variables. 5692 Return 0 on success, -1 on error */ 5693static 5694int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp, 5695 Py_ssize_t requiredsize) 5696{ 5697 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj); 5698 if (requiredsize > oldsize) { 5699 /* remember old output position */ 5700 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj); 5701 /* exponentially overallocate to minimize reallocations */ 5702 if (requiredsize < 2 * oldsize) 5703 requiredsize = 2 * oldsize; 5704 if (PyUnicode_Resize(outobj, requiredsize) < 0) 5705 return -1; 5706 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos; 5707 } 5708 return 0; 5709} 5710/* lookup the character, put the result in the output string and adjust 5711 various state variables. Return a new reference to the object that 5712 was put in the output buffer in *result, or Py_None, if the mapping was 5713 undefined (in which case no character was written). 5714 The called must decref result. 5715 Return 0 on success, -1 on error. */ 5716static 5717int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp, 5718 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp, 5719 PyObject **res) 5720{ 5721 if (charmaptranslate_lookup(*curinp, mapping, res)) 5722 return -1; 5723 if (*res==NULL) { 5724 /* not found => default to 1:1 mapping */ 5725 *(*outp)++ = *curinp; 5726 } 5727 else if (*res==Py_None) 5728 ; 5729 else if (PyLong_Check(*res)) { 5730 /* no overflow check, because we know that the space is enough */ 5731 *(*outp)++ = (Py_UNICODE)PyLong_AS_LONG(*res); 5732 } 5733 else if (PyUnicode_Check(*res)) { 5734 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res); 5735 if (repsize==1) { 5736 /* no overflow check, because we know that the space is enough */ 5737 *(*outp)++ = *PyUnicode_AS_UNICODE(*res); 5738 } 5739 else if (repsize!=0) { 5740 /* more than one character */ 5741 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) + 5742 (insize - (curinp-startinp)) + 5743 repsize - 1; 5744 if (charmaptranslate_makespace(outobj, outp, requiredsize)) 5745 return -1; 5746 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize); 5747 *outp += repsize; 5748 } 5749 } 5750 else 5751 return -1; 5752 return 0; 5753} 5754 5755PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p, 5756 Py_ssize_t size, 5757 PyObject *mapping, 5758 const char *errors) 5759{ 5760 /* output object */ 5761 PyObject *res = NULL; 5762 /* pointers to the beginning and end+1 of input */ 5763 const Py_UNICODE *startp = p; 5764 const Py_UNICODE *endp = p + size; 5765 /* pointer into the output */ 5766 Py_UNICODE *str; 5767 /* current output position */ 5768 Py_ssize_t respos = 0; 5769 char *reason = "character maps to <undefined>"; 5770 PyObject *errorHandler = NULL; 5771 PyObject *exc = NULL; 5772 /* the following variable is used for caching string comparisons 5773 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 5774 * 3=ignore, 4=xmlcharrefreplace */ 5775 int known_errorHandler = -1; 5776 5777 if (mapping == NULL) { 5778 PyErr_BadArgument(); 5779 return NULL; 5780 } 5781 5782 /* allocate enough for a simple 1:1 translation without 5783 replacements, if we need more, we'll resize */ 5784 res = PyUnicode_FromUnicode(NULL, size); 5785 if (res == NULL) 5786 goto onError; 5787 if (size == 0) 5788 return res; 5789 str = PyUnicode_AS_UNICODE(res); 5790 5791 while (p<endp) { 5792 /* try to encode it */ 5793 PyObject *x = NULL; 5794 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) { 5795 Py_XDECREF(x); 5796 goto onError; 5797 } 5798 Py_XDECREF(x); 5799 if (x!=Py_None) /* it worked => adjust input pointer */ 5800 ++p; 5801 else { /* untranslatable character */ 5802 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */ 5803 Py_ssize_t repsize; 5804 Py_ssize_t newpos; 5805 Py_UNICODE *uni2; 5806 /* startpos for collecting untranslatable chars */ 5807 const Py_UNICODE *collstart = p; 5808 const Py_UNICODE *collend = p+1; 5809 const Py_UNICODE *coll; 5810 5811 /* find all untranslatable characters */ 5812 while (collend < endp) { 5813 if (charmaptranslate_lookup(*collend, mapping, &x)) 5814 goto onError; 5815 Py_XDECREF(x); 5816 if (x!=Py_None) 5817 break; 5818 ++collend; 5819 } 5820 /* cache callback name lookup 5821 * (if not done yet, i.e. it's the first error) */ 5822 if (known_errorHandler==-1) { 5823 if ((errors==NULL) || (!strcmp(errors, "strict"))) 5824 known_errorHandler = 1; 5825 else if (!strcmp(errors, "replace")) 5826 known_errorHandler = 2; 5827 else if (!strcmp(errors, "ignore")) 5828 known_errorHandler = 3; 5829 else if (!strcmp(errors, "xmlcharrefreplace")) 5830 known_errorHandler = 4; 5831 else 5832 known_errorHandler = 0; 5833 } 5834 switch (known_errorHandler) { 5835 case 1: /* strict */ 5836 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason); 5837 goto onError; 5838 case 2: /* replace */ 5839 /* No need to check for space, this is a 1:1 replacement */ 5840 for (coll = collstart; coll<collend; ++coll) 5841 *str++ = '?'; 5842 /* fall through */ 5843 case 3: /* ignore */ 5844 p = collend; 5845 break; 5846 case 4: /* xmlcharrefreplace */ 5847 /* generate replacement (temporarily (mis)uses p) */ 5848 for (p = collstart; p < collend; ++p) { 5849 char buffer[2+29+1+1]; 5850 char *cp; 5851 sprintf(buffer, "&#%d;", (int)*p); 5852 if (charmaptranslate_makespace(&res, &str, 5853 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend))) 5854 goto onError; 5855 for (cp = buffer; *cp; ++cp) 5856 *str++ = *cp; 5857 } 5858 p = collend; 5859 break; 5860 default: 5861 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler, 5862 reason, startp, size, &exc, 5863 collstart-startp, collend-startp, &newpos); 5864 if (repunicode == NULL) 5865 goto onError; 5866 /* generate replacement */ 5867 repsize = PyUnicode_GET_SIZE(repunicode); 5868 if (charmaptranslate_makespace(&res, &str, 5869 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) { 5870 Py_DECREF(repunicode); 5871 goto onError; 5872 } 5873 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) 5874 *str++ = *uni2; 5875 p = startp + newpos; 5876 Py_DECREF(repunicode); 5877 } 5878 } 5879 } 5880 /* Resize if we allocated to much */ 5881 respos = str-PyUnicode_AS_UNICODE(res); 5882 if (respos<PyUnicode_GET_SIZE(res)) { 5883 if (PyUnicode_Resize(&res, respos) < 0) 5884 goto onError; 5885 } 5886 Py_XDECREF(exc); 5887 Py_XDECREF(errorHandler); 5888 return res; 5889 5890 onError: 5891 Py_XDECREF(res); 5892 Py_XDECREF(exc); 5893 Py_XDECREF(errorHandler); 5894 return NULL; 5895} 5896 5897PyObject *PyUnicode_Translate(PyObject *str, 5898 PyObject *mapping, 5899 const char *errors) 5900{ 5901 PyObject *result; 5902 5903 str = PyUnicode_FromObject(str); 5904 if (str == NULL) 5905 goto onError; 5906 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str), 5907 PyUnicode_GET_SIZE(str), 5908 mapping, 5909 errors); 5910 Py_DECREF(str); 5911 return result; 5912 5913 onError: 5914 Py_XDECREF(str); 5915 return NULL; 5916} 5917 5918/* --- Decimal Encoder ---------------------------------------------------- */ 5919 5920int PyUnicode_EncodeDecimal(Py_UNICODE *s, 5921 Py_ssize_t length, 5922 char *output, 5923 const char *errors) 5924{ 5925 Py_UNICODE *p, *end; 5926 PyObject *errorHandler = NULL; 5927 PyObject *exc = NULL; 5928 const char *encoding = "decimal"; 5929 const char *reason = "invalid decimal Unicode string"; 5930 /* the following variable is used for caching string comparisons 5931 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */ 5932 int known_errorHandler = -1; 5933 5934 if (output == NULL) { 5935 PyErr_BadArgument(); 5936 return -1; 5937 } 5938 5939 p = s; 5940 end = s + length; 5941 while (p < end) { 5942 register Py_UNICODE ch = *p; 5943 int decimal; 5944 PyObject *repunicode; 5945 Py_ssize_t repsize; 5946 Py_ssize_t newpos; 5947 Py_UNICODE *uni2; 5948 Py_UNICODE *collstart; 5949 Py_UNICODE *collend; 5950 5951 if (Py_UNICODE_ISSPACE(ch)) { 5952 *output++ = ' '; 5953 ++p; 5954 continue; 5955 } 5956 decimal = Py_UNICODE_TODECIMAL(ch); 5957 if (decimal >= 0) { 5958 *output++ = '0' + decimal; 5959 ++p; 5960 continue; 5961 } 5962 if (0 < ch && ch < 256) { 5963 *output++ = (char)ch; 5964 ++p; 5965 continue; 5966 } 5967 /* All other characters are considered unencodable */ 5968 collstart = p; 5969 collend = p+1; 5970 while (collend < end) { 5971 if ((0 < *collend && *collend < 256) || 5972 !Py_UNICODE_ISSPACE(*collend) || 5973 Py_UNICODE_TODECIMAL(*collend)) 5974 break; 5975 } 5976 /* cache callback name lookup 5977 * (if not done yet, i.e. it's the first error) */ 5978 if (known_errorHandler==-1) { 5979 if ((errors==NULL) || (!strcmp(errors, "strict"))) 5980 known_errorHandler = 1; 5981 else if (!strcmp(errors, "replace")) 5982 known_errorHandler = 2; 5983 else if (!strcmp(errors, "ignore")) 5984 known_errorHandler = 3; 5985 else if (!strcmp(errors, "xmlcharrefreplace")) 5986 known_errorHandler = 4; 5987 else 5988 known_errorHandler = 0; 5989 } 5990 switch (known_errorHandler) { 5991 case 1: /* strict */ 5992 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason); 5993 goto onError; 5994 case 2: /* replace */ 5995 for (p = collstart; p < collend; ++p) 5996 *output++ = '?'; 5997 /* fall through */ 5998 case 3: /* ignore */ 5999 p = collend; 6000 break; 6001 case 4: /* xmlcharrefreplace */ 6002 /* generate replacement (temporarily (mis)uses p) */ 6003 for (p = collstart; p < collend; ++p) 6004 output += sprintf(output, "&#%d;", (int)*p); 6005 p = collend; 6006 break; 6007 default: 6008 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler, 6009 encoding, reason, s, length, &exc, 6010 collstart-s, collend-s, &newpos); 6011 if (repunicode == NULL) 6012 goto onError; 6013 if (!PyUnicode_Check(repunicode)) { 6014 /* Byte results not supported, since they have no decimal property. */ 6015 PyErr_SetString(PyExc_TypeError, "error handler should return unicode"); 6016 Py_DECREF(repunicode); 6017 goto onError; 6018 } 6019 /* generate replacement */ 6020 repsize = PyUnicode_GET_SIZE(repunicode); 6021 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) { 6022 Py_UNICODE ch = *uni2; 6023 if (Py_UNICODE_ISSPACE(ch)) 6024 *output++ = ' '; 6025 else { 6026 decimal = Py_UNICODE_TODECIMAL(ch); 6027 if (decimal >= 0) 6028 *output++ = '0' + decimal; 6029 else if (0 < ch && ch < 256) 6030 *output++ = (char)ch; 6031 else { 6032 Py_DECREF(repunicode); 6033 raise_encode_exception(&exc, encoding, 6034 s, length, collstart-s, collend-s, reason); 6035 goto onError; 6036 } 6037 } 6038 } 6039 p = s + newpos; 6040 Py_DECREF(repunicode); 6041 } 6042 } 6043 /* 0-terminate the output string */ 6044 *output++ = '\0'; 6045 Py_XDECREF(exc); 6046 Py_XDECREF(errorHandler); 6047 return 0; 6048 6049 onError: 6050 Py_XDECREF(exc); 6051 Py_XDECREF(errorHandler); 6052 return -1; 6053} 6054 6055/* --- Helpers ------------------------------------------------------------ */ 6056 6057#include "stringlib/unicodedefs.h" 6058#include "stringlib/fastsearch.h" 6059 6060#include "stringlib/count.h" 6061#include "stringlib/find.h" 6062#include "stringlib/partition.h" 6063#include "stringlib/split.h" 6064 6065#define _Py_InsertThousandsGrouping _PyUnicode_InsertThousandsGrouping 6066#define _Py_InsertThousandsGroupingLocale _PyUnicode_InsertThousandsGroupingLocale 6067#include "stringlib/localeutil.h" 6068 6069/* helper macro to fixup start/end slice values */ 6070#define ADJUST_INDICES(start, end, len) \ 6071 if (end > len) \ 6072 end = len; \ 6073 else if (end < 0) { \ 6074 end += len; \ 6075 if (end < 0) \ 6076 end = 0; \ 6077 } \ 6078 if (start < 0) { \ 6079 start += len; \ 6080 if (start < 0) \ 6081 start = 0; \ 6082 } 6083 6084Py_ssize_t PyUnicode_Count(PyObject *str, 6085 PyObject *substr, 6086 Py_ssize_t start, 6087 Py_ssize_t end) 6088{ 6089 Py_ssize_t result; 6090 PyUnicodeObject* str_obj; 6091 PyUnicodeObject* sub_obj; 6092 6093 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str); 6094 if (!str_obj) 6095 return -1; 6096 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr); 6097 if (!sub_obj) { 6098 Py_DECREF(str_obj); 6099 return -1; 6100 } 6101 6102 ADJUST_INDICES(start, end, str_obj->length); 6103 result = stringlib_count( 6104 str_obj->str + start, end - start, sub_obj->str, sub_obj->length, 6105 PY_SSIZE_T_MAX 6106 ); 6107 6108 Py_DECREF(sub_obj); 6109 Py_DECREF(str_obj); 6110 6111 return result; 6112} 6113 6114Py_ssize_t PyUnicode_Find(PyObject *str, 6115 PyObject *sub, 6116 Py_ssize_t start, 6117 Py_ssize_t end, 6118 int direction) 6119{ 6120 Py_ssize_t result; 6121 6122 str = PyUnicode_FromObject(str); 6123 if (!str) 6124 return -2; 6125 sub = PyUnicode_FromObject(sub); 6126 if (!sub) { 6127 Py_DECREF(str); 6128 return -2; 6129 } 6130 6131 if (direction > 0) 6132 result = stringlib_find_slice( 6133 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str), 6134 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub), 6135 start, end 6136 ); 6137 else 6138 result = stringlib_rfind_slice( 6139 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str), 6140 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub), 6141 start, end 6142 ); 6143 6144 Py_DECREF(str); 6145 Py_DECREF(sub); 6146 6147 return result; 6148} 6149 6150static 6151int tailmatch(PyUnicodeObject *self, 6152 PyUnicodeObject *substring, 6153 Py_ssize_t start, 6154 Py_ssize_t end, 6155 int direction) 6156{ 6157 if (substring->length == 0) 6158 return 1; 6159 6160 ADJUST_INDICES(start, end, self->length); 6161 end -= substring->length; 6162 if (end < start) 6163 return 0; 6164 6165 if (direction > 0) { 6166 if (Py_UNICODE_MATCH(self, end, substring)) 6167 return 1; 6168 } else { 6169 if (Py_UNICODE_MATCH(self, start, substring)) 6170 return 1; 6171 } 6172 6173 return 0; 6174} 6175 6176Py_ssize_t PyUnicode_Tailmatch(PyObject *str, 6177 PyObject *substr, 6178 Py_ssize_t start, 6179 Py_ssize_t end, 6180 int direction) 6181{ 6182 Py_ssize_t result; 6183 6184 str = PyUnicode_FromObject(str); 6185 if (str == NULL) 6186 return -1; 6187 substr = PyUnicode_FromObject(substr); 6188 if (substr == NULL) { 6189 Py_DECREF(str); 6190 return -1; 6191 } 6192 6193 result = tailmatch((PyUnicodeObject *)str, 6194 (PyUnicodeObject *)substr, 6195 start, end, direction); 6196 Py_DECREF(str); 6197 Py_DECREF(substr); 6198 return result; 6199} 6200 6201/* Apply fixfct filter to the Unicode object self and return a 6202 reference to the modified object */ 6203 6204static 6205PyObject *fixup(PyUnicodeObject *self, 6206 int (*fixfct)(PyUnicodeObject *s)) 6207{ 6208 6209 PyUnicodeObject *u; 6210 6211 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length); 6212 if (u == NULL) 6213 return NULL; 6214 6215 Py_UNICODE_COPY(u->str, self->str, self->length); 6216 6217 if (!fixfct(u) && PyUnicode_CheckExact(self)) { 6218 /* fixfct should return TRUE if it modified the buffer. If 6219 FALSE, return a reference to the original buffer instead 6220 (to save space, not time) */ 6221 Py_INCREF(self); 6222 Py_DECREF(u); 6223 return (PyObject*) self; 6224 } 6225 return (PyObject*) u; 6226} 6227 6228static 6229int fixupper(PyUnicodeObject *self) 6230{ 6231 Py_ssize_t len = self->length; 6232 Py_UNICODE *s = self->str; 6233 int status = 0; 6234 6235 while (len-- > 0) { 6236 register Py_UNICODE ch; 6237 6238 ch = Py_UNICODE_TOUPPER(*s); 6239 if (ch != *s) { 6240 status = 1; 6241 *s = ch; 6242 } 6243 s++; 6244 } 6245 6246 return status; 6247} 6248 6249static 6250int fixlower(PyUnicodeObject *self) 6251{ 6252 Py_ssize_t len = self->length; 6253 Py_UNICODE *s = self->str; 6254 int status = 0; 6255 6256 while (len-- > 0) { 6257 register Py_UNICODE ch; 6258 6259 ch = Py_UNICODE_TOLOWER(*s); 6260 if (ch != *s) { 6261 status = 1; 6262 *s = ch; 6263 } 6264 s++; 6265 } 6266 6267 return status; 6268} 6269 6270static 6271int fixswapcase(PyUnicodeObject *self) 6272{ 6273 Py_ssize_t len = self->length; 6274 Py_UNICODE *s = self->str; 6275 int status = 0; 6276 6277 while (len-- > 0) { 6278 if (Py_UNICODE_ISUPPER(*s)) { 6279 *s = Py_UNICODE_TOLOWER(*s); 6280 status = 1; 6281 } else if (Py_UNICODE_ISLOWER(*s)) { 6282 *s = Py_UNICODE_TOUPPER(*s); 6283 status = 1; 6284 } 6285 s++; 6286 } 6287 6288 return status; 6289} 6290 6291static 6292int fixcapitalize(PyUnicodeObject *self) 6293{ 6294 Py_ssize_t len = self->length; 6295 Py_UNICODE *s = self->str; 6296 int status = 0; 6297 6298 if (len == 0) 6299 return 0; 6300 if (Py_UNICODE_ISLOWER(*s)) { 6301 *s = Py_UNICODE_TOUPPER(*s); 6302 status = 1; 6303 } 6304 s++; 6305 while (--len > 0) { 6306 if (Py_UNICODE_ISUPPER(*s)) { 6307 *s = Py_UNICODE_TOLOWER(*s); 6308 status = 1; 6309 } 6310 s++; 6311 } 6312 return status; 6313} 6314 6315static 6316int fixtitle(PyUnicodeObject *self) 6317{ 6318 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 6319 register Py_UNICODE *e; 6320 int previous_is_cased; 6321 6322 /* Shortcut for single character strings */ 6323 if (PyUnicode_GET_SIZE(self) == 1) { 6324 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p); 6325 if (*p != ch) { 6326 *p = ch; 6327 return 1; 6328 } 6329 else 6330 return 0; 6331 } 6332 6333 e = p + PyUnicode_GET_SIZE(self); 6334 previous_is_cased = 0; 6335 for (; p < e; p++) { 6336 register const Py_UNICODE ch = *p; 6337 6338 if (previous_is_cased) 6339 *p = Py_UNICODE_TOLOWER(ch); 6340 else 6341 *p = Py_UNICODE_TOTITLE(ch); 6342 6343 if (Py_UNICODE_ISLOWER(ch) || 6344 Py_UNICODE_ISUPPER(ch) || 6345 Py_UNICODE_ISTITLE(ch)) 6346 previous_is_cased = 1; 6347 else 6348 previous_is_cased = 0; 6349 } 6350 return 1; 6351} 6352 6353PyObject * 6354PyUnicode_Join(PyObject *separator, PyObject *seq) 6355{ 6356 const Py_UNICODE blank = ' '; 6357 const Py_UNICODE *sep = ␣ 6358 Py_ssize_t seplen = 1; 6359 PyUnicodeObject *res = NULL; /* the result */ 6360 Py_UNICODE *res_p; /* pointer to free byte in res's string area */ 6361 PyObject *fseq; /* PySequence_Fast(seq) */ 6362 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */ 6363 PyObject **items; 6364 PyObject *item; 6365 Py_ssize_t sz, i; 6366 6367 fseq = PySequence_Fast(seq, ""); 6368 if (fseq == NULL) { 6369 return NULL; 6370 } 6371 6372 /* NOTE: the following code can't call back into Python code, 6373 * so we are sure that fseq won't be mutated. 6374 */ 6375 6376 seqlen = PySequence_Fast_GET_SIZE(fseq); 6377 /* If empty sequence, return u"". */ 6378 if (seqlen == 0) { 6379 res = _PyUnicode_New(0); /* empty sequence; return u"" */ 6380 goto Done; 6381 } 6382 items = PySequence_Fast_ITEMS(fseq); 6383 /* If singleton sequence with an exact Unicode, return that. */ 6384 if (seqlen == 1) { 6385 item = items[0]; 6386 if (PyUnicode_CheckExact(item)) { 6387 Py_INCREF(item); 6388 res = (PyUnicodeObject *)item; 6389 goto Done; 6390 } 6391 } 6392 else { 6393 /* Set up sep and seplen */ 6394 if (separator == NULL) { 6395 sep = ␣ 6396 seplen = 1; 6397 } 6398 else { 6399 if (!PyUnicode_Check(separator)) { 6400 PyErr_Format(PyExc_TypeError, 6401 "separator: expected str instance," 6402 " %.80s found", 6403 Py_TYPE(separator)->tp_name); 6404 goto onError; 6405 } 6406 sep = PyUnicode_AS_UNICODE(separator); 6407 seplen = PyUnicode_GET_SIZE(separator); 6408 } 6409 } 6410 6411 /* There are at least two things to join, or else we have a subclass 6412 * of str in the sequence. 6413 * Do a pre-pass to figure out the total amount of space we'll 6414 * need (sz), and see whether all argument are strings. 6415 */ 6416 sz = 0; 6417 for (i = 0; i < seqlen; i++) { 6418 const Py_ssize_t old_sz = sz; 6419 item = items[i]; 6420 if (!PyUnicode_Check(item)) { 6421 PyErr_Format(PyExc_TypeError, 6422 "sequence item %zd: expected str instance," 6423 " %.80s found", 6424 i, Py_TYPE(item)->tp_name); 6425 goto onError; 6426 } 6427 sz += PyUnicode_GET_SIZE(item); 6428 if (i != 0) 6429 sz += seplen; 6430 if (sz < old_sz || sz > PY_SSIZE_T_MAX) { 6431 PyErr_SetString(PyExc_OverflowError, 6432 "join() result is too long for a Python string"); 6433 goto onError; 6434 } 6435 } 6436 6437 res = _PyUnicode_New(sz); 6438 if (res == NULL) 6439 goto onError; 6440 6441 /* Catenate everything. */ 6442 res_p = PyUnicode_AS_UNICODE(res); 6443 for (i = 0; i < seqlen; ++i) { 6444 Py_ssize_t itemlen; 6445 item = items[i]; 6446 itemlen = PyUnicode_GET_SIZE(item); 6447 /* Copy item, and maybe the separator. */ 6448 if (i) { 6449 Py_UNICODE_COPY(res_p, sep, seplen); 6450 res_p += seplen; 6451 } 6452 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen); 6453 res_p += itemlen; 6454 } 6455 6456 Done: 6457 Py_DECREF(fseq); 6458 return (PyObject *)res; 6459 6460 onError: 6461 Py_DECREF(fseq); 6462 Py_XDECREF(res); 6463 return NULL; 6464} 6465 6466static 6467PyUnicodeObject *pad(PyUnicodeObject *self, 6468 Py_ssize_t left, 6469 Py_ssize_t right, 6470 Py_UNICODE fill) 6471{ 6472 PyUnicodeObject *u; 6473 6474 if (left < 0) 6475 left = 0; 6476 if (right < 0) 6477 right = 0; 6478 6479 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) { 6480 Py_INCREF(self); 6481 return self; 6482 } 6483 6484 if (left > PY_SSIZE_T_MAX - self->length || 6485 right > PY_SSIZE_T_MAX - (left + self->length)) { 6486 PyErr_SetString(PyExc_OverflowError, "padded string is too long"); 6487 return NULL; 6488 } 6489 u = _PyUnicode_New(left + self->length + right); 6490 if (u) { 6491 if (left) 6492 Py_UNICODE_FILL(u->str, fill, left); 6493 Py_UNICODE_COPY(u->str + left, self->str, self->length); 6494 if (right) 6495 Py_UNICODE_FILL(u->str + left + self->length, fill, right); 6496 } 6497 6498 return u; 6499} 6500 6501PyObject *PyUnicode_Splitlines(PyObject *string, int keepends) 6502{ 6503 PyObject *list; 6504 6505 string = PyUnicode_FromObject(string); 6506 if (string == NULL) 6507 return NULL; 6508 6509 list = stringlib_splitlines( 6510 (PyObject*) string, PyUnicode_AS_UNICODE(string), 6511 PyUnicode_GET_SIZE(string), keepends); 6512 6513 Py_DECREF(string); 6514 return list; 6515} 6516 6517static 6518PyObject *split(PyUnicodeObject *self, 6519 PyUnicodeObject *substring, 6520 Py_ssize_t maxcount) 6521{ 6522 if (maxcount < 0) 6523 maxcount = PY_SSIZE_T_MAX; 6524 6525 if (substring == NULL) 6526 return stringlib_split_whitespace( 6527 (PyObject*) self, self->str, self->length, maxcount 6528 ); 6529 6530 return stringlib_split( 6531 (PyObject*) self, self->str, self->length, 6532 substring->str, substring->length, 6533 maxcount 6534 ); 6535} 6536 6537static 6538PyObject *rsplit(PyUnicodeObject *self, 6539 PyUnicodeObject *substring, 6540 Py_ssize_t maxcount) 6541{ 6542 if (maxcount < 0) 6543 maxcount = PY_SSIZE_T_MAX; 6544 6545 if (substring == NULL) 6546 return stringlib_rsplit_whitespace( 6547 (PyObject*) self, self->str, self->length, maxcount 6548 ); 6549 6550 return stringlib_rsplit( 6551 (PyObject*) self, self->str, self->length, 6552 substring->str, substring->length, 6553 maxcount 6554 ); 6555} 6556 6557static 6558PyObject *replace(PyUnicodeObject *self, 6559 PyUnicodeObject *str1, 6560 PyUnicodeObject *str2, 6561 Py_ssize_t maxcount) 6562{ 6563 PyUnicodeObject *u; 6564 6565 if (maxcount < 0) 6566 maxcount = PY_SSIZE_T_MAX; 6567 else if (maxcount == 0 || self->length == 0) 6568 goto nothing; 6569 6570 if (str1->length == str2->length) { 6571 Py_ssize_t i; 6572 /* same length */ 6573 if (str1->length == 0) 6574 goto nothing; 6575 if (str1->length == 1) { 6576 /* replace characters */ 6577 Py_UNICODE u1, u2; 6578 if (!findchar(self->str, self->length, str1->str[0])) 6579 goto nothing; 6580 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length); 6581 if (!u) 6582 return NULL; 6583 Py_UNICODE_COPY(u->str, self->str, self->length); 6584 u1 = str1->str[0]; 6585 u2 = str2->str[0]; 6586 for (i = 0; i < u->length; i++) 6587 if (u->str[i] == u1) { 6588 if (--maxcount < 0) 6589 break; 6590 u->str[i] = u2; 6591 } 6592 } else { 6593 i = stringlib_find( 6594 self->str, self->length, str1->str, str1->length, 0 6595 ); 6596 if (i < 0) 6597 goto nothing; 6598 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length); 6599 if (!u) 6600 return NULL; 6601 Py_UNICODE_COPY(u->str, self->str, self->length); 6602 6603 /* change everything in-place, starting with this one */ 6604 Py_UNICODE_COPY(u->str+i, str2->str, str2->length); 6605 i += str1->length; 6606 6607 while ( --maxcount > 0) { 6608 i = stringlib_find(self->str+i, self->length-i, 6609 str1->str, str1->length, 6610 i); 6611 if (i == -1) 6612 break; 6613 Py_UNICODE_COPY(u->str+i, str2->str, str2->length); 6614 i += str1->length; 6615 } 6616 } 6617 } else { 6618 6619 Py_ssize_t n, i, j, e; 6620 Py_ssize_t product, new_size, delta; 6621 Py_UNICODE *p; 6622 6623 /* replace strings */ 6624 n = stringlib_count(self->str, self->length, str1->str, str1->length, 6625 maxcount); 6626 if (n == 0) 6627 goto nothing; 6628 /* new_size = self->length + n * (str2->length - str1->length)); */ 6629 delta = (str2->length - str1->length); 6630 if (delta == 0) { 6631 new_size = self->length; 6632 } else { 6633 product = n * (str2->length - str1->length); 6634 if ((product / (str2->length - str1->length)) != n) { 6635 PyErr_SetString(PyExc_OverflowError, 6636 "replace string is too long"); 6637 return NULL; 6638 } 6639 new_size = self->length + product; 6640 if (new_size < 0) { 6641 PyErr_SetString(PyExc_OverflowError, 6642 "replace string is too long"); 6643 return NULL; 6644 } 6645 } 6646 u = _PyUnicode_New(new_size); 6647 if (!u) 6648 return NULL; 6649 i = 0; 6650 p = u->str; 6651 e = self->length - str1->length; 6652 if (str1->length > 0) { 6653 while (n-- > 0) { 6654 /* look for next match */ 6655 j = stringlib_find(self->str+i, self->length-i, 6656 str1->str, str1->length, 6657 i); 6658 if (j == -1) 6659 break; 6660 else if (j > i) { 6661 /* copy unchanged part [i:j] */ 6662 Py_UNICODE_COPY(p, self->str+i, j-i); 6663 p += j - i; 6664 } 6665 /* copy substitution string */ 6666 if (str2->length > 0) { 6667 Py_UNICODE_COPY(p, str2->str, str2->length); 6668 p += str2->length; 6669 } 6670 i = j + str1->length; 6671 } 6672 if (i < self->length) 6673 /* copy tail [i:] */ 6674 Py_UNICODE_COPY(p, self->str+i, self->length-i); 6675 } else { 6676 /* interleave */ 6677 while (n > 0) { 6678 Py_UNICODE_COPY(p, str2->str, str2->length); 6679 p += str2->length; 6680 if (--n <= 0) 6681 break; 6682 *p++ = self->str[i++]; 6683 } 6684 Py_UNICODE_COPY(p, self->str+i, self->length-i); 6685 } 6686 } 6687 return (PyObject *) u; 6688 6689 nothing: 6690 /* nothing to replace; return original string (when possible) */ 6691 if (PyUnicode_CheckExact(self)) { 6692 Py_INCREF(self); 6693 return (PyObject *) self; 6694 } 6695 return PyUnicode_FromUnicode(self->str, self->length); 6696} 6697 6698/* --- Unicode Object Methods --------------------------------------------- */ 6699 6700PyDoc_STRVAR(title__doc__, 6701 "S.title() -> str\n\ 6702\n\ 6703Return a titlecased version of S, i.e. words start with title case\n\ 6704characters, all remaining cased characters have lower case."); 6705 6706static PyObject* 6707unicode_title(PyUnicodeObject *self) 6708{ 6709 return fixup(self, fixtitle); 6710} 6711 6712PyDoc_STRVAR(capitalize__doc__, 6713 "S.capitalize() -> str\n\ 6714\n\ 6715Return a capitalized version of S, i.e. make the first character\n\ 6716have upper case and the rest lower case."); 6717 6718static PyObject* 6719unicode_capitalize(PyUnicodeObject *self) 6720{ 6721 return fixup(self, fixcapitalize); 6722} 6723 6724#if 0 6725PyDoc_STRVAR(capwords__doc__, 6726 "S.capwords() -> str\n\ 6727\n\ 6728Apply .capitalize() to all words in S and return the result with\n\ 6729normalized whitespace (all whitespace strings are replaced by ' ')."); 6730 6731static PyObject* 6732unicode_capwords(PyUnicodeObject *self) 6733{ 6734 PyObject *list; 6735 PyObject *item; 6736 Py_ssize_t i; 6737 6738 /* Split into words */ 6739 list = split(self, NULL, -1); 6740 if (!list) 6741 return NULL; 6742 6743 /* Capitalize each word */ 6744 for (i = 0; i < PyList_GET_SIZE(list); i++) { 6745 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i), 6746 fixcapitalize); 6747 if (item == NULL) 6748 goto onError; 6749 Py_DECREF(PyList_GET_ITEM(list, i)); 6750 PyList_SET_ITEM(list, i, item); 6751 } 6752 6753 /* Join the words to form a new string */ 6754 item = PyUnicode_Join(NULL, list); 6755 6756 onError: 6757 Py_DECREF(list); 6758 return (PyObject *)item; 6759} 6760#endif 6761 6762/* Argument converter. Coerces to a single unicode character */ 6763 6764static int 6765convert_uc(PyObject *obj, void *addr) 6766{ 6767 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr; 6768 PyObject *uniobj; 6769 Py_UNICODE *unistr; 6770 6771 uniobj = PyUnicode_FromObject(obj); 6772 if (uniobj == NULL) { 6773 PyErr_SetString(PyExc_TypeError, 6774 "The fill character cannot be converted to Unicode"); 6775 return 0; 6776 } 6777 if (PyUnicode_GET_SIZE(uniobj) != 1) { 6778 PyErr_SetString(PyExc_TypeError, 6779 "The fill character must be exactly one character long"); 6780 Py_DECREF(uniobj); 6781 return 0; 6782 } 6783 unistr = PyUnicode_AS_UNICODE(uniobj); 6784 *fillcharloc = unistr[0]; 6785 Py_DECREF(uniobj); 6786 return 1; 6787} 6788 6789PyDoc_STRVAR(center__doc__, 6790 "S.center(width[, fillchar]) -> str\n\ 6791\n\ 6792Return S centered in a string of length width. Padding is\n\ 6793done using the specified fill character (default is a space)"); 6794 6795static PyObject * 6796unicode_center(PyUnicodeObject *self, PyObject *args) 6797{ 6798 Py_ssize_t marg, left; 6799 Py_ssize_t width; 6800 Py_UNICODE fillchar = ' '; 6801 6802 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar)) 6803 return NULL; 6804 6805 if (self->length >= width && PyUnicode_CheckExact(self)) { 6806 Py_INCREF(self); 6807 return (PyObject*) self; 6808 } 6809 6810 marg = width - self->length; 6811 left = marg / 2 + (marg & width & 1); 6812 6813 return (PyObject*) pad(self, left, marg - left, fillchar); 6814} 6815 6816#if 0 6817 6818/* This code should go into some future Unicode collation support 6819 module. The basic comparison should compare ordinals on a naive 6820 basis (this is what Java does and thus Jython too). */ 6821 6822/* speedy UTF-16 code point order comparison */ 6823/* gleaned from: */ 6824/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */ 6825 6826static short utf16Fixup[32] = 6827{ 6828 0, 0, 0, 0, 0, 0, 0, 0, 6829 0, 0, 0, 0, 0, 0, 0, 0, 6830 0, 0, 0, 0, 0, 0, 0, 0, 6831 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800 6832}; 6833 6834static int 6835unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2) 6836{ 6837 Py_ssize_t len1, len2; 6838 6839 Py_UNICODE *s1 = str1->str; 6840 Py_UNICODE *s2 = str2->str; 6841 6842 len1 = str1->length; 6843 len2 = str2->length; 6844 6845 while (len1 > 0 && len2 > 0) { 6846 Py_UNICODE c1, c2; 6847 6848 c1 = *s1++; 6849 c2 = *s2++; 6850 6851 if (c1 > (1<<11) * 26) 6852 c1 += utf16Fixup[c1>>11]; 6853 if (c2 > (1<<11) * 26) 6854 c2 += utf16Fixup[c2>>11]; 6855 /* now c1 and c2 are in UTF-32-compatible order */ 6856 6857 if (c1 != c2) 6858 return (c1 < c2) ? -1 : 1; 6859 6860 len1--; len2--; 6861 } 6862 6863 return (len1 < len2) ? -1 : (len1 != len2); 6864} 6865 6866#else 6867 6868static int 6869unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2) 6870{ 6871 register Py_ssize_t len1, len2; 6872 6873 Py_UNICODE *s1 = str1->str; 6874 Py_UNICODE *s2 = str2->str; 6875 6876 len1 = str1->length; 6877 len2 = str2->length; 6878 6879 while (len1 > 0 && len2 > 0) { 6880 Py_UNICODE c1, c2; 6881 6882 c1 = *s1++; 6883 c2 = *s2++; 6884 6885 if (c1 != c2) 6886 return (c1 < c2) ? -1 : 1; 6887 6888 len1--; len2--; 6889 } 6890 6891 return (len1 < len2) ? -1 : (len1 != len2); 6892} 6893 6894#endif 6895 6896int PyUnicode_Compare(PyObject *left, 6897 PyObject *right) 6898{ 6899 if (PyUnicode_Check(left) && PyUnicode_Check(right)) 6900 return unicode_compare((PyUnicodeObject *)left, 6901 (PyUnicodeObject *)right); 6902 PyErr_Format(PyExc_TypeError, 6903 "Can't compare %.100s and %.100s", 6904 left->ob_type->tp_name, 6905 right->ob_type->tp_name); 6906 return -1; 6907} 6908 6909int 6910PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str) 6911{ 6912 int i; 6913 Py_UNICODE *id; 6914 assert(PyUnicode_Check(uni)); 6915 id = PyUnicode_AS_UNICODE(uni); 6916 /* Compare Unicode string and source character set string */ 6917 for (i = 0; id[i] && str[i]; i++) 6918 if (id[i] != str[i]) 6919 return ((int)id[i] < (int)str[i]) ? -1 : 1; 6920 /* This check keeps Python strings that end in '\0' from comparing equal 6921 to C strings identical up to that point. */ 6922 if (PyUnicode_GET_SIZE(uni) != i || id[i]) 6923 return 1; /* uni is longer */ 6924 if (str[i]) 6925 return -1; /* str is longer */ 6926 return 0; 6927} 6928 6929 6930#define TEST_COND(cond) \ 6931 ((cond) ? Py_True : Py_False) 6932 6933PyObject *PyUnicode_RichCompare(PyObject *left, 6934 PyObject *right, 6935 int op) 6936{ 6937 int result; 6938 6939 if (PyUnicode_Check(left) && PyUnicode_Check(right)) { 6940 PyObject *v; 6941 if (((PyUnicodeObject *) left)->length != 6942 ((PyUnicodeObject *) right)->length) { 6943 if (op == Py_EQ) { 6944 Py_INCREF(Py_False); 6945 return Py_False; 6946 } 6947 if (op == Py_NE) { 6948 Py_INCREF(Py_True); 6949 return Py_True; 6950 } 6951 } 6952 if (left == right) 6953 result = 0; 6954 else 6955 result = unicode_compare((PyUnicodeObject *)left, 6956 (PyUnicodeObject *)right); 6957 6958 /* Convert the return value to a Boolean */ 6959 switch (op) { 6960 case Py_EQ: 6961 v = TEST_COND(result == 0); 6962 break; 6963 case Py_NE: 6964 v = TEST_COND(result != 0); 6965 break; 6966 case Py_LE: 6967 v = TEST_COND(result <= 0); 6968 break; 6969 case Py_GE: 6970 v = TEST_COND(result >= 0); 6971 break; 6972 case Py_LT: 6973 v = TEST_COND(result == -1); 6974 break; 6975 case Py_GT: 6976 v = TEST_COND(result == 1); 6977 break; 6978 default: 6979 PyErr_BadArgument(); 6980 return NULL; 6981 } 6982 Py_INCREF(v); 6983 return v; 6984 } 6985 6986 Py_INCREF(Py_NotImplemented); 6987 return Py_NotImplemented; 6988} 6989 6990int PyUnicode_Contains(PyObject *container, 6991 PyObject *element) 6992{ 6993 PyObject *str, *sub; 6994 int result; 6995 6996 /* Coerce the two arguments */ 6997 sub = PyUnicode_FromObject(element); 6998 if (!sub) { 6999 PyErr_Format(PyExc_TypeError, 7000 "'in <string>' requires string as left operand, not %s", 7001 element->ob_type->tp_name); 7002 return -1; 7003 } 7004 7005 str = PyUnicode_FromObject(container); 7006 if (!str) { 7007 Py_DECREF(sub); 7008 return -1; 7009 } 7010 7011 result = stringlib_contains_obj(str, sub); 7012 7013 Py_DECREF(str); 7014 Py_DECREF(sub); 7015 7016 return result; 7017} 7018 7019/* Concat to string or Unicode object giving a new Unicode object. */ 7020 7021PyObject *PyUnicode_Concat(PyObject *left, 7022 PyObject *right) 7023{ 7024 PyUnicodeObject *u = NULL, *v = NULL, *w; 7025 7026 /* Coerce the two arguments */ 7027 u = (PyUnicodeObject *)PyUnicode_FromObject(left); 7028 if (u == NULL) 7029 goto onError; 7030 v = (PyUnicodeObject *)PyUnicode_FromObject(right); 7031 if (v == NULL) 7032 goto onError; 7033 7034 /* Shortcuts */ 7035 if (v == unicode_empty) { 7036 Py_DECREF(v); 7037 return (PyObject *)u; 7038 } 7039 if (u == unicode_empty) { 7040 Py_DECREF(u); 7041 return (PyObject *)v; 7042 } 7043 7044 /* Concat the two Unicode strings */ 7045 w = _PyUnicode_New(u->length + v->length); 7046 if (w == NULL) 7047 goto onError; 7048 Py_UNICODE_COPY(w->str, u->str, u->length); 7049 Py_UNICODE_COPY(w->str + u->length, v->str, v->length); 7050 7051 Py_DECREF(u); 7052 Py_DECREF(v); 7053 return (PyObject *)w; 7054 7055 onError: 7056 Py_XDECREF(u); 7057 Py_XDECREF(v); 7058 return NULL; 7059} 7060 7061void 7062PyUnicode_Append(PyObject **pleft, PyObject *right) 7063{ 7064 PyObject *new; 7065 if (*pleft == NULL) 7066 return; 7067 if (right == NULL || !PyUnicode_Check(*pleft)) { 7068 Py_DECREF(*pleft); 7069 *pleft = NULL; 7070 return; 7071 } 7072 new = PyUnicode_Concat(*pleft, right); 7073 Py_DECREF(*pleft); 7074 *pleft = new; 7075} 7076 7077void 7078PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right) 7079{ 7080 PyUnicode_Append(pleft, right); 7081 Py_XDECREF(right); 7082} 7083 7084PyDoc_STRVAR(count__doc__, 7085 "S.count(sub[, start[, end]]) -> int\n\ 7086\n\ 7087Return the number of non-overlapping occurrences of substring sub in\n\ 7088string S[start:end]. Optional arguments start and end are\n\ 7089interpreted as in slice notation."); 7090 7091static PyObject * 7092unicode_count(PyUnicodeObject *self, PyObject *args) 7093{ 7094 PyUnicodeObject *substring; 7095 Py_ssize_t start = 0; 7096 Py_ssize_t end = PY_SSIZE_T_MAX; 7097 PyObject *result; 7098 7099 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring, 7100 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 7101 return NULL; 7102 7103 substring = (PyUnicodeObject *)PyUnicode_FromObject( 7104 (PyObject *)substring); 7105 if (substring == NULL) 7106 return NULL; 7107 7108 ADJUST_INDICES(start, end, self->length); 7109 result = PyLong_FromSsize_t( 7110 stringlib_count(self->str + start, end - start, 7111 substring->str, substring->length, 7112 PY_SSIZE_T_MAX) 7113 ); 7114 7115 Py_DECREF(substring); 7116 7117 return result; 7118} 7119 7120PyDoc_STRVAR(encode__doc__, 7121 "S.encode([encoding[, errors]]) -> bytes\n\ 7122\n\ 7123Encode S using the codec registered for encoding. encoding defaults\n\ 7124to the default encoding. errors may be given to set a different error\n\ 7125handling scheme. Default is 'strict' meaning that encoding errors raise\n\ 7126a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\ 7127'xmlcharrefreplace' as well as any other name registered with\n\ 7128codecs.register_error that can handle UnicodeEncodeErrors."); 7129 7130static PyObject * 7131unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs) 7132{ 7133 static char *kwlist[] = {"encoding", "errors", 0}; 7134 char *encoding = NULL; 7135 char *errors = NULL; 7136 PyObject *v; 7137 7138 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode", 7139 kwlist, &encoding, &errors)) 7140 return NULL; 7141 v = PyUnicode_AsEncodedString((PyObject *)self, encoding, errors); 7142 if (v == NULL) 7143 goto onError; 7144 if (!PyBytes_Check(v)) { 7145 PyErr_Format(PyExc_TypeError, 7146 "encoder did not return a bytes object " 7147 "(type=%.400s)", 7148 Py_TYPE(v)->tp_name); 7149 Py_DECREF(v); 7150 return NULL; 7151 } 7152 return v; 7153 7154 onError: 7155 return NULL; 7156} 7157 7158PyDoc_STRVAR(expandtabs__doc__, 7159 "S.expandtabs([tabsize]) -> str\n\ 7160\n\ 7161Return a copy of S where all tab characters are expanded using spaces.\n\ 7162If tabsize is not given, a tab size of 8 characters is assumed."); 7163 7164static PyObject* 7165unicode_expandtabs(PyUnicodeObject *self, PyObject *args) 7166{ 7167 Py_UNICODE *e; 7168 Py_UNICODE *p; 7169 Py_UNICODE *q; 7170 Py_UNICODE *qe; 7171 Py_ssize_t i, j, incr; 7172 PyUnicodeObject *u; 7173 int tabsize = 8; 7174 7175 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize)) 7176 return NULL; 7177 7178 /* First pass: determine size of output string */ 7179 i = 0; /* chars up to and including most recent \n or \r */ 7180 j = 0; /* chars since most recent \n or \r (use in tab calculations) */ 7181 e = self->str + self->length; /* end of input */ 7182 for (p = self->str; p < e; p++) 7183 if (*p == '\t') { 7184 if (tabsize > 0) { 7185 incr = tabsize - (j % tabsize); /* cannot overflow */ 7186 if (j > PY_SSIZE_T_MAX - incr) 7187 goto overflow1; 7188 j += incr; 7189 } 7190 } 7191 else { 7192 if (j > PY_SSIZE_T_MAX - 1) 7193 goto overflow1; 7194 j++; 7195 if (*p == '\n' || *p == '\r') { 7196 if (i > PY_SSIZE_T_MAX - j) 7197 goto overflow1; 7198 i += j; 7199 j = 0; 7200 } 7201 } 7202 7203 if (i > PY_SSIZE_T_MAX - j) 7204 goto overflow1; 7205 7206 /* Second pass: create output string and fill it */ 7207 u = _PyUnicode_New(i + j); 7208 if (!u) 7209 return NULL; 7210 7211 j = 0; /* same as in first pass */ 7212 q = u->str; /* next output char */ 7213 qe = u->str + u->length; /* end of output */ 7214 7215 for (p = self->str; p < e; p++) 7216 if (*p == '\t') { 7217 if (tabsize > 0) { 7218 i = tabsize - (j % tabsize); 7219 j += i; 7220 while (i--) { 7221 if (q >= qe) 7222 goto overflow2; 7223 *q++ = ' '; 7224 } 7225 } 7226 } 7227 else { 7228 if (q >= qe) 7229 goto overflow2; 7230 *q++ = *p; 7231 j++; 7232 if (*p == '\n' || *p == '\r') 7233 j = 0; 7234 } 7235 7236 return (PyObject*) u; 7237 7238 overflow2: 7239 Py_DECREF(u); 7240 overflow1: 7241 PyErr_SetString(PyExc_OverflowError, "new string is too long"); 7242 return NULL; 7243} 7244 7245PyDoc_STRVAR(find__doc__, 7246 "S.find(sub[, start[, end]]) -> int\n\ 7247\n\ 7248Return the lowest index in S where substring sub is found,\n\ 7249such that sub is contained within s[start:end]. Optional\n\ 7250arguments start and end are interpreted as in slice notation.\n\ 7251\n\ 7252Return -1 on failure."); 7253 7254static PyObject * 7255unicode_find(PyUnicodeObject *self, PyObject *args) 7256{ 7257 PyObject *substring; 7258 Py_ssize_t start; 7259 Py_ssize_t end; 7260 Py_ssize_t result; 7261 7262 if (!_ParseTupleFinds(args, &substring, &start, &end)) 7263 return NULL; 7264 7265 result = stringlib_find_slice( 7266 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self), 7267 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring), 7268 start, end 7269 ); 7270 7271 Py_DECREF(substring); 7272 7273 return PyLong_FromSsize_t(result); 7274} 7275 7276static PyObject * 7277unicode_getitem(PyUnicodeObject *self, Py_ssize_t index) 7278{ 7279 if (index < 0 || index >= self->length) { 7280 PyErr_SetString(PyExc_IndexError, "string index out of range"); 7281 return NULL; 7282 } 7283 7284 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1); 7285} 7286 7287/* Believe it or not, this produces the same value for ASCII strings 7288 as string_hash(). */ 7289static long 7290unicode_hash(PyUnicodeObject *self) 7291{ 7292 Py_ssize_t len; 7293 Py_UNICODE *p; 7294 long x; 7295 7296 if (self->hash != -1) 7297 return self->hash; 7298 len = Py_SIZE(self); 7299 p = self->str; 7300 x = *p << 7; 7301 while (--len >= 0) 7302 x = (1000003*x) ^ *p++; 7303 x ^= Py_SIZE(self); 7304 if (x == -1) 7305 x = -2; 7306 self->hash = x; 7307 return x; 7308} 7309 7310PyDoc_STRVAR(index__doc__, 7311 "S.index(sub[, start[, end]]) -> int\n\ 7312\n\ 7313Like S.find() but raise ValueError when the substring is not found."); 7314 7315static PyObject * 7316unicode_index(PyUnicodeObject *self, PyObject *args) 7317{ 7318 Py_ssize_t result; 7319 PyObject *substring; 7320 Py_ssize_t start; 7321 Py_ssize_t end; 7322 7323 if (!_ParseTupleFinds(args, &substring, &start, &end)) 7324 return NULL; 7325 7326 result = stringlib_find_slice( 7327 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self), 7328 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring), 7329 start, end 7330 ); 7331 7332 Py_DECREF(substring); 7333 7334 if (result < 0) { 7335 PyErr_SetString(PyExc_ValueError, "substring not found"); 7336 return NULL; 7337 } 7338 7339 return PyLong_FromSsize_t(result); 7340} 7341 7342PyDoc_STRVAR(islower__doc__, 7343 "S.islower() -> bool\n\ 7344\n\ 7345Return True if all cased characters in S are lowercase and there is\n\ 7346at least one cased character in S, False otherwise."); 7347 7348static PyObject* 7349unicode_islower(PyUnicodeObject *self) 7350{ 7351 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 7352 register const Py_UNICODE *e; 7353 int cased; 7354 7355 /* Shortcut for single character strings */ 7356 if (PyUnicode_GET_SIZE(self) == 1) 7357 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p)); 7358 7359 /* Special case for empty strings */ 7360 if (PyUnicode_GET_SIZE(self) == 0) 7361 return PyBool_FromLong(0); 7362 7363 e = p + PyUnicode_GET_SIZE(self); 7364 cased = 0; 7365 for (; p < e; p++) { 7366 register const Py_UNICODE ch = *p; 7367 7368 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) 7369 return PyBool_FromLong(0); 7370 else if (!cased && Py_UNICODE_ISLOWER(ch)) 7371 cased = 1; 7372 } 7373 return PyBool_FromLong(cased); 7374} 7375 7376PyDoc_STRVAR(isupper__doc__, 7377 "S.isupper() -> bool\n\ 7378\n\ 7379Return True if all cased characters in S are uppercase and there is\n\ 7380at least one cased character in S, False otherwise."); 7381 7382static PyObject* 7383unicode_isupper(PyUnicodeObject *self) 7384{ 7385 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 7386 register const Py_UNICODE *e; 7387 int cased; 7388 7389 /* Shortcut for single character strings */ 7390 if (PyUnicode_GET_SIZE(self) == 1) 7391 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0); 7392 7393 /* Special case for empty strings */ 7394 if (PyUnicode_GET_SIZE(self) == 0) 7395 return PyBool_FromLong(0); 7396 7397 e = p + PyUnicode_GET_SIZE(self); 7398 cased = 0; 7399 for (; p < e; p++) { 7400 register const Py_UNICODE ch = *p; 7401 7402 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch)) 7403 return PyBool_FromLong(0); 7404 else if (!cased && Py_UNICODE_ISUPPER(ch)) 7405 cased = 1; 7406 } 7407 return PyBool_FromLong(cased); 7408} 7409 7410PyDoc_STRVAR(istitle__doc__, 7411 "S.istitle() -> bool\n\ 7412\n\ 7413Return True if S is a titlecased string and there is at least one\n\ 7414character in S, i.e. upper- and titlecase characters may only\n\ 7415follow uncased characters and lowercase characters only cased ones.\n\ 7416Return False otherwise."); 7417 7418static PyObject* 7419unicode_istitle(PyUnicodeObject *self) 7420{ 7421 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 7422 register const Py_UNICODE *e; 7423 int cased, previous_is_cased; 7424 7425 /* Shortcut for single character strings */ 7426 if (PyUnicode_GET_SIZE(self) == 1) 7427 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) || 7428 (Py_UNICODE_ISUPPER(*p) != 0)); 7429 7430 /* Special case for empty strings */ 7431 if (PyUnicode_GET_SIZE(self) == 0) 7432 return PyBool_FromLong(0); 7433 7434 e = p + PyUnicode_GET_SIZE(self); 7435 cased = 0; 7436 previous_is_cased = 0; 7437 for (; p < e; p++) { 7438 register const Py_UNICODE ch = *p; 7439 7440 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) { 7441 if (previous_is_cased) 7442 return PyBool_FromLong(0); 7443 previous_is_cased = 1; 7444 cased = 1; 7445 } 7446 else if (Py_UNICODE_ISLOWER(ch)) { 7447 if (!previous_is_cased) 7448 return PyBool_FromLong(0); 7449 previous_is_cased = 1; 7450 cased = 1; 7451 } 7452 else 7453 previous_is_cased = 0; 7454 } 7455 return PyBool_FromLong(cased); 7456} 7457 7458PyDoc_STRVAR(isspace__doc__, 7459 "S.isspace() -> bool\n\ 7460\n\ 7461Return True if all characters in S are whitespace\n\ 7462and there is at least one character in S, False otherwise."); 7463 7464static PyObject* 7465unicode_isspace(PyUnicodeObject *self) 7466{ 7467 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 7468 register const Py_UNICODE *e; 7469 7470 /* Shortcut for single character strings */ 7471 if (PyUnicode_GET_SIZE(self) == 1 && 7472 Py_UNICODE_ISSPACE(*p)) 7473 return PyBool_FromLong(1); 7474 7475 /* Special case for empty strings */ 7476 if (PyUnicode_GET_SIZE(self) == 0) 7477 return PyBool_FromLong(0); 7478 7479 e = p + PyUnicode_GET_SIZE(self); 7480 for (; p < e; p++) { 7481 if (!Py_UNICODE_ISSPACE(*p)) 7482 return PyBool_FromLong(0); 7483 } 7484 return PyBool_FromLong(1); 7485} 7486 7487PyDoc_STRVAR(isalpha__doc__, 7488 "S.isalpha() -> bool\n\ 7489\n\ 7490Return True if all characters in S are alphabetic\n\ 7491and there is at least one character in S, False otherwise."); 7492 7493static PyObject* 7494unicode_isalpha(PyUnicodeObject *self) 7495{ 7496 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 7497 register const Py_UNICODE *e; 7498 7499 /* Shortcut for single character strings */ 7500 if (PyUnicode_GET_SIZE(self) == 1 && 7501 Py_UNICODE_ISALPHA(*p)) 7502 return PyBool_FromLong(1); 7503 7504 /* Special case for empty strings */ 7505 if (PyUnicode_GET_SIZE(self) == 0) 7506 return PyBool_FromLong(0); 7507 7508 e = p + PyUnicode_GET_SIZE(self); 7509 for (; p < e; p++) { 7510 if (!Py_UNICODE_ISALPHA(*p)) 7511 return PyBool_FromLong(0); 7512 } 7513 return PyBool_FromLong(1); 7514} 7515 7516PyDoc_STRVAR(isalnum__doc__, 7517 "S.isalnum() -> bool\n\ 7518\n\ 7519Return True if all characters in S are alphanumeric\n\ 7520and there is at least one character in S, False otherwise."); 7521 7522static PyObject* 7523unicode_isalnum(PyUnicodeObject *self) 7524{ 7525 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 7526 register const Py_UNICODE *e; 7527 7528 /* Shortcut for single character strings */ 7529 if (PyUnicode_GET_SIZE(self) == 1 && 7530 Py_UNICODE_ISALNUM(*p)) 7531 return PyBool_FromLong(1); 7532 7533 /* Special case for empty strings */ 7534 if (PyUnicode_GET_SIZE(self) == 0) 7535 return PyBool_FromLong(0); 7536 7537 e = p + PyUnicode_GET_SIZE(self); 7538 for (; p < e; p++) { 7539 if (!Py_UNICODE_ISALNUM(*p)) 7540 return PyBool_FromLong(0); 7541 } 7542 return PyBool_FromLong(1); 7543} 7544 7545PyDoc_STRVAR(isdecimal__doc__, 7546 "S.isdecimal() -> bool\n\ 7547\n\ 7548Return True if there are only decimal characters in S,\n\ 7549False otherwise."); 7550 7551static PyObject* 7552unicode_isdecimal(PyUnicodeObject *self) 7553{ 7554 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 7555 register const Py_UNICODE *e; 7556 7557 /* Shortcut for single character strings */ 7558 if (PyUnicode_GET_SIZE(self) == 1 && 7559 Py_UNICODE_ISDECIMAL(*p)) 7560 return PyBool_FromLong(1); 7561 7562 /* Special case for empty strings */ 7563 if (PyUnicode_GET_SIZE(self) == 0) 7564 return PyBool_FromLong(0); 7565 7566 e = p + PyUnicode_GET_SIZE(self); 7567 for (; p < e; p++) { 7568 if (!Py_UNICODE_ISDECIMAL(*p)) 7569 return PyBool_FromLong(0); 7570 } 7571 return PyBool_FromLong(1); 7572} 7573 7574PyDoc_STRVAR(isdigit__doc__, 7575 "S.isdigit() -> bool\n\ 7576\n\ 7577Return True if all characters in S are digits\n\ 7578and there is at least one character in S, False otherwise."); 7579 7580static PyObject* 7581unicode_isdigit(PyUnicodeObject *self) 7582{ 7583 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 7584 register const Py_UNICODE *e; 7585 7586 /* Shortcut for single character strings */ 7587 if (PyUnicode_GET_SIZE(self) == 1 && 7588 Py_UNICODE_ISDIGIT(*p)) 7589 return PyBool_FromLong(1); 7590 7591 /* Special case for empty strings */ 7592 if (PyUnicode_GET_SIZE(self) == 0) 7593 return PyBool_FromLong(0); 7594 7595 e = p + PyUnicode_GET_SIZE(self); 7596 for (; p < e; p++) { 7597 if (!Py_UNICODE_ISDIGIT(*p)) 7598 return PyBool_FromLong(0); 7599 } 7600 return PyBool_FromLong(1); 7601} 7602 7603PyDoc_STRVAR(isnumeric__doc__, 7604 "S.isnumeric() -> bool\n\ 7605\n\ 7606Return True if there are only numeric characters in S,\n\ 7607False otherwise."); 7608 7609static PyObject* 7610unicode_isnumeric(PyUnicodeObject *self) 7611{ 7612 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 7613 register const Py_UNICODE *e; 7614 7615 /* Shortcut for single character strings */ 7616 if (PyUnicode_GET_SIZE(self) == 1 && 7617 Py_UNICODE_ISNUMERIC(*p)) 7618 return PyBool_FromLong(1); 7619 7620 /* Special case for empty strings */ 7621 if (PyUnicode_GET_SIZE(self) == 0) 7622 return PyBool_FromLong(0); 7623 7624 e = p + PyUnicode_GET_SIZE(self); 7625 for (; p < e; p++) { 7626 if (!Py_UNICODE_ISNUMERIC(*p)) 7627 return PyBool_FromLong(0); 7628 } 7629 return PyBool_FromLong(1); 7630} 7631 7632int 7633PyUnicode_IsIdentifier(PyObject *self) 7634{ 7635 register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self); 7636 register const Py_UNICODE *e; 7637 7638 /* Special case for empty strings */ 7639 if (PyUnicode_GET_SIZE(self) == 0) 7640 return 0; 7641 7642 /* PEP 3131 says that the first character must be in 7643 XID_Start and subsequent characters in XID_Continue, 7644 and for the ASCII range, the 2.x rules apply (i.e 7645 start with letters and underscore, continue with 7646 letters, digits, underscore). However, given the current 7647 definition of XID_Start and XID_Continue, it is sufficient 7648 to check just for these, except that _ must be allowed 7649 as starting an identifier. */ 7650 if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */) 7651 return 0; 7652 7653 e = p + PyUnicode_GET_SIZE(self); 7654 for (p++; p < e; p++) { 7655 if (!_PyUnicode_IsXidContinue(*p)) 7656 return 0; 7657 } 7658 return 1; 7659} 7660 7661PyDoc_STRVAR(isidentifier__doc__, 7662 "S.isidentifier() -> bool\n\ 7663\n\ 7664Return True if S is a valid identifier according\n\ 7665to the language definition."); 7666 7667static PyObject* 7668unicode_isidentifier(PyObject *self) 7669{ 7670 return PyBool_FromLong(PyUnicode_IsIdentifier(self)); 7671} 7672 7673PyDoc_STRVAR(isprintable__doc__, 7674 "S.isprintable() -> bool\n\ 7675\n\ 7676Return True if all characters in S are considered\n\ 7677printable in repr() or S is empty, False otherwise."); 7678 7679static PyObject* 7680unicode_isprintable(PyObject *self) 7681{ 7682 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 7683 register const Py_UNICODE *e; 7684 7685 /* Shortcut for single character strings */ 7686 if (PyUnicode_GET_SIZE(self) == 1 && Py_UNICODE_ISPRINTABLE(*p)) { 7687 Py_RETURN_TRUE; 7688 } 7689 7690 e = p + PyUnicode_GET_SIZE(self); 7691 for (; p < e; p++) { 7692 if (!Py_UNICODE_ISPRINTABLE(*p)) { 7693 Py_RETURN_FALSE; 7694 } 7695 } 7696 Py_RETURN_TRUE; 7697} 7698 7699PyDoc_STRVAR(join__doc__, 7700 "S.join(iterable) -> str\n\ 7701\n\ 7702Return a string which is the concatenation of the strings in the\n\ 7703iterable. The separator between elements is S."); 7704 7705static PyObject* 7706unicode_join(PyObject *self, PyObject *data) 7707{ 7708 return PyUnicode_Join(self, data); 7709} 7710 7711static Py_ssize_t 7712unicode_length(PyUnicodeObject *self) 7713{ 7714 return self->length; 7715} 7716 7717PyDoc_STRVAR(ljust__doc__, 7718 "S.ljust(width[, fillchar]) -> str\n\ 7719\n\ 7720Return S left-justified in a Unicode string of length width. Padding is\n\ 7721done using the specified fill character (default is a space)."); 7722 7723static PyObject * 7724unicode_ljust(PyUnicodeObject *self, PyObject *args) 7725{ 7726 Py_ssize_t width; 7727 Py_UNICODE fillchar = ' '; 7728 7729 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar)) 7730 return NULL; 7731 7732 if (self->length >= width && PyUnicode_CheckExact(self)) { 7733 Py_INCREF(self); 7734 return (PyObject*) self; 7735 } 7736 7737 return (PyObject*) pad(self, 0, width - self->length, fillchar); 7738} 7739 7740PyDoc_STRVAR(lower__doc__, 7741 "S.lower() -> str\n\ 7742\n\ 7743Return a copy of the string S converted to lowercase."); 7744 7745static PyObject* 7746unicode_lower(PyUnicodeObject *self) 7747{ 7748 return fixup(self, fixlower); 7749} 7750 7751#define LEFTSTRIP 0 7752#define RIGHTSTRIP 1 7753#define BOTHSTRIP 2 7754 7755/* Arrays indexed by above */ 7756static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"}; 7757 7758#define STRIPNAME(i) (stripformat[i]+3) 7759 7760/* externally visible for str.strip(unicode) */ 7761PyObject * 7762_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj) 7763{ 7764 Py_UNICODE *s = PyUnicode_AS_UNICODE(self); 7765 Py_ssize_t len = PyUnicode_GET_SIZE(self); 7766 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj); 7767 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj); 7768 Py_ssize_t i, j; 7769 7770 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen); 7771 7772 i = 0; 7773 if (striptype != RIGHTSTRIP) { 7774 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) { 7775 i++; 7776 } 7777 } 7778 7779 j = len; 7780 if (striptype != LEFTSTRIP) { 7781 do { 7782 j--; 7783 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen)); 7784 j++; 7785 } 7786 7787 if (i == 0 && j == len && PyUnicode_CheckExact(self)) { 7788 Py_INCREF(self); 7789 return (PyObject*)self; 7790 } 7791 else 7792 return PyUnicode_FromUnicode(s+i, j-i); 7793} 7794 7795 7796static PyObject * 7797do_strip(PyUnicodeObject *self, int striptype) 7798{ 7799 Py_UNICODE *s = PyUnicode_AS_UNICODE(self); 7800 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j; 7801 7802 i = 0; 7803 if (striptype != RIGHTSTRIP) { 7804 while (i < len && Py_UNICODE_ISSPACE(s[i])) { 7805 i++; 7806 } 7807 } 7808 7809 j = len; 7810 if (striptype != LEFTSTRIP) { 7811 do { 7812 j--; 7813 } while (j >= i && Py_UNICODE_ISSPACE(s[j])); 7814 j++; 7815 } 7816 7817 if (i == 0 && j == len && PyUnicode_CheckExact(self)) { 7818 Py_INCREF(self); 7819 return (PyObject*)self; 7820 } 7821 else 7822 return PyUnicode_FromUnicode(s+i, j-i); 7823} 7824 7825 7826static PyObject * 7827do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args) 7828{ 7829 PyObject *sep = NULL; 7830 7831 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep)) 7832 return NULL; 7833 7834 if (sep != NULL && sep != Py_None) { 7835 if (PyUnicode_Check(sep)) 7836 return _PyUnicode_XStrip(self, striptype, sep); 7837 else { 7838 PyErr_Format(PyExc_TypeError, 7839 "%s arg must be None or str", 7840 STRIPNAME(striptype)); 7841 return NULL; 7842 } 7843 } 7844 7845 return do_strip(self, striptype); 7846} 7847 7848 7849PyDoc_STRVAR(strip__doc__, 7850 "S.strip([chars]) -> str\n\ 7851\n\ 7852Return a copy of the string S with leading and trailing\n\ 7853whitespace removed.\n\ 7854If chars is given and not None, remove characters in chars instead."); 7855 7856static PyObject * 7857unicode_strip(PyUnicodeObject *self, PyObject *args) 7858{ 7859 if (PyTuple_GET_SIZE(args) == 0) 7860 return do_strip(self, BOTHSTRIP); /* Common case */ 7861 else 7862 return do_argstrip(self, BOTHSTRIP, args); 7863} 7864 7865 7866PyDoc_STRVAR(lstrip__doc__, 7867 "S.lstrip([chars]) -> str\n\ 7868\n\ 7869Return a copy of the string S with leading whitespace removed.\n\ 7870If chars is given and not None, remove characters in chars instead."); 7871 7872static PyObject * 7873unicode_lstrip(PyUnicodeObject *self, PyObject *args) 7874{ 7875 if (PyTuple_GET_SIZE(args) == 0) 7876 return do_strip(self, LEFTSTRIP); /* Common case */ 7877 else 7878 return do_argstrip(self, LEFTSTRIP, args); 7879} 7880 7881 7882PyDoc_STRVAR(rstrip__doc__, 7883 "S.rstrip([chars]) -> str\n\ 7884\n\ 7885Return a copy of the string S with trailing whitespace removed.\n\ 7886If chars is given and not None, remove characters in chars instead."); 7887 7888static PyObject * 7889unicode_rstrip(PyUnicodeObject *self, PyObject *args) 7890{ 7891 if (PyTuple_GET_SIZE(args) == 0) 7892 return do_strip(self, RIGHTSTRIP); /* Common case */ 7893 else 7894 return do_argstrip(self, RIGHTSTRIP, args); 7895} 7896 7897 7898static PyObject* 7899unicode_repeat(PyUnicodeObject *str, Py_ssize_t len) 7900{ 7901 PyUnicodeObject *u; 7902 Py_UNICODE *p; 7903 Py_ssize_t nchars; 7904 size_t nbytes; 7905 7906 if (len < 1) { 7907 Py_INCREF(unicode_empty); 7908 return (PyObject *)unicode_empty; 7909 } 7910 7911 if (len == 1 && PyUnicode_CheckExact(str)) { 7912 /* no repeat, return original string */ 7913 Py_INCREF(str); 7914 return (PyObject*) str; 7915 } 7916 7917 /* ensure # of chars needed doesn't overflow int and # of bytes 7918 * needed doesn't overflow size_t 7919 */ 7920 nchars = len * str->length; 7921 if (nchars / len != str->length) { 7922 PyErr_SetString(PyExc_OverflowError, 7923 "repeated string is too long"); 7924 return NULL; 7925 } 7926 nbytes = (nchars + 1) * sizeof(Py_UNICODE); 7927 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) { 7928 PyErr_SetString(PyExc_OverflowError, 7929 "repeated string is too long"); 7930 return NULL; 7931 } 7932 u = _PyUnicode_New(nchars); 7933 if (!u) 7934 return NULL; 7935 7936 p = u->str; 7937 7938 if (str->length == 1) { 7939 Py_UNICODE_FILL(p, str->str[0], len); 7940 } else { 7941 Py_ssize_t done = str->length; /* number of characters copied this far */ 7942 Py_UNICODE_COPY(p, str->str, str->length); 7943 while (done < nchars) { 7944 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done; 7945 Py_UNICODE_COPY(p+done, p, n); 7946 done += n; 7947 } 7948 } 7949 7950 return (PyObject*) u; 7951} 7952 7953PyObject *PyUnicode_Replace(PyObject *obj, 7954 PyObject *subobj, 7955 PyObject *replobj, 7956 Py_ssize_t maxcount) 7957{ 7958 PyObject *self; 7959 PyObject *str1; 7960 PyObject *str2; 7961 PyObject *result; 7962 7963 self = PyUnicode_FromObject(obj); 7964 if (self == NULL) 7965 return NULL; 7966 str1 = PyUnicode_FromObject(subobj); 7967 if (str1 == NULL) { 7968 Py_DECREF(self); 7969 return NULL; 7970 } 7971 str2 = PyUnicode_FromObject(replobj); 7972 if (str2 == NULL) { 7973 Py_DECREF(self); 7974 Py_DECREF(str1); 7975 return NULL; 7976 } 7977 result = replace((PyUnicodeObject *)self, 7978 (PyUnicodeObject *)str1, 7979 (PyUnicodeObject *)str2, 7980 maxcount); 7981 Py_DECREF(self); 7982 Py_DECREF(str1); 7983 Py_DECREF(str2); 7984 return result; 7985} 7986 7987PyDoc_STRVAR(replace__doc__, 7988 "S.replace(old, new[, count]) -> str\n\ 7989\n\ 7990Return a copy of S with all occurrences of substring\n\ 7991old replaced by new. If the optional argument count is\n\ 7992given, only the first count occurrences are replaced."); 7993 7994static PyObject* 7995unicode_replace(PyUnicodeObject *self, PyObject *args) 7996{ 7997 PyUnicodeObject *str1; 7998 PyUnicodeObject *str2; 7999 Py_ssize_t maxcount = -1; 8000 PyObject *result; 8001 8002 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount)) 8003 return NULL; 8004 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1); 8005 if (str1 == NULL) 8006 return NULL; 8007 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2); 8008 if (str2 == NULL) { 8009 Py_DECREF(str1); 8010 return NULL; 8011 } 8012 8013 result = replace(self, str1, str2, maxcount); 8014 8015 Py_DECREF(str1); 8016 Py_DECREF(str2); 8017 return result; 8018} 8019 8020static 8021PyObject *unicode_repr(PyObject *unicode) 8022{ 8023 PyObject *repr; 8024 Py_UNICODE *p; 8025 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode); 8026 Py_ssize_t size = PyUnicode_GET_SIZE(unicode); 8027 8028 /* XXX(nnorwitz): rather than over-allocating, it would be 8029 better to choose a different scheme. Perhaps scan the 8030 first N-chars of the string and allocate based on that size. 8031 */ 8032 /* Initial allocation is based on the longest-possible unichr 8033 escape. 8034 8035 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source 8036 unichr, so in this case it's the longest unichr escape. In 8037 narrow (UTF-16) builds this is five chars per source unichr 8038 since there are two unichrs in the surrogate pair, so in narrow 8039 (UTF-16) builds it's not the longest unichr escape. 8040 8041 In wide or narrow builds '\uxxxx' is 6 chars per source unichr, 8042 so in the narrow (UTF-16) build case it's the longest unichr 8043 escape. 8044 */ 8045 8046 repr = PyUnicode_FromUnicode(NULL, 8047 2 /* quotes */ 8048#ifdef Py_UNICODE_WIDE 8049 + 10*size 8050#else 8051 + 6*size 8052#endif 8053 + 1); 8054 if (repr == NULL) 8055 return NULL; 8056 8057 p = PyUnicode_AS_UNICODE(repr); 8058 8059 /* Add quote */ 8060 *p++ = (findchar(s, size, '\'') && 8061 !findchar(s, size, '"')) ? '"' : '\''; 8062 while (size-- > 0) { 8063 Py_UNICODE ch = *s++; 8064 8065 /* Escape quotes and backslashes */ 8066 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) { 8067 *p++ = '\\'; 8068 *p++ = ch; 8069 continue; 8070 } 8071 8072 /* Map special whitespace to '\t', \n', '\r' */ 8073 if (ch == '\t') { 8074 *p++ = '\\'; 8075 *p++ = 't'; 8076 } 8077 else if (ch == '\n') { 8078 *p++ = '\\'; 8079 *p++ = 'n'; 8080 } 8081 else if (ch == '\r') { 8082 *p++ = '\\'; 8083 *p++ = 'r'; 8084 } 8085 8086 /* Map non-printable US ASCII to '\xhh' */ 8087 else if (ch < ' ' || ch == 0x7F) { 8088 *p++ = '\\'; 8089 *p++ = 'x'; 8090 *p++ = hexdigits[(ch >> 4) & 0x000F]; 8091 *p++ = hexdigits[ch & 0x000F]; 8092 } 8093 8094 /* Copy ASCII characters as-is */ 8095 else if (ch < 0x7F) { 8096 *p++ = ch; 8097 } 8098 8099 /* Non-ASCII characters */ 8100 else { 8101 Py_UCS4 ucs = ch; 8102 8103#ifndef Py_UNICODE_WIDE 8104 Py_UNICODE ch2 = 0; 8105 /* Get code point from surrogate pair */ 8106 if (size > 0) { 8107 ch2 = *s; 8108 if (ch >= 0xD800 && ch < 0xDC00 && ch2 >= 0xDC00 8109 && ch2 <= 0xDFFF) { 8110 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) 8111 + 0x00010000; 8112 s++; 8113 size--; 8114 } 8115 } 8116#endif 8117 /* Map Unicode whitespace and control characters 8118 (categories Z* and C* except ASCII space) 8119 */ 8120 if (!Py_UNICODE_ISPRINTABLE(ucs)) { 8121 /* Map 8-bit characters to '\xhh' */ 8122 if (ucs <= 0xff) { 8123 *p++ = '\\'; 8124 *p++ = 'x'; 8125 *p++ = hexdigits[(ch >> 4) & 0x000F]; 8126 *p++ = hexdigits[ch & 0x000F]; 8127 } 8128 /* Map 21-bit characters to '\U00xxxxxx' */ 8129 else if (ucs >= 0x10000) { 8130 *p++ = '\\'; 8131 *p++ = 'U'; 8132 *p++ = hexdigits[(ucs >> 28) & 0x0000000F]; 8133 *p++ = hexdigits[(ucs >> 24) & 0x0000000F]; 8134 *p++ = hexdigits[(ucs >> 20) & 0x0000000F]; 8135 *p++ = hexdigits[(ucs >> 16) & 0x0000000F]; 8136 *p++ = hexdigits[(ucs >> 12) & 0x0000000F]; 8137 *p++ = hexdigits[(ucs >> 8) & 0x0000000F]; 8138 *p++ = hexdigits[(ucs >> 4) & 0x0000000F]; 8139 *p++ = hexdigits[ucs & 0x0000000F]; 8140 } 8141 /* Map 16-bit characters to '\uxxxx' */ 8142 else { 8143 *p++ = '\\'; 8144 *p++ = 'u'; 8145 *p++ = hexdigits[(ucs >> 12) & 0x000F]; 8146 *p++ = hexdigits[(ucs >> 8) & 0x000F]; 8147 *p++ = hexdigits[(ucs >> 4) & 0x000F]; 8148 *p++ = hexdigits[ucs & 0x000F]; 8149 } 8150 } 8151 /* Copy characters as-is */ 8152 else { 8153 *p++ = ch; 8154#ifndef Py_UNICODE_WIDE 8155 if (ucs >= 0x10000) 8156 *p++ = ch2; 8157#endif 8158 } 8159 } 8160 } 8161 /* Add quote */ 8162 *p++ = PyUnicode_AS_UNICODE(repr)[0]; 8163 8164 *p = '\0'; 8165 PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr)); 8166 return repr; 8167} 8168 8169PyDoc_STRVAR(rfind__doc__, 8170 "S.rfind(sub[, start[, end]]) -> int\n\ 8171\n\ 8172Return the highest index in S where substring sub is found,\n\ 8173such that sub is contained within s[start:end]. Optional\n\ 8174arguments start and end are interpreted as in slice notation.\n\ 8175\n\ 8176Return -1 on failure."); 8177 8178static PyObject * 8179unicode_rfind(PyUnicodeObject *self, PyObject *args) 8180{ 8181 PyObject *substring; 8182 Py_ssize_t start; 8183 Py_ssize_t end; 8184 Py_ssize_t result; 8185 8186 if (!_ParseTupleFinds(args, &substring, &start, &end)) 8187 return NULL; 8188 8189 result = stringlib_rfind_slice( 8190 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self), 8191 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring), 8192 start, end 8193 ); 8194 8195 Py_DECREF(substring); 8196 8197 return PyLong_FromSsize_t(result); 8198} 8199 8200PyDoc_STRVAR(rindex__doc__, 8201 "S.rindex(sub[, start[, end]]) -> int\n\ 8202\n\ 8203Like S.rfind() but raise ValueError when the substring is not found."); 8204 8205static PyObject * 8206unicode_rindex(PyUnicodeObject *self, PyObject *args) 8207{ 8208 PyObject *substring; 8209 Py_ssize_t start; 8210 Py_ssize_t end; 8211 Py_ssize_t result; 8212 8213 if (!_ParseTupleFinds(args, &substring, &start, &end)) 8214 return NULL; 8215 8216 result = stringlib_rfind_slice( 8217 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self), 8218 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring), 8219 start, end 8220 ); 8221 8222 Py_DECREF(substring); 8223 8224 if (result < 0) { 8225 PyErr_SetString(PyExc_ValueError, "substring not found"); 8226 return NULL; 8227 } 8228 return PyLong_FromSsize_t(result); 8229} 8230 8231PyDoc_STRVAR(rjust__doc__, 8232 "S.rjust(width[, fillchar]) -> str\n\ 8233\n\ 8234Return S right-justified in a string of length width. Padding is\n\ 8235done using the specified fill character (default is a space)."); 8236 8237static PyObject * 8238unicode_rjust(PyUnicodeObject *self, PyObject *args) 8239{ 8240 Py_ssize_t width; 8241 Py_UNICODE fillchar = ' '; 8242 8243 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar)) 8244 return NULL; 8245 8246 if (self->length >= width && PyUnicode_CheckExact(self)) { 8247 Py_INCREF(self); 8248 return (PyObject*) self; 8249 } 8250 8251 return (PyObject*) pad(self, width - self->length, 0, fillchar); 8252} 8253 8254PyObject *PyUnicode_Split(PyObject *s, 8255 PyObject *sep, 8256 Py_ssize_t maxsplit) 8257{ 8258 PyObject *result; 8259 8260 s = PyUnicode_FromObject(s); 8261 if (s == NULL) 8262 return NULL; 8263 if (sep != NULL) { 8264 sep = PyUnicode_FromObject(sep); 8265 if (sep == NULL) { 8266 Py_DECREF(s); 8267 return NULL; 8268 } 8269 } 8270 8271 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit); 8272 8273 Py_DECREF(s); 8274 Py_XDECREF(sep); 8275 return result; 8276} 8277 8278PyDoc_STRVAR(split__doc__, 8279 "S.split([sep[, maxsplit]]) -> list of strings\n\ 8280\n\ 8281Return a list of the words in S, using sep as the\n\ 8282delimiter string. If maxsplit is given, at most maxsplit\n\ 8283splits are done. If sep is not specified or is None, any\n\ 8284whitespace string is a separator and empty strings are\n\ 8285removed from the result."); 8286 8287static PyObject* 8288unicode_split(PyUnicodeObject *self, PyObject *args) 8289{ 8290 PyObject *substring = Py_None; 8291 Py_ssize_t maxcount = -1; 8292 8293 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount)) 8294 return NULL; 8295 8296 if (substring == Py_None) 8297 return split(self, NULL, maxcount); 8298 else if (PyUnicode_Check(substring)) 8299 return split(self, (PyUnicodeObject *)substring, maxcount); 8300 else 8301 return PyUnicode_Split((PyObject *)self, substring, maxcount); 8302} 8303 8304PyObject * 8305PyUnicode_Partition(PyObject *str_in, PyObject *sep_in) 8306{ 8307 PyObject* str_obj; 8308 PyObject* sep_obj; 8309 PyObject* out; 8310 8311 str_obj = PyUnicode_FromObject(str_in); 8312 if (!str_obj) 8313 return NULL; 8314 sep_obj = PyUnicode_FromObject(sep_in); 8315 if (!sep_obj) { 8316 Py_DECREF(str_obj); 8317 return NULL; 8318 } 8319 8320 out = stringlib_partition( 8321 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj), 8322 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj) 8323 ); 8324 8325 Py_DECREF(sep_obj); 8326 Py_DECREF(str_obj); 8327 8328 return out; 8329} 8330 8331 8332PyObject * 8333PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in) 8334{ 8335 PyObject* str_obj; 8336 PyObject* sep_obj; 8337 PyObject* out; 8338 8339 str_obj = PyUnicode_FromObject(str_in); 8340 if (!str_obj) 8341 return NULL; 8342 sep_obj = PyUnicode_FromObject(sep_in); 8343 if (!sep_obj) { 8344 Py_DECREF(str_obj); 8345 return NULL; 8346 } 8347 8348 out = stringlib_rpartition( 8349 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj), 8350 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj) 8351 ); 8352 8353 Py_DECREF(sep_obj); 8354 Py_DECREF(str_obj); 8355 8356 return out; 8357} 8358 8359PyDoc_STRVAR(partition__doc__, 8360 "S.partition(sep) -> (head, sep, tail)\n\ 8361\n\ 8362Search for the separator sep in S, and return the part before it,\n\ 8363the separator itself, and the part after it. If the separator is not\n\ 8364found, return S and two empty strings."); 8365 8366static PyObject* 8367unicode_partition(PyUnicodeObject *self, PyObject *separator) 8368{ 8369 return PyUnicode_Partition((PyObject *)self, separator); 8370} 8371 8372PyDoc_STRVAR(rpartition__doc__, 8373 "S.rpartition(sep) -> (head, sep, tail)\n\ 8374\n\ 8375Search for the separator sep in S, starting at the end of S, and return\n\ 8376the part before it, the separator itself, and the part after it. If the\n\ 8377separator is not found, return two empty strings and S."); 8378 8379static PyObject* 8380unicode_rpartition(PyUnicodeObject *self, PyObject *separator) 8381{ 8382 return PyUnicode_RPartition((PyObject *)self, separator); 8383} 8384 8385PyObject *PyUnicode_RSplit(PyObject *s, 8386 PyObject *sep, 8387 Py_ssize_t maxsplit) 8388{ 8389 PyObject *result; 8390 8391 s = PyUnicode_FromObject(s); 8392 if (s == NULL) 8393 return NULL; 8394 if (sep != NULL) { 8395 sep = PyUnicode_FromObject(sep); 8396 if (sep == NULL) { 8397 Py_DECREF(s); 8398 return NULL; 8399 } 8400 } 8401 8402 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit); 8403 8404 Py_DECREF(s); 8405 Py_XDECREF(sep); 8406 return result; 8407} 8408 8409PyDoc_STRVAR(rsplit__doc__, 8410 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\ 8411\n\ 8412Return a list of the words in S, using sep as the\n\ 8413delimiter string, starting at the end of the string and\n\ 8414working to the front. If maxsplit is given, at most maxsplit\n\ 8415splits are done. If sep is not specified, any whitespace string\n\ 8416is a separator."); 8417 8418static PyObject* 8419unicode_rsplit(PyUnicodeObject *self, PyObject *args) 8420{ 8421 PyObject *substring = Py_None; 8422 Py_ssize_t maxcount = -1; 8423 8424 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount)) 8425 return NULL; 8426 8427 if (substring == Py_None) 8428 return rsplit(self, NULL, maxcount); 8429 else if (PyUnicode_Check(substring)) 8430 return rsplit(self, (PyUnicodeObject *)substring, maxcount); 8431 else 8432 return PyUnicode_RSplit((PyObject *)self, substring, maxcount); 8433} 8434 8435PyDoc_STRVAR(splitlines__doc__, 8436 "S.splitlines([keepends]) -> list of strings\n\ 8437\n\ 8438Return a list of the lines in S, breaking at line boundaries.\n\ 8439Line breaks are not included in the resulting list unless keepends\n\ 8440is given and true."); 8441 8442static PyObject* 8443unicode_splitlines(PyUnicodeObject *self, PyObject *args) 8444{ 8445 int keepends = 0; 8446 8447 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends)) 8448 return NULL; 8449 8450 return PyUnicode_Splitlines((PyObject *)self, keepends); 8451} 8452 8453static 8454PyObject *unicode_str(PyObject *self) 8455{ 8456 if (PyUnicode_CheckExact(self)) { 8457 Py_INCREF(self); 8458 return self; 8459 } else 8460 /* Subtype -- return genuine unicode string with the same value. */ 8461 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self), 8462 PyUnicode_GET_SIZE(self)); 8463} 8464 8465PyDoc_STRVAR(swapcase__doc__, 8466 "S.swapcase() -> str\n\ 8467\n\ 8468Return a copy of S with uppercase characters converted to lowercase\n\ 8469and vice versa."); 8470 8471static PyObject* 8472unicode_swapcase(PyUnicodeObject *self) 8473{ 8474 return fixup(self, fixswapcase); 8475} 8476 8477PyDoc_STRVAR(maketrans__doc__, 8478 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\ 8479\n\ 8480Return a translation table usable for str.translate().\n\ 8481If there is only one argument, it must be a dictionary mapping Unicode\n\ 8482ordinals (integers) or characters to Unicode ordinals, strings or None.\n\ 8483Character keys will be then converted to ordinals.\n\ 8484If there are two arguments, they must be strings of equal length, and\n\ 8485in the resulting dictionary, each character in x will be mapped to the\n\ 8486character at the same position in y. If there is a third argument, it\n\ 8487must be a string, whose characters will be mapped to None in the result."); 8488 8489static PyObject* 8490unicode_maketrans(PyUnicodeObject *null, PyObject *args) 8491{ 8492 PyObject *x, *y = NULL, *z = NULL; 8493 PyObject *new = NULL, *key, *value; 8494 Py_ssize_t i = 0; 8495 int res; 8496 8497 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z)) 8498 return NULL; 8499 new = PyDict_New(); 8500 if (!new) 8501 return NULL; 8502 if (y != NULL) { 8503 /* x must be a string too, of equal length */ 8504 Py_ssize_t ylen = PyUnicode_GET_SIZE(y); 8505 if (!PyUnicode_Check(x)) { 8506 PyErr_SetString(PyExc_TypeError, "first maketrans argument must " 8507 "be a string if there is a second argument"); 8508 goto err; 8509 } 8510 if (PyUnicode_GET_SIZE(x) != ylen) { 8511 PyErr_SetString(PyExc_ValueError, "the first two maketrans " 8512 "arguments must have equal length"); 8513 goto err; 8514 } 8515 /* create entries for translating chars in x to those in y */ 8516 for (i = 0; i < PyUnicode_GET_SIZE(x); i++) { 8517 key = PyLong_FromLong(PyUnicode_AS_UNICODE(x)[i]); 8518 value = PyLong_FromLong(PyUnicode_AS_UNICODE(y)[i]); 8519 if (!key || !value) 8520 goto err; 8521 res = PyDict_SetItem(new, key, value); 8522 Py_DECREF(key); 8523 Py_DECREF(value); 8524 if (res < 0) 8525 goto err; 8526 } 8527 /* create entries for deleting chars in z */ 8528 if (z != NULL) { 8529 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) { 8530 key = PyLong_FromLong(PyUnicode_AS_UNICODE(z)[i]); 8531 if (!key) 8532 goto err; 8533 res = PyDict_SetItem(new, key, Py_None); 8534 Py_DECREF(key); 8535 if (res < 0) 8536 goto err; 8537 } 8538 } 8539 } else { 8540 /* x must be a dict */ 8541 if (!PyDict_CheckExact(x)) { 8542 PyErr_SetString(PyExc_TypeError, "if you give only one argument " 8543 "to maketrans it must be a dict"); 8544 goto err; 8545 } 8546 /* copy entries into the new dict, converting string keys to int keys */ 8547 while (PyDict_Next(x, &i, &key, &value)) { 8548 if (PyUnicode_Check(key)) { 8549 /* convert string keys to integer keys */ 8550 PyObject *newkey; 8551 if (PyUnicode_GET_SIZE(key) != 1) { 8552 PyErr_SetString(PyExc_ValueError, "string keys in translate " 8553 "table must be of length 1"); 8554 goto err; 8555 } 8556 newkey = PyLong_FromLong(PyUnicode_AS_UNICODE(key)[0]); 8557 if (!newkey) 8558 goto err; 8559 res = PyDict_SetItem(new, newkey, value); 8560 Py_DECREF(newkey); 8561 if (res < 0) 8562 goto err; 8563 } else if (PyLong_Check(key)) { 8564 /* just keep integer keys */ 8565 if (PyDict_SetItem(new, key, value) < 0) 8566 goto err; 8567 } else { 8568 PyErr_SetString(PyExc_TypeError, "keys in translate table must " 8569 "be strings or integers"); 8570 goto err; 8571 } 8572 } 8573 } 8574 return new; 8575 err: 8576 Py_DECREF(new); 8577 return NULL; 8578} 8579 8580PyDoc_STRVAR(translate__doc__, 8581 "S.translate(table) -> str\n\ 8582\n\ 8583Return a copy of the string S, where all characters have been mapped\n\ 8584through the given translation table, which must be a mapping of\n\ 8585Unicode ordinals to Unicode ordinals, strings, or None.\n\ 8586Unmapped characters are left untouched. Characters mapped to None\n\ 8587are deleted."); 8588 8589static PyObject* 8590unicode_translate(PyUnicodeObject *self, PyObject *table) 8591{ 8592 return PyUnicode_TranslateCharmap(self->str, self->length, table, "ignore"); 8593} 8594 8595PyDoc_STRVAR(upper__doc__, 8596 "S.upper() -> str\n\ 8597\n\ 8598Return a copy of S converted to uppercase."); 8599 8600static PyObject* 8601unicode_upper(PyUnicodeObject *self) 8602{ 8603 return fixup(self, fixupper); 8604} 8605 8606PyDoc_STRVAR(zfill__doc__, 8607 "S.zfill(width) -> str\n\ 8608\n\ 8609Pad a numeric string S with zeros on the left, to fill a field\n\ 8610of the specified width. The string S is never truncated."); 8611 8612static PyObject * 8613unicode_zfill(PyUnicodeObject *self, PyObject *args) 8614{ 8615 Py_ssize_t fill; 8616 PyUnicodeObject *u; 8617 8618 Py_ssize_t width; 8619 if (!PyArg_ParseTuple(args, "n:zfill", &width)) 8620 return NULL; 8621 8622 if (self->length >= width) { 8623 if (PyUnicode_CheckExact(self)) { 8624 Py_INCREF(self); 8625 return (PyObject*) self; 8626 } 8627 else 8628 return PyUnicode_FromUnicode( 8629 PyUnicode_AS_UNICODE(self), 8630 PyUnicode_GET_SIZE(self) 8631 ); 8632 } 8633 8634 fill = width - self->length; 8635 8636 u = pad(self, fill, 0, '0'); 8637 8638 if (u == NULL) 8639 return NULL; 8640 8641 if (u->str[fill] == '+' || u->str[fill] == '-') { 8642 /* move sign to beginning of string */ 8643 u->str[0] = u->str[fill]; 8644 u->str[fill] = '0'; 8645 } 8646 8647 return (PyObject*) u; 8648} 8649 8650#if 0 8651static PyObject* 8652unicode_freelistsize(PyUnicodeObject *self) 8653{ 8654 return PyLong_FromLong(numfree); 8655} 8656#endif 8657 8658PyDoc_STRVAR(startswith__doc__, 8659 "S.startswith(prefix[, start[, end]]) -> bool\n\ 8660\n\ 8661Return True if S starts with the specified prefix, False otherwise.\n\ 8662With optional start, test S beginning at that position.\n\ 8663With optional end, stop comparing S at that position.\n\ 8664prefix can also be a tuple of strings to try."); 8665 8666static PyObject * 8667unicode_startswith(PyUnicodeObject *self, 8668 PyObject *args) 8669{ 8670 PyObject *subobj; 8671 PyUnicodeObject *substring; 8672 Py_ssize_t start = 0; 8673 Py_ssize_t end = PY_SSIZE_T_MAX; 8674 int result; 8675 8676 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj, 8677 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 8678 return NULL; 8679 if (PyTuple_Check(subobj)) { 8680 Py_ssize_t i; 8681 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) { 8682 substring = (PyUnicodeObject *)PyUnicode_FromObject( 8683 PyTuple_GET_ITEM(subobj, i)); 8684 if (substring == NULL) 8685 return NULL; 8686 result = tailmatch(self, substring, start, end, -1); 8687 Py_DECREF(substring); 8688 if (result) { 8689 Py_RETURN_TRUE; 8690 } 8691 } 8692 /* nothing matched */ 8693 Py_RETURN_FALSE; 8694 } 8695 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj); 8696 if (substring == NULL) 8697 return NULL; 8698 result = tailmatch(self, substring, start, end, -1); 8699 Py_DECREF(substring); 8700 return PyBool_FromLong(result); 8701} 8702 8703 8704PyDoc_STRVAR(endswith__doc__, 8705 "S.endswith(suffix[, start[, end]]) -> bool\n\ 8706\n\ 8707Return True if S ends with the specified suffix, False otherwise.\n\ 8708With optional start, test S beginning at that position.\n\ 8709With optional end, stop comparing S at that position.\n\ 8710suffix can also be a tuple of strings to try."); 8711 8712static PyObject * 8713unicode_endswith(PyUnicodeObject *self, 8714 PyObject *args) 8715{ 8716 PyObject *subobj; 8717 PyUnicodeObject *substring; 8718 Py_ssize_t start = 0; 8719 Py_ssize_t end = PY_SSIZE_T_MAX; 8720 int result; 8721 8722 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj, 8723 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 8724 return NULL; 8725 if (PyTuple_Check(subobj)) { 8726 Py_ssize_t i; 8727 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) { 8728 substring = (PyUnicodeObject *)PyUnicode_FromObject( 8729 PyTuple_GET_ITEM(subobj, i)); 8730 if (substring == NULL) 8731 return NULL; 8732 result = tailmatch(self, substring, start, end, +1); 8733 Py_DECREF(substring); 8734 if (result) { 8735 Py_RETURN_TRUE; 8736 } 8737 } 8738 Py_RETURN_FALSE; 8739 } 8740 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj); 8741 if (substring == NULL) 8742 return NULL; 8743 8744 result = tailmatch(self, substring, start, end, +1); 8745 Py_DECREF(substring); 8746 return PyBool_FromLong(result); 8747} 8748 8749#include "stringlib/string_format.h" 8750 8751PyDoc_STRVAR(format__doc__, 8752 "S.format(*args, **kwargs) -> str\n\ 8753\n\ 8754"); 8755 8756static PyObject * 8757unicode__format__(PyObject* self, PyObject* args) 8758{ 8759 PyObject *format_spec; 8760 8761 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec)) 8762 return NULL; 8763 8764 return _PyUnicode_FormatAdvanced(self, 8765 PyUnicode_AS_UNICODE(format_spec), 8766 PyUnicode_GET_SIZE(format_spec)); 8767} 8768 8769PyDoc_STRVAR(p_format__doc__, 8770 "S.__format__(format_spec) -> str\n\ 8771\n\ 8772"); 8773 8774static PyObject * 8775unicode__sizeof__(PyUnicodeObject *v) 8776{ 8777 return PyLong_FromSsize_t(sizeof(PyUnicodeObject) + 8778 sizeof(Py_UNICODE) * (v->length + 1)); 8779} 8780 8781PyDoc_STRVAR(sizeof__doc__, 8782 "S.__sizeof__() -> size of S in memory, in bytes"); 8783 8784static PyObject * 8785unicode_getnewargs(PyUnicodeObject *v) 8786{ 8787 return Py_BuildValue("(u#)", v->str, v->length); 8788} 8789 8790 8791static PyMethodDef unicode_methods[] = { 8792 8793 /* Order is according to common usage: often used methods should 8794 appear first, since lookup is done sequentially. */ 8795 8796 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__}, 8797 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__}, 8798 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__}, 8799 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__}, 8800 {"join", (PyCFunction) unicode_join, METH_O, join__doc__}, 8801 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__}, 8802 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__}, 8803 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__}, 8804 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__}, 8805 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__}, 8806 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__}, 8807 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__}, 8808 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__}, 8809 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__}, 8810 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__}, 8811 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__}, 8812 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__}, 8813 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__}, 8814 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__}, 8815 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__}, 8816 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__}, 8817 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__}, 8818 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__}, 8819 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__}, 8820 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__}, 8821 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__}, 8822 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__}, 8823 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__}, 8824 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__}, 8825 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__}, 8826 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__}, 8827 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__}, 8828 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__}, 8829 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__}, 8830 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__}, 8831 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__}, 8832 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__}, 8833 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__}, 8834 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__}, 8835 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__}, 8836 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__}, 8837 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__}, 8838 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS}, 8839 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS}, 8840 {"maketrans", (PyCFunction) unicode_maketrans, 8841 METH_VARARGS | METH_STATIC, maketrans__doc__}, 8842 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__}, 8843#if 0 8844 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__}, 8845#endif 8846 8847#if 0 8848 /* This one is just used for debugging the implementation. */ 8849 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS}, 8850#endif 8851 8852 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS}, 8853 {NULL, NULL} 8854}; 8855 8856static PyObject * 8857unicode_mod(PyObject *v, PyObject *w) 8858{ 8859 if (!PyUnicode_Check(v)) { 8860 Py_INCREF(Py_NotImplemented); 8861 return Py_NotImplemented; 8862 } 8863 return PyUnicode_Format(v, w); 8864} 8865 8866static PyNumberMethods unicode_as_number = { 8867 0, /*nb_add*/ 8868 0, /*nb_subtract*/ 8869 0, /*nb_multiply*/ 8870 unicode_mod, /*nb_remainder*/ 8871}; 8872 8873static PySequenceMethods unicode_as_sequence = { 8874 (lenfunc) unicode_length, /* sq_length */ 8875 PyUnicode_Concat, /* sq_concat */ 8876 (ssizeargfunc) unicode_repeat, /* sq_repeat */ 8877 (ssizeargfunc) unicode_getitem, /* sq_item */ 8878 0, /* sq_slice */ 8879 0, /* sq_ass_item */ 8880 0, /* sq_ass_slice */ 8881 PyUnicode_Contains, /* sq_contains */ 8882}; 8883 8884static PyObject* 8885unicode_subscript(PyUnicodeObject* self, PyObject* item) 8886{ 8887 if (PyIndex_Check(item)) { 8888 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError); 8889 if (i == -1 && PyErr_Occurred()) 8890 return NULL; 8891 if (i < 0) 8892 i += PyUnicode_GET_SIZE(self); 8893 return unicode_getitem(self, i); 8894 } else if (PySlice_Check(item)) { 8895 Py_ssize_t start, stop, step, slicelength, cur, i; 8896 Py_UNICODE* source_buf; 8897 Py_UNICODE* result_buf; 8898 PyObject* result; 8899 8900 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self), 8901 &start, &stop, &step, &slicelength) < 0) { 8902 return NULL; 8903 } 8904 8905 if (slicelength <= 0) { 8906 return PyUnicode_FromUnicode(NULL, 0); 8907 } else if (start == 0 && step == 1 && slicelength == self->length && 8908 PyUnicode_CheckExact(self)) { 8909 Py_INCREF(self); 8910 return (PyObject *)self; 8911 } else if (step == 1) { 8912 return PyUnicode_FromUnicode(self->str + start, slicelength); 8913 } else { 8914 source_buf = PyUnicode_AS_UNICODE((PyObject*)self); 8915 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength* 8916 sizeof(Py_UNICODE)); 8917 8918 if (result_buf == NULL) 8919 return PyErr_NoMemory(); 8920 8921 for (cur = start, i = 0; i < slicelength; cur += step, i++) { 8922 result_buf[i] = source_buf[cur]; 8923 } 8924 8925 result = PyUnicode_FromUnicode(result_buf, slicelength); 8926 PyObject_FREE(result_buf); 8927 return result; 8928 } 8929 } else { 8930 PyErr_SetString(PyExc_TypeError, "string indices must be integers"); 8931 return NULL; 8932 } 8933} 8934 8935static PyMappingMethods unicode_as_mapping = { 8936 (lenfunc)unicode_length, /* mp_length */ 8937 (binaryfunc)unicode_subscript, /* mp_subscript */ 8938 (objobjargproc)0, /* mp_ass_subscript */ 8939}; 8940 8941 8942/* Helpers for PyUnicode_Format() */ 8943 8944static PyObject * 8945getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx) 8946{ 8947 Py_ssize_t argidx = *p_argidx; 8948 if (argidx < arglen) { 8949 (*p_argidx)++; 8950 if (arglen < 0) 8951 return args; 8952 else 8953 return PyTuple_GetItem(args, argidx); 8954 } 8955 PyErr_SetString(PyExc_TypeError, 8956 "not enough arguments for format string"); 8957 return NULL; 8958} 8959 8960/* Returns a new reference to a PyUnicode object, or NULL on failure. */ 8961 8962static PyObject * 8963formatfloat(PyObject *v, int flags, int prec, int type) 8964{ 8965 char *p; 8966 PyObject *result; 8967 double x; 8968 8969 x = PyFloat_AsDouble(v); 8970 if (x == -1.0 && PyErr_Occurred()) 8971 return NULL; 8972 8973 if (prec < 0) 8974 prec = 6; 8975 8976 p = PyOS_double_to_string(x, type, prec, 8977 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL); 8978 if (p == NULL) 8979 return NULL; 8980 result = PyUnicode_FromStringAndSize(p, strlen(p)); 8981 PyMem_Free(p); 8982 return result; 8983} 8984 8985static PyObject* 8986formatlong(PyObject *val, int flags, int prec, int type) 8987{ 8988 char *buf; 8989 int len; 8990 PyObject *str; /* temporary string object. */ 8991 PyObject *result; 8992 8993 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len); 8994 if (!str) 8995 return NULL; 8996 result = PyUnicode_FromStringAndSize(buf, len); 8997 Py_DECREF(str); 8998 return result; 8999} 9000 9001static int 9002formatchar(Py_UNICODE *buf, 9003 size_t buflen, 9004 PyObject *v) 9005{ 9006 /* presume that the buffer is at least 3 characters long */ 9007 if (PyUnicode_Check(v)) { 9008 if (PyUnicode_GET_SIZE(v) == 1) { 9009 buf[0] = PyUnicode_AS_UNICODE(v)[0]; 9010 buf[1] = '\0'; 9011 return 1; 9012 } 9013#ifndef Py_UNICODE_WIDE 9014 if (PyUnicode_GET_SIZE(v) == 2) { 9015 /* Decode a valid surrogate pair */ 9016 int c0 = PyUnicode_AS_UNICODE(v)[0]; 9017 int c1 = PyUnicode_AS_UNICODE(v)[1]; 9018 if (0xD800 <= c0 && c0 <= 0xDBFF && 9019 0xDC00 <= c1 && c1 <= 0xDFFF) { 9020 buf[0] = c0; 9021 buf[1] = c1; 9022 buf[2] = '\0'; 9023 return 2; 9024 } 9025 } 9026#endif 9027 goto onError; 9028 } 9029 else { 9030 /* Integer input truncated to a character */ 9031 long x; 9032 x = PyLong_AsLong(v); 9033 if (x == -1 && PyErr_Occurred()) 9034 goto onError; 9035 9036 if (x < 0 || x > 0x10ffff) { 9037 PyErr_SetString(PyExc_OverflowError, 9038 "%c arg not in range(0x110000)"); 9039 return -1; 9040 } 9041 9042#ifndef Py_UNICODE_WIDE 9043 if (x > 0xffff) { 9044 x -= 0x10000; 9045 buf[0] = (Py_UNICODE)(0xD800 | (x >> 10)); 9046 buf[1] = (Py_UNICODE)(0xDC00 | (x & 0x3FF)); 9047 return 2; 9048 } 9049#endif 9050 buf[0] = (Py_UNICODE) x; 9051 buf[1] = '\0'; 9052 return 1; 9053 } 9054 9055 onError: 9056 PyErr_SetString(PyExc_TypeError, 9057 "%c requires int or char"); 9058 return -1; 9059} 9060 9061/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...) 9062 FORMATBUFLEN is the length of the buffer in which chars are formatted. 9063*/ 9064#define FORMATBUFLEN (size_t)10 9065 9066PyObject *PyUnicode_Format(PyObject *format, 9067 PyObject *args) 9068{ 9069 Py_UNICODE *fmt, *res; 9070 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx; 9071 int args_owned = 0; 9072 PyUnicodeObject *result = NULL; 9073 PyObject *dict = NULL; 9074 PyObject *uformat; 9075 9076 if (format == NULL || args == NULL) { 9077 PyErr_BadInternalCall(); 9078 return NULL; 9079 } 9080 uformat = PyUnicode_FromObject(format); 9081 if (uformat == NULL) 9082 return NULL; 9083 fmt = PyUnicode_AS_UNICODE(uformat); 9084 fmtcnt = PyUnicode_GET_SIZE(uformat); 9085 9086 reslen = rescnt = fmtcnt + 100; 9087 result = _PyUnicode_New(reslen); 9088 if (result == NULL) 9089 goto onError; 9090 res = PyUnicode_AS_UNICODE(result); 9091 9092 if (PyTuple_Check(args)) { 9093 arglen = PyTuple_Size(args); 9094 argidx = 0; 9095 } 9096 else { 9097 arglen = -1; 9098 argidx = -2; 9099 } 9100 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) && 9101 !PyUnicode_Check(args)) 9102 dict = args; 9103 9104 while (--fmtcnt >= 0) { 9105 if (*fmt != '%') { 9106 if (--rescnt < 0) { 9107 rescnt = fmtcnt + 100; 9108 reslen += rescnt; 9109 if (_PyUnicode_Resize(&result, reslen) < 0) 9110 goto onError; 9111 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt; 9112 --rescnt; 9113 } 9114 *res++ = *fmt++; 9115 } 9116 else { 9117 /* Got a format specifier */ 9118 int flags = 0; 9119 Py_ssize_t width = -1; 9120 int prec = -1; 9121 Py_UNICODE c = '\0'; 9122 Py_UNICODE fill; 9123 int isnumok; 9124 PyObject *v = NULL; 9125 PyObject *temp = NULL; 9126 Py_UNICODE *pbuf; 9127 Py_UNICODE sign; 9128 Py_ssize_t len; 9129 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For formatchar() */ 9130 9131 fmt++; 9132 if (*fmt == '(') { 9133 Py_UNICODE *keystart; 9134 Py_ssize_t keylen; 9135 PyObject *key; 9136 int pcount = 1; 9137 9138 if (dict == NULL) { 9139 PyErr_SetString(PyExc_TypeError, 9140 "format requires a mapping"); 9141 goto onError; 9142 } 9143 ++fmt; 9144 --fmtcnt; 9145 keystart = fmt; 9146 /* Skip over balanced parentheses */ 9147 while (pcount > 0 && --fmtcnt >= 0) { 9148 if (*fmt == ')') 9149 --pcount; 9150 else if (*fmt == '(') 9151 ++pcount; 9152 fmt++; 9153 } 9154 keylen = fmt - keystart - 1; 9155 if (fmtcnt < 0 || pcount > 0) { 9156 PyErr_SetString(PyExc_ValueError, 9157 "incomplete format key"); 9158 goto onError; 9159 } 9160#if 0 9161 /* keys are converted to strings using UTF-8 and 9162 then looked up since Python uses strings to hold 9163 variables names etc. in its namespaces and we 9164 wouldn't want to break common idioms. */ 9165 key = PyUnicode_EncodeUTF8(keystart, 9166 keylen, 9167 NULL); 9168#else 9169 key = PyUnicode_FromUnicode(keystart, keylen); 9170#endif 9171 if (key == NULL) 9172 goto onError; 9173 if (args_owned) { 9174 Py_DECREF(args); 9175 args_owned = 0; 9176 } 9177 args = PyObject_GetItem(dict, key); 9178 Py_DECREF(key); 9179 if (args == NULL) { 9180 goto onError; 9181 } 9182 args_owned = 1; 9183 arglen = -1; 9184 argidx = -2; 9185 } 9186 while (--fmtcnt >= 0) { 9187 switch (c = *fmt++) { 9188 case '-': flags |= F_LJUST; continue; 9189 case '+': flags |= F_SIGN; continue; 9190 case ' ': flags |= F_BLANK; continue; 9191 case '#': flags |= F_ALT; continue; 9192 case '0': flags |= F_ZERO; continue; 9193 } 9194 break; 9195 } 9196 if (c == '*') { 9197 v = getnextarg(args, arglen, &argidx); 9198 if (v == NULL) 9199 goto onError; 9200 if (!PyLong_Check(v)) { 9201 PyErr_SetString(PyExc_TypeError, 9202 "* wants int"); 9203 goto onError; 9204 } 9205 width = PyLong_AsLong(v); 9206 if (width == -1 && PyErr_Occurred()) 9207 goto onError; 9208 if (width < 0) { 9209 flags |= F_LJUST; 9210 width = -width; 9211 } 9212 if (--fmtcnt >= 0) 9213 c = *fmt++; 9214 } 9215 else if (c >= '0' && c <= '9') { 9216 width = c - '0'; 9217 while (--fmtcnt >= 0) { 9218 c = *fmt++; 9219 if (c < '0' || c > '9') 9220 break; 9221 if ((width*10) / 10 != width) { 9222 PyErr_SetString(PyExc_ValueError, 9223 "width too big"); 9224 goto onError; 9225 } 9226 width = width*10 + (c - '0'); 9227 } 9228 } 9229 if (c == '.') { 9230 prec = 0; 9231 if (--fmtcnt >= 0) 9232 c = *fmt++; 9233 if (c == '*') { 9234 v = getnextarg(args, arglen, &argidx); 9235 if (v == NULL) 9236 goto onError; 9237 if (!PyLong_Check(v)) { 9238 PyErr_SetString(PyExc_TypeError, 9239 "* wants int"); 9240 goto onError; 9241 } 9242 prec = PyLong_AsLong(v); 9243 if (prec == -1 && PyErr_Occurred()) 9244 goto onError; 9245 if (prec < 0) 9246 prec = 0; 9247 if (--fmtcnt >= 0) 9248 c = *fmt++; 9249 } 9250 else if (c >= '0' && c <= '9') { 9251 prec = c - '0'; 9252 while (--fmtcnt >= 0) { 9253 c = *fmt++; 9254 if (c < '0' || c > '9') 9255 break; 9256 if ((prec*10) / 10 != prec) { 9257 PyErr_SetString(PyExc_ValueError, 9258 "prec too big"); 9259 goto onError; 9260 } 9261 prec = prec*10 + (c - '0'); 9262 } 9263 } 9264 } /* prec */ 9265 if (fmtcnt >= 0) { 9266 if (c == 'h' || c == 'l' || c == 'L') { 9267 if (--fmtcnt >= 0) 9268 c = *fmt++; 9269 } 9270 } 9271 if (fmtcnt < 0) { 9272 PyErr_SetString(PyExc_ValueError, 9273 "incomplete format"); 9274 goto onError; 9275 } 9276 if (c != '%') { 9277 v = getnextarg(args, arglen, &argidx); 9278 if (v == NULL) 9279 goto onError; 9280 } 9281 sign = 0; 9282 fill = ' '; 9283 switch (c) { 9284 9285 case '%': 9286 pbuf = formatbuf; 9287 /* presume that buffer length is at least 1 */ 9288 pbuf[0] = '%'; 9289 len = 1; 9290 break; 9291 9292 case 's': 9293 case 'r': 9294 case 'a': 9295 if (PyUnicode_CheckExact(v) && c == 's') { 9296 temp = v; 9297 Py_INCREF(temp); 9298 } 9299 else { 9300 if (c == 's') 9301 temp = PyObject_Str(v); 9302 else if (c == 'r') 9303 temp = PyObject_Repr(v); 9304 else 9305 temp = PyObject_ASCII(v); 9306 if (temp == NULL) 9307 goto onError; 9308 if (PyUnicode_Check(temp)) 9309 /* nothing to do */; 9310 else { 9311 Py_DECREF(temp); 9312 PyErr_SetString(PyExc_TypeError, 9313 "%s argument has non-string str()"); 9314 goto onError; 9315 } 9316 } 9317 pbuf = PyUnicode_AS_UNICODE(temp); 9318 len = PyUnicode_GET_SIZE(temp); 9319 if (prec >= 0 && len > prec) 9320 len = prec; 9321 break; 9322 9323 case 'i': 9324 case 'd': 9325 case 'u': 9326 case 'o': 9327 case 'x': 9328 case 'X': 9329 if (c == 'i') 9330 c = 'd'; 9331 isnumok = 0; 9332 if (PyNumber_Check(v)) { 9333 PyObject *iobj=NULL; 9334 9335 if (PyLong_Check(v)) { 9336 iobj = v; 9337 Py_INCREF(iobj); 9338 } 9339 else { 9340 iobj = PyNumber_Long(v); 9341 } 9342 if (iobj!=NULL) { 9343 if (PyLong_Check(iobj)) { 9344 isnumok = 1; 9345 temp = formatlong(iobj, flags, prec, c); 9346 Py_DECREF(iobj); 9347 if (!temp) 9348 goto onError; 9349 pbuf = PyUnicode_AS_UNICODE(temp); 9350 len = PyUnicode_GET_SIZE(temp); 9351 sign = 1; 9352 } 9353 else { 9354 Py_DECREF(iobj); 9355 } 9356 } 9357 } 9358 if (!isnumok) { 9359 PyErr_Format(PyExc_TypeError, 9360 "%%%c format: a number is required, " 9361 "not %.200s", (char)c, Py_TYPE(v)->tp_name); 9362 goto onError; 9363 } 9364 if (flags & F_ZERO) 9365 fill = '0'; 9366 break; 9367 9368 case 'e': 9369 case 'E': 9370 case 'f': 9371 case 'F': 9372 case 'g': 9373 case 'G': 9374 temp = formatfloat(v, flags, prec, c); 9375 if (!temp) 9376 goto onError; 9377 pbuf = PyUnicode_AS_UNICODE(temp); 9378 len = PyUnicode_GET_SIZE(temp); 9379 sign = 1; 9380 if (flags & F_ZERO) 9381 fill = '0'; 9382 break; 9383 9384 case 'c': 9385 pbuf = formatbuf; 9386 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v); 9387 if (len < 0) 9388 goto onError; 9389 break; 9390 9391 default: 9392 PyErr_Format(PyExc_ValueError, 9393 "unsupported format character '%c' (0x%x) " 9394 "at index %zd", 9395 (31<=c && c<=126) ? (char)c : '?', 9396 (int)c, 9397 (Py_ssize_t)(fmt - 1 - 9398 PyUnicode_AS_UNICODE(uformat))); 9399 goto onError; 9400 } 9401 if (sign) { 9402 if (*pbuf == '-' || *pbuf == '+') { 9403 sign = *pbuf++; 9404 len--; 9405 } 9406 else if (flags & F_SIGN) 9407 sign = '+'; 9408 else if (flags & F_BLANK) 9409 sign = ' '; 9410 else 9411 sign = 0; 9412 } 9413 if (width < len) 9414 width = len; 9415 if (rescnt - (sign != 0) < width) { 9416 reslen -= rescnt; 9417 rescnt = width + fmtcnt + 100; 9418 reslen += rescnt; 9419 if (reslen < 0) { 9420 Py_XDECREF(temp); 9421 PyErr_NoMemory(); 9422 goto onError; 9423 } 9424 if (_PyUnicode_Resize(&result, reslen) < 0) { 9425 Py_XDECREF(temp); 9426 goto onError; 9427 } 9428 res = PyUnicode_AS_UNICODE(result) 9429 + reslen - rescnt; 9430 } 9431 if (sign) { 9432 if (fill != ' ') 9433 *res++ = sign; 9434 rescnt--; 9435 if (width > len) 9436 width--; 9437 } 9438 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) { 9439 assert(pbuf[0] == '0'); 9440 assert(pbuf[1] == c); 9441 if (fill != ' ') { 9442 *res++ = *pbuf++; 9443 *res++ = *pbuf++; 9444 } 9445 rescnt -= 2; 9446 width -= 2; 9447 if (width < 0) 9448 width = 0; 9449 len -= 2; 9450 } 9451 if (width > len && !(flags & F_LJUST)) { 9452 do { 9453 --rescnt; 9454 *res++ = fill; 9455 } while (--width > len); 9456 } 9457 if (fill == ' ') { 9458 if (sign) 9459 *res++ = sign; 9460 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) { 9461 assert(pbuf[0] == '0'); 9462 assert(pbuf[1] == c); 9463 *res++ = *pbuf++; 9464 *res++ = *pbuf++; 9465 } 9466 } 9467 Py_UNICODE_COPY(res, pbuf, len); 9468 res += len; 9469 rescnt -= len; 9470 while (--width >= len) { 9471 --rescnt; 9472 *res++ = ' '; 9473 } 9474 if (dict && (argidx < arglen) && c != '%') { 9475 PyErr_SetString(PyExc_TypeError, 9476 "not all arguments converted during string formatting"); 9477 Py_XDECREF(temp); 9478 goto onError; 9479 } 9480 Py_XDECREF(temp); 9481 } /* '%' */ 9482 } /* until end */ 9483 if (argidx < arglen && !dict) { 9484 PyErr_SetString(PyExc_TypeError, 9485 "not all arguments converted during string formatting"); 9486 goto onError; 9487 } 9488 9489 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0) 9490 goto onError; 9491 if (args_owned) { 9492 Py_DECREF(args); 9493 } 9494 Py_DECREF(uformat); 9495 return (PyObject *)result; 9496 9497 onError: 9498 Py_XDECREF(result); 9499 Py_DECREF(uformat); 9500 if (args_owned) { 9501 Py_DECREF(args); 9502 } 9503 return NULL; 9504} 9505 9506static PyObject * 9507unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds); 9508 9509static PyObject * 9510unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds) 9511{ 9512 PyObject *x = NULL; 9513 static char *kwlist[] = {"object", "encoding", "errors", 0}; 9514 char *encoding = NULL; 9515 char *errors = NULL; 9516 9517 if (type != &PyUnicode_Type) 9518 return unicode_subtype_new(type, args, kwds); 9519 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str", 9520 kwlist, &x, &encoding, &errors)) 9521 return NULL; 9522 if (x == NULL) 9523 return (PyObject *)_PyUnicode_New(0); 9524 if (encoding == NULL && errors == NULL) 9525 return PyObject_Str(x); 9526 else 9527 return PyUnicode_FromEncodedObject(x, encoding, errors); 9528} 9529 9530static PyObject * 9531unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds) 9532{ 9533 PyUnicodeObject *tmp, *pnew; 9534 Py_ssize_t n; 9535 9536 assert(PyType_IsSubtype(type, &PyUnicode_Type)); 9537 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds); 9538 if (tmp == NULL) 9539 return NULL; 9540 assert(PyUnicode_Check(tmp)); 9541 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length); 9542 if (pnew == NULL) { 9543 Py_DECREF(tmp); 9544 return NULL; 9545 } 9546 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1)); 9547 if (pnew->str == NULL) { 9548 _Py_ForgetReference((PyObject *)pnew); 9549 PyObject_Del(pnew); 9550 Py_DECREF(tmp); 9551 return PyErr_NoMemory(); 9552 } 9553 Py_UNICODE_COPY(pnew->str, tmp->str, n+1); 9554 pnew->length = n; 9555 pnew->hash = tmp->hash; 9556 Py_DECREF(tmp); 9557 return (PyObject *)pnew; 9558} 9559 9560PyDoc_STRVAR(unicode_doc, 9561 "str(string[, encoding[, errors]]) -> str\n\ 9562\n\ 9563Create a new string object from the given encoded string.\n\ 9564encoding defaults to the current default string encoding.\n\ 9565errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'."); 9566 9567static PyObject *unicode_iter(PyObject *seq); 9568 9569PyTypeObject PyUnicode_Type = { 9570 PyVarObject_HEAD_INIT(&PyType_Type, 0) 9571 "str", /* tp_name */ 9572 sizeof(PyUnicodeObject), /* tp_size */ 9573 0, /* tp_itemsize */ 9574 /* Slots */ 9575 (destructor)unicode_dealloc, /* tp_dealloc */ 9576 0, /* tp_print */ 9577 0, /* tp_getattr */ 9578 0, /* tp_setattr */ 9579 0, /* tp_reserved */ 9580 unicode_repr, /* tp_repr */ 9581 &unicode_as_number, /* tp_as_number */ 9582 &unicode_as_sequence, /* tp_as_sequence */ 9583 &unicode_as_mapping, /* tp_as_mapping */ 9584 (hashfunc) unicode_hash, /* tp_hash*/ 9585 0, /* tp_call*/ 9586 (reprfunc) unicode_str, /* tp_str */ 9587 PyObject_GenericGetAttr, /* tp_getattro */ 9588 0, /* tp_setattro */ 9589 0, /* tp_as_buffer */ 9590 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | 9591 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */ 9592 unicode_doc, /* tp_doc */ 9593 0, /* tp_traverse */ 9594 0, /* tp_clear */ 9595 PyUnicode_RichCompare, /* tp_richcompare */ 9596 0, /* tp_weaklistoffset */ 9597 unicode_iter, /* tp_iter */ 9598 0, /* tp_iternext */ 9599 unicode_methods, /* tp_methods */ 9600 0, /* tp_members */ 9601 0, /* tp_getset */ 9602 &PyBaseObject_Type, /* tp_base */ 9603 0, /* tp_dict */ 9604 0, /* tp_descr_get */ 9605 0, /* tp_descr_set */ 9606 0, /* tp_dictoffset */ 9607 0, /* tp_init */ 9608 0, /* tp_alloc */ 9609 unicode_new, /* tp_new */ 9610 PyObject_Del, /* tp_free */ 9611}; 9612 9613/* Initialize the Unicode implementation */ 9614 9615void _PyUnicode_Init(void) 9616{ 9617 int i; 9618 9619 /* XXX - move this array to unicodectype.c ? */ 9620 Py_UNICODE linebreak[] = { 9621 0x000A, /* LINE FEED */ 9622 0x000D, /* CARRIAGE RETURN */ 9623 0x001C, /* FILE SEPARATOR */ 9624 0x001D, /* GROUP SEPARATOR */ 9625 0x001E, /* RECORD SEPARATOR */ 9626 0x0085, /* NEXT LINE */ 9627 0x2028, /* LINE SEPARATOR */ 9628 0x2029, /* PARAGRAPH SEPARATOR */ 9629 }; 9630 9631 /* Init the implementation */ 9632 free_list = NULL; 9633 numfree = 0; 9634 unicode_empty = _PyUnicode_New(0); 9635 if (!unicode_empty) 9636 return; 9637 9638 for (i = 0; i < 256; i++) 9639 unicode_latin1[i] = NULL; 9640 if (PyType_Ready(&PyUnicode_Type) < 0) 9641 Py_FatalError("Can't initialize 'unicode'"); 9642 9643 /* initialize the linebreak bloom filter */ 9644 bloom_linebreak = make_bloom_mask( 9645 linebreak, sizeof(linebreak) / sizeof(linebreak[0]) 9646 ); 9647 9648 PyType_Ready(&EncodingMapType); 9649} 9650 9651/* Finalize the Unicode implementation */ 9652 9653int 9654PyUnicode_ClearFreeList(void) 9655{ 9656 int freelist_size = numfree; 9657 PyUnicodeObject *u; 9658 9659 for (u = free_list; u != NULL;) { 9660 PyUnicodeObject *v = u; 9661 u = *(PyUnicodeObject **)u; 9662 if (v->str) 9663 PyObject_DEL(v->str); 9664 Py_XDECREF(v->defenc); 9665 PyObject_Del(v); 9666 numfree--; 9667 } 9668 free_list = NULL; 9669 assert(numfree == 0); 9670 return freelist_size; 9671} 9672 9673void 9674_PyUnicode_Fini(void) 9675{ 9676 int i; 9677 9678 Py_XDECREF(unicode_empty); 9679 unicode_empty = NULL; 9680 9681 for (i = 0; i < 256; i++) { 9682 if (unicode_latin1[i]) { 9683 Py_DECREF(unicode_latin1[i]); 9684 unicode_latin1[i] = NULL; 9685 } 9686 } 9687 (void)PyUnicode_ClearFreeList(); 9688} 9689 9690void 9691PyUnicode_InternInPlace(PyObject **p) 9692{ 9693 register PyUnicodeObject *s = (PyUnicodeObject *)(*p); 9694 PyObject *t; 9695 if (s == NULL || !PyUnicode_Check(s)) 9696 Py_FatalError( 9697 "PyUnicode_InternInPlace: unicode strings only please!"); 9698 /* If it's a subclass, we don't really know what putting 9699 it in the interned dict might do. */ 9700 if (!PyUnicode_CheckExact(s)) 9701 return; 9702 if (PyUnicode_CHECK_INTERNED(s)) 9703 return; 9704 if (interned == NULL) { 9705 interned = PyDict_New(); 9706 if (interned == NULL) { 9707 PyErr_Clear(); /* Don't leave an exception */ 9708 return; 9709 } 9710 } 9711 /* It might be that the GetItem call fails even 9712 though the key is present in the dictionary, 9713 namely when this happens during a stack overflow. */ 9714 Py_ALLOW_RECURSION 9715 t = PyDict_GetItem(interned, (PyObject *)s); 9716 Py_END_ALLOW_RECURSION 9717 9718 if (t) { 9719 Py_INCREF(t); 9720 Py_DECREF(*p); 9721 *p = t; 9722 return; 9723 } 9724 9725 PyThreadState_GET()->recursion_critical = 1; 9726 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) { 9727 PyErr_Clear(); 9728 PyThreadState_GET()->recursion_critical = 0; 9729 return; 9730 } 9731 PyThreadState_GET()->recursion_critical = 0; 9732 /* The two references in interned are not counted by refcnt. 9733 The deallocator will take care of this */ 9734 Py_REFCNT(s) -= 2; 9735 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL; 9736} 9737 9738void 9739PyUnicode_InternImmortal(PyObject **p) 9740{ 9741 PyUnicode_InternInPlace(p); 9742 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) { 9743 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL; 9744 Py_INCREF(*p); 9745 } 9746} 9747 9748PyObject * 9749PyUnicode_InternFromString(const char *cp) 9750{ 9751 PyObject *s = PyUnicode_FromString(cp); 9752 if (s == NULL) 9753 return NULL; 9754 PyUnicode_InternInPlace(&s); 9755 return s; 9756} 9757 9758void _Py_ReleaseInternedUnicodeStrings(void) 9759{ 9760 PyObject *keys; 9761 PyUnicodeObject *s; 9762 Py_ssize_t i, n; 9763 Py_ssize_t immortal_size = 0, mortal_size = 0; 9764 9765 if (interned == NULL || !PyDict_Check(interned)) 9766 return; 9767 keys = PyDict_Keys(interned); 9768 if (keys == NULL || !PyList_Check(keys)) { 9769 PyErr_Clear(); 9770 return; 9771 } 9772 9773 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak 9774 detector, interned unicode strings are not forcibly deallocated; 9775 rather, we give them their stolen references back, and then clear 9776 and DECREF the interned dict. */ 9777 9778 n = PyList_GET_SIZE(keys); 9779 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n", 9780 n); 9781 for (i = 0; i < n; i++) { 9782 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i); 9783 switch (s->state) { 9784 case SSTATE_NOT_INTERNED: 9785 /* XXX Shouldn't happen */ 9786 break; 9787 case SSTATE_INTERNED_IMMORTAL: 9788 Py_REFCNT(s) += 1; 9789 immortal_size += s->length; 9790 break; 9791 case SSTATE_INTERNED_MORTAL: 9792 Py_REFCNT(s) += 2; 9793 mortal_size += s->length; 9794 break; 9795 default: 9796 Py_FatalError("Inconsistent interned string state."); 9797 } 9798 s->state = SSTATE_NOT_INTERNED; 9799 } 9800 fprintf(stderr, "total size of all interned strings: " 9801 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d " 9802 "mortal/immortal\n", mortal_size, immortal_size); 9803 Py_DECREF(keys); 9804 PyDict_Clear(interned); 9805 Py_DECREF(interned); 9806 interned = NULL; 9807} 9808 9809 9810/********************* Unicode Iterator **************************/ 9811 9812typedef struct { 9813 PyObject_HEAD 9814 Py_ssize_t it_index; 9815 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */ 9816} unicodeiterobject; 9817 9818static void 9819unicodeiter_dealloc(unicodeiterobject *it) 9820{ 9821 _PyObject_GC_UNTRACK(it); 9822 Py_XDECREF(it->it_seq); 9823 PyObject_GC_Del(it); 9824} 9825 9826static int 9827unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg) 9828{ 9829 Py_VISIT(it->it_seq); 9830 return 0; 9831} 9832 9833static PyObject * 9834unicodeiter_next(unicodeiterobject *it) 9835{ 9836 PyUnicodeObject *seq; 9837 PyObject *item; 9838 9839 assert(it != NULL); 9840 seq = it->it_seq; 9841 if (seq == NULL) 9842 return NULL; 9843 assert(PyUnicode_Check(seq)); 9844 9845 if (it->it_index < PyUnicode_GET_SIZE(seq)) { 9846 item = PyUnicode_FromUnicode( 9847 PyUnicode_AS_UNICODE(seq)+it->it_index, 1); 9848 if (item != NULL) 9849 ++it->it_index; 9850 return item; 9851 } 9852 9853 Py_DECREF(seq); 9854 it->it_seq = NULL; 9855 return NULL; 9856} 9857 9858static PyObject * 9859unicodeiter_len(unicodeiterobject *it) 9860{ 9861 Py_ssize_t len = 0; 9862 if (it->it_seq) 9863 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index; 9864 return PyLong_FromSsize_t(len); 9865} 9866 9867PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it))."); 9868 9869static PyMethodDef unicodeiter_methods[] = { 9870 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS, 9871 length_hint_doc}, 9872 {NULL, NULL} /* sentinel */ 9873}; 9874 9875PyTypeObject PyUnicodeIter_Type = { 9876 PyVarObject_HEAD_INIT(&PyType_Type, 0) 9877 "str_iterator", /* tp_name */ 9878 sizeof(unicodeiterobject), /* tp_basicsize */ 9879 0, /* tp_itemsize */ 9880 /* methods */ 9881 (destructor)unicodeiter_dealloc, /* tp_dealloc */ 9882 0, /* tp_print */ 9883 0, /* tp_getattr */ 9884 0, /* tp_setattr */ 9885 0, /* tp_reserved */ 9886 0, /* tp_repr */ 9887 0, /* tp_as_number */ 9888 0, /* tp_as_sequence */ 9889 0, /* tp_as_mapping */ 9890 0, /* tp_hash */ 9891 0, /* tp_call */ 9892 0, /* tp_str */ 9893 PyObject_GenericGetAttr, /* tp_getattro */ 9894 0, /* tp_setattro */ 9895 0, /* tp_as_buffer */ 9896 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */ 9897 0, /* tp_doc */ 9898 (traverseproc)unicodeiter_traverse, /* tp_traverse */ 9899 0, /* tp_clear */ 9900 0, /* tp_richcompare */ 9901 0, /* tp_weaklistoffset */ 9902 PyObject_SelfIter, /* tp_iter */ 9903 (iternextfunc)unicodeiter_next, /* tp_iternext */ 9904 unicodeiter_methods, /* tp_methods */ 9905 0, 9906}; 9907 9908static PyObject * 9909unicode_iter(PyObject *seq) 9910{ 9911 unicodeiterobject *it; 9912 9913 if (!PyUnicode_Check(seq)) { 9914 PyErr_BadInternalCall(); 9915 return NULL; 9916 } 9917 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type); 9918 if (it == NULL) 9919 return NULL; 9920 it->it_index = 0; 9921 Py_INCREF(seq); 9922 it->it_seq = (PyUnicodeObject *)seq; 9923 _PyObject_GC_TRACK(it); 9924 return (PyObject *)it; 9925} 9926 9927size_t 9928Py_UNICODE_strlen(const Py_UNICODE *u) 9929{ 9930 int res = 0; 9931 while(*u++) 9932 res++; 9933 return res; 9934} 9935 9936Py_UNICODE* 9937Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2) 9938{ 9939 Py_UNICODE *u = s1; 9940 while ((*u++ = *s2++)); 9941 return s1; 9942} 9943 9944Py_UNICODE* 9945Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n) 9946{ 9947 Py_UNICODE *u = s1; 9948 while ((*u++ = *s2++)) 9949 if (n-- == 0) 9950 break; 9951 return s1; 9952} 9953 9954Py_UNICODE* 9955Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2) 9956{ 9957 Py_UNICODE *u1 = s1; 9958 u1 += Py_UNICODE_strlen(u1); 9959 Py_UNICODE_strcpy(u1, s2); 9960 return s1; 9961} 9962 9963int 9964Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2) 9965{ 9966 while (*s1 && *s2 && *s1 == *s2) 9967 s1++, s2++; 9968 if (*s1 && *s2) 9969 return (*s1 < *s2) ? -1 : +1; 9970 if (*s1) 9971 return 1; 9972 if (*s2) 9973 return -1; 9974 return 0; 9975} 9976 9977int 9978Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n) 9979{ 9980 register Py_UNICODE u1, u2; 9981 for (; n != 0; n--) { 9982 u1 = *s1; 9983 u2 = *s2; 9984 if (u1 != u2) 9985 return (u1 < u2) ? -1 : +1; 9986 if (u1 == '\0') 9987 return 0; 9988 s1++; 9989 s2++; 9990 } 9991 return 0; 9992} 9993 9994Py_UNICODE* 9995Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c) 9996{ 9997 const Py_UNICODE *p; 9998 for (p = s; *p; p++) 9999 if (*p == c) 10000 return (Py_UNICODE*)p; 10001 return NULL; 10002} 10003 10004Py_UNICODE* 10005Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c) 10006{ 10007 const Py_UNICODE *p; 10008 p = s + Py_UNICODE_strlen(s); 10009 while (p != s) { 10010 p--; 10011 if (*p == c) 10012 return (Py_UNICODE*)p; 10013 } 10014 return NULL; 10015} 10016 10017Py_UNICODE* 10018PyUnicode_AsUnicodeCopy(PyObject *object) 10019{ 10020 PyUnicodeObject *unicode = (PyUnicodeObject *)object; 10021 Py_UNICODE *copy; 10022 Py_ssize_t size; 10023 10024 /* Ensure we won't overflow the size. */ 10025 if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) { 10026 PyErr_NoMemory(); 10027 return NULL; 10028 } 10029 size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */ 10030 size *= sizeof(Py_UNICODE); 10031 copy = PyMem_Malloc(size); 10032 if (copy == NULL) { 10033 PyErr_NoMemory(); 10034 return NULL; 10035 } 10036 memcpy(copy, PyUnicode_AS_UNICODE(unicode), size); 10037 return copy; 10038} 10039 10040#ifdef __cplusplus 10041} 10042#endif 10043