unicodeobject.c revision 51d2fd983bcc85342b631e27a33e214c691e53be
1/* 2 3Unicode implementation based on original code by Fredrik Lundh, 4modified by Marc-Andre Lemburg <mal@lemburg.com> according to the 5Unicode Integration Proposal (see file Misc/unicode.txt). 6 7Major speed upgrades to the method implementations at the Reykjavik 8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke. 9 10Copyright (c) Corporation for National Research Initiatives. 11 12-------------------------------------------------------------------- 13The original string type implementation is: 14 15 Copyright (c) 1999 by Secret Labs AB 16 Copyright (c) 1999 by Fredrik Lundh 17 18By obtaining, using, and/or copying this software and/or its 19associated documentation, you agree that you have read, understood, 20and will comply with the following terms and conditions: 21 22Permission to use, copy, modify, and distribute this software and its 23associated documentation for any purpose and without fee is hereby 24granted, provided that the above copyright notice appears in all 25copies, and that both that copyright notice and this permission notice 26appear in supporting documentation, and that the name of Secret Labs 27AB or the author not be used in advertising or publicity pertaining to 28distribution of the software without specific, written prior 29permission. 30 31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO 32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND 33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR 34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT 37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 38-------------------------------------------------------------------- 39 40*/ 41 42#define PY_SSIZE_T_CLEAN 43#include "Python.h" 44#include "bytes_methods.h" 45 46#include "unicodeobject.h" 47#include "ucnhash.h" 48 49#ifdef MS_WINDOWS 50#include <windows.h> 51#endif 52 53/* Limit for the Unicode object free list */ 54 55#define PyUnicode_MAXFREELIST 1024 56 57/* Limit for the Unicode object free list stay alive optimization. 58 59 The implementation will keep allocated Unicode memory intact for 60 all objects on the free list having a size less than this 61 limit. This reduces malloc() overhead for small Unicode objects. 62 63 At worst this will result in PyUnicode_MAXFREELIST * 64 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT + 65 malloc()-overhead) bytes of unused garbage. 66 67 Setting the limit to 0 effectively turns the feature off. 68 69 Note: This is an experimental feature ! If you get core dumps when 70 using Unicode objects, turn this feature off. 71 72*/ 73 74#define KEEPALIVE_SIZE_LIMIT 9 75 76/* Endianness switches; defaults to little endian */ 77 78#ifdef WORDS_BIGENDIAN 79# define BYTEORDER_IS_BIG_ENDIAN 80#else 81# define BYTEORDER_IS_LITTLE_ENDIAN 82#endif 83 84/* --- Globals ------------------------------------------------------------ 85 86 The globals are initialized by the _PyUnicode_Init() API and should 87 not be used before calling that API. 88 89*/ 90 91 92#ifdef __cplusplus 93extern "C" { 94#endif 95 96/* This dictionary holds all interned unicode strings. Note that references 97 to strings in this dictionary are *not* counted in the string's ob_refcnt. 98 When the interned string reaches a refcnt of 0 the string deallocation 99 function will delete the reference from this dictionary. 100 101 Another way to look at this is that to say that the actual reference 102 count of a string is: s->ob_refcnt + (s->state ? 2 : 0) 103*/ 104static PyObject *interned; 105 106/* Free list for Unicode objects */ 107static PyUnicodeObject *free_list; 108static int numfree; 109 110/* The empty Unicode object is shared to improve performance. */ 111static PyUnicodeObject *unicode_empty; 112 113/* Single character Unicode strings in the Latin-1 range are being 114 shared as well. */ 115static PyUnicodeObject *unicode_latin1[256]; 116 117/* Fast detection of the most frequent whitespace characters */ 118const unsigned char _Py_ascii_whitespace[] = { 119 0, 0, 0, 0, 0, 0, 0, 0, 120/* case 0x0009: * CHARACTER TABULATION */ 121/* case 0x000A: * LINE FEED */ 122/* case 0x000B: * LINE TABULATION */ 123/* case 0x000C: * FORM FEED */ 124/* case 0x000D: * CARRIAGE RETURN */ 125 0, 1, 1, 1, 1, 1, 0, 0, 126 0, 0, 0, 0, 0, 0, 0, 0, 127/* case 0x001C: * FILE SEPARATOR */ 128/* case 0x001D: * GROUP SEPARATOR */ 129/* case 0x001E: * RECORD SEPARATOR */ 130/* case 0x001F: * UNIT SEPARATOR */ 131 0, 0, 0, 0, 1, 1, 1, 1, 132/* case 0x0020: * SPACE */ 133 1, 0, 0, 0, 0, 0, 0, 0, 134 0, 0, 0, 0, 0, 0, 0, 0, 135 0, 0, 0, 0, 0, 0, 0, 0, 136 0, 0, 0, 0, 0, 0, 0, 0, 137 138 0, 0, 0, 0, 0, 0, 0, 0, 139 0, 0, 0, 0, 0, 0, 0, 0, 140 0, 0, 0, 0, 0, 0, 0, 0, 141 0, 0, 0, 0, 0, 0, 0, 0, 142 0, 0, 0, 0, 0, 0, 0, 0, 143 0, 0, 0, 0, 0, 0, 0, 0, 144 0, 0, 0, 0, 0, 0, 0, 0, 145 0, 0, 0, 0, 0, 0, 0, 0 146}; 147 148static PyObject *unicode_encode_call_errorhandler(const char *errors, 149 PyObject **errorHandler,const char *encoding, const char *reason, 150 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject, 151 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos); 152 153static void raise_encode_exception(PyObject **exceptionObject, 154 const char *encoding, 155 const Py_UNICODE *unicode, Py_ssize_t size, 156 Py_ssize_t startpos, Py_ssize_t endpos, 157 const char *reason); 158 159/* Same for linebreaks */ 160static unsigned char ascii_linebreak[] = { 161 0, 0, 0, 0, 0, 0, 0, 0, 162/* 0x000A, * LINE FEED */ 163/* 0x000B, * LINE TABULATION */ 164/* 0x000C, * FORM FEED */ 165/* 0x000D, * CARRIAGE RETURN */ 166 0, 0, 1, 1, 1, 1, 0, 0, 167 0, 0, 0, 0, 0, 0, 0, 0, 168/* 0x001C, * FILE SEPARATOR */ 169/* 0x001D, * GROUP SEPARATOR */ 170/* 0x001E, * RECORD SEPARATOR */ 171 0, 0, 0, 0, 1, 1, 1, 0, 172 0, 0, 0, 0, 0, 0, 0, 0, 173 0, 0, 0, 0, 0, 0, 0, 0, 174 0, 0, 0, 0, 0, 0, 0, 0, 175 0, 0, 0, 0, 0, 0, 0, 0, 176 177 0, 0, 0, 0, 0, 0, 0, 0, 178 0, 0, 0, 0, 0, 0, 0, 0, 179 0, 0, 0, 0, 0, 0, 0, 0, 180 0, 0, 0, 0, 0, 0, 0, 0, 181 0, 0, 0, 0, 0, 0, 0, 0, 182 0, 0, 0, 0, 0, 0, 0, 0, 183 0, 0, 0, 0, 0, 0, 0, 0, 184 0, 0, 0, 0, 0, 0, 0, 0 185}; 186 187 188Py_UNICODE 189PyUnicode_GetMax(void) 190{ 191#ifdef Py_UNICODE_WIDE 192 return 0x10FFFF; 193#else 194 /* This is actually an illegal character, so it should 195 not be passed to unichr. */ 196 return 0xFFFF; 197#endif 198} 199 200/* --- Bloom Filters ----------------------------------------------------- */ 201 202/* stuff to implement simple "bloom filters" for Unicode characters. 203 to keep things simple, we use a single bitmask, using the least 5 204 bits from each unicode characters as the bit index. */ 205 206/* the linebreak mask is set up by Unicode_Init below */ 207 208#if LONG_BIT >= 128 209#define BLOOM_WIDTH 128 210#elif LONG_BIT >= 64 211#define BLOOM_WIDTH 64 212#elif LONG_BIT >= 32 213#define BLOOM_WIDTH 32 214#else 215#error "LONG_BIT is smaller than 32" 216#endif 217 218#define BLOOM_MASK unsigned long 219 220static BLOOM_MASK bloom_linebreak; 221 222#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1))))) 223#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1))))) 224 225#define BLOOM_LINEBREAK(ch) \ 226 ((ch) < 128U ? ascii_linebreak[(ch)] : \ 227 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch))) 228 229Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len) 230{ 231 /* calculate simple bloom-style bitmask for a given unicode string */ 232 233 BLOOM_MASK mask; 234 Py_ssize_t i; 235 236 mask = 0; 237 for (i = 0; i < len; i++) 238 BLOOM_ADD(mask, ptr[i]); 239 240 return mask; 241} 242 243Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen) 244{ 245 Py_ssize_t i; 246 247 for (i = 0; i < setlen; i++) 248 if (set[i] == chr) 249 return 1; 250 251 return 0; 252} 253 254#define BLOOM_MEMBER(mask, chr, set, setlen) \ 255 BLOOM(mask, chr) && unicode_member(chr, set, setlen) 256 257/* --- Unicode Object ----------------------------------------------------- */ 258 259static 260int unicode_resize(register PyUnicodeObject *unicode, 261 Py_ssize_t length) 262{ 263 void *oldstr; 264 265 /* Shortcut if there's nothing much to do. */ 266 if (unicode->length == length) 267 goto reset; 268 269 /* Resizing shared object (unicode_empty or single character 270 objects) in-place is not allowed. Use PyUnicode_Resize() 271 instead ! */ 272 273 if (unicode == unicode_empty || 274 (unicode->length == 1 && 275 unicode->str[0] < 256U && 276 unicode_latin1[unicode->str[0]] == unicode)) { 277 PyErr_SetString(PyExc_SystemError, 278 "can't resize shared str objects"); 279 return -1; 280 } 281 282 /* We allocate one more byte to make sure the string is Ux0000 terminated. 283 The overallocation is also used by fastsearch, which assumes that it's 284 safe to look at str[length] (without making any assumptions about what 285 it contains). */ 286 287 oldstr = unicode->str; 288 unicode->str = PyObject_REALLOC(unicode->str, 289 sizeof(Py_UNICODE) * (length + 1)); 290 if (!unicode->str) { 291 unicode->str = (Py_UNICODE *)oldstr; 292 PyErr_NoMemory(); 293 return -1; 294 } 295 unicode->str[length] = 0; 296 unicode->length = length; 297 298 reset: 299 /* Reset the object caches */ 300 if (unicode->defenc) { 301 Py_CLEAR(unicode->defenc); 302 } 303 unicode->hash = -1; 304 305 return 0; 306} 307 308/* We allocate one more byte to make sure the string is 309 Ux0000 terminated; some code (e.g. new_identifier) 310 relies on that. 311 312 XXX This allocator could further be enhanced by assuring that the 313 free list never reduces its size below 1. 314 315*/ 316 317static 318PyUnicodeObject *_PyUnicode_New(Py_ssize_t length) 319{ 320 register PyUnicodeObject *unicode; 321 322 /* Optimization for empty strings */ 323 if (length == 0 && unicode_empty != NULL) { 324 Py_INCREF(unicode_empty); 325 return unicode_empty; 326 } 327 328 /* Ensure we won't overflow the size. */ 329 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) { 330 return (PyUnicodeObject *)PyErr_NoMemory(); 331 } 332 333 /* Unicode freelist & memory allocation */ 334 if (free_list) { 335 unicode = free_list; 336 free_list = *(PyUnicodeObject **)unicode; 337 numfree--; 338 if (unicode->str) { 339 /* Keep-Alive optimization: we only upsize the buffer, 340 never downsize it. */ 341 if ((unicode->length < length) && 342 unicode_resize(unicode, length) < 0) { 343 PyObject_DEL(unicode->str); 344 unicode->str = NULL; 345 } 346 } 347 else { 348 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1); 349 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size); 350 } 351 PyObject_INIT(unicode, &PyUnicode_Type); 352 } 353 else { 354 size_t new_size; 355 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type); 356 if (unicode == NULL) 357 return NULL; 358 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1); 359 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size); 360 } 361 362 if (!unicode->str) { 363 PyErr_NoMemory(); 364 goto onError; 365 } 366 /* Initialize the first element to guard against cases where 367 * the caller fails before initializing str -- unicode_resize() 368 * reads str[0], and the Keep-Alive optimization can keep memory 369 * allocated for str alive across a call to unicode_dealloc(unicode). 370 * We don't want unicode_resize to read uninitialized memory in 371 * that case. 372 */ 373 unicode->str[0] = 0; 374 unicode->str[length] = 0; 375 unicode->length = length; 376 unicode->hash = -1; 377 unicode->state = 0; 378 unicode->defenc = NULL; 379 return unicode; 380 381 onError: 382 /* XXX UNREF/NEWREF interface should be more symmetrical */ 383 _Py_DEC_REFTOTAL; 384 _Py_ForgetReference((PyObject *)unicode); 385 PyObject_Del(unicode); 386 return NULL; 387} 388 389static 390void unicode_dealloc(register PyUnicodeObject *unicode) 391{ 392 switch (PyUnicode_CHECK_INTERNED(unicode)) { 393 case SSTATE_NOT_INTERNED: 394 break; 395 396 case SSTATE_INTERNED_MORTAL: 397 /* revive dead object temporarily for DelItem */ 398 Py_REFCNT(unicode) = 3; 399 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0) 400 Py_FatalError( 401 "deletion of interned string failed"); 402 break; 403 404 case SSTATE_INTERNED_IMMORTAL: 405 Py_FatalError("Immortal interned string died."); 406 407 default: 408 Py_FatalError("Inconsistent interned string state."); 409 } 410 411 if (PyUnicode_CheckExact(unicode) && 412 numfree < PyUnicode_MAXFREELIST) { 413 /* Keep-Alive optimization */ 414 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) { 415 PyObject_DEL(unicode->str); 416 unicode->str = NULL; 417 unicode->length = 0; 418 } 419 if (unicode->defenc) { 420 Py_CLEAR(unicode->defenc); 421 } 422 /* Add to free list */ 423 *(PyUnicodeObject **)unicode = free_list; 424 free_list = unicode; 425 numfree++; 426 } 427 else { 428 PyObject_DEL(unicode->str); 429 Py_XDECREF(unicode->defenc); 430 Py_TYPE(unicode)->tp_free((PyObject *)unicode); 431 } 432} 433 434static 435int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length) 436{ 437 register PyUnicodeObject *v; 438 439 /* Argument checks */ 440 if (unicode == NULL) { 441 PyErr_BadInternalCall(); 442 return -1; 443 } 444 v = *unicode; 445 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) { 446 PyErr_BadInternalCall(); 447 return -1; 448 } 449 450 /* Resizing unicode_empty and single character objects is not 451 possible since these are being shared. We simply return a fresh 452 copy with the same Unicode content. */ 453 if (v->length != length && 454 (v == unicode_empty || v->length == 1)) { 455 PyUnicodeObject *w = _PyUnicode_New(length); 456 if (w == NULL) 457 return -1; 458 Py_UNICODE_COPY(w->str, v->str, 459 length < v->length ? length : v->length); 460 Py_DECREF(*unicode); 461 *unicode = w; 462 return 0; 463 } 464 465 /* Note that we don't have to modify *unicode for unshared Unicode 466 objects, since we can modify them in-place. */ 467 return unicode_resize(v, length); 468} 469 470int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length) 471{ 472 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length); 473} 474 475PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u, 476 Py_ssize_t size) 477{ 478 PyUnicodeObject *unicode; 479 480 /* If the Unicode data is known at construction time, we can apply 481 some optimizations which share commonly used objects. */ 482 if (u != NULL) { 483 484 /* Optimization for empty strings */ 485 if (size == 0 && unicode_empty != NULL) { 486 Py_INCREF(unicode_empty); 487 return (PyObject *)unicode_empty; 488 } 489 490 /* Single character Unicode objects in the Latin-1 range are 491 shared when using this constructor */ 492 if (size == 1 && *u < 256) { 493 unicode = unicode_latin1[*u]; 494 if (!unicode) { 495 unicode = _PyUnicode_New(1); 496 if (!unicode) 497 return NULL; 498 unicode->str[0] = *u; 499 unicode_latin1[*u] = unicode; 500 } 501 Py_INCREF(unicode); 502 return (PyObject *)unicode; 503 } 504 } 505 506 unicode = _PyUnicode_New(size); 507 if (!unicode) 508 return NULL; 509 510 /* Copy the Unicode data into the new object */ 511 if (u != NULL) 512 Py_UNICODE_COPY(unicode->str, u, size); 513 514 return (PyObject *)unicode; 515} 516 517PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size) 518{ 519 PyUnicodeObject *unicode; 520 521 if (size < 0) { 522 PyErr_SetString(PyExc_SystemError, 523 "Negative size passed to PyUnicode_FromStringAndSize"); 524 return NULL; 525 } 526 527 /* If the Unicode data is known at construction time, we can apply 528 some optimizations which share commonly used objects. 529 Also, this means the input must be UTF-8, so fall back to the 530 UTF-8 decoder at the end. */ 531 if (u != NULL) { 532 533 /* Optimization for empty strings */ 534 if (size == 0 && unicode_empty != NULL) { 535 Py_INCREF(unicode_empty); 536 return (PyObject *)unicode_empty; 537 } 538 539 /* Single characters are shared when using this constructor. 540 Restrict to ASCII, since the input must be UTF-8. */ 541 if (size == 1 && Py_CHARMASK(*u) < 128) { 542 unicode = unicode_latin1[Py_CHARMASK(*u)]; 543 if (!unicode) { 544 unicode = _PyUnicode_New(1); 545 if (!unicode) 546 return NULL; 547 unicode->str[0] = Py_CHARMASK(*u); 548 unicode_latin1[Py_CHARMASK(*u)] = unicode; 549 } 550 Py_INCREF(unicode); 551 return (PyObject *)unicode; 552 } 553 554 return PyUnicode_DecodeUTF8(u, size, NULL); 555 } 556 557 unicode = _PyUnicode_New(size); 558 if (!unicode) 559 return NULL; 560 561 return (PyObject *)unicode; 562} 563 564PyObject *PyUnicode_FromString(const char *u) 565{ 566 size_t size = strlen(u); 567 if (size > PY_SSIZE_T_MAX) { 568 PyErr_SetString(PyExc_OverflowError, "input too long"); 569 return NULL; 570 } 571 572 return PyUnicode_FromStringAndSize(u, size); 573} 574 575#ifdef HAVE_WCHAR_H 576 577#if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4) 578# define CONVERT_WCHAR_TO_SURROGATES 579#endif 580 581#ifdef CONVERT_WCHAR_TO_SURROGATES 582 583/* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need 584 to convert from UTF32 to UTF16. */ 585 586PyObject *PyUnicode_FromWideChar(register const wchar_t *w, 587 Py_ssize_t size) 588{ 589 PyUnicodeObject *unicode; 590 register Py_ssize_t i; 591 Py_ssize_t alloc; 592 const wchar_t *orig_w; 593 594 if (w == NULL) { 595 if (size == 0) 596 return PyUnicode_FromStringAndSize(NULL, 0); 597 PyErr_BadInternalCall(); 598 return NULL; 599 } 600 601 if (size == -1) { 602 size = wcslen(w); 603 } 604 605 alloc = size; 606 orig_w = w; 607 for (i = size; i > 0; i--) { 608 if (*w > 0xFFFF) 609 alloc++; 610 w++; 611 } 612 w = orig_w; 613 unicode = _PyUnicode_New(alloc); 614 if (!unicode) 615 return NULL; 616 617 /* Copy the wchar_t data into the new object */ 618 { 619 register Py_UNICODE *u; 620 u = PyUnicode_AS_UNICODE(unicode); 621 for (i = size; i > 0; i--) { 622 if (*w > 0xFFFF) { 623 wchar_t ordinal = *w++; 624 ordinal -= 0x10000; 625 *u++ = 0xD800 | (ordinal >> 10); 626 *u++ = 0xDC00 | (ordinal & 0x3FF); 627 } 628 else 629 *u++ = *w++; 630 } 631 } 632 return (PyObject *)unicode; 633} 634 635#else 636 637PyObject *PyUnicode_FromWideChar(register const wchar_t *w, 638 Py_ssize_t size) 639{ 640 PyUnicodeObject *unicode; 641 642 if (w == NULL) { 643 if (size == 0) 644 return PyUnicode_FromStringAndSize(NULL, 0); 645 PyErr_BadInternalCall(); 646 return NULL; 647 } 648 649 if (size == -1) { 650 size = wcslen(w); 651 } 652 653 unicode = _PyUnicode_New(size); 654 if (!unicode) 655 return NULL; 656 657 /* Copy the wchar_t data into the new object */ 658#if Py_UNICODE_SIZE == SIZEOF_WCHAR_T 659 memcpy(unicode->str, w, size * sizeof(wchar_t)); 660#else 661 { 662 register Py_UNICODE *u; 663 register Py_ssize_t i; 664 u = PyUnicode_AS_UNICODE(unicode); 665 for (i = size; i > 0; i--) 666 *u++ = *w++; 667 } 668#endif 669 670 return (PyObject *)unicode; 671} 672 673#endif /* CONVERT_WCHAR_TO_SURROGATES */ 674 675#undef CONVERT_WCHAR_TO_SURROGATES 676 677static void 678makefmt(char *fmt, int longflag, int longlongflag, int size_tflag, 679 int zeropad, int width, int precision, char c) 680{ 681 *fmt++ = '%'; 682 if (width) { 683 if (zeropad) 684 *fmt++ = '0'; 685 fmt += sprintf(fmt, "%d", width); 686 } 687 if (precision) 688 fmt += sprintf(fmt, ".%d", precision); 689 if (longflag) 690 *fmt++ = 'l'; 691 else if (longlongflag) { 692 /* longlongflag should only ever be nonzero on machines with 693 HAVE_LONG_LONG defined */ 694#ifdef HAVE_LONG_LONG 695 char *f = PY_FORMAT_LONG_LONG; 696 while (*f) 697 *fmt++ = *f++; 698#else 699 /* we shouldn't ever get here */ 700 assert(0); 701 *fmt++ = 'l'; 702#endif 703 } 704 else if (size_tflag) { 705 char *f = PY_FORMAT_SIZE_T; 706 while (*f) 707 *fmt++ = *f++; 708 } 709 *fmt++ = c; 710 *fmt = '\0'; 711} 712 713#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;} 714 715/* size of fixed-size buffer for formatting single arguments */ 716#define ITEM_BUFFER_LEN 21 717/* maximum number of characters required for output of %ld. 21 characters 718 allows for 64-bit integers (in decimal) and an optional sign. */ 719#define MAX_LONG_CHARS 21 720/* maximum number of characters required for output of %lld. 721 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits, 722 plus 1 for the sign. 53/22 is an upper bound for log10(256). */ 723#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22) 724 725PyObject * 726PyUnicode_FromFormatV(const char *format, va_list vargs) 727{ 728 va_list count; 729 Py_ssize_t callcount = 0; 730 PyObject **callresults = NULL; 731 PyObject **callresult = NULL; 732 Py_ssize_t n = 0; 733 int width = 0; 734 int precision = 0; 735 int zeropad; 736 const char* f; 737 Py_UNICODE *s; 738 PyObject *string; 739 /* used by sprintf */ 740 char buffer[ITEM_BUFFER_LEN+1]; 741 /* use abuffer instead of buffer, if we need more space 742 * (which can happen if there's a format specifier with width). */ 743 char *abuffer = NULL; 744 char *realbuffer; 745 Py_ssize_t abuffersize = 0; 746 char fmt[61]; /* should be enough for %0width.precisionlld */ 747 const char *copy; 748 749 Py_VA_COPY(count, vargs); 750 /* step 1: count the number of %S/%R/%A/%s format specifications 751 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/ 752 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the 753 * result in an array) */ 754 for (f = format; *f; f++) { 755 if (*f == '%') { 756 if (*(f+1)=='%') 757 continue; 758 if (*(f+1)=='S' || *(f+1)=='R' || *(f+1)=='A') 759 ++callcount; 760 while (Py_ISDIGIT((unsigned)*f)) 761 width = (width*10) + *f++ - '0'; 762 while (*++f && *f != '%' && !Py_ISALPHA((unsigned)*f)) 763 ; 764 if (*f == 's') 765 ++callcount; 766 } 767 else if (128 <= (unsigned char)*f) { 768 PyErr_Format(PyExc_ValueError, 769 "PyUnicode_FromFormatV() expects an ASCII-encoded format " 770 "string, got a non-ASCII byte: 0x%02x", 771 (unsigned char)*f); 772 return NULL; 773 } 774 } 775 /* step 2: allocate memory for the results of 776 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */ 777 if (callcount) { 778 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount); 779 if (!callresults) { 780 PyErr_NoMemory(); 781 return NULL; 782 } 783 callresult = callresults; 784 } 785 /* step 3: figure out how large a buffer we need */ 786 for (f = format; *f; f++) { 787 if (*f == '%') { 788#ifdef HAVE_LONG_LONG 789 int longlongflag = 0; 790#endif 791 const char* p = f; 792 width = 0; 793 while (Py_ISDIGIT((unsigned)*f)) 794 width = (width*10) + *f++ - '0'; 795 while (*++f && *f != '%' && !Py_ISALPHA((unsigned)*f)) 796 ; 797 798 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since 799 * they don't affect the amount of space we reserve. 800 */ 801 if (*f == 'l') { 802 if (f[1] == 'd' || f[1] == 'u') { 803 ++f; 804 } 805#ifdef HAVE_LONG_LONG 806 else if (f[1] == 'l' && 807 (f[2] == 'd' || f[2] == 'u')) { 808 longlongflag = 1; 809 f += 2; 810 } 811#endif 812 } 813 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) { 814 ++f; 815 } 816 817 switch (*f) { 818 case 'c': 819 (void)va_arg(count, int); 820 /* fall through... */ 821 case '%': 822 n++; 823 break; 824 case 'd': case 'u': case 'i': case 'x': 825 (void) va_arg(count, int); 826#ifdef HAVE_LONG_LONG 827 if (longlongflag) { 828 if (width < MAX_LONG_LONG_CHARS) 829 width = MAX_LONG_LONG_CHARS; 830 } 831 else 832#endif 833 /* MAX_LONG_CHARS is enough to hold a 64-bit integer, 834 including sign. Decimal takes the most space. This 835 isn't enough for octal. If a width is specified we 836 need more (which we allocate later). */ 837 if (width < MAX_LONG_CHARS) 838 width = MAX_LONG_CHARS; 839 n += width; 840 /* XXX should allow for large precision here too. */ 841 if (abuffersize < width) 842 abuffersize = width; 843 break; 844 case 's': 845 { 846 /* UTF-8 */ 847 const char *s = va_arg(count, const char*); 848 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace"); 849 if (!str) 850 goto fail; 851 n += PyUnicode_GET_SIZE(str); 852 /* Remember the str and switch to the next slot */ 853 *callresult++ = str; 854 break; 855 } 856 case 'U': 857 { 858 PyObject *obj = va_arg(count, PyObject *); 859 assert(obj && PyUnicode_Check(obj)); 860 n += PyUnicode_GET_SIZE(obj); 861 break; 862 } 863 case 'V': 864 { 865 PyObject *obj = va_arg(count, PyObject *); 866 const char *str = va_arg(count, const char *); 867 assert(obj || str); 868 assert(!obj || PyUnicode_Check(obj)); 869 if (obj) 870 n += PyUnicode_GET_SIZE(obj); 871 else 872 n += strlen(str); 873 break; 874 } 875 case 'S': 876 { 877 PyObject *obj = va_arg(count, PyObject *); 878 PyObject *str; 879 assert(obj); 880 str = PyObject_Str(obj); 881 if (!str) 882 goto fail; 883 n += PyUnicode_GET_SIZE(str); 884 /* Remember the str and switch to the next slot */ 885 *callresult++ = str; 886 break; 887 } 888 case 'R': 889 { 890 PyObject *obj = va_arg(count, PyObject *); 891 PyObject *repr; 892 assert(obj); 893 repr = PyObject_Repr(obj); 894 if (!repr) 895 goto fail; 896 n += PyUnicode_GET_SIZE(repr); 897 /* Remember the repr and switch to the next slot */ 898 *callresult++ = repr; 899 break; 900 } 901 case 'A': 902 { 903 PyObject *obj = va_arg(count, PyObject *); 904 PyObject *ascii; 905 assert(obj); 906 ascii = PyObject_ASCII(obj); 907 if (!ascii) 908 goto fail; 909 n += PyUnicode_GET_SIZE(ascii); 910 /* Remember the repr and switch to the next slot */ 911 *callresult++ = ascii; 912 break; 913 } 914 case 'p': 915 (void) va_arg(count, int); 916 /* maximum 64-bit pointer representation: 917 * 0xffffffffffffffff 918 * so 19 characters is enough. 919 * XXX I count 18 -- what's the extra for? 920 */ 921 n += 19; 922 break; 923 default: 924 /* if we stumble upon an unknown 925 formatting code, copy the rest of 926 the format string to the output 927 string. (we cannot just skip the 928 code, since there's no way to know 929 what's in the argument list) */ 930 n += strlen(p); 931 goto expand; 932 } 933 } else 934 n++; 935 } 936 expand: 937 if (abuffersize > ITEM_BUFFER_LEN) { 938 /* add 1 for sprintf's trailing null byte */ 939 abuffer = PyObject_Malloc(abuffersize + 1); 940 if (!abuffer) { 941 PyErr_NoMemory(); 942 goto fail; 943 } 944 realbuffer = abuffer; 945 } 946 else 947 realbuffer = buffer; 948 /* step 4: fill the buffer */ 949 /* Since we've analyzed how much space we need for the worst case, 950 we don't have to resize the string. 951 There can be no errors beyond this point. */ 952 string = PyUnicode_FromUnicode(NULL, n); 953 if (!string) 954 goto fail; 955 956 s = PyUnicode_AS_UNICODE(string); 957 callresult = callresults; 958 959 for (f = format; *f; f++) { 960 if (*f == '%') { 961 const char* p = f++; 962 int longflag = 0; 963 int longlongflag = 0; 964 int size_tflag = 0; 965 zeropad = (*f == '0'); 966 /* parse the width.precision part */ 967 width = 0; 968 while (Py_ISDIGIT((unsigned)*f)) 969 width = (width*10) + *f++ - '0'; 970 precision = 0; 971 if (*f == '.') { 972 f++; 973 while (Py_ISDIGIT((unsigned)*f)) 974 precision = (precision*10) + *f++ - '0'; 975 } 976 /* Handle %ld, %lu, %lld and %llu. */ 977 if (*f == 'l') { 978 if (f[1] == 'd' || f[1] == 'u') { 979 longflag = 1; 980 ++f; 981 } 982#ifdef HAVE_LONG_LONG 983 else if (f[1] == 'l' && 984 (f[2] == 'd' || f[2] == 'u')) { 985 longlongflag = 1; 986 f += 2; 987 } 988#endif 989 } 990 /* handle the size_t flag. */ 991 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) { 992 size_tflag = 1; 993 ++f; 994 } 995 996 switch (*f) { 997 case 'c': 998 *s++ = va_arg(vargs, int); 999 break; 1000 case 'd': 1001 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad, 1002 width, precision, 'd'); 1003 if (longflag) 1004 sprintf(realbuffer, fmt, va_arg(vargs, long)); 1005#ifdef HAVE_LONG_LONG 1006 else if (longlongflag) 1007 sprintf(realbuffer, fmt, va_arg(vargs, PY_LONG_LONG)); 1008#endif 1009 else if (size_tflag) 1010 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t)); 1011 else 1012 sprintf(realbuffer, fmt, va_arg(vargs, int)); 1013 appendstring(realbuffer); 1014 break; 1015 case 'u': 1016 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad, 1017 width, precision, 'u'); 1018 if (longflag) 1019 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long)); 1020#ifdef HAVE_LONG_LONG 1021 else if (longlongflag) 1022 sprintf(realbuffer, fmt, va_arg(vargs, 1023 unsigned PY_LONG_LONG)); 1024#endif 1025 else if (size_tflag) 1026 sprintf(realbuffer, fmt, va_arg(vargs, size_t)); 1027 else 1028 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int)); 1029 appendstring(realbuffer); 1030 break; 1031 case 'i': 1032 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'i'); 1033 sprintf(realbuffer, fmt, va_arg(vargs, int)); 1034 appendstring(realbuffer); 1035 break; 1036 case 'x': 1037 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x'); 1038 sprintf(realbuffer, fmt, va_arg(vargs, int)); 1039 appendstring(realbuffer); 1040 break; 1041 case 's': 1042 { 1043 /* unused, since we already have the result */ 1044 (void) va_arg(vargs, char *); 1045 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult), 1046 PyUnicode_GET_SIZE(*callresult)); 1047 s += PyUnicode_GET_SIZE(*callresult); 1048 /* We're done with the unicode()/repr() => forget it */ 1049 Py_DECREF(*callresult); 1050 /* switch to next unicode()/repr() result */ 1051 ++callresult; 1052 break; 1053 } 1054 case 'U': 1055 { 1056 PyObject *obj = va_arg(vargs, PyObject *); 1057 Py_ssize_t size = PyUnicode_GET_SIZE(obj); 1058 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size); 1059 s += size; 1060 break; 1061 } 1062 case 'V': 1063 { 1064 PyObject *obj = va_arg(vargs, PyObject *); 1065 const char *str = va_arg(vargs, const char *); 1066 if (obj) { 1067 Py_ssize_t size = PyUnicode_GET_SIZE(obj); 1068 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size); 1069 s += size; 1070 } else { 1071 appendstring(str); 1072 } 1073 break; 1074 } 1075 case 'S': 1076 case 'R': 1077 case 'A': 1078 { 1079 Py_UNICODE *ucopy; 1080 Py_ssize_t usize; 1081 Py_ssize_t upos; 1082 /* unused, since we already have the result */ 1083 (void) va_arg(vargs, PyObject *); 1084 ucopy = PyUnicode_AS_UNICODE(*callresult); 1085 usize = PyUnicode_GET_SIZE(*callresult); 1086 for (upos = 0; upos<usize;) 1087 *s++ = ucopy[upos++]; 1088 /* We're done with the unicode()/repr() => forget it */ 1089 Py_DECREF(*callresult); 1090 /* switch to next unicode()/repr() result */ 1091 ++callresult; 1092 break; 1093 } 1094 case 'p': 1095 sprintf(buffer, "%p", va_arg(vargs, void*)); 1096 /* %p is ill-defined: ensure leading 0x. */ 1097 if (buffer[1] == 'X') 1098 buffer[1] = 'x'; 1099 else if (buffer[1] != 'x') { 1100 memmove(buffer+2, buffer, strlen(buffer)+1); 1101 buffer[0] = '0'; 1102 buffer[1] = 'x'; 1103 } 1104 appendstring(buffer); 1105 break; 1106 case '%': 1107 *s++ = '%'; 1108 break; 1109 default: 1110 appendstring(p); 1111 goto end; 1112 } 1113 } 1114 else 1115 *s++ = *f; 1116 } 1117 1118 end: 1119 if (callresults) 1120 PyObject_Free(callresults); 1121 if (abuffer) 1122 PyObject_Free(abuffer); 1123 PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string)); 1124 return string; 1125 fail: 1126 if (callresults) { 1127 PyObject **callresult2 = callresults; 1128 while (callresult2 < callresult) { 1129 Py_DECREF(*callresult2); 1130 ++callresult2; 1131 } 1132 PyObject_Free(callresults); 1133 } 1134 if (abuffer) 1135 PyObject_Free(abuffer); 1136 return NULL; 1137} 1138 1139#undef appendstring 1140 1141PyObject * 1142PyUnicode_FromFormat(const char *format, ...) 1143{ 1144 PyObject* ret; 1145 va_list vargs; 1146 1147#ifdef HAVE_STDARG_PROTOTYPES 1148 va_start(vargs, format); 1149#else 1150 va_start(vargs); 1151#endif 1152 ret = PyUnicode_FromFormatV(format, vargs); 1153 va_end(vargs); 1154 return ret; 1155} 1156 1157/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString(): 1158 convert a Unicode object to a wide character string. 1159 1160 - If w is NULL: return the number of wide characters (including the nul 1161 character) required to convert the unicode object. Ignore size argument. 1162 1163 - Otherwise: return the number of wide characters (excluding the nul 1164 character) written into w. Write at most size wide characters (including 1165 the nul character). */ 1166static Py_ssize_t 1167unicode_aswidechar(PyUnicodeObject *unicode, 1168 wchar_t *w, 1169 Py_ssize_t size) 1170{ 1171#if Py_UNICODE_SIZE == SIZEOF_WCHAR_T 1172 Py_ssize_t res; 1173 if (w != NULL) { 1174 res = PyUnicode_GET_SIZE(unicode); 1175 if (size > res) 1176 size = res + 1; 1177 else 1178 res = size; 1179 memcpy(w, unicode->str, size * sizeof(wchar_t)); 1180 return res; 1181 } 1182 else 1183 return PyUnicode_GET_SIZE(unicode) + 1; 1184#elif Py_UNICODE_SIZE == 2 && SIZEOF_WCHAR_T == 4 1185 register const Py_UNICODE *u; 1186 const Py_UNICODE *uend; 1187 const wchar_t *worig, *wend; 1188 Py_ssize_t nchar; 1189 1190 u = PyUnicode_AS_UNICODE(unicode); 1191 uend = u + PyUnicode_GET_SIZE(unicode); 1192 if (w != NULL) { 1193 worig = w; 1194 wend = w + size; 1195 while (u != uend && w != wend) { 1196 if (0xD800 <= u[0] && u[0] <= 0xDBFF 1197 && 0xDC00 <= u[1] && u[1] <= 0xDFFF) 1198 { 1199 *w = (((u[0] & 0x3FF) << 10) | (u[1] & 0x3FF)) + 0x10000; 1200 u += 2; 1201 } 1202 else { 1203 *w = *u; 1204 u++; 1205 } 1206 w++; 1207 } 1208 if (w != wend) 1209 *w = L'\0'; 1210 return w - worig; 1211 } 1212 else { 1213 nchar = 1; /* nul character at the end */ 1214 while (u != uend) { 1215 if (0xD800 <= u[0] && u[0] <= 0xDBFF 1216 && 0xDC00 <= u[1] && u[1] <= 0xDFFF) 1217 u += 2; 1218 else 1219 u++; 1220 nchar++; 1221 } 1222 } 1223 return nchar; 1224#elif Py_UNICODE_SIZE == 4 && SIZEOF_WCHAR_T == 2 1225 register Py_UNICODE *u, *uend, ordinal; 1226 register Py_ssize_t i; 1227 wchar_t *worig, *wend; 1228 Py_ssize_t nchar; 1229 1230 u = PyUnicode_AS_UNICODE(unicode); 1231 uend = u + PyUnicode_GET_SIZE(u); 1232 if (w != NULL) { 1233 worig = w; 1234 wend = w + size; 1235 while (u != uend && w != wend) { 1236 ordinal = *u; 1237 if (ordinal > 0xffff) { 1238 ordinal -= 0x10000; 1239 *w++ = 0xD800 | (ordinal >> 10); 1240 *w++ = 0xDC00 | (ordinal & 0x3FF); 1241 } 1242 else 1243 *w++ = ordinal; 1244 u++; 1245 } 1246 if (w != wend) 1247 *w = 0; 1248 return w - worig; 1249 } 1250 else { 1251 nchar = 1; /* nul character */ 1252 while (u != uend) { 1253 if (*u > 0xffff) 1254 nchar += 2; 1255 else 1256 nchar++; 1257 u++; 1258 } 1259 return nchar; 1260 } 1261#else 1262# error "unsupported wchar_t and Py_UNICODE sizes, see issue #8670" 1263#endif 1264} 1265 1266Py_ssize_t 1267PyUnicode_AsWideChar(PyUnicodeObject *unicode, 1268 wchar_t *w, 1269 Py_ssize_t size) 1270{ 1271 if (unicode == NULL) { 1272 PyErr_BadInternalCall(); 1273 return -1; 1274 } 1275 return unicode_aswidechar(unicode, w, size); 1276} 1277 1278wchar_t* 1279PyUnicode_AsWideCharString(PyObject *unicode, 1280 Py_ssize_t *size) 1281{ 1282 wchar_t* buffer; 1283 Py_ssize_t buflen; 1284 1285 if (unicode == NULL) { 1286 PyErr_BadInternalCall(); 1287 return NULL; 1288 } 1289 1290 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, NULL, 0); 1291 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) { 1292 PyErr_NoMemory(); 1293 return NULL; 1294 } 1295 1296 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t)); 1297 if (buffer == NULL) { 1298 PyErr_NoMemory(); 1299 return NULL; 1300 } 1301 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, buffer, buflen); 1302 if (size != NULL) 1303 *size = buflen; 1304 return buffer; 1305} 1306 1307#endif 1308 1309PyObject *PyUnicode_FromOrdinal(int ordinal) 1310{ 1311 Py_UNICODE s[2]; 1312 1313 if (ordinal < 0 || ordinal > 0x10ffff) { 1314 PyErr_SetString(PyExc_ValueError, 1315 "chr() arg not in range(0x110000)"); 1316 return NULL; 1317 } 1318 1319#ifndef Py_UNICODE_WIDE 1320 if (ordinal > 0xffff) { 1321 ordinal -= 0x10000; 1322 s[0] = 0xD800 | (ordinal >> 10); 1323 s[1] = 0xDC00 | (ordinal & 0x3FF); 1324 return PyUnicode_FromUnicode(s, 2); 1325 } 1326#endif 1327 1328 s[0] = (Py_UNICODE)ordinal; 1329 return PyUnicode_FromUnicode(s, 1); 1330} 1331 1332PyObject *PyUnicode_FromObject(register PyObject *obj) 1333{ 1334 /* XXX Perhaps we should make this API an alias of 1335 PyObject_Str() instead ?! */ 1336 if (PyUnicode_CheckExact(obj)) { 1337 Py_INCREF(obj); 1338 return obj; 1339 } 1340 if (PyUnicode_Check(obj)) { 1341 /* For a Unicode subtype that's not a Unicode object, 1342 return a true Unicode object with the same data. */ 1343 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj), 1344 PyUnicode_GET_SIZE(obj)); 1345 } 1346 PyErr_Format(PyExc_TypeError, 1347 "Can't convert '%.100s' object to str implicitly", 1348 Py_TYPE(obj)->tp_name); 1349 return NULL; 1350} 1351 1352PyObject *PyUnicode_FromEncodedObject(register PyObject *obj, 1353 const char *encoding, 1354 const char *errors) 1355{ 1356 Py_buffer buffer; 1357 PyObject *v; 1358 1359 if (obj == NULL) { 1360 PyErr_BadInternalCall(); 1361 return NULL; 1362 } 1363 1364 /* Decoding bytes objects is the most common case and should be fast */ 1365 if (PyBytes_Check(obj)) { 1366 if (PyBytes_GET_SIZE(obj) == 0) { 1367 Py_INCREF(unicode_empty); 1368 v = (PyObject *) unicode_empty; 1369 } 1370 else { 1371 v = PyUnicode_Decode( 1372 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj), 1373 encoding, errors); 1374 } 1375 return v; 1376 } 1377 1378 if (PyUnicode_Check(obj)) { 1379 PyErr_SetString(PyExc_TypeError, 1380 "decoding str is not supported"); 1381 return NULL; 1382 } 1383 1384 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */ 1385 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) { 1386 PyErr_Format(PyExc_TypeError, 1387 "coercing to str: need bytes, bytearray " 1388 "or buffer-like object, %.80s found", 1389 Py_TYPE(obj)->tp_name); 1390 return NULL; 1391 } 1392 1393 if (buffer.len == 0) { 1394 Py_INCREF(unicode_empty); 1395 v = (PyObject *) unicode_empty; 1396 } 1397 else 1398 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors); 1399 1400 PyBuffer_Release(&buffer); 1401 return v; 1402} 1403 1404/* Convert encoding to lower case and replace '_' with '-' in order to 1405 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1), 1406 1 on success. */ 1407static int 1408normalize_encoding(const char *encoding, 1409 char *lower, 1410 size_t lower_len) 1411{ 1412 const char *e; 1413 char *l; 1414 char *l_end; 1415 1416 e = encoding; 1417 l = lower; 1418 l_end = &lower[lower_len - 1]; 1419 while (*e) { 1420 if (l == l_end) 1421 return 0; 1422 if (Py_ISUPPER(*e)) { 1423 *l++ = Py_TOLOWER(*e++); 1424 } 1425 else if (*e == '_') { 1426 *l++ = '-'; 1427 e++; 1428 } 1429 else { 1430 *l++ = *e++; 1431 } 1432 } 1433 *l = '\0'; 1434 return 1; 1435} 1436 1437PyObject *PyUnicode_Decode(const char *s, 1438 Py_ssize_t size, 1439 const char *encoding, 1440 const char *errors) 1441{ 1442 PyObject *buffer = NULL, *unicode; 1443 Py_buffer info; 1444 char lower[11]; /* Enough for any encoding shortcut */ 1445 1446 if (encoding == NULL) 1447 encoding = PyUnicode_GetDefaultEncoding(); 1448 1449 /* Shortcuts for common default encodings */ 1450 if (normalize_encoding(encoding, lower, sizeof(lower))) { 1451 if (strcmp(lower, "utf-8") == 0) 1452 return PyUnicode_DecodeUTF8(s, size, errors); 1453 else if ((strcmp(lower, "latin-1") == 0) || 1454 (strcmp(lower, "iso-8859-1") == 0)) 1455 return PyUnicode_DecodeLatin1(s, size, errors); 1456#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T) 1457 else if (strcmp(lower, "mbcs") == 0) 1458 return PyUnicode_DecodeMBCS(s, size, errors); 1459#endif 1460 else if (strcmp(lower, "ascii") == 0) 1461 return PyUnicode_DecodeASCII(s, size, errors); 1462 else if (strcmp(lower, "utf-16") == 0) 1463 return PyUnicode_DecodeUTF16(s, size, errors, 0); 1464 else if (strcmp(lower, "utf-32") == 0) 1465 return PyUnicode_DecodeUTF32(s, size, errors, 0); 1466 } 1467 1468 /* Decode via the codec registry */ 1469 buffer = NULL; 1470 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0) 1471 goto onError; 1472 buffer = PyMemoryView_FromBuffer(&info); 1473 if (buffer == NULL) 1474 goto onError; 1475 unicode = PyCodec_Decode(buffer, encoding, errors); 1476 if (unicode == NULL) 1477 goto onError; 1478 if (!PyUnicode_Check(unicode)) { 1479 PyErr_Format(PyExc_TypeError, 1480 "decoder did not return a str object (type=%.400s)", 1481 Py_TYPE(unicode)->tp_name); 1482 Py_DECREF(unicode); 1483 goto onError; 1484 } 1485 Py_DECREF(buffer); 1486 return unicode; 1487 1488 onError: 1489 Py_XDECREF(buffer); 1490 return NULL; 1491} 1492 1493PyObject *PyUnicode_AsDecodedObject(PyObject *unicode, 1494 const char *encoding, 1495 const char *errors) 1496{ 1497 PyObject *v; 1498 1499 if (!PyUnicode_Check(unicode)) { 1500 PyErr_BadArgument(); 1501 goto onError; 1502 } 1503 1504 if (encoding == NULL) 1505 encoding = PyUnicode_GetDefaultEncoding(); 1506 1507 /* Decode via the codec registry */ 1508 v = PyCodec_Decode(unicode, encoding, errors); 1509 if (v == NULL) 1510 goto onError; 1511 return v; 1512 1513 onError: 1514 return NULL; 1515} 1516 1517PyObject *PyUnicode_AsDecodedUnicode(PyObject *unicode, 1518 const char *encoding, 1519 const char *errors) 1520{ 1521 PyObject *v; 1522 1523 if (!PyUnicode_Check(unicode)) { 1524 PyErr_BadArgument(); 1525 goto onError; 1526 } 1527 1528 if (encoding == NULL) 1529 encoding = PyUnicode_GetDefaultEncoding(); 1530 1531 /* Decode via the codec registry */ 1532 v = PyCodec_Decode(unicode, encoding, errors); 1533 if (v == NULL) 1534 goto onError; 1535 if (!PyUnicode_Check(v)) { 1536 PyErr_Format(PyExc_TypeError, 1537 "decoder did not return a str object (type=%.400s)", 1538 Py_TYPE(v)->tp_name); 1539 Py_DECREF(v); 1540 goto onError; 1541 } 1542 return v; 1543 1544 onError: 1545 return NULL; 1546} 1547 1548PyObject *PyUnicode_Encode(const Py_UNICODE *s, 1549 Py_ssize_t size, 1550 const char *encoding, 1551 const char *errors) 1552{ 1553 PyObject *v, *unicode; 1554 1555 unicode = PyUnicode_FromUnicode(s, size); 1556 if (unicode == NULL) 1557 return NULL; 1558 v = PyUnicode_AsEncodedString(unicode, encoding, errors); 1559 Py_DECREF(unicode); 1560 return v; 1561} 1562 1563PyObject *PyUnicode_AsEncodedObject(PyObject *unicode, 1564 const char *encoding, 1565 const char *errors) 1566{ 1567 PyObject *v; 1568 1569 if (!PyUnicode_Check(unicode)) { 1570 PyErr_BadArgument(); 1571 goto onError; 1572 } 1573 1574 if (encoding == NULL) 1575 encoding = PyUnicode_GetDefaultEncoding(); 1576 1577 /* Encode via the codec registry */ 1578 v = PyCodec_Encode(unicode, encoding, errors); 1579 if (v == NULL) 1580 goto onError; 1581 return v; 1582 1583 onError: 1584 return NULL; 1585} 1586 1587PyObject * 1588PyUnicode_EncodeFSDefault(PyObject *unicode) 1589{ 1590#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T) 1591 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode), 1592 PyUnicode_GET_SIZE(unicode), 1593 NULL); 1594#elif defined(__APPLE__) 1595 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode), 1596 PyUnicode_GET_SIZE(unicode), 1597 "surrogateescape"); 1598#else 1599 if (Py_FileSystemDefaultEncoding) { 1600 return PyUnicode_AsEncodedString(unicode, 1601 Py_FileSystemDefaultEncoding, 1602 "surrogateescape"); 1603 } 1604 else { 1605 /* locale encoding with surrogateescape */ 1606 wchar_t *wchar; 1607 char *bytes; 1608 PyObject *bytes_obj; 1609 1610 wchar = PyUnicode_AsWideCharString(unicode, NULL); 1611 if (wchar == NULL) 1612 return NULL; 1613 bytes = _Py_wchar2char(wchar); 1614 PyMem_Free(wchar); 1615 if (bytes == NULL) 1616 return NULL; 1617 1618 bytes_obj = PyBytes_FromString(bytes); 1619 PyMem_Free(bytes); 1620 return bytes_obj; 1621 } 1622#endif 1623} 1624 1625PyObject *PyUnicode_AsEncodedString(PyObject *unicode, 1626 const char *encoding, 1627 const char *errors) 1628{ 1629 PyObject *v; 1630 char lower[11]; /* Enough for any encoding shortcut */ 1631 1632 if (!PyUnicode_Check(unicode)) { 1633 PyErr_BadArgument(); 1634 return NULL; 1635 } 1636 1637 if (encoding == NULL) 1638 encoding = PyUnicode_GetDefaultEncoding(); 1639 1640 /* Shortcuts for common default encodings */ 1641 if (normalize_encoding(encoding, lower, sizeof(lower))) { 1642 if (strcmp(lower, "utf-8") == 0) 1643 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode), 1644 PyUnicode_GET_SIZE(unicode), 1645 errors); 1646 else if ((strcmp(lower, "latin-1") == 0) || 1647 (strcmp(lower, "iso-8859-1") == 0)) 1648 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode), 1649 PyUnicode_GET_SIZE(unicode), 1650 errors); 1651#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T) 1652 else if (strcmp(lower, "mbcs") == 0) 1653 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode), 1654 PyUnicode_GET_SIZE(unicode), 1655 errors); 1656#endif 1657 else if (strcmp(lower, "ascii") == 0) 1658 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode), 1659 PyUnicode_GET_SIZE(unicode), 1660 errors); 1661 } 1662 /* During bootstrap, we may need to find the encodings 1663 package, to load the file system encoding, and require the 1664 file system encoding in order to load the encodings 1665 package. 1666 1667 Break out of this dependency by assuming that the path to 1668 the encodings module is ASCII-only. XXX could try wcstombs 1669 instead, if the file system encoding is the locale's 1670 encoding. */ 1671 if (Py_FileSystemDefaultEncoding && 1672 strcmp(encoding, Py_FileSystemDefaultEncoding) == 0 && 1673 !PyThreadState_GET()->interp->codecs_initialized) 1674 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode), 1675 PyUnicode_GET_SIZE(unicode), 1676 errors); 1677 1678 /* Encode via the codec registry */ 1679 v = PyCodec_Encode(unicode, encoding, errors); 1680 if (v == NULL) 1681 return NULL; 1682 1683 /* The normal path */ 1684 if (PyBytes_Check(v)) 1685 return v; 1686 1687 /* If the codec returns a buffer, raise a warning and convert to bytes */ 1688 if (PyByteArray_Check(v)) { 1689 int error; 1690 PyObject *b; 1691 1692 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1, 1693 "encoder %s returned bytearray instead of bytes", 1694 encoding); 1695 if (error) { 1696 Py_DECREF(v); 1697 return NULL; 1698 } 1699 1700 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v)); 1701 Py_DECREF(v); 1702 return b; 1703 } 1704 1705 PyErr_Format(PyExc_TypeError, 1706 "encoder did not return a bytes object (type=%.400s)", 1707 Py_TYPE(v)->tp_name); 1708 Py_DECREF(v); 1709 return NULL; 1710} 1711 1712PyObject *PyUnicode_AsEncodedUnicode(PyObject *unicode, 1713 const char *encoding, 1714 const char *errors) 1715{ 1716 PyObject *v; 1717 1718 if (!PyUnicode_Check(unicode)) { 1719 PyErr_BadArgument(); 1720 goto onError; 1721 } 1722 1723 if (encoding == NULL) 1724 encoding = PyUnicode_GetDefaultEncoding(); 1725 1726 /* Encode via the codec registry */ 1727 v = PyCodec_Encode(unicode, encoding, errors); 1728 if (v == NULL) 1729 goto onError; 1730 if (!PyUnicode_Check(v)) { 1731 PyErr_Format(PyExc_TypeError, 1732 "encoder did not return an str object (type=%.400s)", 1733 Py_TYPE(v)->tp_name); 1734 Py_DECREF(v); 1735 goto onError; 1736 } 1737 return v; 1738 1739 onError: 1740 return NULL; 1741} 1742 1743PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode, 1744 const char *errors) 1745{ 1746 PyObject *v = ((PyUnicodeObject *)unicode)->defenc; 1747 if (v) 1748 return v; 1749 if (errors != NULL) 1750 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString"); 1751 v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode), 1752 PyUnicode_GET_SIZE(unicode), 1753 NULL); 1754 if (!v) 1755 return NULL; 1756 ((PyUnicodeObject *)unicode)->defenc = v; 1757 return v; 1758} 1759 1760PyObject* 1761PyUnicode_DecodeFSDefault(const char *s) { 1762 Py_ssize_t size = (Py_ssize_t)strlen(s); 1763 return PyUnicode_DecodeFSDefaultAndSize(s, size); 1764} 1765 1766PyObject* 1767PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size) 1768{ 1769#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T) 1770 return PyUnicode_DecodeMBCS(s, size, NULL); 1771#elif defined(__APPLE__) 1772 return PyUnicode_DecodeUTF8(s, size, "surrogateescape"); 1773#else 1774 /* During the early bootstrapping process, Py_FileSystemDefaultEncoding 1775 can be undefined. If it is case, decode using UTF-8. The following assumes 1776 that Py_FileSystemDefaultEncoding is set to a built-in encoding during the 1777 bootstrapping process where the codecs aren't ready yet. 1778 */ 1779 if (Py_FileSystemDefaultEncoding) { 1780 return PyUnicode_Decode(s, size, 1781 Py_FileSystemDefaultEncoding, 1782 "surrogateescape"); 1783 } 1784 else { 1785 /* locale encoding with surrogateescape */ 1786 wchar_t *wchar; 1787 PyObject *unicode; 1788 size_t len; 1789 1790 if (s[size] != '\0' || size != strlen(s)) { 1791 PyErr_SetString(PyExc_TypeError, "embedded NUL character"); 1792 return NULL; 1793 } 1794 1795 wchar = _Py_char2wchar(s, &len); 1796 if (wchar == NULL) 1797 return NULL; 1798 1799 unicode = PyUnicode_FromWideChar(wchar, len); 1800 PyMem_Free(wchar); 1801 return unicode; 1802 } 1803#endif 1804} 1805 1806 1807int 1808PyUnicode_FSConverter(PyObject* arg, void* addr) 1809{ 1810 PyObject *output = NULL; 1811 Py_ssize_t size; 1812 void *data; 1813 if (arg == NULL) { 1814 Py_DECREF(*(PyObject**)addr); 1815 return 1; 1816 } 1817 if (PyBytes_Check(arg)) { 1818 output = arg; 1819 Py_INCREF(output); 1820 } 1821 else { 1822 arg = PyUnicode_FromObject(arg); 1823 if (!arg) 1824 return 0; 1825 output = PyUnicode_EncodeFSDefault(arg); 1826 Py_DECREF(arg); 1827 if (!output) 1828 return 0; 1829 if (!PyBytes_Check(output)) { 1830 Py_DECREF(output); 1831 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes"); 1832 return 0; 1833 } 1834 } 1835 size = PyBytes_GET_SIZE(output); 1836 data = PyBytes_AS_STRING(output); 1837 if (size != strlen(data)) { 1838 PyErr_SetString(PyExc_TypeError, "embedded NUL character"); 1839 Py_DECREF(output); 1840 return 0; 1841 } 1842 *(PyObject**)addr = output; 1843 return Py_CLEANUP_SUPPORTED; 1844} 1845 1846 1847int 1848PyUnicode_FSDecoder(PyObject* arg, void* addr) 1849{ 1850 PyObject *output = NULL; 1851 Py_ssize_t size; 1852 void *data; 1853 if (arg == NULL) { 1854 Py_DECREF(*(PyObject**)addr); 1855 return 1; 1856 } 1857 if (PyUnicode_Check(arg)) { 1858 output = arg; 1859 Py_INCREF(output); 1860 } 1861 else { 1862 arg = PyBytes_FromObject(arg); 1863 if (!arg) 1864 return 0; 1865 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg), 1866 PyBytes_GET_SIZE(arg)); 1867 Py_DECREF(arg); 1868 if (!output) 1869 return 0; 1870 if (!PyUnicode_Check(output)) { 1871 Py_DECREF(output); 1872 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode"); 1873 return 0; 1874 } 1875 } 1876 size = PyUnicode_GET_SIZE(output); 1877 data = PyUnicode_AS_UNICODE(output); 1878 if (size != Py_UNICODE_strlen(data)) { 1879 PyErr_SetString(PyExc_TypeError, "embedded NUL character"); 1880 Py_DECREF(output); 1881 return 0; 1882 } 1883 *(PyObject**)addr = output; 1884 return Py_CLEANUP_SUPPORTED; 1885} 1886 1887 1888char* 1889_PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize) 1890{ 1891 PyObject *bytes; 1892 if (!PyUnicode_Check(unicode)) { 1893 PyErr_BadArgument(); 1894 return NULL; 1895 } 1896 bytes = _PyUnicode_AsDefaultEncodedString(unicode, NULL); 1897 if (bytes == NULL) 1898 return NULL; 1899 if (psize != NULL) 1900 *psize = PyBytes_GET_SIZE(bytes); 1901 return PyBytes_AS_STRING(bytes); 1902} 1903 1904char* 1905_PyUnicode_AsString(PyObject *unicode) 1906{ 1907 return _PyUnicode_AsStringAndSize(unicode, NULL); 1908} 1909 1910Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode) 1911{ 1912 if (!PyUnicode_Check(unicode)) { 1913 PyErr_BadArgument(); 1914 goto onError; 1915 } 1916 return PyUnicode_AS_UNICODE(unicode); 1917 1918 onError: 1919 return NULL; 1920} 1921 1922Py_ssize_t PyUnicode_GetSize(PyObject *unicode) 1923{ 1924 if (!PyUnicode_Check(unicode)) { 1925 PyErr_BadArgument(); 1926 goto onError; 1927 } 1928 return PyUnicode_GET_SIZE(unicode); 1929 1930 onError: 1931 return -1; 1932} 1933 1934const char *PyUnicode_GetDefaultEncoding(void) 1935{ 1936 return "utf-8"; 1937} 1938 1939/* create or adjust a UnicodeDecodeError */ 1940static void 1941make_decode_exception(PyObject **exceptionObject, 1942 const char *encoding, 1943 const char *input, Py_ssize_t length, 1944 Py_ssize_t startpos, Py_ssize_t endpos, 1945 const char *reason) 1946{ 1947 if (*exceptionObject == NULL) { 1948 *exceptionObject = PyUnicodeDecodeError_Create( 1949 encoding, input, length, startpos, endpos, reason); 1950 } 1951 else { 1952 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos)) 1953 goto onError; 1954 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos)) 1955 goto onError; 1956 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason)) 1957 goto onError; 1958 } 1959 return; 1960 1961onError: 1962 Py_DECREF(*exceptionObject); 1963 *exceptionObject = NULL; 1964} 1965 1966/* error handling callback helper: 1967 build arguments, call the callback and check the arguments, 1968 if no exception occurred, copy the replacement to the output 1969 and adjust various state variables. 1970 return 0 on success, -1 on error 1971*/ 1972 1973static 1974int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler, 1975 const char *encoding, const char *reason, 1976 const char **input, const char **inend, Py_ssize_t *startinpos, 1977 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr, 1978 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr) 1979{ 1980 static char *argparse = "O!n;decoding error handler must return (str, int) tuple"; 1981 1982 PyObject *restuple = NULL; 1983 PyObject *repunicode = NULL; 1984 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output); 1985 Py_ssize_t insize; 1986 Py_ssize_t requiredsize; 1987 Py_ssize_t newpos; 1988 Py_UNICODE *repptr; 1989 PyObject *inputobj = NULL; 1990 Py_ssize_t repsize; 1991 int res = -1; 1992 1993 if (*errorHandler == NULL) { 1994 *errorHandler = PyCodec_LookupError(errors); 1995 if (*errorHandler == NULL) 1996 goto onError; 1997 } 1998 1999 make_decode_exception(exceptionObject, 2000 encoding, 2001 *input, *inend - *input, 2002 *startinpos, *endinpos, 2003 reason); 2004 if (*exceptionObject == NULL) 2005 goto onError; 2006 2007 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL); 2008 if (restuple == NULL) 2009 goto onError; 2010 if (!PyTuple_Check(restuple)) { 2011 PyErr_SetString(PyExc_TypeError, &argparse[4]); 2012 goto onError; 2013 } 2014 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos)) 2015 goto onError; 2016 2017 /* Copy back the bytes variables, which might have been modified by the 2018 callback */ 2019 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject); 2020 if (!inputobj) 2021 goto onError; 2022 if (!PyBytes_Check(inputobj)) { 2023 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes"); 2024 } 2025 *input = PyBytes_AS_STRING(inputobj); 2026 insize = PyBytes_GET_SIZE(inputobj); 2027 *inend = *input + insize; 2028 /* we can DECREF safely, as the exception has another reference, 2029 so the object won't go away. */ 2030 Py_DECREF(inputobj); 2031 2032 if (newpos<0) 2033 newpos = insize+newpos; 2034 if (newpos<0 || newpos>insize) { 2035 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos); 2036 goto onError; 2037 } 2038 2039 /* need more space? (at least enough for what we 2040 have+the replacement+the rest of the string (starting 2041 at the new input position), so we won't have to check space 2042 when there are no errors in the rest of the string) */ 2043 repptr = PyUnicode_AS_UNICODE(repunicode); 2044 repsize = PyUnicode_GET_SIZE(repunicode); 2045 requiredsize = *outpos + repsize + insize-newpos; 2046 if (requiredsize > outsize) { 2047 if (requiredsize<2*outsize) 2048 requiredsize = 2*outsize; 2049 if (_PyUnicode_Resize(output, requiredsize) < 0) 2050 goto onError; 2051 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos; 2052 } 2053 *endinpos = newpos; 2054 *inptr = *input + newpos; 2055 Py_UNICODE_COPY(*outptr, repptr, repsize); 2056 *outptr += repsize; 2057 *outpos += repsize; 2058 2059 /* we made it! */ 2060 res = 0; 2061 2062 onError: 2063 Py_XDECREF(restuple); 2064 return res; 2065} 2066 2067/* --- UTF-7 Codec -------------------------------------------------------- */ 2068 2069/* See RFC2152 for details. We encode conservatively and decode liberally. */ 2070 2071/* Three simple macros defining base-64. */ 2072 2073/* Is c a base-64 character? */ 2074 2075#define IS_BASE64(c) \ 2076 (((c) >= 'A' && (c) <= 'Z') || \ 2077 ((c) >= 'a' && (c) <= 'z') || \ 2078 ((c) >= '0' && (c) <= '9') || \ 2079 (c) == '+' || (c) == '/') 2080 2081/* given that c is a base-64 character, what is its base-64 value? */ 2082 2083#define FROM_BASE64(c) \ 2084 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \ 2085 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \ 2086 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \ 2087 (c) == '+' ? 62 : 63) 2088 2089/* What is the base-64 character of the bottom 6 bits of n? */ 2090 2091#define TO_BASE64(n) \ 2092 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f]) 2093 2094/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be 2095 * decoded as itself. We are permissive on decoding; the only ASCII 2096 * byte not decoding to itself is the + which begins a base64 2097 * string. */ 2098 2099#define DECODE_DIRECT(c) \ 2100 ((c) <= 127 && (c) != '+') 2101 2102/* The UTF-7 encoder treats ASCII characters differently according to 2103 * whether they are Set D, Set O, Whitespace, or special (i.e. none of 2104 * the above). See RFC2152. This array identifies these different 2105 * sets: 2106 * 0 : "Set D" 2107 * alphanumeric and '(),-./:? 2108 * 1 : "Set O" 2109 * !"#$%&*;<=>@[]^_`{|} 2110 * 2 : "whitespace" 2111 * ht nl cr sp 2112 * 3 : special (must be base64 encoded) 2113 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127) 2114 */ 2115 2116static 2117char utf7_category[128] = { 2118/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */ 2119 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3, 2120/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */ 2121 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2122/* sp ! " # $ % & ' ( ) * + , - . / */ 2123 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0, 2124/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */ 2125 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 2126/* @ A B C D E F G H I J K L M N O */ 2127 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2128/* P Q R S T U V W X Y Z [ \ ] ^ _ */ 2129 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1, 2130/* ` a b c d e f g h i j k l m n o */ 2131 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2132/* p q r s t u v w x y z { | } ~ del */ 2133 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3, 2134}; 2135 2136/* ENCODE_DIRECT: this character should be encoded as itself. The 2137 * answer depends on whether we are encoding set O as itself, and also 2138 * on whether we are encoding whitespace as itself. RFC2152 makes it 2139 * clear that the answers to these questions vary between 2140 * applications, so this code needs to be flexible. */ 2141 2142#define ENCODE_DIRECT(c, directO, directWS) \ 2143 ((c) < 128 && (c) > 0 && \ 2144 ((utf7_category[(c)] == 0) || \ 2145 (directWS && (utf7_category[(c)] == 2)) || \ 2146 (directO && (utf7_category[(c)] == 1)))) 2147 2148PyObject *PyUnicode_DecodeUTF7(const char *s, 2149 Py_ssize_t size, 2150 const char *errors) 2151{ 2152 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL); 2153} 2154 2155/* The decoder. The only state we preserve is our read position, 2156 * i.e. how many characters we have consumed. So if we end in the 2157 * middle of a shift sequence we have to back off the read position 2158 * and the output to the beginning of the sequence, otherwise we lose 2159 * all the shift state (seen bits, number of bits seen, high 2160 * surrogate). */ 2161 2162PyObject *PyUnicode_DecodeUTF7Stateful(const char *s, 2163 Py_ssize_t size, 2164 const char *errors, 2165 Py_ssize_t *consumed) 2166{ 2167 const char *starts = s; 2168 Py_ssize_t startinpos; 2169 Py_ssize_t endinpos; 2170 Py_ssize_t outpos; 2171 const char *e; 2172 PyUnicodeObject *unicode; 2173 Py_UNICODE *p; 2174 const char *errmsg = ""; 2175 int inShift = 0; 2176 Py_UNICODE *shiftOutStart; 2177 unsigned int base64bits = 0; 2178 unsigned long base64buffer = 0; 2179 Py_UNICODE surrogate = 0; 2180 PyObject *errorHandler = NULL; 2181 PyObject *exc = NULL; 2182 2183 unicode = _PyUnicode_New(size); 2184 if (!unicode) 2185 return NULL; 2186 if (size == 0) { 2187 if (consumed) 2188 *consumed = 0; 2189 return (PyObject *)unicode; 2190 } 2191 2192 p = unicode->str; 2193 shiftOutStart = p; 2194 e = s + size; 2195 2196 while (s < e) { 2197 Py_UNICODE ch; 2198 restart: 2199 ch = (unsigned char) *s; 2200 2201 if (inShift) { /* in a base-64 section */ 2202 if (IS_BASE64(ch)) { /* consume a base-64 character */ 2203 base64buffer = (base64buffer << 6) | FROM_BASE64(ch); 2204 base64bits += 6; 2205 s++; 2206 if (base64bits >= 16) { 2207 /* we have enough bits for a UTF-16 value */ 2208 Py_UNICODE outCh = (Py_UNICODE) 2209 (base64buffer >> (base64bits-16)); 2210 base64bits -= 16; 2211 base64buffer &= (1 << base64bits) - 1; /* clear high bits */ 2212 if (surrogate) { 2213 /* expecting a second surrogate */ 2214 if (outCh >= 0xDC00 && outCh <= 0xDFFF) { 2215#ifdef Py_UNICODE_WIDE 2216 *p++ = (((surrogate & 0x3FF)<<10) 2217 | (outCh & 0x3FF)) + 0x10000; 2218#else 2219 *p++ = surrogate; 2220 *p++ = outCh; 2221#endif 2222 surrogate = 0; 2223 } 2224 else { 2225 surrogate = 0; 2226 errmsg = "second surrogate missing"; 2227 goto utf7Error; 2228 } 2229 } 2230 else if (outCh >= 0xD800 && outCh <= 0xDBFF) { 2231 /* first surrogate */ 2232 surrogate = outCh; 2233 } 2234 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) { 2235 errmsg = "unexpected second surrogate"; 2236 goto utf7Error; 2237 } 2238 else { 2239 *p++ = outCh; 2240 } 2241 } 2242 } 2243 else { /* now leaving a base-64 section */ 2244 inShift = 0; 2245 s++; 2246 if (surrogate) { 2247 errmsg = "second surrogate missing at end of shift sequence"; 2248 goto utf7Error; 2249 } 2250 if (base64bits > 0) { /* left-over bits */ 2251 if (base64bits >= 6) { 2252 /* We've seen at least one base-64 character */ 2253 errmsg = "partial character in shift sequence"; 2254 goto utf7Error; 2255 } 2256 else { 2257 /* Some bits remain; they should be zero */ 2258 if (base64buffer != 0) { 2259 errmsg = "non-zero padding bits in shift sequence"; 2260 goto utf7Error; 2261 } 2262 } 2263 } 2264 if (ch != '-') { 2265 /* '-' is absorbed; other terminating 2266 characters are preserved */ 2267 *p++ = ch; 2268 } 2269 } 2270 } 2271 else if ( ch == '+' ) { 2272 startinpos = s-starts; 2273 s++; /* consume '+' */ 2274 if (s < e && *s == '-') { /* '+-' encodes '+' */ 2275 s++; 2276 *p++ = '+'; 2277 } 2278 else { /* begin base64-encoded section */ 2279 inShift = 1; 2280 shiftOutStart = p; 2281 base64bits = 0; 2282 } 2283 } 2284 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */ 2285 *p++ = ch; 2286 s++; 2287 } 2288 else { 2289 startinpos = s-starts; 2290 s++; 2291 errmsg = "unexpected special character"; 2292 goto utf7Error; 2293 } 2294 continue; 2295utf7Error: 2296 outpos = p-PyUnicode_AS_UNICODE(unicode); 2297 endinpos = s-starts; 2298 if (unicode_decode_call_errorhandler( 2299 errors, &errorHandler, 2300 "utf7", errmsg, 2301 &starts, &e, &startinpos, &endinpos, &exc, &s, 2302 &unicode, &outpos, &p)) 2303 goto onError; 2304 } 2305 2306 /* end of string */ 2307 2308 if (inShift && !consumed) { /* in shift sequence, no more to follow */ 2309 /* if we're in an inconsistent state, that's an error */ 2310 if (surrogate || 2311 (base64bits >= 6) || 2312 (base64bits > 0 && base64buffer != 0)) { 2313 outpos = p-PyUnicode_AS_UNICODE(unicode); 2314 endinpos = size; 2315 if (unicode_decode_call_errorhandler( 2316 errors, &errorHandler, 2317 "utf7", "unterminated shift sequence", 2318 &starts, &e, &startinpos, &endinpos, &exc, &s, 2319 &unicode, &outpos, &p)) 2320 goto onError; 2321 if (s < e) 2322 goto restart; 2323 } 2324 } 2325 2326 /* return state */ 2327 if (consumed) { 2328 if (inShift) { 2329 p = shiftOutStart; /* back off output */ 2330 *consumed = startinpos; 2331 } 2332 else { 2333 *consumed = s-starts; 2334 } 2335 } 2336 2337 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0) 2338 goto onError; 2339 2340 Py_XDECREF(errorHandler); 2341 Py_XDECREF(exc); 2342 return (PyObject *)unicode; 2343 2344 onError: 2345 Py_XDECREF(errorHandler); 2346 Py_XDECREF(exc); 2347 Py_DECREF(unicode); 2348 return NULL; 2349} 2350 2351 2352PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s, 2353 Py_ssize_t size, 2354 int base64SetO, 2355 int base64WhiteSpace, 2356 const char *errors) 2357{ 2358 PyObject *v; 2359 /* It might be possible to tighten this worst case */ 2360 Py_ssize_t allocated = 8 * size; 2361 int inShift = 0; 2362 Py_ssize_t i = 0; 2363 unsigned int base64bits = 0; 2364 unsigned long base64buffer = 0; 2365 char * out; 2366 char * start; 2367 2368 if (size == 0) 2369 return PyBytes_FromStringAndSize(NULL, 0); 2370 2371 if (allocated / 8 != size) 2372 return PyErr_NoMemory(); 2373 2374 v = PyBytes_FromStringAndSize(NULL, allocated); 2375 if (v == NULL) 2376 return NULL; 2377 2378 start = out = PyBytes_AS_STRING(v); 2379 for (;i < size; ++i) { 2380 Py_UNICODE ch = s[i]; 2381 2382 if (inShift) { 2383 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) { 2384 /* shifting out */ 2385 if (base64bits) { /* output remaining bits */ 2386 *out++ = TO_BASE64(base64buffer << (6-base64bits)); 2387 base64buffer = 0; 2388 base64bits = 0; 2389 } 2390 inShift = 0; 2391 /* Characters not in the BASE64 set implicitly unshift the sequence 2392 so no '-' is required, except if the character is itself a '-' */ 2393 if (IS_BASE64(ch) || ch == '-') { 2394 *out++ = '-'; 2395 } 2396 *out++ = (char) ch; 2397 } 2398 else { 2399 goto encode_char; 2400 } 2401 } 2402 else { /* not in a shift sequence */ 2403 if (ch == '+') { 2404 *out++ = '+'; 2405 *out++ = '-'; 2406 } 2407 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) { 2408 *out++ = (char) ch; 2409 } 2410 else { 2411 *out++ = '+'; 2412 inShift = 1; 2413 goto encode_char; 2414 } 2415 } 2416 continue; 2417encode_char: 2418#ifdef Py_UNICODE_WIDE 2419 if (ch >= 0x10000) { 2420 /* code first surrogate */ 2421 base64bits += 16; 2422 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10); 2423 while (base64bits >= 6) { 2424 *out++ = TO_BASE64(base64buffer >> (base64bits-6)); 2425 base64bits -= 6; 2426 } 2427 /* prepare second surrogate */ 2428 ch = 0xDC00 | ((ch-0x10000) & 0x3FF); 2429 } 2430#endif 2431 base64bits += 16; 2432 base64buffer = (base64buffer << 16) | ch; 2433 while (base64bits >= 6) { 2434 *out++ = TO_BASE64(base64buffer >> (base64bits-6)); 2435 base64bits -= 6; 2436 } 2437 } 2438 if (base64bits) 2439 *out++= TO_BASE64(base64buffer << (6-base64bits) ); 2440 if (inShift) 2441 *out++ = '-'; 2442 if (_PyBytes_Resize(&v, out - start) < 0) 2443 return NULL; 2444 return v; 2445} 2446 2447#undef IS_BASE64 2448#undef FROM_BASE64 2449#undef TO_BASE64 2450#undef DECODE_DIRECT 2451#undef ENCODE_DIRECT 2452 2453/* --- UTF-8 Codec -------------------------------------------------------- */ 2454 2455static 2456char utf8_code_length[256] = { 2457 /* Map UTF-8 encoded prefix byte to sequence length. Zero means 2458 illegal prefix. See RFC 3629 for details */ 2459 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */ 2460 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2461 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2462 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2463 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2464 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2465 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2466 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */ 2467 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */ 2468 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2469 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2470 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */ 2471 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */ 2472 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */ 2473 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */ 2474 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */ 2475}; 2476 2477PyObject *PyUnicode_DecodeUTF8(const char *s, 2478 Py_ssize_t size, 2479 const char *errors) 2480{ 2481 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL); 2482} 2483 2484/* Mask to check or force alignment of a pointer to C 'long' boundaries */ 2485#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1) 2486 2487/* Mask to quickly check whether a C 'long' contains a 2488 non-ASCII, UTF8-encoded char. */ 2489#if (SIZEOF_LONG == 8) 2490# define ASCII_CHAR_MASK 0x8080808080808080L 2491#elif (SIZEOF_LONG == 4) 2492# define ASCII_CHAR_MASK 0x80808080L 2493#else 2494# error C 'long' size should be either 4 or 8! 2495#endif 2496 2497PyObject *PyUnicode_DecodeUTF8Stateful(const char *s, 2498 Py_ssize_t size, 2499 const char *errors, 2500 Py_ssize_t *consumed) 2501{ 2502 const char *starts = s; 2503 int n; 2504 int k; 2505 Py_ssize_t startinpos; 2506 Py_ssize_t endinpos; 2507 Py_ssize_t outpos; 2508 const char *e, *aligned_end; 2509 PyUnicodeObject *unicode; 2510 Py_UNICODE *p; 2511 const char *errmsg = ""; 2512 PyObject *errorHandler = NULL; 2513 PyObject *exc = NULL; 2514 2515 /* Note: size will always be longer than the resulting Unicode 2516 character count */ 2517 unicode = _PyUnicode_New(size); 2518 if (!unicode) 2519 return NULL; 2520 if (size == 0) { 2521 if (consumed) 2522 *consumed = 0; 2523 return (PyObject *)unicode; 2524 } 2525 2526 /* Unpack UTF-8 encoded data */ 2527 p = unicode->str; 2528 e = s + size; 2529 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK); 2530 2531 while (s < e) { 2532 Py_UCS4 ch = (unsigned char)*s; 2533 2534 if (ch < 0x80) { 2535 /* Fast path for runs of ASCII characters. Given that common UTF-8 2536 input will consist of an overwhelming majority of ASCII 2537 characters, we try to optimize for this case by checking 2538 as many characters as a C 'long' can contain. 2539 First, check if we can do an aligned read, as most CPUs have 2540 a penalty for unaligned reads. 2541 */ 2542 if (!((size_t) s & LONG_PTR_MASK)) { 2543 /* Help register allocation */ 2544 register const char *_s = s; 2545 register Py_UNICODE *_p = p; 2546 while (_s < aligned_end) { 2547 /* Read a whole long at a time (either 4 or 8 bytes), 2548 and do a fast unrolled copy if it only contains ASCII 2549 characters. */ 2550 unsigned long data = *(unsigned long *) _s; 2551 if (data & ASCII_CHAR_MASK) 2552 break; 2553 _p[0] = (unsigned char) _s[0]; 2554 _p[1] = (unsigned char) _s[1]; 2555 _p[2] = (unsigned char) _s[2]; 2556 _p[3] = (unsigned char) _s[3]; 2557#if (SIZEOF_LONG == 8) 2558 _p[4] = (unsigned char) _s[4]; 2559 _p[5] = (unsigned char) _s[5]; 2560 _p[6] = (unsigned char) _s[6]; 2561 _p[7] = (unsigned char) _s[7]; 2562#endif 2563 _s += SIZEOF_LONG; 2564 _p += SIZEOF_LONG; 2565 } 2566 s = _s; 2567 p = _p; 2568 if (s == e) 2569 break; 2570 ch = (unsigned char)*s; 2571 } 2572 } 2573 2574 if (ch < 0x80) { 2575 *p++ = (Py_UNICODE)ch; 2576 s++; 2577 continue; 2578 } 2579 2580 n = utf8_code_length[ch]; 2581 2582 if (s + n > e) { 2583 if (consumed) 2584 break; 2585 else { 2586 errmsg = "unexpected end of data"; 2587 startinpos = s-starts; 2588 endinpos = startinpos+1; 2589 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++) 2590 endinpos++; 2591 goto utf8Error; 2592 } 2593 } 2594 2595 switch (n) { 2596 2597 case 0: 2598 errmsg = "invalid start byte"; 2599 startinpos = s-starts; 2600 endinpos = startinpos+1; 2601 goto utf8Error; 2602 2603 case 1: 2604 errmsg = "internal error"; 2605 startinpos = s-starts; 2606 endinpos = startinpos+1; 2607 goto utf8Error; 2608 2609 case 2: 2610 if ((s[1] & 0xc0) != 0x80) { 2611 errmsg = "invalid continuation byte"; 2612 startinpos = s-starts; 2613 endinpos = startinpos + 1; 2614 goto utf8Error; 2615 } 2616 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f); 2617 assert ((ch > 0x007F) && (ch <= 0x07FF)); 2618 *p++ = (Py_UNICODE)ch; 2619 break; 2620 2621 case 3: 2622 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf 2623 will result in surrogates in range d800-dfff. Surrogates are 2624 not valid UTF-8 so they are rejected. 2625 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf 2626 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */ 2627 if ((s[1] & 0xc0) != 0x80 || 2628 (s[2] & 0xc0) != 0x80 || 2629 ((unsigned char)s[0] == 0xE0 && 2630 (unsigned char)s[1] < 0xA0) || 2631 ((unsigned char)s[0] == 0xED && 2632 (unsigned char)s[1] > 0x9F)) { 2633 errmsg = "invalid continuation byte"; 2634 startinpos = s-starts; 2635 endinpos = startinpos + 1; 2636 2637 /* if s[1] first two bits are 1 and 0, then the invalid 2638 continuation byte is s[2], so increment endinpos by 1, 2639 if not, s[1] is invalid and endinpos doesn't need to 2640 be incremented. */ 2641 if ((s[1] & 0xC0) == 0x80) 2642 endinpos++; 2643 goto utf8Error; 2644 } 2645 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f); 2646 assert ((ch > 0x07FF) && (ch <= 0xFFFF)); 2647 *p++ = (Py_UNICODE)ch; 2648 break; 2649 2650 case 4: 2651 if ((s[1] & 0xc0) != 0x80 || 2652 (s[2] & 0xc0) != 0x80 || 2653 (s[3] & 0xc0) != 0x80 || 2654 ((unsigned char)s[0] == 0xF0 && 2655 (unsigned char)s[1] < 0x90) || 2656 ((unsigned char)s[0] == 0xF4 && 2657 (unsigned char)s[1] > 0x8F)) { 2658 errmsg = "invalid continuation byte"; 2659 startinpos = s-starts; 2660 endinpos = startinpos + 1; 2661 if ((s[1] & 0xC0) == 0x80) { 2662 endinpos++; 2663 if ((s[2] & 0xC0) == 0x80) 2664 endinpos++; 2665 } 2666 goto utf8Error; 2667 } 2668 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) + 2669 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f); 2670 assert ((ch > 0xFFFF) && (ch <= 0x10ffff)); 2671 2672#ifdef Py_UNICODE_WIDE 2673 *p++ = (Py_UNICODE)ch; 2674#else 2675 /* compute and append the two surrogates: */ 2676 2677 /* translate from 10000..10FFFF to 0..FFFF */ 2678 ch -= 0x10000; 2679 2680 /* high surrogate = top 10 bits added to D800 */ 2681 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10)); 2682 2683 /* low surrogate = bottom 10 bits added to DC00 */ 2684 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF)); 2685#endif 2686 break; 2687 } 2688 s += n; 2689 continue; 2690 2691 utf8Error: 2692 outpos = p-PyUnicode_AS_UNICODE(unicode); 2693 if (unicode_decode_call_errorhandler( 2694 errors, &errorHandler, 2695 "utf8", errmsg, 2696 &starts, &e, &startinpos, &endinpos, &exc, &s, 2697 &unicode, &outpos, &p)) 2698 goto onError; 2699 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK); 2700 } 2701 if (consumed) 2702 *consumed = s-starts; 2703 2704 /* Adjust length */ 2705 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0) 2706 goto onError; 2707 2708 Py_XDECREF(errorHandler); 2709 Py_XDECREF(exc); 2710 return (PyObject *)unicode; 2711 2712 onError: 2713 Py_XDECREF(errorHandler); 2714 Py_XDECREF(exc); 2715 Py_DECREF(unicode); 2716 return NULL; 2717} 2718 2719#undef ASCII_CHAR_MASK 2720 2721#ifdef __APPLE__ 2722 2723/* Simplified UTF-8 decoder using surrogateescape error handler, 2724 used to decode the command line arguments on Mac OS X. */ 2725 2726wchar_t* 2727_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size) 2728{ 2729 int n; 2730 const char *e; 2731 wchar_t *unicode, *p; 2732 2733 /* Note: size will always be longer than the resulting Unicode 2734 character count */ 2735 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) { 2736 PyErr_NoMemory(); 2737 return NULL; 2738 } 2739 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t)); 2740 if (!unicode) 2741 return NULL; 2742 2743 /* Unpack UTF-8 encoded data */ 2744 p = unicode; 2745 e = s + size; 2746 while (s < e) { 2747 Py_UCS4 ch = (unsigned char)*s; 2748 2749 if (ch < 0x80) { 2750 *p++ = (wchar_t)ch; 2751 s++; 2752 continue; 2753 } 2754 2755 n = utf8_code_length[ch]; 2756 if (s + n > e) { 2757 goto surrogateescape; 2758 } 2759 2760 switch (n) { 2761 case 0: 2762 case 1: 2763 goto surrogateescape; 2764 2765 case 2: 2766 if ((s[1] & 0xc0) != 0x80) 2767 goto surrogateescape; 2768 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f); 2769 assert ((ch > 0x007F) && (ch <= 0x07FF)); 2770 *p++ = (wchar_t)ch; 2771 break; 2772 2773 case 3: 2774 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf 2775 will result in surrogates in range d800-dfff. Surrogates are 2776 not valid UTF-8 so they are rejected. 2777 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf 2778 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */ 2779 if ((s[1] & 0xc0) != 0x80 || 2780 (s[2] & 0xc0) != 0x80 || 2781 ((unsigned char)s[0] == 0xE0 && 2782 (unsigned char)s[1] < 0xA0) || 2783 ((unsigned char)s[0] == 0xED && 2784 (unsigned char)s[1] > 0x9F)) { 2785 2786 goto surrogateescape; 2787 } 2788 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f); 2789 assert ((ch > 0x07FF) && (ch <= 0xFFFF)); 2790 *p++ = (Py_UNICODE)ch; 2791 break; 2792 2793 case 4: 2794 if ((s[1] & 0xc0) != 0x80 || 2795 (s[2] & 0xc0) != 0x80 || 2796 (s[3] & 0xc0) != 0x80 || 2797 ((unsigned char)s[0] == 0xF0 && 2798 (unsigned char)s[1] < 0x90) || 2799 ((unsigned char)s[0] == 0xF4 && 2800 (unsigned char)s[1] > 0x8F)) { 2801 goto surrogateescape; 2802 } 2803 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) + 2804 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f); 2805 assert ((ch > 0xFFFF) && (ch <= 0x10ffff)); 2806 2807#if SIZEOF_WCHAR_T == 4 2808 *p++ = (wchar_t)ch; 2809#else 2810 /* compute and append the two surrogates: */ 2811 2812 /* translate from 10000..10FFFF to 0..FFFF */ 2813 ch -= 0x10000; 2814 2815 /* high surrogate = top 10 bits added to D800 */ 2816 *p++ = (wchar_t)(0xD800 + (ch >> 10)); 2817 2818 /* low surrogate = bottom 10 bits added to DC00 */ 2819 *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF)); 2820#endif 2821 break; 2822 } 2823 s += n; 2824 continue; 2825 2826 surrogateescape: 2827 *p++ = 0xDC00 + ch; 2828 s++; 2829 } 2830 *p = L'\0'; 2831 return unicode; 2832} 2833 2834#endif /* __APPLE__ */ 2835 2836/* Allocation strategy: if the string is short, convert into a stack buffer 2837 and allocate exactly as much space needed at the end. Else allocate the 2838 maximum possible needed (4 result bytes per Unicode character), and return 2839 the excess memory at the end. 2840*/ 2841PyObject * 2842PyUnicode_EncodeUTF8(const Py_UNICODE *s, 2843 Py_ssize_t size, 2844 const char *errors) 2845{ 2846#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */ 2847 2848 Py_ssize_t i; /* index into s of next input byte */ 2849 PyObject *result; /* result string object */ 2850 char *p; /* next free byte in output buffer */ 2851 Py_ssize_t nallocated; /* number of result bytes allocated */ 2852 Py_ssize_t nneeded; /* number of result bytes needed */ 2853 char stackbuf[MAX_SHORT_UNICHARS * 4]; 2854 PyObject *errorHandler = NULL; 2855 PyObject *exc = NULL; 2856 2857 assert(s != NULL); 2858 assert(size >= 0); 2859 2860 if (size <= MAX_SHORT_UNICHARS) { 2861 /* Write into the stack buffer; nallocated can't overflow. 2862 * At the end, we'll allocate exactly as much heap space as it 2863 * turns out we need. 2864 */ 2865 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int); 2866 result = NULL; /* will allocate after we're done */ 2867 p = stackbuf; 2868 } 2869 else { 2870 /* Overallocate on the heap, and give the excess back at the end. */ 2871 nallocated = size * 4; 2872 if (nallocated / 4 != size) /* overflow! */ 2873 return PyErr_NoMemory(); 2874 result = PyBytes_FromStringAndSize(NULL, nallocated); 2875 if (result == NULL) 2876 return NULL; 2877 p = PyBytes_AS_STRING(result); 2878 } 2879 2880 for (i = 0; i < size;) { 2881 Py_UCS4 ch = s[i++]; 2882 2883 if (ch < 0x80) 2884 /* Encode ASCII */ 2885 *p++ = (char) ch; 2886 2887 else if (ch < 0x0800) { 2888 /* Encode Latin-1 */ 2889 *p++ = (char)(0xc0 | (ch >> 6)); 2890 *p++ = (char)(0x80 | (ch & 0x3f)); 2891 } else if (0xD800 <= ch && ch <= 0xDFFF) { 2892#ifndef Py_UNICODE_WIDE 2893 /* Special case: check for high and low surrogate */ 2894 if (ch <= 0xDBFF && i != size && 0xDC00 <= s[i] && s[i] <= 0xDFFF) { 2895 Py_UCS4 ch2 = s[i]; 2896 /* Combine the two surrogates to form a UCS4 value */ 2897 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000; 2898 i++; 2899 2900 /* Encode UCS4 Unicode ordinals */ 2901 *p++ = (char)(0xf0 | (ch >> 18)); 2902 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f)); 2903 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); 2904 *p++ = (char)(0x80 | (ch & 0x3f)); 2905 } else { 2906#endif 2907 Py_ssize_t newpos; 2908 PyObject *rep; 2909 Py_ssize_t repsize, k; 2910 rep = unicode_encode_call_errorhandler 2911 (errors, &errorHandler, "utf-8", "surrogates not allowed", 2912 s, size, &exc, i-1, i, &newpos); 2913 if (!rep) 2914 goto error; 2915 2916 if (PyBytes_Check(rep)) 2917 repsize = PyBytes_GET_SIZE(rep); 2918 else 2919 repsize = PyUnicode_GET_SIZE(rep); 2920 2921 if (repsize > 4) { 2922 Py_ssize_t offset; 2923 2924 if (result == NULL) 2925 offset = p - stackbuf; 2926 else 2927 offset = p - PyBytes_AS_STRING(result); 2928 2929 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) { 2930 /* integer overflow */ 2931 PyErr_NoMemory(); 2932 goto error; 2933 } 2934 nallocated += repsize - 4; 2935 if (result != NULL) { 2936 if (_PyBytes_Resize(&result, nallocated) < 0) 2937 goto error; 2938 } else { 2939 result = PyBytes_FromStringAndSize(NULL, nallocated); 2940 if (result == NULL) 2941 goto error; 2942 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset); 2943 } 2944 p = PyBytes_AS_STRING(result) + offset; 2945 } 2946 2947 if (PyBytes_Check(rep)) { 2948 char *prep = PyBytes_AS_STRING(rep); 2949 for(k = repsize; k > 0; k--) 2950 *p++ = *prep++; 2951 } else /* rep is unicode */ { 2952 Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep); 2953 Py_UNICODE c; 2954 2955 for(k=0; k<repsize; k++) { 2956 c = prep[k]; 2957 if (0x80 <= c) { 2958 raise_encode_exception(&exc, "utf-8", s, size, 2959 i-1, i, "surrogates not allowed"); 2960 goto error; 2961 } 2962 *p++ = (char)prep[k]; 2963 } 2964 } 2965 Py_DECREF(rep); 2966#ifndef Py_UNICODE_WIDE 2967 } 2968#endif 2969 } else if (ch < 0x10000) { 2970 *p++ = (char)(0xe0 | (ch >> 12)); 2971 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); 2972 *p++ = (char)(0x80 | (ch & 0x3f)); 2973 } else /* ch >= 0x10000 */ { 2974 /* Encode UCS4 Unicode ordinals */ 2975 *p++ = (char)(0xf0 | (ch >> 18)); 2976 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f)); 2977 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); 2978 *p++ = (char)(0x80 | (ch & 0x3f)); 2979 } 2980 } 2981 2982 if (result == NULL) { 2983 /* This was stack allocated. */ 2984 nneeded = p - stackbuf; 2985 assert(nneeded <= nallocated); 2986 result = PyBytes_FromStringAndSize(stackbuf, nneeded); 2987 } 2988 else { 2989 /* Cut back to size actually needed. */ 2990 nneeded = p - PyBytes_AS_STRING(result); 2991 assert(nneeded <= nallocated); 2992 _PyBytes_Resize(&result, nneeded); 2993 } 2994 Py_XDECREF(errorHandler); 2995 Py_XDECREF(exc); 2996 return result; 2997 error: 2998 Py_XDECREF(errorHandler); 2999 Py_XDECREF(exc); 3000 Py_XDECREF(result); 3001 return NULL; 3002 3003#undef MAX_SHORT_UNICHARS 3004} 3005 3006PyObject *PyUnicode_AsUTF8String(PyObject *unicode) 3007{ 3008 if (!PyUnicode_Check(unicode)) { 3009 PyErr_BadArgument(); 3010 return NULL; 3011 } 3012 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode), 3013 PyUnicode_GET_SIZE(unicode), 3014 NULL); 3015} 3016 3017/* --- UTF-32 Codec ------------------------------------------------------- */ 3018 3019PyObject * 3020PyUnicode_DecodeUTF32(const char *s, 3021 Py_ssize_t size, 3022 const char *errors, 3023 int *byteorder) 3024{ 3025 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL); 3026} 3027 3028PyObject * 3029PyUnicode_DecodeUTF32Stateful(const char *s, 3030 Py_ssize_t size, 3031 const char *errors, 3032 int *byteorder, 3033 Py_ssize_t *consumed) 3034{ 3035 const char *starts = s; 3036 Py_ssize_t startinpos; 3037 Py_ssize_t endinpos; 3038 Py_ssize_t outpos; 3039 PyUnicodeObject *unicode; 3040 Py_UNICODE *p; 3041#ifndef Py_UNICODE_WIDE 3042 int pairs = 0; 3043 const unsigned char *qq; 3044#else 3045 const int pairs = 0; 3046#endif 3047 const unsigned char *q, *e; 3048 int bo = 0; /* assume native ordering by default */ 3049 const char *errmsg = ""; 3050 /* Offsets from q for retrieving bytes in the right order. */ 3051#ifdef BYTEORDER_IS_LITTLE_ENDIAN 3052 int iorder[] = {0, 1, 2, 3}; 3053#else 3054 int iorder[] = {3, 2, 1, 0}; 3055#endif 3056 PyObject *errorHandler = NULL; 3057 PyObject *exc = NULL; 3058 3059 q = (unsigned char *)s; 3060 e = q + size; 3061 3062 if (byteorder) 3063 bo = *byteorder; 3064 3065 /* Check for BOM marks (U+FEFF) in the input and adjust current 3066 byte order setting accordingly. In native mode, the leading BOM 3067 mark is skipped, in all other modes, it is copied to the output 3068 stream as-is (giving a ZWNBSP character). */ 3069 if (bo == 0) { 3070 if (size >= 4) { 3071 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) | 3072 (q[iorder[1]] << 8) | q[iorder[0]]; 3073#ifdef BYTEORDER_IS_LITTLE_ENDIAN 3074 if (bom == 0x0000FEFF) { 3075 q += 4; 3076 bo = -1; 3077 } 3078 else if (bom == 0xFFFE0000) { 3079 q += 4; 3080 bo = 1; 3081 } 3082#else 3083 if (bom == 0x0000FEFF) { 3084 q += 4; 3085 bo = 1; 3086 } 3087 else if (bom == 0xFFFE0000) { 3088 q += 4; 3089 bo = -1; 3090 } 3091#endif 3092 } 3093 } 3094 3095 if (bo == -1) { 3096 /* force LE */ 3097 iorder[0] = 0; 3098 iorder[1] = 1; 3099 iorder[2] = 2; 3100 iorder[3] = 3; 3101 } 3102 else if (bo == 1) { 3103 /* force BE */ 3104 iorder[0] = 3; 3105 iorder[1] = 2; 3106 iorder[2] = 1; 3107 iorder[3] = 0; 3108 } 3109 3110 /* On narrow builds we split characters outside the BMP into two 3111 codepoints => count how much extra space we need. */ 3112#ifndef Py_UNICODE_WIDE 3113 for (qq = q; qq < e; qq += 4) 3114 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0) 3115 pairs++; 3116#endif 3117 3118 /* This might be one to much, because of a BOM */ 3119 unicode = _PyUnicode_New((size+3)/4+pairs); 3120 if (!unicode) 3121 return NULL; 3122 if (size == 0) 3123 return (PyObject *)unicode; 3124 3125 /* Unpack UTF-32 encoded data */ 3126 p = unicode->str; 3127 3128 while (q < e) { 3129 Py_UCS4 ch; 3130 /* remaining bytes at the end? (size should be divisible by 4) */ 3131 if (e-q<4) { 3132 if (consumed) 3133 break; 3134 errmsg = "truncated data"; 3135 startinpos = ((const char *)q)-starts; 3136 endinpos = ((const char *)e)-starts; 3137 goto utf32Error; 3138 /* The remaining input chars are ignored if the callback 3139 chooses to skip the input */ 3140 } 3141 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) | 3142 (q[iorder[1]] << 8) | q[iorder[0]]; 3143 3144 if (ch >= 0x110000) 3145 { 3146 errmsg = "codepoint not in range(0x110000)"; 3147 startinpos = ((const char *)q)-starts; 3148 endinpos = startinpos+4; 3149 goto utf32Error; 3150 } 3151#ifndef Py_UNICODE_WIDE 3152 if (ch >= 0x10000) 3153 { 3154 *p++ = 0xD800 | ((ch-0x10000) >> 10); 3155 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF); 3156 } 3157 else 3158#endif 3159 *p++ = ch; 3160 q += 4; 3161 continue; 3162 utf32Error: 3163 outpos = p-PyUnicode_AS_UNICODE(unicode); 3164 if (unicode_decode_call_errorhandler( 3165 errors, &errorHandler, 3166 "utf32", errmsg, 3167 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q, 3168 &unicode, &outpos, &p)) 3169 goto onError; 3170 } 3171 3172 if (byteorder) 3173 *byteorder = bo; 3174 3175 if (consumed) 3176 *consumed = (const char *)q-starts; 3177 3178 /* Adjust length */ 3179 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0) 3180 goto onError; 3181 3182 Py_XDECREF(errorHandler); 3183 Py_XDECREF(exc); 3184 return (PyObject *)unicode; 3185 3186 onError: 3187 Py_DECREF(unicode); 3188 Py_XDECREF(errorHandler); 3189 Py_XDECREF(exc); 3190 return NULL; 3191} 3192 3193PyObject * 3194PyUnicode_EncodeUTF32(const Py_UNICODE *s, 3195 Py_ssize_t size, 3196 const char *errors, 3197 int byteorder) 3198{ 3199 PyObject *v; 3200 unsigned char *p; 3201 Py_ssize_t nsize, bytesize; 3202#ifndef Py_UNICODE_WIDE 3203 Py_ssize_t i, pairs; 3204#else 3205 const int pairs = 0; 3206#endif 3207 /* Offsets from p for storing byte pairs in the right order. */ 3208#ifdef BYTEORDER_IS_LITTLE_ENDIAN 3209 int iorder[] = {0, 1, 2, 3}; 3210#else 3211 int iorder[] = {3, 2, 1, 0}; 3212#endif 3213 3214#define STORECHAR(CH) \ 3215 do { \ 3216 p[iorder[3]] = ((CH) >> 24) & 0xff; \ 3217 p[iorder[2]] = ((CH) >> 16) & 0xff; \ 3218 p[iorder[1]] = ((CH) >> 8) & 0xff; \ 3219 p[iorder[0]] = (CH) & 0xff; \ 3220 p += 4; \ 3221 } while(0) 3222 3223 /* In narrow builds we can output surrogate pairs as one codepoint, 3224 so we need less space. */ 3225#ifndef Py_UNICODE_WIDE 3226 for (i = pairs = 0; i < size-1; i++) 3227 if (0xD800 <= s[i] && s[i] <= 0xDBFF && 3228 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF) 3229 pairs++; 3230#endif 3231 nsize = (size - pairs + (byteorder == 0)); 3232 bytesize = nsize * 4; 3233 if (bytesize / 4 != nsize) 3234 return PyErr_NoMemory(); 3235 v = PyBytes_FromStringAndSize(NULL, bytesize); 3236 if (v == NULL) 3237 return NULL; 3238 3239 p = (unsigned char *)PyBytes_AS_STRING(v); 3240 if (byteorder == 0) 3241 STORECHAR(0xFEFF); 3242 if (size == 0) 3243 goto done; 3244 3245 if (byteorder == -1) { 3246 /* force LE */ 3247 iorder[0] = 0; 3248 iorder[1] = 1; 3249 iorder[2] = 2; 3250 iorder[3] = 3; 3251 } 3252 else if (byteorder == 1) { 3253 /* force BE */ 3254 iorder[0] = 3; 3255 iorder[1] = 2; 3256 iorder[2] = 1; 3257 iorder[3] = 0; 3258 } 3259 3260 while (size-- > 0) { 3261 Py_UCS4 ch = *s++; 3262#ifndef Py_UNICODE_WIDE 3263 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) { 3264 Py_UCS4 ch2 = *s; 3265 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) { 3266 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000; 3267 s++; 3268 size--; 3269 } 3270 } 3271#endif 3272 STORECHAR(ch); 3273 } 3274 3275 done: 3276 return v; 3277#undef STORECHAR 3278} 3279 3280PyObject *PyUnicode_AsUTF32String(PyObject *unicode) 3281{ 3282 if (!PyUnicode_Check(unicode)) { 3283 PyErr_BadArgument(); 3284 return NULL; 3285 } 3286 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode), 3287 PyUnicode_GET_SIZE(unicode), 3288 NULL, 3289 0); 3290} 3291 3292/* --- UTF-16 Codec ------------------------------------------------------- */ 3293 3294PyObject * 3295PyUnicode_DecodeUTF16(const char *s, 3296 Py_ssize_t size, 3297 const char *errors, 3298 int *byteorder) 3299{ 3300 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL); 3301} 3302 3303/* Two masks for fast checking of whether a C 'long' may contain 3304 UTF16-encoded surrogate characters. This is an efficient heuristic, 3305 assuming that non-surrogate characters with a code point >= 0x8000 are 3306 rare in most input. 3307 FAST_CHAR_MASK is used when the input is in native byte ordering, 3308 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering. 3309*/ 3310#if (SIZEOF_LONG == 8) 3311# define FAST_CHAR_MASK 0x8000800080008000L 3312# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L 3313#elif (SIZEOF_LONG == 4) 3314# define FAST_CHAR_MASK 0x80008000L 3315# define SWAPPED_FAST_CHAR_MASK 0x00800080L 3316#else 3317# error C 'long' size should be either 4 or 8! 3318#endif 3319 3320PyObject * 3321PyUnicode_DecodeUTF16Stateful(const char *s, 3322 Py_ssize_t size, 3323 const char *errors, 3324 int *byteorder, 3325 Py_ssize_t *consumed) 3326{ 3327 const char *starts = s; 3328 Py_ssize_t startinpos; 3329 Py_ssize_t endinpos; 3330 Py_ssize_t outpos; 3331 PyUnicodeObject *unicode; 3332 Py_UNICODE *p; 3333 const unsigned char *q, *e, *aligned_end; 3334 int bo = 0; /* assume native ordering by default */ 3335 int native_ordering = 0; 3336 const char *errmsg = ""; 3337 /* Offsets from q for retrieving byte pairs in the right order. */ 3338#ifdef BYTEORDER_IS_LITTLE_ENDIAN 3339 int ihi = 1, ilo = 0; 3340#else 3341 int ihi = 0, ilo = 1; 3342#endif 3343 PyObject *errorHandler = NULL; 3344 PyObject *exc = NULL; 3345 3346 /* Note: size will always be longer than the resulting Unicode 3347 character count */ 3348 unicode = _PyUnicode_New(size); 3349 if (!unicode) 3350 return NULL; 3351 if (size == 0) 3352 return (PyObject *)unicode; 3353 3354 /* Unpack UTF-16 encoded data */ 3355 p = unicode->str; 3356 q = (unsigned char *)s; 3357 e = q + size - 1; 3358 3359 if (byteorder) 3360 bo = *byteorder; 3361 3362 /* Check for BOM marks (U+FEFF) in the input and adjust current 3363 byte order setting accordingly. In native mode, the leading BOM 3364 mark is skipped, in all other modes, it is copied to the output 3365 stream as-is (giving a ZWNBSP character). */ 3366 if (bo == 0) { 3367 if (size >= 2) { 3368 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo]; 3369#ifdef BYTEORDER_IS_LITTLE_ENDIAN 3370 if (bom == 0xFEFF) { 3371 q += 2; 3372 bo = -1; 3373 } 3374 else if (bom == 0xFFFE) { 3375 q += 2; 3376 bo = 1; 3377 } 3378#else 3379 if (bom == 0xFEFF) { 3380 q += 2; 3381 bo = 1; 3382 } 3383 else if (bom == 0xFFFE) { 3384 q += 2; 3385 bo = -1; 3386 } 3387#endif 3388 } 3389 } 3390 3391 if (bo == -1) { 3392 /* force LE */ 3393 ihi = 1; 3394 ilo = 0; 3395 } 3396 else if (bo == 1) { 3397 /* force BE */ 3398 ihi = 0; 3399 ilo = 1; 3400 } 3401#ifdef BYTEORDER_IS_LITTLE_ENDIAN 3402 native_ordering = ilo < ihi; 3403#else 3404 native_ordering = ilo > ihi; 3405#endif 3406 3407 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK); 3408 while (q < e) { 3409 Py_UNICODE ch; 3410 /* First check for possible aligned read of a C 'long'. Unaligned 3411 reads are more expensive, better to defer to another iteration. */ 3412 if (!((size_t) q & LONG_PTR_MASK)) { 3413 /* Fast path for runs of non-surrogate chars. */ 3414 register const unsigned char *_q = q; 3415 Py_UNICODE *_p = p; 3416 if (native_ordering) { 3417 /* Native ordering is simple: as long as the input cannot 3418 possibly contain a surrogate char, do an unrolled copy 3419 of several 16-bit code points to the target object. 3420 The non-surrogate check is done on several input bytes 3421 at a time (as many as a C 'long' can contain). */ 3422 while (_q < aligned_end) { 3423 unsigned long data = * (unsigned long *) _q; 3424 if (data & FAST_CHAR_MASK) 3425 break; 3426 _p[0] = ((unsigned short *) _q)[0]; 3427 _p[1] = ((unsigned short *) _q)[1]; 3428#if (SIZEOF_LONG == 8) 3429 _p[2] = ((unsigned short *) _q)[2]; 3430 _p[3] = ((unsigned short *) _q)[3]; 3431#endif 3432 _q += SIZEOF_LONG; 3433 _p += SIZEOF_LONG / 2; 3434 } 3435 } 3436 else { 3437 /* Byteswapped ordering is similar, but we must decompose 3438 the copy bytewise, and take care of zero'ing out the 3439 upper bytes if the target object is in 32-bit units 3440 (that is, in UCS-4 builds). */ 3441 while (_q < aligned_end) { 3442 unsigned long data = * (unsigned long *) _q; 3443 if (data & SWAPPED_FAST_CHAR_MASK) 3444 break; 3445 /* Zero upper bytes in UCS-4 builds */ 3446#if (Py_UNICODE_SIZE > 2) 3447 _p[0] = 0; 3448 _p[1] = 0; 3449#if (SIZEOF_LONG == 8) 3450 _p[2] = 0; 3451 _p[3] = 0; 3452#endif 3453#endif 3454 /* Issue #4916; UCS-4 builds on big endian machines must 3455 fill the two last bytes of each 4-byte unit. */ 3456#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2) 3457# define OFF 2 3458#else 3459# define OFF 0 3460#endif 3461 ((unsigned char *) _p)[OFF + 1] = _q[0]; 3462 ((unsigned char *) _p)[OFF + 0] = _q[1]; 3463 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2]; 3464 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3]; 3465#if (SIZEOF_LONG == 8) 3466 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4]; 3467 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5]; 3468 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6]; 3469 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7]; 3470#endif 3471#undef OFF 3472 _q += SIZEOF_LONG; 3473 _p += SIZEOF_LONG / 2; 3474 } 3475 } 3476 p = _p; 3477 q = _q; 3478 if (q >= e) 3479 break; 3480 } 3481 ch = (q[ihi] << 8) | q[ilo]; 3482 3483 q += 2; 3484 3485 if (ch < 0xD800 || ch > 0xDFFF) { 3486 *p++ = ch; 3487 continue; 3488 } 3489 3490 /* UTF-16 code pair: */ 3491 if (q > e) { 3492 errmsg = "unexpected end of data"; 3493 startinpos = (((const char *)q) - 2) - starts; 3494 endinpos = ((const char *)e) + 1 - starts; 3495 goto utf16Error; 3496 } 3497 if (0xD800 <= ch && ch <= 0xDBFF) { 3498 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo]; 3499 q += 2; 3500 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) { 3501#ifndef Py_UNICODE_WIDE 3502 *p++ = ch; 3503 *p++ = ch2; 3504#else 3505 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000; 3506#endif 3507 continue; 3508 } 3509 else { 3510 errmsg = "illegal UTF-16 surrogate"; 3511 startinpos = (((const char *)q)-4)-starts; 3512 endinpos = startinpos+2; 3513 goto utf16Error; 3514 } 3515 3516 } 3517 errmsg = "illegal encoding"; 3518 startinpos = (((const char *)q)-2)-starts; 3519 endinpos = startinpos+2; 3520 /* Fall through to report the error */ 3521 3522 utf16Error: 3523 outpos = p - PyUnicode_AS_UNICODE(unicode); 3524 if (unicode_decode_call_errorhandler( 3525 errors, 3526 &errorHandler, 3527 "utf16", errmsg, 3528 &starts, 3529 (const char **)&e, 3530 &startinpos, 3531 &endinpos, 3532 &exc, 3533 (const char **)&q, 3534 &unicode, 3535 &outpos, 3536 &p)) 3537 goto onError; 3538 } 3539 /* remaining byte at the end? (size should be even) */ 3540 if (e == q) { 3541 if (!consumed) { 3542 errmsg = "truncated data"; 3543 startinpos = ((const char *)q) - starts; 3544 endinpos = ((const char *)e) + 1 - starts; 3545 outpos = p - PyUnicode_AS_UNICODE(unicode); 3546 if (unicode_decode_call_errorhandler( 3547 errors, 3548 &errorHandler, 3549 "utf16", errmsg, 3550 &starts, 3551 (const char **)&e, 3552 &startinpos, 3553 &endinpos, 3554 &exc, 3555 (const char **)&q, 3556 &unicode, 3557 &outpos, 3558 &p)) 3559 goto onError; 3560 /* The remaining input chars are ignored if the callback 3561 chooses to skip the input */ 3562 } 3563 } 3564 3565 if (byteorder) 3566 *byteorder = bo; 3567 3568 if (consumed) 3569 *consumed = (const char *)q-starts; 3570 3571 /* Adjust length */ 3572 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0) 3573 goto onError; 3574 3575 Py_XDECREF(errorHandler); 3576 Py_XDECREF(exc); 3577 return (PyObject *)unicode; 3578 3579 onError: 3580 Py_DECREF(unicode); 3581 Py_XDECREF(errorHandler); 3582 Py_XDECREF(exc); 3583 return NULL; 3584} 3585 3586#undef FAST_CHAR_MASK 3587#undef SWAPPED_FAST_CHAR_MASK 3588 3589PyObject * 3590PyUnicode_EncodeUTF16(const Py_UNICODE *s, 3591 Py_ssize_t size, 3592 const char *errors, 3593 int byteorder) 3594{ 3595 PyObject *v; 3596 unsigned char *p; 3597 Py_ssize_t nsize, bytesize; 3598#ifdef Py_UNICODE_WIDE 3599 Py_ssize_t i, pairs; 3600#else 3601 const int pairs = 0; 3602#endif 3603 /* Offsets from p for storing byte pairs in the right order. */ 3604#ifdef BYTEORDER_IS_LITTLE_ENDIAN 3605 int ihi = 1, ilo = 0; 3606#else 3607 int ihi = 0, ilo = 1; 3608#endif 3609 3610#define STORECHAR(CH) \ 3611 do { \ 3612 p[ihi] = ((CH) >> 8) & 0xff; \ 3613 p[ilo] = (CH) & 0xff; \ 3614 p += 2; \ 3615 } while(0) 3616 3617#ifdef Py_UNICODE_WIDE 3618 for (i = pairs = 0; i < size; i++) 3619 if (s[i] >= 0x10000) 3620 pairs++; 3621#endif 3622 /* 2 * (size + pairs + (byteorder == 0)) */ 3623 if (size > PY_SSIZE_T_MAX || 3624 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0)) 3625 return PyErr_NoMemory(); 3626 nsize = size + pairs + (byteorder == 0); 3627 bytesize = nsize * 2; 3628 if (bytesize / 2 != nsize) 3629 return PyErr_NoMemory(); 3630 v = PyBytes_FromStringAndSize(NULL, bytesize); 3631 if (v == NULL) 3632 return NULL; 3633 3634 p = (unsigned char *)PyBytes_AS_STRING(v); 3635 if (byteorder == 0) 3636 STORECHAR(0xFEFF); 3637 if (size == 0) 3638 goto done; 3639 3640 if (byteorder == -1) { 3641 /* force LE */ 3642 ihi = 1; 3643 ilo = 0; 3644 } 3645 else if (byteorder == 1) { 3646 /* force BE */ 3647 ihi = 0; 3648 ilo = 1; 3649 } 3650 3651 while (size-- > 0) { 3652 Py_UNICODE ch = *s++; 3653 Py_UNICODE ch2 = 0; 3654#ifdef Py_UNICODE_WIDE 3655 if (ch >= 0x10000) { 3656 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF); 3657 ch = 0xD800 | ((ch-0x10000) >> 10); 3658 } 3659#endif 3660 STORECHAR(ch); 3661 if (ch2) 3662 STORECHAR(ch2); 3663 } 3664 3665 done: 3666 return v; 3667#undef STORECHAR 3668} 3669 3670PyObject *PyUnicode_AsUTF16String(PyObject *unicode) 3671{ 3672 if (!PyUnicode_Check(unicode)) { 3673 PyErr_BadArgument(); 3674 return NULL; 3675 } 3676 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode), 3677 PyUnicode_GET_SIZE(unicode), 3678 NULL, 3679 0); 3680} 3681 3682/* --- Unicode Escape Codec ----------------------------------------------- */ 3683 3684static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL; 3685 3686PyObject *PyUnicode_DecodeUnicodeEscape(const char *s, 3687 Py_ssize_t size, 3688 const char *errors) 3689{ 3690 const char *starts = s; 3691 Py_ssize_t startinpos; 3692 Py_ssize_t endinpos; 3693 Py_ssize_t outpos; 3694 int i; 3695 PyUnicodeObject *v; 3696 Py_UNICODE *p; 3697 const char *end; 3698 char* message; 3699 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */ 3700 PyObject *errorHandler = NULL; 3701 PyObject *exc = NULL; 3702 3703 /* Escaped strings will always be longer than the resulting 3704 Unicode string, so we start with size here and then reduce the 3705 length after conversion to the true value. 3706 (but if the error callback returns a long replacement string 3707 we'll have to allocate more space) */ 3708 v = _PyUnicode_New(size); 3709 if (v == NULL) 3710 goto onError; 3711 if (size == 0) 3712 return (PyObject *)v; 3713 3714 p = PyUnicode_AS_UNICODE(v); 3715 end = s + size; 3716 3717 while (s < end) { 3718 unsigned char c; 3719 Py_UNICODE x; 3720 int digits; 3721 3722 /* Non-escape characters are interpreted as Unicode ordinals */ 3723 if (*s != '\\') { 3724 *p++ = (unsigned char) *s++; 3725 continue; 3726 } 3727 3728 startinpos = s-starts; 3729 /* \ - Escapes */ 3730 s++; 3731 c = *s++; 3732 if (s > end) 3733 c = '\0'; /* Invalid after \ */ 3734 switch (c) { 3735 3736 /* \x escapes */ 3737 case '\n': break; 3738 case '\\': *p++ = '\\'; break; 3739 case '\'': *p++ = '\''; break; 3740 case '\"': *p++ = '\"'; break; 3741 case 'b': *p++ = '\b'; break; 3742 case 'f': *p++ = '\014'; break; /* FF */ 3743 case 't': *p++ = '\t'; break; 3744 case 'n': *p++ = '\n'; break; 3745 case 'r': *p++ = '\r'; break; 3746 case 'v': *p++ = '\013'; break; /* VT */ 3747 case 'a': *p++ = '\007'; break; /* BEL, not classic C */ 3748 3749 /* \OOO (octal) escapes */ 3750 case '0': case '1': case '2': case '3': 3751 case '4': case '5': case '6': case '7': 3752 x = s[-1] - '0'; 3753 if (s < end && '0' <= *s && *s <= '7') { 3754 x = (x<<3) + *s++ - '0'; 3755 if (s < end && '0' <= *s && *s <= '7') 3756 x = (x<<3) + *s++ - '0'; 3757 } 3758 *p++ = x; 3759 break; 3760 3761 /* hex escapes */ 3762 /* \xXX */ 3763 case 'x': 3764 digits = 2; 3765 message = "truncated \\xXX escape"; 3766 goto hexescape; 3767 3768 /* \uXXXX */ 3769 case 'u': 3770 digits = 4; 3771 message = "truncated \\uXXXX escape"; 3772 goto hexescape; 3773 3774 /* \UXXXXXXXX */ 3775 case 'U': 3776 digits = 8; 3777 message = "truncated \\UXXXXXXXX escape"; 3778 hexescape: 3779 chr = 0; 3780 outpos = p-PyUnicode_AS_UNICODE(v); 3781 if (s+digits>end) { 3782 endinpos = size; 3783 if (unicode_decode_call_errorhandler( 3784 errors, &errorHandler, 3785 "unicodeescape", "end of string in escape sequence", 3786 &starts, &end, &startinpos, &endinpos, &exc, &s, 3787 &v, &outpos, &p)) 3788 goto onError; 3789 goto nextByte; 3790 } 3791 for (i = 0; i < digits; ++i) { 3792 c = (unsigned char) s[i]; 3793 if (!Py_ISXDIGIT(c)) { 3794 endinpos = (s+i+1)-starts; 3795 if (unicode_decode_call_errorhandler( 3796 errors, &errorHandler, 3797 "unicodeescape", message, 3798 &starts, &end, &startinpos, &endinpos, &exc, &s, 3799 &v, &outpos, &p)) 3800 goto onError; 3801 goto nextByte; 3802 } 3803 chr = (chr<<4) & ~0xF; 3804 if (c >= '0' && c <= '9') 3805 chr += c - '0'; 3806 else if (c >= 'a' && c <= 'f') 3807 chr += 10 + c - 'a'; 3808 else 3809 chr += 10 + c - 'A'; 3810 } 3811 s += i; 3812 if (chr == 0xffffffff && PyErr_Occurred()) 3813 /* _decoding_error will have already written into the 3814 target buffer. */ 3815 break; 3816 store: 3817 /* when we get here, chr is a 32-bit unicode character */ 3818 if (chr <= 0xffff) 3819 /* UCS-2 character */ 3820 *p++ = (Py_UNICODE) chr; 3821 else if (chr <= 0x10ffff) { 3822 /* UCS-4 character. Either store directly, or as 3823 surrogate pair. */ 3824#ifdef Py_UNICODE_WIDE 3825 *p++ = chr; 3826#else 3827 chr -= 0x10000L; 3828 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10); 3829 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF); 3830#endif 3831 } else { 3832 endinpos = s-starts; 3833 outpos = p-PyUnicode_AS_UNICODE(v); 3834 if (unicode_decode_call_errorhandler( 3835 errors, &errorHandler, 3836 "unicodeescape", "illegal Unicode character", 3837 &starts, &end, &startinpos, &endinpos, &exc, &s, 3838 &v, &outpos, &p)) 3839 goto onError; 3840 } 3841 break; 3842 3843 /* \N{name} */ 3844 case 'N': 3845 message = "malformed \\N character escape"; 3846 if (ucnhash_CAPI == NULL) { 3847 /* load the unicode data module */ 3848 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(PyUnicodeData_CAPSULE_NAME, 1); 3849 if (ucnhash_CAPI == NULL) 3850 goto ucnhashError; 3851 } 3852 if (*s == '{') { 3853 const char *start = s+1; 3854 /* look for the closing brace */ 3855 while (*s != '}' && s < end) 3856 s++; 3857 if (s > start && s < end && *s == '}') { 3858 /* found a name. look it up in the unicode database */ 3859 message = "unknown Unicode character name"; 3860 s++; 3861 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr)) 3862 goto store; 3863 } 3864 } 3865 endinpos = s-starts; 3866 outpos = p-PyUnicode_AS_UNICODE(v); 3867 if (unicode_decode_call_errorhandler( 3868 errors, &errorHandler, 3869 "unicodeescape", message, 3870 &starts, &end, &startinpos, &endinpos, &exc, &s, 3871 &v, &outpos, &p)) 3872 goto onError; 3873 break; 3874 3875 default: 3876 if (s > end) { 3877 message = "\\ at end of string"; 3878 s--; 3879 endinpos = s-starts; 3880 outpos = p-PyUnicode_AS_UNICODE(v); 3881 if (unicode_decode_call_errorhandler( 3882 errors, &errorHandler, 3883 "unicodeescape", message, 3884 &starts, &end, &startinpos, &endinpos, &exc, &s, 3885 &v, &outpos, &p)) 3886 goto onError; 3887 } 3888 else { 3889 *p++ = '\\'; 3890 *p++ = (unsigned char)s[-1]; 3891 } 3892 break; 3893 } 3894 nextByte: 3895 ; 3896 } 3897 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0) 3898 goto onError; 3899 Py_XDECREF(errorHandler); 3900 Py_XDECREF(exc); 3901 return (PyObject *)v; 3902 3903 ucnhashError: 3904 PyErr_SetString( 3905 PyExc_UnicodeError, 3906 "\\N escapes not supported (can't load unicodedata module)" 3907 ); 3908 Py_XDECREF(v); 3909 Py_XDECREF(errorHandler); 3910 Py_XDECREF(exc); 3911 return NULL; 3912 3913 onError: 3914 Py_XDECREF(v); 3915 Py_XDECREF(errorHandler); 3916 Py_XDECREF(exc); 3917 return NULL; 3918} 3919 3920/* Return a Unicode-Escape string version of the Unicode object. 3921 3922 If quotes is true, the string is enclosed in u"" or u'' quotes as 3923 appropriate. 3924 3925*/ 3926 3927Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s, 3928 Py_ssize_t size, 3929 Py_UNICODE ch) 3930{ 3931 /* like wcschr, but doesn't stop at NULL characters */ 3932 3933 while (size-- > 0) { 3934 if (*s == ch) 3935 return s; 3936 s++; 3937 } 3938 3939 return NULL; 3940} 3941 3942static const char *hexdigits = "0123456789abcdef"; 3943 3944PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s, 3945 Py_ssize_t size) 3946{ 3947 PyObject *repr; 3948 char *p; 3949 3950#ifdef Py_UNICODE_WIDE 3951 const Py_ssize_t expandsize = 10; 3952#else 3953 const Py_ssize_t expandsize = 6; 3954#endif 3955 3956 /* XXX(nnorwitz): rather than over-allocating, it would be 3957 better to choose a different scheme. Perhaps scan the 3958 first N-chars of the string and allocate based on that size. 3959 */ 3960 /* Initial allocation is based on the longest-possible unichr 3961 escape. 3962 3963 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source 3964 unichr, so in this case it's the longest unichr escape. In 3965 narrow (UTF-16) builds this is five chars per source unichr 3966 since there are two unichrs in the surrogate pair, so in narrow 3967 (UTF-16) builds it's not the longest unichr escape. 3968 3969 In wide or narrow builds '\uxxxx' is 6 chars per source unichr, 3970 so in the narrow (UTF-16) build case it's the longest unichr 3971 escape. 3972 */ 3973 3974 if (size == 0) 3975 return PyBytes_FromStringAndSize(NULL, 0); 3976 3977 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize) 3978 return PyErr_NoMemory(); 3979 3980 repr = PyBytes_FromStringAndSize(NULL, 3981 2 3982 + expandsize*size 3983 + 1); 3984 if (repr == NULL) 3985 return NULL; 3986 3987 p = PyBytes_AS_STRING(repr); 3988 3989 while (size-- > 0) { 3990 Py_UNICODE ch = *s++; 3991 3992 /* Escape backslashes */ 3993 if (ch == '\\') { 3994 *p++ = '\\'; 3995 *p++ = (char) ch; 3996 continue; 3997 } 3998 3999#ifdef Py_UNICODE_WIDE 4000 /* Map 21-bit characters to '\U00xxxxxx' */ 4001 else if (ch >= 0x10000) { 4002 *p++ = '\\'; 4003 *p++ = 'U'; 4004 *p++ = hexdigits[(ch >> 28) & 0x0000000F]; 4005 *p++ = hexdigits[(ch >> 24) & 0x0000000F]; 4006 *p++ = hexdigits[(ch >> 20) & 0x0000000F]; 4007 *p++ = hexdigits[(ch >> 16) & 0x0000000F]; 4008 *p++ = hexdigits[(ch >> 12) & 0x0000000F]; 4009 *p++ = hexdigits[(ch >> 8) & 0x0000000F]; 4010 *p++ = hexdigits[(ch >> 4) & 0x0000000F]; 4011 *p++ = hexdigits[ch & 0x0000000F]; 4012 continue; 4013 } 4014#else 4015 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */ 4016 else if (ch >= 0xD800 && ch < 0xDC00) { 4017 Py_UNICODE ch2; 4018 Py_UCS4 ucs; 4019 4020 ch2 = *s++; 4021 size--; 4022 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) { 4023 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000; 4024 *p++ = '\\'; 4025 *p++ = 'U'; 4026 *p++ = hexdigits[(ucs >> 28) & 0x0000000F]; 4027 *p++ = hexdigits[(ucs >> 24) & 0x0000000F]; 4028 *p++ = hexdigits[(ucs >> 20) & 0x0000000F]; 4029 *p++ = hexdigits[(ucs >> 16) & 0x0000000F]; 4030 *p++ = hexdigits[(ucs >> 12) & 0x0000000F]; 4031 *p++ = hexdigits[(ucs >> 8) & 0x0000000F]; 4032 *p++ = hexdigits[(ucs >> 4) & 0x0000000F]; 4033 *p++ = hexdigits[ucs & 0x0000000F]; 4034 continue; 4035 } 4036 /* Fall through: isolated surrogates are copied as-is */ 4037 s--; 4038 size++; 4039 } 4040#endif 4041 4042 /* Map 16-bit characters to '\uxxxx' */ 4043 if (ch >= 256) { 4044 *p++ = '\\'; 4045 *p++ = 'u'; 4046 *p++ = hexdigits[(ch >> 12) & 0x000F]; 4047 *p++ = hexdigits[(ch >> 8) & 0x000F]; 4048 *p++ = hexdigits[(ch >> 4) & 0x000F]; 4049 *p++ = hexdigits[ch & 0x000F]; 4050 } 4051 4052 /* Map special whitespace to '\t', \n', '\r' */ 4053 else if (ch == '\t') { 4054 *p++ = '\\'; 4055 *p++ = 't'; 4056 } 4057 else if (ch == '\n') { 4058 *p++ = '\\'; 4059 *p++ = 'n'; 4060 } 4061 else if (ch == '\r') { 4062 *p++ = '\\'; 4063 *p++ = 'r'; 4064 } 4065 4066 /* Map non-printable US ASCII to '\xhh' */ 4067 else if (ch < ' ' || ch >= 0x7F) { 4068 *p++ = '\\'; 4069 *p++ = 'x'; 4070 *p++ = hexdigits[(ch >> 4) & 0x000F]; 4071 *p++ = hexdigits[ch & 0x000F]; 4072 } 4073 4074 /* Copy everything else as-is */ 4075 else 4076 *p++ = (char) ch; 4077 } 4078 4079 assert(p - PyBytes_AS_STRING(repr) > 0); 4080 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) 4081 return NULL; 4082 return repr; 4083} 4084 4085PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode) 4086{ 4087 PyObject *s; 4088 if (!PyUnicode_Check(unicode)) { 4089 PyErr_BadArgument(); 4090 return NULL; 4091 } 4092 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode), 4093 PyUnicode_GET_SIZE(unicode)); 4094 return s; 4095} 4096 4097/* --- Raw Unicode Escape Codec ------------------------------------------- */ 4098 4099PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s, 4100 Py_ssize_t size, 4101 const char *errors) 4102{ 4103 const char *starts = s; 4104 Py_ssize_t startinpos; 4105 Py_ssize_t endinpos; 4106 Py_ssize_t outpos; 4107 PyUnicodeObject *v; 4108 Py_UNICODE *p; 4109 const char *end; 4110 const char *bs; 4111 PyObject *errorHandler = NULL; 4112 PyObject *exc = NULL; 4113 4114 /* Escaped strings will always be longer than the resulting 4115 Unicode string, so we start with size here and then reduce the 4116 length after conversion to the true value. (But decoding error 4117 handler might have to resize the string) */ 4118 v = _PyUnicode_New(size); 4119 if (v == NULL) 4120 goto onError; 4121 if (size == 0) 4122 return (PyObject *)v; 4123 p = PyUnicode_AS_UNICODE(v); 4124 end = s + size; 4125 while (s < end) { 4126 unsigned char c; 4127 Py_UCS4 x; 4128 int i; 4129 int count; 4130 4131 /* Non-escape characters are interpreted as Unicode ordinals */ 4132 if (*s != '\\') { 4133 *p++ = (unsigned char)*s++; 4134 continue; 4135 } 4136 startinpos = s-starts; 4137 4138 /* \u-escapes are only interpreted iff the number of leading 4139 backslashes if odd */ 4140 bs = s; 4141 for (;s < end;) { 4142 if (*s != '\\') 4143 break; 4144 *p++ = (unsigned char)*s++; 4145 } 4146 if (((s - bs) & 1) == 0 || 4147 s >= end || 4148 (*s != 'u' && *s != 'U')) { 4149 continue; 4150 } 4151 p--; 4152 count = *s=='u' ? 4 : 8; 4153 s++; 4154 4155 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */ 4156 outpos = p-PyUnicode_AS_UNICODE(v); 4157 for (x = 0, i = 0; i < count; ++i, ++s) { 4158 c = (unsigned char)*s; 4159 if (!Py_ISXDIGIT(c)) { 4160 endinpos = s-starts; 4161 if (unicode_decode_call_errorhandler( 4162 errors, &errorHandler, 4163 "rawunicodeescape", "truncated \\uXXXX", 4164 &starts, &end, &startinpos, &endinpos, &exc, &s, 4165 &v, &outpos, &p)) 4166 goto onError; 4167 goto nextByte; 4168 } 4169 x = (x<<4) & ~0xF; 4170 if (c >= '0' && c <= '9') 4171 x += c - '0'; 4172 else if (c >= 'a' && c <= 'f') 4173 x += 10 + c - 'a'; 4174 else 4175 x += 10 + c - 'A'; 4176 } 4177 if (x <= 0xffff) 4178 /* UCS-2 character */ 4179 *p++ = (Py_UNICODE) x; 4180 else if (x <= 0x10ffff) { 4181 /* UCS-4 character. Either store directly, or as 4182 surrogate pair. */ 4183#ifdef Py_UNICODE_WIDE 4184 *p++ = (Py_UNICODE) x; 4185#else 4186 x -= 0x10000L; 4187 *p++ = 0xD800 + (Py_UNICODE) (x >> 10); 4188 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF); 4189#endif 4190 } else { 4191 endinpos = s-starts; 4192 outpos = p-PyUnicode_AS_UNICODE(v); 4193 if (unicode_decode_call_errorhandler( 4194 errors, &errorHandler, 4195 "rawunicodeescape", "\\Uxxxxxxxx out of range", 4196 &starts, &end, &startinpos, &endinpos, &exc, &s, 4197 &v, &outpos, &p)) 4198 goto onError; 4199 } 4200 nextByte: 4201 ; 4202 } 4203 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0) 4204 goto onError; 4205 Py_XDECREF(errorHandler); 4206 Py_XDECREF(exc); 4207 return (PyObject *)v; 4208 4209 onError: 4210 Py_XDECREF(v); 4211 Py_XDECREF(errorHandler); 4212 Py_XDECREF(exc); 4213 return NULL; 4214} 4215 4216PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s, 4217 Py_ssize_t size) 4218{ 4219 PyObject *repr; 4220 char *p; 4221 char *q; 4222 4223#ifdef Py_UNICODE_WIDE 4224 const Py_ssize_t expandsize = 10; 4225#else 4226 const Py_ssize_t expandsize = 6; 4227#endif 4228 4229 if (size > PY_SSIZE_T_MAX / expandsize) 4230 return PyErr_NoMemory(); 4231 4232 repr = PyBytes_FromStringAndSize(NULL, expandsize * size); 4233 if (repr == NULL) 4234 return NULL; 4235 if (size == 0) 4236 return repr; 4237 4238 p = q = PyBytes_AS_STRING(repr); 4239 while (size-- > 0) { 4240 Py_UNICODE ch = *s++; 4241#ifdef Py_UNICODE_WIDE 4242 /* Map 32-bit characters to '\Uxxxxxxxx' */ 4243 if (ch >= 0x10000) { 4244 *p++ = '\\'; 4245 *p++ = 'U'; 4246 *p++ = hexdigits[(ch >> 28) & 0xf]; 4247 *p++ = hexdigits[(ch >> 24) & 0xf]; 4248 *p++ = hexdigits[(ch >> 20) & 0xf]; 4249 *p++ = hexdigits[(ch >> 16) & 0xf]; 4250 *p++ = hexdigits[(ch >> 12) & 0xf]; 4251 *p++ = hexdigits[(ch >> 8) & 0xf]; 4252 *p++ = hexdigits[(ch >> 4) & 0xf]; 4253 *p++ = hexdigits[ch & 15]; 4254 } 4255 else 4256#else 4257 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */ 4258 if (ch >= 0xD800 && ch < 0xDC00) { 4259 Py_UNICODE ch2; 4260 Py_UCS4 ucs; 4261 4262 ch2 = *s++; 4263 size--; 4264 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) { 4265 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000; 4266 *p++ = '\\'; 4267 *p++ = 'U'; 4268 *p++ = hexdigits[(ucs >> 28) & 0xf]; 4269 *p++ = hexdigits[(ucs >> 24) & 0xf]; 4270 *p++ = hexdigits[(ucs >> 20) & 0xf]; 4271 *p++ = hexdigits[(ucs >> 16) & 0xf]; 4272 *p++ = hexdigits[(ucs >> 12) & 0xf]; 4273 *p++ = hexdigits[(ucs >> 8) & 0xf]; 4274 *p++ = hexdigits[(ucs >> 4) & 0xf]; 4275 *p++ = hexdigits[ucs & 0xf]; 4276 continue; 4277 } 4278 /* Fall through: isolated surrogates are copied as-is */ 4279 s--; 4280 size++; 4281 } 4282#endif 4283 /* Map 16-bit characters to '\uxxxx' */ 4284 if (ch >= 256) { 4285 *p++ = '\\'; 4286 *p++ = 'u'; 4287 *p++ = hexdigits[(ch >> 12) & 0xf]; 4288 *p++ = hexdigits[(ch >> 8) & 0xf]; 4289 *p++ = hexdigits[(ch >> 4) & 0xf]; 4290 *p++ = hexdigits[ch & 15]; 4291 } 4292 /* Copy everything else as-is */ 4293 else 4294 *p++ = (char) ch; 4295 } 4296 size = p - q; 4297 4298 assert(size > 0); 4299 if (_PyBytes_Resize(&repr, size) < 0) 4300 return NULL; 4301 return repr; 4302} 4303 4304PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode) 4305{ 4306 PyObject *s; 4307 if (!PyUnicode_Check(unicode)) { 4308 PyErr_BadArgument(); 4309 return NULL; 4310 } 4311 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode), 4312 PyUnicode_GET_SIZE(unicode)); 4313 4314 return s; 4315} 4316 4317/* --- Unicode Internal Codec ------------------------------------------- */ 4318 4319PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s, 4320 Py_ssize_t size, 4321 const char *errors) 4322{ 4323 const char *starts = s; 4324 Py_ssize_t startinpos; 4325 Py_ssize_t endinpos; 4326 Py_ssize_t outpos; 4327 PyUnicodeObject *v; 4328 Py_UNICODE *p; 4329 const char *end; 4330 const char *reason; 4331 PyObject *errorHandler = NULL; 4332 PyObject *exc = NULL; 4333 4334#ifdef Py_UNICODE_WIDE 4335 Py_UNICODE unimax = PyUnicode_GetMax(); 4336#endif 4337 4338 /* XXX overflow detection missing */ 4339 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE); 4340 if (v == NULL) 4341 goto onError; 4342 if (PyUnicode_GetSize((PyObject *)v) == 0) 4343 return (PyObject *)v; 4344 p = PyUnicode_AS_UNICODE(v); 4345 end = s + size; 4346 4347 while (s < end) { 4348 memcpy(p, s, sizeof(Py_UNICODE)); 4349 /* We have to sanity check the raw data, otherwise doom looms for 4350 some malformed UCS-4 data. */ 4351 if ( 4352#ifdef Py_UNICODE_WIDE 4353 *p > unimax || *p < 0 || 4354#endif 4355 end-s < Py_UNICODE_SIZE 4356 ) 4357 { 4358 startinpos = s - starts; 4359 if (end-s < Py_UNICODE_SIZE) { 4360 endinpos = end-starts; 4361 reason = "truncated input"; 4362 } 4363 else { 4364 endinpos = s - starts + Py_UNICODE_SIZE; 4365 reason = "illegal code point (> 0x10FFFF)"; 4366 } 4367 outpos = p - PyUnicode_AS_UNICODE(v); 4368 if (unicode_decode_call_errorhandler( 4369 errors, &errorHandler, 4370 "unicode_internal", reason, 4371 &starts, &end, &startinpos, &endinpos, &exc, &s, 4372 &v, &outpos, &p)) { 4373 goto onError; 4374 } 4375 } 4376 else { 4377 p++; 4378 s += Py_UNICODE_SIZE; 4379 } 4380 } 4381 4382 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0) 4383 goto onError; 4384 Py_XDECREF(errorHandler); 4385 Py_XDECREF(exc); 4386 return (PyObject *)v; 4387 4388 onError: 4389 Py_XDECREF(v); 4390 Py_XDECREF(errorHandler); 4391 Py_XDECREF(exc); 4392 return NULL; 4393} 4394 4395/* --- Latin-1 Codec ------------------------------------------------------ */ 4396 4397PyObject *PyUnicode_DecodeLatin1(const char *s, 4398 Py_ssize_t size, 4399 const char *errors) 4400{ 4401 PyUnicodeObject *v; 4402 Py_UNICODE *p; 4403 const char *e, *unrolled_end; 4404 4405 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */ 4406 if (size == 1) { 4407 Py_UNICODE r = *(unsigned char*)s; 4408 return PyUnicode_FromUnicode(&r, 1); 4409 } 4410 4411 v = _PyUnicode_New(size); 4412 if (v == NULL) 4413 goto onError; 4414 if (size == 0) 4415 return (PyObject *)v; 4416 p = PyUnicode_AS_UNICODE(v); 4417 e = s + size; 4418 /* Unrolling the copy makes it much faster by reducing the looping 4419 overhead. This is similar to what many memcpy() implementations do. */ 4420 unrolled_end = e - 4; 4421 while (s < unrolled_end) { 4422 p[0] = (unsigned char) s[0]; 4423 p[1] = (unsigned char) s[1]; 4424 p[2] = (unsigned char) s[2]; 4425 p[3] = (unsigned char) s[3]; 4426 s += 4; 4427 p += 4; 4428 } 4429 while (s < e) 4430 *p++ = (unsigned char) *s++; 4431 return (PyObject *)v; 4432 4433 onError: 4434 Py_XDECREF(v); 4435 return NULL; 4436} 4437 4438/* create or adjust a UnicodeEncodeError */ 4439static void make_encode_exception(PyObject **exceptionObject, 4440 const char *encoding, 4441 const Py_UNICODE *unicode, Py_ssize_t size, 4442 Py_ssize_t startpos, Py_ssize_t endpos, 4443 const char *reason) 4444{ 4445 if (*exceptionObject == NULL) { 4446 *exceptionObject = PyUnicodeEncodeError_Create( 4447 encoding, unicode, size, startpos, endpos, reason); 4448 } 4449 else { 4450 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos)) 4451 goto onError; 4452 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos)) 4453 goto onError; 4454 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason)) 4455 goto onError; 4456 return; 4457 onError: 4458 Py_DECREF(*exceptionObject); 4459 *exceptionObject = NULL; 4460 } 4461} 4462 4463/* raises a UnicodeEncodeError */ 4464static void raise_encode_exception(PyObject **exceptionObject, 4465 const char *encoding, 4466 const Py_UNICODE *unicode, Py_ssize_t size, 4467 Py_ssize_t startpos, Py_ssize_t endpos, 4468 const char *reason) 4469{ 4470 make_encode_exception(exceptionObject, 4471 encoding, unicode, size, startpos, endpos, reason); 4472 if (*exceptionObject != NULL) 4473 PyCodec_StrictErrors(*exceptionObject); 4474} 4475 4476/* error handling callback helper: 4477 build arguments, call the callback and check the arguments, 4478 put the result into newpos and return the replacement string, which 4479 has to be freed by the caller */ 4480static PyObject *unicode_encode_call_errorhandler(const char *errors, 4481 PyObject **errorHandler, 4482 const char *encoding, const char *reason, 4483 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject, 4484 Py_ssize_t startpos, Py_ssize_t endpos, 4485 Py_ssize_t *newpos) 4486{ 4487 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple"; 4488 4489 PyObject *restuple; 4490 PyObject *resunicode; 4491 4492 if (*errorHandler == NULL) { 4493 *errorHandler = PyCodec_LookupError(errors); 4494 if (*errorHandler == NULL) 4495 return NULL; 4496 } 4497 4498 make_encode_exception(exceptionObject, 4499 encoding, unicode, size, startpos, endpos, reason); 4500 if (*exceptionObject == NULL) 4501 return NULL; 4502 4503 restuple = PyObject_CallFunctionObjArgs( 4504 *errorHandler, *exceptionObject, NULL); 4505 if (restuple == NULL) 4506 return NULL; 4507 if (!PyTuple_Check(restuple)) { 4508 PyErr_SetString(PyExc_TypeError, &argparse[3]); 4509 Py_DECREF(restuple); 4510 return NULL; 4511 } 4512 if (!PyArg_ParseTuple(restuple, argparse, 4513 &resunicode, newpos)) { 4514 Py_DECREF(restuple); 4515 return NULL; 4516 } 4517 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) { 4518 PyErr_SetString(PyExc_TypeError, &argparse[3]); 4519 Py_DECREF(restuple); 4520 return NULL; 4521 } 4522 if (*newpos<0) 4523 *newpos = size+*newpos; 4524 if (*newpos<0 || *newpos>size) { 4525 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos); 4526 Py_DECREF(restuple); 4527 return NULL; 4528 } 4529 Py_INCREF(resunicode); 4530 Py_DECREF(restuple); 4531 return resunicode; 4532} 4533 4534static PyObject *unicode_encode_ucs1(const Py_UNICODE *p, 4535 Py_ssize_t size, 4536 const char *errors, 4537 int limit) 4538{ 4539 /* output object */ 4540 PyObject *res; 4541 /* pointers to the beginning and end+1 of input */ 4542 const Py_UNICODE *startp = p; 4543 const Py_UNICODE *endp = p + size; 4544 /* pointer to the beginning of the unencodable characters */ 4545 /* const Py_UNICODE *badp = NULL; */ 4546 /* pointer into the output */ 4547 char *str; 4548 /* current output position */ 4549 Py_ssize_t ressize; 4550 const char *encoding = (limit == 256) ? "latin-1" : "ascii"; 4551 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)"; 4552 PyObject *errorHandler = NULL; 4553 PyObject *exc = NULL; 4554 /* the following variable is used for caching string comparisons 4555 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */ 4556 int known_errorHandler = -1; 4557 4558 /* allocate enough for a simple encoding without 4559 replacements, if we need more, we'll resize */ 4560 if (size == 0) 4561 return PyBytes_FromStringAndSize(NULL, 0); 4562 res = PyBytes_FromStringAndSize(NULL, size); 4563 if (res == NULL) 4564 return NULL; 4565 str = PyBytes_AS_STRING(res); 4566 ressize = size; 4567 4568 while (p<endp) { 4569 Py_UNICODE c = *p; 4570 4571 /* can we encode this? */ 4572 if (c<limit) { 4573 /* no overflow check, because we know that the space is enough */ 4574 *str++ = (char)c; 4575 ++p; 4576 } 4577 else { 4578 Py_ssize_t unicodepos = p-startp; 4579 Py_ssize_t requiredsize; 4580 PyObject *repunicode; 4581 Py_ssize_t repsize; 4582 Py_ssize_t newpos; 4583 Py_ssize_t respos; 4584 Py_UNICODE *uni2; 4585 /* startpos for collecting unencodable chars */ 4586 const Py_UNICODE *collstart = p; 4587 const Py_UNICODE *collend = p; 4588 /* find all unecodable characters */ 4589 while ((collend < endp) && ((*collend)>=limit)) 4590 ++collend; 4591 /* cache callback name lookup (if not done yet, i.e. it's the first error) */ 4592 if (known_errorHandler==-1) { 4593 if ((errors==NULL) || (!strcmp(errors, "strict"))) 4594 known_errorHandler = 1; 4595 else if (!strcmp(errors, "replace")) 4596 known_errorHandler = 2; 4597 else if (!strcmp(errors, "ignore")) 4598 known_errorHandler = 3; 4599 else if (!strcmp(errors, "xmlcharrefreplace")) 4600 known_errorHandler = 4; 4601 else 4602 known_errorHandler = 0; 4603 } 4604 switch (known_errorHandler) { 4605 case 1: /* strict */ 4606 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason); 4607 goto onError; 4608 case 2: /* replace */ 4609 while (collstart++<collend) 4610 *str++ = '?'; /* fall through */ 4611 case 3: /* ignore */ 4612 p = collend; 4613 break; 4614 case 4: /* xmlcharrefreplace */ 4615 respos = str - PyBytes_AS_STRING(res); 4616 /* determine replacement size (temporarily (mis)uses p) */ 4617 for (p = collstart, repsize = 0; p < collend; ++p) { 4618 if (*p<10) 4619 repsize += 2+1+1; 4620 else if (*p<100) 4621 repsize += 2+2+1; 4622 else if (*p<1000) 4623 repsize += 2+3+1; 4624 else if (*p<10000) 4625 repsize += 2+4+1; 4626#ifndef Py_UNICODE_WIDE 4627 else 4628 repsize += 2+5+1; 4629#else 4630 else if (*p<100000) 4631 repsize += 2+5+1; 4632 else if (*p<1000000) 4633 repsize += 2+6+1; 4634 else 4635 repsize += 2+7+1; 4636#endif 4637 } 4638 requiredsize = respos+repsize+(endp-collend); 4639 if (requiredsize > ressize) { 4640 if (requiredsize<2*ressize) 4641 requiredsize = 2*ressize; 4642 if (_PyBytes_Resize(&res, requiredsize)) 4643 goto onError; 4644 str = PyBytes_AS_STRING(res) + respos; 4645 ressize = requiredsize; 4646 } 4647 /* generate replacement (temporarily (mis)uses p) */ 4648 for (p = collstart; p < collend; ++p) { 4649 str += sprintf(str, "&#%d;", (int)*p); 4650 } 4651 p = collend; 4652 break; 4653 default: 4654 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler, 4655 encoding, reason, startp, size, &exc, 4656 collstart-startp, collend-startp, &newpos); 4657 if (repunicode == NULL) 4658 goto onError; 4659 if (PyBytes_Check(repunicode)) { 4660 /* Directly copy bytes result to output. */ 4661 repsize = PyBytes_Size(repunicode); 4662 if (repsize > 1) { 4663 /* Make room for all additional bytes. */ 4664 respos = str - PyBytes_AS_STRING(res); 4665 if (_PyBytes_Resize(&res, ressize+repsize-1)) { 4666 Py_DECREF(repunicode); 4667 goto onError; 4668 } 4669 str = PyBytes_AS_STRING(res) + respos; 4670 ressize += repsize-1; 4671 } 4672 memcpy(str, PyBytes_AsString(repunicode), repsize); 4673 str += repsize; 4674 p = startp + newpos; 4675 Py_DECREF(repunicode); 4676 break; 4677 } 4678 /* need more space? (at least enough for what we 4679 have+the replacement+the rest of the string, so 4680 we won't have to check space for encodable characters) */ 4681 respos = str - PyBytes_AS_STRING(res); 4682 repsize = PyUnicode_GET_SIZE(repunicode); 4683 requiredsize = respos+repsize+(endp-collend); 4684 if (requiredsize > ressize) { 4685 if (requiredsize<2*ressize) 4686 requiredsize = 2*ressize; 4687 if (_PyBytes_Resize(&res, requiredsize)) { 4688 Py_DECREF(repunicode); 4689 goto onError; 4690 } 4691 str = PyBytes_AS_STRING(res) + respos; 4692 ressize = requiredsize; 4693 } 4694 /* check if there is anything unencodable in the replacement 4695 and copy it to the output */ 4696 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) { 4697 c = *uni2; 4698 if (c >= limit) { 4699 raise_encode_exception(&exc, encoding, startp, size, 4700 unicodepos, unicodepos+1, reason); 4701 Py_DECREF(repunicode); 4702 goto onError; 4703 } 4704 *str = (char)c; 4705 } 4706 p = startp + newpos; 4707 Py_DECREF(repunicode); 4708 } 4709 } 4710 } 4711 /* Resize if we allocated to much */ 4712 size = str - PyBytes_AS_STRING(res); 4713 if (size < ressize) { /* If this falls res will be NULL */ 4714 assert(size >= 0); 4715 if (_PyBytes_Resize(&res, size) < 0) 4716 goto onError; 4717 } 4718 4719 Py_XDECREF(errorHandler); 4720 Py_XDECREF(exc); 4721 return res; 4722 4723 onError: 4724 Py_XDECREF(res); 4725 Py_XDECREF(errorHandler); 4726 Py_XDECREF(exc); 4727 return NULL; 4728} 4729 4730PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p, 4731 Py_ssize_t size, 4732 const char *errors) 4733{ 4734 return unicode_encode_ucs1(p, size, errors, 256); 4735} 4736 4737PyObject *PyUnicode_AsLatin1String(PyObject *unicode) 4738{ 4739 if (!PyUnicode_Check(unicode)) { 4740 PyErr_BadArgument(); 4741 return NULL; 4742 } 4743 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode), 4744 PyUnicode_GET_SIZE(unicode), 4745 NULL); 4746} 4747 4748/* --- 7-bit ASCII Codec -------------------------------------------------- */ 4749 4750PyObject *PyUnicode_DecodeASCII(const char *s, 4751 Py_ssize_t size, 4752 const char *errors) 4753{ 4754 const char *starts = s; 4755 PyUnicodeObject *v; 4756 Py_UNICODE *p; 4757 Py_ssize_t startinpos; 4758 Py_ssize_t endinpos; 4759 Py_ssize_t outpos; 4760 const char *e; 4761 PyObject *errorHandler = NULL; 4762 PyObject *exc = NULL; 4763 4764 /* ASCII is equivalent to the first 128 ordinals in Unicode. */ 4765 if (size == 1 && *(unsigned char*)s < 128) { 4766 Py_UNICODE r = *(unsigned char*)s; 4767 return PyUnicode_FromUnicode(&r, 1); 4768 } 4769 4770 v = _PyUnicode_New(size); 4771 if (v == NULL) 4772 goto onError; 4773 if (size == 0) 4774 return (PyObject *)v; 4775 p = PyUnicode_AS_UNICODE(v); 4776 e = s + size; 4777 while (s < e) { 4778 register unsigned char c = (unsigned char)*s; 4779 if (c < 128) { 4780 *p++ = c; 4781 ++s; 4782 } 4783 else { 4784 startinpos = s-starts; 4785 endinpos = startinpos + 1; 4786 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v); 4787 if (unicode_decode_call_errorhandler( 4788 errors, &errorHandler, 4789 "ascii", "ordinal not in range(128)", 4790 &starts, &e, &startinpos, &endinpos, &exc, &s, 4791 &v, &outpos, &p)) 4792 goto onError; 4793 } 4794 } 4795 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v)) 4796 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0) 4797 goto onError; 4798 Py_XDECREF(errorHandler); 4799 Py_XDECREF(exc); 4800 return (PyObject *)v; 4801 4802 onError: 4803 Py_XDECREF(v); 4804 Py_XDECREF(errorHandler); 4805 Py_XDECREF(exc); 4806 return NULL; 4807} 4808 4809PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p, 4810 Py_ssize_t size, 4811 const char *errors) 4812{ 4813 return unicode_encode_ucs1(p, size, errors, 128); 4814} 4815 4816PyObject *PyUnicode_AsASCIIString(PyObject *unicode) 4817{ 4818 if (!PyUnicode_Check(unicode)) { 4819 PyErr_BadArgument(); 4820 return NULL; 4821 } 4822 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode), 4823 PyUnicode_GET_SIZE(unicode), 4824 NULL); 4825} 4826 4827#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T) 4828 4829/* --- MBCS codecs for Windows -------------------------------------------- */ 4830 4831#if SIZEOF_INT < SIZEOF_SIZE_T 4832#define NEED_RETRY 4833#endif 4834 4835/* XXX This code is limited to "true" double-byte encodings, as 4836 a) it assumes an incomplete character consists of a single byte, and 4837 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte 4838 encodings, see IsDBCSLeadByteEx documentation. */ 4839 4840static int is_dbcs_lead_byte(const char *s, int offset) 4841{ 4842 const char *curr = s + offset; 4843 4844 if (IsDBCSLeadByte(*curr)) { 4845 const char *prev = CharPrev(s, curr); 4846 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2); 4847 } 4848 return 0; 4849} 4850 4851/* 4852 * Decode MBCS string into unicode object. If 'final' is set, converts 4853 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise. 4854 */ 4855static int decode_mbcs(PyUnicodeObject **v, 4856 const char *s, /* MBCS string */ 4857 int size, /* sizeof MBCS string */ 4858 int final, 4859 const char *errors) 4860{ 4861 Py_UNICODE *p; 4862 Py_ssize_t n; 4863 DWORD usize; 4864 DWORD flags; 4865 4866 assert(size >= 0); 4867 4868 /* check and handle 'errors' arg */ 4869 if (errors==NULL || strcmp(errors, "strict")==0) 4870 flags = MB_ERR_INVALID_CHARS; 4871 else if (strcmp(errors, "ignore")==0) 4872 flags = 0; 4873 else { 4874 PyErr_Format(PyExc_ValueError, 4875 "mbcs encoding does not support errors='%s'", 4876 errors); 4877 return -1; 4878 } 4879 4880 /* Skip trailing lead-byte unless 'final' is set */ 4881 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1)) 4882 --size; 4883 4884 /* First get the size of the result */ 4885 if (size > 0) { 4886 usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0); 4887 if (usize==0) 4888 goto mbcs_decode_error; 4889 } else 4890 usize = 0; 4891 4892 if (*v == NULL) { 4893 /* Create unicode object */ 4894 *v = _PyUnicode_New(usize); 4895 if (*v == NULL) 4896 return -1; 4897 n = 0; 4898 } 4899 else { 4900 /* Extend unicode object */ 4901 n = PyUnicode_GET_SIZE(*v); 4902 if (_PyUnicode_Resize(v, n + usize) < 0) 4903 return -1; 4904 } 4905 4906 /* Do the conversion */ 4907 if (usize > 0) { 4908 p = PyUnicode_AS_UNICODE(*v) + n; 4909 if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) { 4910 goto mbcs_decode_error; 4911 } 4912 } 4913 return size; 4914 4915mbcs_decode_error: 4916 /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then 4917 we raise a UnicodeDecodeError - else it is a 'generic' 4918 windows error 4919 */ 4920 if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) { 4921 /* Ideally, we should get reason from FormatMessage - this 4922 is the Windows 2000 English version of the message 4923 */ 4924 PyObject *exc = NULL; 4925 const char *reason = "No mapping for the Unicode character exists " 4926 "in the target multi-byte code page."; 4927 make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason); 4928 if (exc != NULL) { 4929 PyCodec_StrictErrors(exc); 4930 Py_DECREF(exc); 4931 } 4932 } else { 4933 PyErr_SetFromWindowsErrWithFilename(0, NULL); 4934 } 4935 return -1; 4936} 4937 4938PyObject *PyUnicode_DecodeMBCSStateful(const char *s, 4939 Py_ssize_t size, 4940 const char *errors, 4941 Py_ssize_t *consumed) 4942{ 4943 PyUnicodeObject *v = NULL; 4944 int done; 4945 4946 if (consumed) 4947 *consumed = 0; 4948 4949#ifdef NEED_RETRY 4950 retry: 4951 if (size > INT_MAX) 4952 done = decode_mbcs(&v, s, INT_MAX, 0, errors); 4953 else 4954#endif 4955 done = decode_mbcs(&v, s, (int)size, !consumed, errors); 4956 4957 if (done < 0) { 4958 Py_XDECREF(v); 4959 return NULL; 4960 } 4961 4962 if (consumed) 4963 *consumed += done; 4964 4965#ifdef NEED_RETRY 4966 if (size > INT_MAX) { 4967 s += done; 4968 size -= done; 4969 goto retry; 4970 } 4971#endif 4972 4973 return (PyObject *)v; 4974} 4975 4976PyObject *PyUnicode_DecodeMBCS(const char *s, 4977 Py_ssize_t size, 4978 const char *errors) 4979{ 4980 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL); 4981} 4982 4983/* 4984 * Convert unicode into string object (MBCS). 4985 * Returns 0 if succeed, -1 otherwise. 4986 */ 4987static int encode_mbcs(PyObject **repr, 4988 const Py_UNICODE *p, /* unicode */ 4989 int size, /* size of unicode */ 4990 const char* errors) 4991{ 4992 BOOL usedDefaultChar = FALSE; 4993 BOOL *pusedDefaultChar; 4994 int mbcssize; 4995 Py_ssize_t n; 4996 PyObject *exc = NULL; 4997 DWORD flags; 4998 4999 assert(size >= 0); 5000 5001 /* check and handle 'errors' arg */ 5002 if (errors==NULL || strcmp(errors, "strict")==0) { 5003 flags = WC_NO_BEST_FIT_CHARS; 5004 pusedDefaultChar = &usedDefaultChar; 5005 } else if (strcmp(errors, "replace")==0) { 5006 flags = 0; 5007 pusedDefaultChar = NULL; 5008 } else { 5009 PyErr_Format(PyExc_ValueError, 5010 "mbcs encoding does not support errors='%s'", 5011 errors); 5012 return -1; 5013 } 5014 5015 /* First get the size of the result */ 5016 if (size > 0) { 5017 mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0, 5018 NULL, pusedDefaultChar); 5019 if (mbcssize == 0) { 5020 PyErr_SetFromWindowsErrWithFilename(0, NULL); 5021 return -1; 5022 } 5023 /* If we used a default char, then we failed! */ 5024 if (pusedDefaultChar && *pusedDefaultChar) 5025 goto mbcs_encode_error; 5026 } else { 5027 mbcssize = 0; 5028 } 5029 5030 if (*repr == NULL) { 5031 /* Create string object */ 5032 *repr = PyBytes_FromStringAndSize(NULL, mbcssize); 5033 if (*repr == NULL) 5034 return -1; 5035 n = 0; 5036 } 5037 else { 5038 /* Extend string object */ 5039 n = PyBytes_Size(*repr); 5040 if (_PyBytes_Resize(repr, n + mbcssize) < 0) 5041 return -1; 5042 } 5043 5044 /* Do the conversion */ 5045 if (size > 0) { 5046 char *s = PyBytes_AS_STRING(*repr) + n; 5047 if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize, 5048 NULL, pusedDefaultChar)) { 5049 PyErr_SetFromWindowsErrWithFilename(0, NULL); 5050 return -1; 5051 } 5052 if (pusedDefaultChar && *pusedDefaultChar) 5053 goto mbcs_encode_error; 5054 } 5055 return 0; 5056 5057mbcs_encode_error: 5058 raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character"); 5059 Py_XDECREF(exc); 5060 return -1; 5061} 5062 5063PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p, 5064 Py_ssize_t size, 5065 const char *errors) 5066{ 5067 PyObject *repr = NULL; 5068 int ret; 5069 5070#ifdef NEED_RETRY 5071 retry: 5072 if (size > INT_MAX) 5073 ret = encode_mbcs(&repr, p, INT_MAX, errors); 5074 else 5075#endif 5076 ret = encode_mbcs(&repr, p, (int)size, errors); 5077 5078 if (ret < 0) { 5079 Py_XDECREF(repr); 5080 return NULL; 5081 } 5082 5083#ifdef NEED_RETRY 5084 if (size > INT_MAX) { 5085 p += INT_MAX; 5086 size -= INT_MAX; 5087 goto retry; 5088 } 5089#endif 5090 5091 return repr; 5092} 5093 5094PyObject *PyUnicode_AsMBCSString(PyObject *unicode) 5095{ 5096 if (!PyUnicode_Check(unicode)) { 5097 PyErr_BadArgument(); 5098 return NULL; 5099 } 5100 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode), 5101 PyUnicode_GET_SIZE(unicode), 5102 NULL); 5103} 5104 5105#undef NEED_RETRY 5106 5107#endif /* MS_WINDOWS */ 5108 5109/* --- Character Mapping Codec -------------------------------------------- */ 5110 5111PyObject *PyUnicode_DecodeCharmap(const char *s, 5112 Py_ssize_t size, 5113 PyObject *mapping, 5114 const char *errors) 5115{ 5116 const char *starts = s; 5117 Py_ssize_t startinpos; 5118 Py_ssize_t endinpos; 5119 Py_ssize_t outpos; 5120 const char *e; 5121 PyUnicodeObject *v; 5122 Py_UNICODE *p; 5123 Py_ssize_t extrachars = 0; 5124 PyObject *errorHandler = NULL; 5125 PyObject *exc = NULL; 5126 Py_UNICODE *mapstring = NULL; 5127 Py_ssize_t maplen = 0; 5128 5129 /* Default to Latin-1 */ 5130 if (mapping == NULL) 5131 return PyUnicode_DecodeLatin1(s, size, errors); 5132 5133 v = _PyUnicode_New(size); 5134 if (v == NULL) 5135 goto onError; 5136 if (size == 0) 5137 return (PyObject *)v; 5138 p = PyUnicode_AS_UNICODE(v); 5139 e = s + size; 5140 if (PyUnicode_CheckExact(mapping)) { 5141 mapstring = PyUnicode_AS_UNICODE(mapping); 5142 maplen = PyUnicode_GET_SIZE(mapping); 5143 while (s < e) { 5144 unsigned char ch = *s; 5145 Py_UNICODE x = 0xfffe; /* illegal value */ 5146 5147 if (ch < maplen) 5148 x = mapstring[ch]; 5149 5150 if (x == 0xfffe) { 5151 /* undefined mapping */ 5152 outpos = p-PyUnicode_AS_UNICODE(v); 5153 startinpos = s-starts; 5154 endinpos = startinpos+1; 5155 if (unicode_decode_call_errorhandler( 5156 errors, &errorHandler, 5157 "charmap", "character maps to <undefined>", 5158 &starts, &e, &startinpos, &endinpos, &exc, &s, 5159 &v, &outpos, &p)) { 5160 goto onError; 5161 } 5162 continue; 5163 } 5164 *p++ = x; 5165 ++s; 5166 } 5167 } 5168 else { 5169 while (s < e) { 5170 unsigned char ch = *s; 5171 PyObject *w, *x; 5172 5173 /* Get mapping (char ordinal -> integer, Unicode char or None) */ 5174 w = PyLong_FromLong((long)ch); 5175 if (w == NULL) 5176 goto onError; 5177 x = PyObject_GetItem(mapping, w); 5178 Py_DECREF(w); 5179 if (x == NULL) { 5180 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 5181 /* No mapping found means: mapping is undefined. */ 5182 PyErr_Clear(); 5183 x = Py_None; 5184 Py_INCREF(x); 5185 } else 5186 goto onError; 5187 } 5188 5189 /* Apply mapping */ 5190 if (PyLong_Check(x)) { 5191 long value = PyLong_AS_LONG(x); 5192 if (value < 0 || value > 65535) { 5193 PyErr_SetString(PyExc_TypeError, 5194 "character mapping must be in range(65536)"); 5195 Py_DECREF(x); 5196 goto onError; 5197 } 5198 *p++ = (Py_UNICODE)value; 5199 } 5200 else if (x == Py_None) { 5201 /* undefined mapping */ 5202 outpos = p-PyUnicode_AS_UNICODE(v); 5203 startinpos = s-starts; 5204 endinpos = startinpos+1; 5205 if (unicode_decode_call_errorhandler( 5206 errors, &errorHandler, 5207 "charmap", "character maps to <undefined>", 5208 &starts, &e, &startinpos, &endinpos, &exc, &s, 5209 &v, &outpos, &p)) { 5210 Py_DECREF(x); 5211 goto onError; 5212 } 5213 Py_DECREF(x); 5214 continue; 5215 } 5216 else if (PyUnicode_Check(x)) { 5217 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x); 5218 5219 if (targetsize == 1) 5220 /* 1-1 mapping */ 5221 *p++ = *PyUnicode_AS_UNICODE(x); 5222 5223 else if (targetsize > 1) { 5224 /* 1-n mapping */ 5225 if (targetsize > extrachars) { 5226 /* resize first */ 5227 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v); 5228 Py_ssize_t needed = (targetsize - extrachars) + \ 5229 (targetsize << 2); 5230 extrachars += needed; 5231 /* XXX overflow detection missing */ 5232 if (_PyUnicode_Resize(&v, 5233 PyUnicode_GET_SIZE(v) + needed) < 0) { 5234 Py_DECREF(x); 5235 goto onError; 5236 } 5237 p = PyUnicode_AS_UNICODE(v) + oldpos; 5238 } 5239 Py_UNICODE_COPY(p, 5240 PyUnicode_AS_UNICODE(x), 5241 targetsize); 5242 p += targetsize; 5243 extrachars -= targetsize; 5244 } 5245 /* 1-0 mapping: skip the character */ 5246 } 5247 else { 5248 /* wrong return value */ 5249 PyErr_SetString(PyExc_TypeError, 5250 "character mapping must return integer, None or str"); 5251 Py_DECREF(x); 5252 goto onError; 5253 } 5254 Py_DECREF(x); 5255 ++s; 5256 } 5257 } 5258 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v)) 5259 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0) 5260 goto onError; 5261 Py_XDECREF(errorHandler); 5262 Py_XDECREF(exc); 5263 return (PyObject *)v; 5264 5265 onError: 5266 Py_XDECREF(errorHandler); 5267 Py_XDECREF(exc); 5268 Py_XDECREF(v); 5269 return NULL; 5270} 5271 5272/* Charmap encoding: the lookup table */ 5273 5274struct encoding_map{ 5275 PyObject_HEAD 5276 unsigned char level1[32]; 5277 int count2, count3; 5278 unsigned char level23[1]; 5279}; 5280 5281static PyObject* 5282encoding_map_size(PyObject *obj, PyObject* args) 5283{ 5284 struct encoding_map *map = (struct encoding_map*)obj; 5285 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 + 5286 128*map->count3); 5287} 5288 5289static PyMethodDef encoding_map_methods[] = { 5290 {"size", encoding_map_size, METH_NOARGS, 5291 PyDoc_STR("Return the size (in bytes) of this object") }, 5292 { 0 } 5293}; 5294 5295static void 5296encoding_map_dealloc(PyObject* o) 5297{ 5298 PyObject_FREE(o); 5299} 5300 5301static PyTypeObject EncodingMapType = { 5302 PyVarObject_HEAD_INIT(NULL, 0) 5303 "EncodingMap", /*tp_name*/ 5304 sizeof(struct encoding_map), /*tp_basicsize*/ 5305 0, /*tp_itemsize*/ 5306 /* methods */ 5307 encoding_map_dealloc, /*tp_dealloc*/ 5308 0, /*tp_print*/ 5309 0, /*tp_getattr*/ 5310 0, /*tp_setattr*/ 5311 0, /*tp_reserved*/ 5312 0, /*tp_repr*/ 5313 0, /*tp_as_number*/ 5314 0, /*tp_as_sequence*/ 5315 0, /*tp_as_mapping*/ 5316 0, /*tp_hash*/ 5317 0, /*tp_call*/ 5318 0, /*tp_str*/ 5319 0, /*tp_getattro*/ 5320 0, /*tp_setattro*/ 5321 0, /*tp_as_buffer*/ 5322 Py_TPFLAGS_DEFAULT, /*tp_flags*/ 5323 0, /*tp_doc*/ 5324 0, /*tp_traverse*/ 5325 0, /*tp_clear*/ 5326 0, /*tp_richcompare*/ 5327 0, /*tp_weaklistoffset*/ 5328 0, /*tp_iter*/ 5329 0, /*tp_iternext*/ 5330 encoding_map_methods, /*tp_methods*/ 5331 0, /*tp_members*/ 5332 0, /*tp_getset*/ 5333 0, /*tp_base*/ 5334 0, /*tp_dict*/ 5335 0, /*tp_descr_get*/ 5336 0, /*tp_descr_set*/ 5337 0, /*tp_dictoffset*/ 5338 0, /*tp_init*/ 5339 0, /*tp_alloc*/ 5340 0, /*tp_new*/ 5341 0, /*tp_free*/ 5342 0, /*tp_is_gc*/ 5343}; 5344 5345PyObject* 5346PyUnicode_BuildEncodingMap(PyObject* string) 5347{ 5348 Py_UNICODE *decode; 5349 PyObject *result; 5350 struct encoding_map *mresult; 5351 int i; 5352 int need_dict = 0; 5353 unsigned char level1[32]; 5354 unsigned char level2[512]; 5355 unsigned char *mlevel1, *mlevel2, *mlevel3; 5356 int count2 = 0, count3 = 0; 5357 5358 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) { 5359 PyErr_BadArgument(); 5360 return NULL; 5361 } 5362 decode = PyUnicode_AS_UNICODE(string); 5363 memset(level1, 0xFF, sizeof level1); 5364 memset(level2, 0xFF, sizeof level2); 5365 5366 /* If there isn't a one-to-one mapping of NULL to \0, 5367 or if there are non-BMP characters, we need to use 5368 a mapping dictionary. */ 5369 if (decode[0] != 0) 5370 need_dict = 1; 5371 for (i = 1; i < 256; i++) { 5372 int l1, l2; 5373 if (decode[i] == 0 5374#ifdef Py_UNICODE_WIDE 5375 || decode[i] > 0xFFFF 5376#endif 5377 ) { 5378 need_dict = 1; 5379 break; 5380 } 5381 if (decode[i] == 0xFFFE) 5382 /* unmapped character */ 5383 continue; 5384 l1 = decode[i] >> 11; 5385 l2 = decode[i] >> 7; 5386 if (level1[l1] == 0xFF) 5387 level1[l1] = count2++; 5388 if (level2[l2] == 0xFF) 5389 level2[l2] = count3++; 5390 } 5391 5392 if (count2 >= 0xFF || count3 >= 0xFF) 5393 need_dict = 1; 5394 5395 if (need_dict) { 5396 PyObject *result = PyDict_New(); 5397 PyObject *key, *value; 5398 if (!result) 5399 return NULL; 5400 for (i = 0; i < 256; i++) { 5401 key = value = NULL; 5402 key = PyLong_FromLong(decode[i]); 5403 value = PyLong_FromLong(i); 5404 if (!key || !value) 5405 goto failed1; 5406 if (PyDict_SetItem(result, key, value) == -1) 5407 goto failed1; 5408 Py_DECREF(key); 5409 Py_DECREF(value); 5410 } 5411 return result; 5412 failed1: 5413 Py_XDECREF(key); 5414 Py_XDECREF(value); 5415 Py_DECREF(result); 5416 return NULL; 5417 } 5418 5419 /* Create a three-level trie */ 5420 result = PyObject_MALLOC(sizeof(struct encoding_map) + 5421 16*count2 + 128*count3 - 1); 5422 if (!result) 5423 return PyErr_NoMemory(); 5424 PyObject_Init(result, &EncodingMapType); 5425 mresult = (struct encoding_map*)result; 5426 mresult->count2 = count2; 5427 mresult->count3 = count3; 5428 mlevel1 = mresult->level1; 5429 mlevel2 = mresult->level23; 5430 mlevel3 = mresult->level23 + 16*count2; 5431 memcpy(mlevel1, level1, 32); 5432 memset(mlevel2, 0xFF, 16*count2); 5433 memset(mlevel3, 0, 128*count3); 5434 count3 = 0; 5435 for (i = 1; i < 256; i++) { 5436 int o1, o2, o3, i2, i3; 5437 if (decode[i] == 0xFFFE) 5438 /* unmapped character */ 5439 continue; 5440 o1 = decode[i]>>11; 5441 o2 = (decode[i]>>7) & 0xF; 5442 i2 = 16*mlevel1[o1] + o2; 5443 if (mlevel2[i2] == 0xFF) 5444 mlevel2[i2] = count3++; 5445 o3 = decode[i] & 0x7F; 5446 i3 = 128*mlevel2[i2] + o3; 5447 mlevel3[i3] = i; 5448 } 5449 return result; 5450} 5451 5452static int 5453encoding_map_lookup(Py_UNICODE c, PyObject *mapping) 5454{ 5455 struct encoding_map *map = (struct encoding_map*)mapping; 5456 int l1 = c>>11; 5457 int l2 = (c>>7) & 0xF; 5458 int l3 = c & 0x7F; 5459 int i; 5460 5461#ifdef Py_UNICODE_WIDE 5462 if (c > 0xFFFF) { 5463 return -1; 5464 } 5465#endif 5466 if (c == 0) 5467 return 0; 5468 /* level 1*/ 5469 i = map->level1[l1]; 5470 if (i == 0xFF) { 5471 return -1; 5472 } 5473 /* level 2*/ 5474 i = map->level23[16*i+l2]; 5475 if (i == 0xFF) { 5476 return -1; 5477 } 5478 /* level 3 */ 5479 i = map->level23[16*map->count2 + 128*i + l3]; 5480 if (i == 0) { 5481 return -1; 5482 } 5483 return i; 5484} 5485 5486/* Lookup the character ch in the mapping. If the character 5487 can't be found, Py_None is returned (or NULL, if another 5488 error occurred). */ 5489static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping) 5490{ 5491 PyObject *w = PyLong_FromLong((long)c); 5492 PyObject *x; 5493 5494 if (w == NULL) 5495 return NULL; 5496 x = PyObject_GetItem(mapping, w); 5497 Py_DECREF(w); 5498 if (x == NULL) { 5499 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 5500 /* No mapping found means: mapping is undefined. */ 5501 PyErr_Clear(); 5502 x = Py_None; 5503 Py_INCREF(x); 5504 return x; 5505 } else 5506 return NULL; 5507 } 5508 else if (x == Py_None) 5509 return x; 5510 else if (PyLong_Check(x)) { 5511 long value = PyLong_AS_LONG(x); 5512 if (value < 0 || value > 255) { 5513 PyErr_SetString(PyExc_TypeError, 5514 "character mapping must be in range(256)"); 5515 Py_DECREF(x); 5516 return NULL; 5517 } 5518 return x; 5519 } 5520 else if (PyBytes_Check(x)) 5521 return x; 5522 else { 5523 /* wrong return value */ 5524 PyErr_Format(PyExc_TypeError, 5525 "character mapping must return integer, bytes or None, not %.400s", 5526 x->ob_type->tp_name); 5527 Py_DECREF(x); 5528 return NULL; 5529 } 5530} 5531 5532static int 5533charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize) 5534{ 5535 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj); 5536 /* exponentially overallocate to minimize reallocations */ 5537 if (requiredsize < 2*outsize) 5538 requiredsize = 2*outsize; 5539 if (_PyBytes_Resize(outobj, requiredsize)) 5540 return -1; 5541 return 0; 5542} 5543 5544typedef enum charmapencode_result { 5545 enc_SUCCESS, enc_FAILED, enc_EXCEPTION 5546}charmapencode_result; 5547/* lookup the character, put the result in the output string and adjust 5548 various state variables. Resize the output bytes object if not enough 5549 space is available. Return a new reference to the object that 5550 was put in the output buffer, or Py_None, if the mapping was undefined 5551 (in which case no character was written) or NULL, if a 5552 reallocation error occurred. The caller must decref the result */ 5553static 5554charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping, 5555 PyObject **outobj, Py_ssize_t *outpos) 5556{ 5557 PyObject *rep; 5558 char *outstart; 5559 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj); 5560 5561 if (Py_TYPE(mapping) == &EncodingMapType) { 5562 int res = encoding_map_lookup(c, mapping); 5563 Py_ssize_t requiredsize = *outpos+1; 5564 if (res == -1) 5565 return enc_FAILED; 5566 if (outsize<requiredsize) 5567 if (charmapencode_resize(outobj, outpos, requiredsize)) 5568 return enc_EXCEPTION; 5569 outstart = PyBytes_AS_STRING(*outobj); 5570 outstart[(*outpos)++] = (char)res; 5571 return enc_SUCCESS; 5572 } 5573 5574 rep = charmapencode_lookup(c, mapping); 5575 if (rep==NULL) 5576 return enc_EXCEPTION; 5577 else if (rep==Py_None) { 5578 Py_DECREF(rep); 5579 return enc_FAILED; 5580 } else { 5581 if (PyLong_Check(rep)) { 5582 Py_ssize_t requiredsize = *outpos+1; 5583 if (outsize<requiredsize) 5584 if (charmapencode_resize(outobj, outpos, requiredsize)) { 5585 Py_DECREF(rep); 5586 return enc_EXCEPTION; 5587 } 5588 outstart = PyBytes_AS_STRING(*outobj); 5589 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep); 5590 } 5591 else { 5592 const char *repchars = PyBytes_AS_STRING(rep); 5593 Py_ssize_t repsize = PyBytes_GET_SIZE(rep); 5594 Py_ssize_t requiredsize = *outpos+repsize; 5595 if (outsize<requiredsize) 5596 if (charmapencode_resize(outobj, outpos, requiredsize)) { 5597 Py_DECREF(rep); 5598 return enc_EXCEPTION; 5599 } 5600 outstart = PyBytes_AS_STRING(*outobj); 5601 memcpy(outstart + *outpos, repchars, repsize); 5602 *outpos += repsize; 5603 } 5604 } 5605 Py_DECREF(rep); 5606 return enc_SUCCESS; 5607} 5608 5609/* handle an error in PyUnicode_EncodeCharmap 5610 Return 0 on success, -1 on error */ 5611static 5612int charmap_encoding_error( 5613 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping, 5614 PyObject **exceptionObject, 5615 int *known_errorHandler, PyObject **errorHandler, const char *errors, 5616 PyObject **res, Py_ssize_t *respos) 5617{ 5618 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */ 5619 Py_ssize_t repsize; 5620 Py_ssize_t newpos; 5621 Py_UNICODE *uni2; 5622 /* startpos for collecting unencodable chars */ 5623 Py_ssize_t collstartpos = *inpos; 5624 Py_ssize_t collendpos = *inpos+1; 5625 Py_ssize_t collpos; 5626 char *encoding = "charmap"; 5627 char *reason = "character maps to <undefined>"; 5628 charmapencode_result x; 5629 5630 /* find all unencodable characters */ 5631 while (collendpos < size) { 5632 PyObject *rep; 5633 if (Py_TYPE(mapping) == &EncodingMapType) { 5634 int res = encoding_map_lookup(p[collendpos], mapping); 5635 if (res != -1) 5636 break; 5637 ++collendpos; 5638 continue; 5639 } 5640 5641 rep = charmapencode_lookup(p[collendpos], mapping); 5642 if (rep==NULL) 5643 return -1; 5644 else if (rep!=Py_None) { 5645 Py_DECREF(rep); 5646 break; 5647 } 5648 Py_DECREF(rep); 5649 ++collendpos; 5650 } 5651 /* cache callback name lookup 5652 * (if not done yet, i.e. it's the first error) */ 5653 if (*known_errorHandler==-1) { 5654 if ((errors==NULL) || (!strcmp(errors, "strict"))) 5655 *known_errorHandler = 1; 5656 else if (!strcmp(errors, "replace")) 5657 *known_errorHandler = 2; 5658 else if (!strcmp(errors, "ignore")) 5659 *known_errorHandler = 3; 5660 else if (!strcmp(errors, "xmlcharrefreplace")) 5661 *known_errorHandler = 4; 5662 else 5663 *known_errorHandler = 0; 5664 } 5665 switch (*known_errorHandler) { 5666 case 1: /* strict */ 5667 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason); 5668 return -1; 5669 case 2: /* replace */ 5670 for (collpos = collstartpos; collpos<collendpos; ++collpos) { 5671 x = charmapencode_output('?', mapping, res, respos); 5672 if (x==enc_EXCEPTION) { 5673 return -1; 5674 } 5675 else if (x==enc_FAILED) { 5676 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason); 5677 return -1; 5678 } 5679 } 5680 /* fall through */ 5681 case 3: /* ignore */ 5682 *inpos = collendpos; 5683 break; 5684 case 4: /* xmlcharrefreplace */ 5685 /* generate replacement (temporarily (mis)uses p) */ 5686 for (collpos = collstartpos; collpos < collendpos; ++collpos) { 5687 char buffer[2+29+1+1]; 5688 char *cp; 5689 sprintf(buffer, "&#%d;", (int)p[collpos]); 5690 for (cp = buffer; *cp; ++cp) { 5691 x = charmapencode_output(*cp, mapping, res, respos); 5692 if (x==enc_EXCEPTION) 5693 return -1; 5694 else if (x==enc_FAILED) { 5695 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason); 5696 return -1; 5697 } 5698 } 5699 } 5700 *inpos = collendpos; 5701 break; 5702 default: 5703 repunicode = unicode_encode_call_errorhandler(errors, errorHandler, 5704 encoding, reason, p, size, exceptionObject, 5705 collstartpos, collendpos, &newpos); 5706 if (repunicode == NULL) 5707 return -1; 5708 if (PyBytes_Check(repunicode)) { 5709 /* Directly copy bytes result to output. */ 5710 Py_ssize_t outsize = PyBytes_Size(*res); 5711 Py_ssize_t requiredsize; 5712 repsize = PyBytes_Size(repunicode); 5713 requiredsize = *respos + repsize; 5714 if (requiredsize > outsize) 5715 /* Make room for all additional bytes. */ 5716 if (charmapencode_resize(res, respos, requiredsize)) { 5717 Py_DECREF(repunicode); 5718 return -1; 5719 } 5720 memcpy(PyBytes_AsString(*res) + *respos, 5721 PyBytes_AsString(repunicode), repsize); 5722 *respos += repsize; 5723 *inpos = newpos; 5724 Py_DECREF(repunicode); 5725 break; 5726 } 5727 /* generate replacement */ 5728 repsize = PyUnicode_GET_SIZE(repunicode); 5729 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) { 5730 x = charmapencode_output(*uni2, mapping, res, respos); 5731 if (x==enc_EXCEPTION) { 5732 return -1; 5733 } 5734 else if (x==enc_FAILED) { 5735 Py_DECREF(repunicode); 5736 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason); 5737 return -1; 5738 } 5739 } 5740 *inpos = newpos; 5741 Py_DECREF(repunicode); 5742 } 5743 return 0; 5744} 5745 5746PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p, 5747 Py_ssize_t size, 5748 PyObject *mapping, 5749 const char *errors) 5750{ 5751 /* output object */ 5752 PyObject *res = NULL; 5753 /* current input position */ 5754 Py_ssize_t inpos = 0; 5755 /* current output position */ 5756 Py_ssize_t respos = 0; 5757 PyObject *errorHandler = NULL; 5758 PyObject *exc = NULL; 5759 /* the following variable is used for caching string comparisons 5760 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 5761 * 3=ignore, 4=xmlcharrefreplace */ 5762 int known_errorHandler = -1; 5763 5764 /* Default to Latin-1 */ 5765 if (mapping == NULL) 5766 return PyUnicode_EncodeLatin1(p, size, errors); 5767 5768 /* allocate enough for a simple encoding without 5769 replacements, if we need more, we'll resize */ 5770 res = PyBytes_FromStringAndSize(NULL, size); 5771 if (res == NULL) 5772 goto onError; 5773 if (size == 0) 5774 return res; 5775 5776 while (inpos<size) { 5777 /* try to encode it */ 5778 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos); 5779 if (x==enc_EXCEPTION) /* error */ 5780 goto onError; 5781 if (x==enc_FAILED) { /* unencodable character */ 5782 if (charmap_encoding_error(p, size, &inpos, mapping, 5783 &exc, 5784 &known_errorHandler, &errorHandler, errors, 5785 &res, &respos)) { 5786 goto onError; 5787 } 5788 } 5789 else 5790 /* done with this character => adjust input position */ 5791 ++inpos; 5792 } 5793 5794 /* Resize if we allocated to much */ 5795 if (respos<PyBytes_GET_SIZE(res)) 5796 if (_PyBytes_Resize(&res, respos) < 0) 5797 goto onError; 5798 5799 Py_XDECREF(exc); 5800 Py_XDECREF(errorHandler); 5801 return res; 5802 5803 onError: 5804 Py_XDECREF(res); 5805 Py_XDECREF(exc); 5806 Py_XDECREF(errorHandler); 5807 return NULL; 5808} 5809 5810PyObject *PyUnicode_AsCharmapString(PyObject *unicode, 5811 PyObject *mapping) 5812{ 5813 if (!PyUnicode_Check(unicode) || mapping == NULL) { 5814 PyErr_BadArgument(); 5815 return NULL; 5816 } 5817 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode), 5818 PyUnicode_GET_SIZE(unicode), 5819 mapping, 5820 NULL); 5821} 5822 5823/* create or adjust a UnicodeTranslateError */ 5824static void make_translate_exception(PyObject **exceptionObject, 5825 const Py_UNICODE *unicode, Py_ssize_t size, 5826 Py_ssize_t startpos, Py_ssize_t endpos, 5827 const char *reason) 5828{ 5829 if (*exceptionObject == NULL) { 5830 *exceptionObject = PyUnicodeTranslateError_Create( 5831 unicode, size, startpos, endpos, reason); 5832 } 5833 else { 5834 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos)) 5835 goto onError; 5836 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos)) 5837 goto onError; 5838 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason)) 5839 goto onError; 5840 return; 5841 onError: 5842 Py_DECREF(*exceptionObject); 5843 *exceptionObject = NULL; 5844 } 5845} 5846 5847/* raises a UnicodeTranslateError */ 5848static void raise_translate_exception(PyObject **exceptionObject, 5849 const Py_UNICODE *unicode, Py_ssize_t size, 5850 Py_ssize_t startpos, Py_ssize_t endpos, 5851 const char *reason) 5852{ 5853 make_translate_exception(exceptionObject, 5854 unicode, size, startpos, endpos, reason); 5855 if (*exceptionObject != NULL) 5856 PyCodec_StrictErrors(*exceptionObject); 5857} 5858 5859/* error handling callback helper: 5860 build arguments, call the callback and check the arguments, 5861 put the result into newpos and return the replacement string, which 5862 has to be freed by the caller */ 5863static PyObject *unicode_translate_call_errorhandler(const char *errors, 5864 PyObject **errorHandler, 5865 const char *reason, 5866 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject, 5867 Py_ssize_t startpos, Py_ssize_t endpos, 5868 Py_ssize_t *newpos) 5869{ 5870 static char *argparse = "O!n;translating error handler must return (str, int) tuple"; 5871 5872 Py_ssize_t i_newpos; 5873 PyObject *restuple; 5874 PyObject *resunicode; 5875 5876 if (*errorHandler == NULL) { 5877 *errorHandler = PyCodec_LookupError(errors); 5878 if (*errorHandler == NULL) 5879 return NULL; 5880 } 5881 5882 make_translate_exception(exceptionObject, 5883 unicode, size, startpos, endpos, reason); 5884 if (*exceptionObject == NULL) 5885 return NULL; 5886 5887 restuple = PyObject_CallFunctionObjArgs( 5888 *errorHandler, *exceptionObject, NULL); 5889 if (restuple == NULL) 5890 return NULL; 5891 if (!PyTuple_Check(restuple)) { 5892 PyErr_SetString(PyExc_TypeError, &argparse[4]); 5893 Py_DECREF(restuple); 5894 return NULL; 5895 } 5896 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, 5897 &resunicode, &i_newpos)) { 5898 Py_DECREF(restuple); 5899 return NULL; 5900 } 5901 if (i_newpos<0) 5902 *newpos = size+i_newpos; 5903 else 5904 *newpos = i_newpos; 5905 if (*newpos<0 || *newpos>size) { 5906 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos); 5907 Py_DECREF(restuple); 5908 return NULL; 5909 } 5910 Py_INCREF(resunicode); 5911 Py_DECREF(restuple); 5912 return resunicode; 5913} 5914 5915/* Lookup the character ch in the mapping and put the result in result, 5916 which must be decrefed by the caller. 5917 Return 0 on success, -1 on error */ 5918static 5919int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result) 5920{ 5921 PyObject *w = PyLong_FromLong((long)c); 5922 PyObject *x; 5923 5924 if (w == NULL) 5925 return -1; 5926 x = PyObject_GetItem(mapping, w); 5927 Py_DECREF(w); 5928 if (x == NULL) { 5929 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 5930 /* No mapping found means: use 1:1 mapping. */ 5931 PyErr_Clear(); 5932 *result = NULL; 5933 return 0; 5934 } else 5935 return -1; 5936 } 5937 else if (x == Py_None) { 5938 *result = x; 5939 return 0; 5940 } 5941 else if (PyLong_Check(x)) { 5942 long value = PyLong_AS_LONG(x); 5943 long max = PyUnicode_GetMax(); 5944 if (value < 0 || value > max) { 5945 PyErr_Format(PyExc_TypeError, 5946 "character mapping must be in range(0x%x)", max+1); 5947 Py_DECREF(x); 5948 return -1; 5949 } 5950 *result = x; 5951 return 0; 5952 } 5953 else if (PyUnicode_Check(x)) { 5954 *result = x; 5955 return 0; 5956 } 5957 else { 5958 /* wrong return value */ 5959 PyErr_SetString(PyExc_TypeError, 5960 "character mapping must return integer, None or str"); 5961 Py_DECREF(x); 5962 return -1; 5963 } 5964} 5965/* ensure that *outobj is at least requiredsize characters long, 5966 if not reallocate and adjust various state variables. 5967 Return 0 on success, -1 on error */ 5968static 5969int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp, 5970 Py_ssize_t requiredsize) 5971{ 5972 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj); 5973 if (requiredsize > oldsize) { 5974 /* remember old output position */ 5975 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj); 5976 /* exponentially overallocate to minimize reallocations */ 5977 if (requiredsize < 2 * oldsize) 5978 requiredsize = 2 * oldsize; 5979 if (PyUnicode_Resize(outobj, requiredsize) < 0) 5980 return -1; 5981 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos; 5982 } 5983 return 0; 5984} 5985/* lookup the character, put the result in the output string and adjust 5986 various state variables. Return a new reference to the object that 5987 was put in the output buffer in *result, or Py_None, if the mapping was 5988 undefined (in which case no character was written). 5989 The called must decref result. 5990 Return 0 on success, -1 on error. */ 5991static 5992int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp, 5993 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp, 5994 PyObject **res) 5995{ 5996 if (charmaptranslate_lookup(*curinp, mapping, res)) 5997 return -1; 5998 if (*res==NULL) { 5999 /* not found => default to 1:1 mapping */ 6000 *(*outp)++ = *curinp; 6001 } 6002 else if (*res==Py_None) 6003 ; 6004 else if (PyLong_Check(*res)) { 6005 /* no overflow check, because we know that the space is enough */ 6006 *(*outp)++ = (Py_UNICODE)PyLong_AS_LONG(*res); 6007 } 6008 else if (PyUnicode_Check(*res)) { 6009 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res); 6010 if (repsize==1) { 6011 /* no overflow check, because we know that the space is enough */ 6012 *(*outp)++ = *PyUnicode_AS_UNICODE(*res); 6013 } 6014 else if (repsize!=0) { 6015 /* more than one character */ 6016 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) + 6017 (insize - (curinp-startinp)) + 6018 repsize - 1; 6019 if (charmaptranslate_makespace(outobj, outp, requiredsize)) 6020 return -1; 6021 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize); 6022 *outp += repsize; 6023 } 6024 } 6025 else 6026 return -1; 6027 return 0; 6028} 6029 6030PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p, 6031 Py_ssize_t size, 6032 PyObject *mapping, 6033 const char *errors) 6034{ 6035 /* output object */ 6036 PyObject *res = NULL; 6037 /* pointers to the beginning and end+1 of input */ 6038 const Py_UNICODE *startp = p; 6039 const Py_UNICODE *endp = p + size; 6040 /* pointer into the output */ 6041 Py_UNICODE *str; 6042 /* current output position */ 6043 Py_ssize_t respos = 0; 6044 char *reason = "character maps to <undefined>"; 6045 PyObject *errorHandler = NULL; 6046 PyObject *exc = NULL; 6047 /* the following variable is used for caching string comparisons 6048 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 6049 * 3=ignore, 4=xmlcharrefreplace */ 6050 int known_errorHandler = -1; 6051 6052 if (mapping == NULL) { 6053 PyErr_BadArgument(); 6054 return NULL; 6055 } 6056 6057 /* allocate enough for a simple 1:1 translation without 6058 replacements, if we need more, we'll resize */ 6059 res = PyUnicode_FromUnicode(NULL, size); 6060 if (res == NULL) 6061 goto onError; 6062 if (size == 0) 6063 return res; 6064 str = PyUnicode_AS_UNICODE(res); 6065 6066 while (p<endp) { 6067 /* try to encode it */ 6068 PyObject *x = NULL; 6069 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) { 6070 Py_XDECREF(x); 6071 goto onError; 6072 } 6073 Py_XDECREF(x); 6074 if (x!=Py_None) /* it worked => adjust input pointer */ 6075 ++p; 6076 else { /* untranslatable character */ 6077 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */ 6078 Py_ssize_t repsize; 6079 Py_ssize_t newpos; 6080 Py_UNICODE *uni2; 6081 /* startpos for collecting untranslatable chars */ 6082 const Py_UNICODE *collstart = p; 6083 const Py_UNICODE *collend = p+1; 6084 const Py_UNICODE *coll; 6085 6086 /* find all untranslatable characters */ 6087 while (collend < endp) { 6088 if (charmaptranslate_lookup(*collend, mapping, &x)) 6089 goto onError; 6090 Py_XDECREF(x); 6091 if (x!=Py_None) 6092 break; 6093 ++collend; 6094 } 6095 /* cache callback name lookup 6096 * (if not done yet, i.e. it's the first error) */ 6097 if (known_errorHandler==-1) { 6098 if ((errors==NULL) || (!strcmp(errors, "strict"))) 6099 known_errorHandler = 1; 6100 else if (!strcmp(errors, "replace")) 6101 known_errorHandler = 2; 6102 else if (!strcmp(errors, "ignore")) 6103 known_errorHandler = 3; 6104 else if (!strcmp(errors, "xmlcharrefreplace")) 6105 known_errorHandler = 4; 6106 else 6107 known_errorHandler = 0; 6108 } 6109 switch (known_errorHandler) { 6110 case 1: /* strict */ 6111 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason); 6112 goto onError; 6113 case 2: /* replace */ 6114 /* No need to check for space, this is a 1:1 replacement */ 6115 for (coll = collstart; coll<collend; ++coll) 6116 *str++ = '?'; 6117 /* fall through */ 6118 case 3: /* ignore */ 6119 p = collend; 6120 break; 6121 case 4: /* xmlcharrefreplace */ 6122 /* generate replacement (temporarily (mis)uses p) */ 6123 for (p = collstart; p < collend; ++p) { 6124 char buffer[2+29+1+1]; 6125 char *cp; 6126 sprintf(buffer, "&#%d;", (int)*p); 6127 if (charmaptranslate_makespace(&res, &str, 6128 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend))) 6129 goto onError; 6130 for (cp = buffer; *cp; ++cp) 6131 *str++ = *cp; 6132 } 6133 p = collend; 6134 break; 6135 default: 6136 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler, 6137 reason, startp, size, &exc, 6138 collstart-startp, collend-startp, &newpos); 6139 if (repunicode == NULL) 6140 goto onError; 6141 /* generate replacement */ 6142 repsize = PyUnicode_GET_SIZE(repunicode); 6143 if (charmaptranslate_makespace(&res, &str, 6144 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) { 6145 Py_DECREF(repunicode); 6146 goto onError; 6147 } 6148 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) 6149 *str++ = *uni2; 6150 p = startp + newpos; 6151 Py_DECREF(repunicode); 6152 } 6153 } 6154 } 6155 /* Resize if we allocated to much */ 6156 respos = str-PyUnicode_AS_UNICODE(res); 6157 if (respos<PyUnicode_GET_SIZE(res)) { 6158 if (PyUnicode_Resize(&res, respos) < 0) 6159 goto onError; 6160 } 6161 Py_XDECREF(exc); 6162 Py_XDECREF(errorHandler); 6163 return res; 6164 6165 onError: 6166 Py_XDECREF(res); 6167 Py_XDECREF(exc); 6168 Py_XDECREF(errorHandler); 6169 return NULL; 6170} 6171 6172PyObject *PyUnicode_Translate(PyObject *str, 6173 PyObject *mapping, 6174 const char *errors) 6175{ 6176 PyObject *result; 6177 6178 str = PyUnicode_FromObject(str); 6179 if (str == NULL) 6180 goto onError; 6181 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str), 6182 PyUnicode_GET_SIZE(str), 6183 mapping, 6184 errors); 6185 Py_DECREF(str); 6186 return result; 6187 6188 onError: 6189 Py_XDECREF(str); 6190 return NULL; 6191} 6192 6193/* --- Decimal Encoder ---------------------------------------------------- */ 6194 6195int PyUnicode_EncodeDecimal(Py_UNICODE *s, 6196 Py_ssize_t length, 6197 char *output, 6198 const char *errors) 6199{ 6200 Py_UNICODE *p, *end; 6201 PyObject *errorHandler = NULL; 6202 PyObject *exc = NULL; 6203 const char *encoding = "decimal"; 6204 const char *reason = "invalid decimal Unicode string"; 6205 /* the following variable is used for caching string comparisons 6206 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */ 6207 int known_errorHandler = -1; 6208 6209 if (output == NULL) { 6210 PyErr_BadArgument(); 6211 return -1; 6212 } 6213 6214 p = s; 6215 end = s + length; 6216 while (p < end) { 6217 register Py_UNICODE ch = *p; 6218 int decimal; 6219 PyObject *repunicode; 6220 Py_ssize_t repsize; 6221 Py_ssize_t newpos; 6222 Py_UNICODE *uni2; 6223 Py_UNICODE *collstart; 6224 Py_UNICODE *collend; 6225 6226 if (Py_UNICODE_ISSPACE(ch)) { 6227 *output++ = ' '; 6228 ++p; 6229 continue; 6230 } 6231 decimal = Py_UNICODE_TODECIMAL(ch); 6232 if (decimal >= 0) { 6233 *output++ = '0' + decimal; 6234 ++p; 6235 continue; 6236 } 6237 if (0 < ch && ch < 256) { 6238 *output++ = (char)ch; 6239 ++p; 6240 continue; 6241 } 6242 /* All other characters are considered unencodable */ 6243 collstart = p; 6244 collend = p+1; 6245 while (collend < end) { 6246 if ((0 < *collend && *collend < 256) || 6247 !Py_UNICODE_ISSPACE(*collend) || 6248 Py_UNICODE_TODECIMAL(*collend)) 6249 break; 6250 } 6251 /* cache callback name lookup 6252 * (if not done yet, i.e. it's the first error) */ 6253 if (known_errorHandler==-1) { 6254 if ((errors==NULL) || (!strcmp(errors, "strict"))) 6255 known_errorHandler = 1; 6256 else if (!strcmp(errors, "replace")) 6257 known_errorHandler = 2; 6258 else if (!strcmp(errors, "ignore")) 6259 known_errorHandler = 3; 6260 else if (!strcmp(errors, "xmlcharrefreplace")) 6261 known_errorHandler = 4; 6262 else 6263 known_errorHandler = 0; 6264 } 6265 switch (known_errorHandler) { 6266 case 1: /* strict */ 6267 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason); 6268 goto onError; 6269 case 2: /* replace */ 6270 for (p = collstart; p < collend; ++p) 6271 *output++ = '?'; 6272 /* fall through */ 6273 case 3: /* ignore */ 6274 p = collend; 6275 break; 6276 case 4: /* xmlcharrefreplace */ 6277 /* generate replacement (temporarily (mis)uses p) */ 6278 for (p = collstart; p < collend; ++p) 6279 output += sprintf(output, "&#%d;", (int)*p); 6280 p = collend; 6281 break; 6282 default: 6283 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler, 6284 encoding, reason, s, length, &exc, 6285 collstart-s, collend-s, &newpos); 6286 if (repunicode == NULL) 6287 goto onError; 6288 if (!PyUnicode_Check(repunicode)) { 6289 /* Byte results not supported, since they have no decimal property. */ 6290 PyErr_SetString(PyExc_TypeError, "error handler should return unicode"); 6291 Py_DECREF(repunicode); 6292 goto onError; 6293 } 6294 /* generate replacement */ 6295 repsize = PyUnicode_GET_SIZE(repunicode); 6296 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) { 6297 Py_UNICODE ch = *uni2; 6298 if (Py_UNICODE_ISSPACE(ch)) 6299 *output++ = ' '; 6300 else { 6301 decimal = Py_UNICODE_TODECIMAL(ch); 6302 if (decimal >= 0) 6303 *output++ = '0' + decimal; 6304 else if (0 < ch && ch < 256) 6305 *output++ = (char)ch; 6306 else { 6307 Py_DECREF(repunicode); 6308 raise_encode_exception(&exc, encoding, 6309 s, length, collstart-s, collend-s, reason); 6310 goto onError; 6311 } 6312 } 6313 } 6314 p = s + newpos; 6315 Py_DECREF(repunicode); 6316 } 6317 } 6318 /* 0-terminate the output string */ 6319 *output++ = '\0'; 6320 Py_XDECREF(exc); 6321 Py_XDECREF(errorHandler); 6322 return 0; 6323 6324 onError: 6325 Py_XDECREF(exc); 6326 Py_XDECREF(errorHandler); 6327 return -1; 6328} 6329 6330/* --- Helpers ------------------------------------------------------------ */ 6331 6332#include "stringlib/unicodedefs.h" 6333#include "stringlib/fastsearch.h" 6334 6335#include "stringlib/count.h" 6336#include "stringlib/find.h" 6337#include "stringlib/partition.h" 6338#include "stringlib/split.h" 6339 6340#define _Py_InsertThousandsGrouping _PyUnicode_InsertThousandsGrouping 6341#define _Py_InsertThousandsGroupingLocale _PyUnicode_InsertThousandsGroupingLocale 6342#include "stringlib/localeutil.h" 6343 6344/* helper macro to fixup start/end slice values */ 6345#define ADJUST_INDICES(start, end, len) \ 6346 if (end > len) \ 6347 end = len; \ 6348 else if (end < 0) { \ 6349 end += len; \ 6350 if (end < 0) \ 6351 end = 0; \ 6352 } \ 6353 if (start < 0) { \ 6354 start += len; \ 6355 if (start < 0) \ 6356 start = 0; \ 6357 } 6358 6359Py_ssize_t PyUnicode_Count(PyObject *str, 6360 PyObject *substr, 6361 Py_ssize_t start, 6362 Py_ssize_t end) 6363{ 6364 Py_ssize_t result; 6365 PyUnicodeObject* str_obj; 6366 PyUnicodeObject* sub_obj; 6367 6368 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str); 6369 if (!str_obj) 6370 return -1; 6371 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr); 6372 if (!sub_obj) { 6373 Py_DECREF(str_obj); 6374 return -1; 6375 } 6376 6377 ADJUST_INDICES(start, end, str_obj->length); 6378 result = stringlib_count( 6379 str_obj->str + start, end - start, sub_obj->str, sub_obj->length, 6380 PY_SSIZE_T_MAX 6381 ); 6382 6383 Py_DECREF(sub_obj); 6384 Py_DECREF(str_obj); 6385 6386 return result; 6387} 6388 6389Py_ssize_t PyUnicode_Find(PyObject *str, 6390 PyObject *sub, 6391 Py_ssize_t start, 6392 Py_ssize_t end, 6393 int direction) 6394{ 6395 Py_ssize_t result; 6396 6397 str = PyUnicode_FromObject(str); 6398 if (!str) 6399 return -2; 6400 sub = PyUnicode_FromObject(sub); 6401 if (!sub) { 6402 Py_DECREF(str); 6403 return -2; 6404 } 6405 6406 if (direction > 0) 6407 result = stringlib_find_slice( 6408 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str), 6409 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub), 6410 start, end 6411 ); 6412 else 6413 result = stringlib_rfind_slice( 6414 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str), 6415 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub), 6416 start, end 6417 ); 6418 6419 Py_DECREF(str); 6420 Py_DECREF(sub); 6421 6422 return result; 6423} 6424 6425static 6426int tailmatch(PyUnicodeObject *self, 6427 PyUnicodeObject *substring, 6428 Py_ssize_t start, 6429 Py_ssize_t end, 6430 int direction) 6431{ 6432 if (substring->length == 0) 6433 return 1; 6434 6435 ADJUST_INDICES(start, end, self->length); 6436 end -= substring->length; 6437 if (end < start) 6438 return 0; 6439 6440 if (direction > 0) { 6441 if (Py_UNICODE_MATCH(self, end, substring)) 6442 return 1; 6443 } else { 6444 if (Py_UNICODE_MATCH(self, start, substring)) 6445 return 1; 6446 } 6447 6448 return 0; 6449} 6450 6451Py_ssize_t PyUnicode_Tailmatch(PyObject *str, 6452 PyObject *substr, 6453 Py_ssize_t start, 6454 Py_ssize_t end, 6455 int direction) 6456{ 6457 Py_ssize_t result; 6458 6459 str = PyUnicode_FromObject(str); 6460 if (str == NULL) 6461 return -1; 6462 substr = PyUnicode_FromObject(substr); 6463 if (substr == NULL) { 6464 Py_DECREF(str); 6465 return -1; 6466 } 6467 6468 result = tailmatch((PyUnicodeObject *)str, 6469 (PyUnicodeObject *)substr, 6470 start, end, direction); 6471 Py_DECREF(str); 6472 Py_DECREF(substr); 6473 return result; 6474} 6475 6476/* Apply fixfct filter to the Unicode object self and return a 6477 reference to the modified object */ 6478 6479static 6480PyObject *fixup(PyUnicodeObject *self, 6481 int (*fixfct)(PyUnicodeObject *s)) 6482{ 6483 6484 PyUnicodeObject *u; 6485 6486 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length); 6487 if (u == NULL) 6488 return NULL; 6489 6490 Py_UNICODE_COPY(u->str, self->str, self->length); 6491 6492 if (!fixfct(u) && PyUnicode_CheckExact(self)) { 6493 /* fixfct should return TRUE if it modified the buffer. If 6494 FALSE, return a reference to the original buffer instead 6495 (to save space, not time) */ 6496 Py_INCREF(self); 6497 Py_DECREF(u); 6498 return (PyObject*) self; 6499 } 6500 return (PyObject*) u; 6501} 6502 6503static 6504int fixupper(PyUnicodeObject *self) 6505{ 6506 Py_ssize_t len = self->length; 6507 Py_UNICODE *s = self->str; 6508 int status = 0; 6509 6510 while (len-- > 0) { 6511 register Py_UNICODE ch; 6512 6513 ch = Py_UNICODE_TOUPPER(*s); 6514 if (ch != *s) { 6515 status = 1; 6516 *s = ch; 6517 } 6518 s++; 6519 } 6520 6521 return status; 6522} 6523 6524static 6525int fixlower(PyUnicodeObject *self) 6526{ 6527 Py_ssize_t len = self->length; 6528 Py_UNICODE *s = self->str; 6529 int status = 0; 6530 6531 while (len-- > 0) { 6532 register Py_UNICODE ch; 6533 6534 ch = Py_UNICODE_TOLOWER(*s); 6535 if (ch != *s) { 6536 status = 1; 6537 *s = ch; 6538 } 6539 s++; 6540 } 6541 6542 return status; 6543} 6544 6545static 6546int fixswapcase(PyUnicodeObject *self) 6547{ 6548 Py_ssize_t len = self->length; 6549 Py_UNICODE *s = self->str; 6550 int status = 0; 6551 6552 while (len-- > 0) { 6553 if (Py_UNICODE_ISUPPER(*s)) { 6554 *s = Py_UNICODE_TOLOWER(*s); 6555 status = 1; 6556 } else if (Py_UNICODE_ISLOWER(*s)) { 6557 *s = Py_UNICODE_TOUPPER(*s); 6558 status = 1; 6559 } 6560 s++; 6561 } 6562 6563 return status; 6564} 6565 6566static 6567int fixcapitalize(PyUnicodeObject *self) 6568{ 6569 Py_ssize_t len = self->length; 6570 Py_UNICODE *s = self->str; 6571 int status = 0; 6572 6573 if (len == 0) 6574 return 0; 6575 if (Py_UNICODE_ISLOWER(*s)) { 6576 *s = Py_UNICODE_TOUPPER(*s); 6577 status = 1; 6578 } 6579 s++; 6580 while (--len > 0) { 6581 if (Py_UNICODE_ISUPPER(*s)) { 6582 *s = Py_UNICODE_TOLOWER(*s); 6583 status = 1; 6584 } 6585 s++; 6586 } 6587 return status; 6588} 6589 6590static 6591int fixtitle(PyUnicodeObject *self) 6592{ 6593 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 6594 register Py_UNICODE *e; 6595 int previous_is_cased; 6596 6597 /* Shortcut for single character strings */ 6598 if (PyUnicode_GET_SIZE(self) == 1) { 6599 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p); 6600 if (*p != ch) { 6601 *p = ch; 6602 return 1; 6603 } 6604 else 6605 return 0; 6606 } 6607 6608 e = p + PyUnicode_GET_SIZE(self); 6609 previous_is_cased = 0; 6610 for (; p < e; p++) { 6611 register const Py_UNICODE ch = *p; 6612 6613 if (previous_is_cased) 6614 *p = Py_UNICODE_TOLOWER(ch); 6615 else 6616 *p = Py_UNICODE_TOTITLE(ch); 6617 6618 if (Py_UNICODE_ISLOWER(ch) || 6619 Py_UNICODE_ISUPPER(ch) || 6620 Py_UNICODE_ISTITLE(ch)) 6621 previous_is_cased = 1; 6622 else 6623 previous_is_cased = 0; 6624 } 6625 return 1; 6626} 6627 6628PyObject * 6629PyUnicode_Join(PyObject *separator, PyObject *seq) 6630{ 6631 const Py_UNICODE blank = ' '; 6632 const Py_UNICODE *sep = ␣ 6633 Py_ssize_t seplen = 1; 6634 PyUnicodeObject *res = NULL; /* the result */ 6635 Py_UNICODE *res_p; /* pointer to free byte in res's string area */ 6636 PyObject *fseq; /* PySequence_Fast(seq) */ 6637 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */ 6638 PyObject **items; 6639 PyObject *item; 6640 Py_ssize_t sz, i; 6641 6642 fseq = PySequence_Fast(seq, ""); 6643 if (fseq == NULL) { 6644 return NULL; 6645 } 6646 6647 /* NOTE: the following code can't call back into Python code, 6648 * so we are sure that fseq won't be mutated. 6649 */ 6650 6651 seqlen = PySequence_Fast_GET_SIZE(fseq); 6652 /* If empty sequence, return u"". */ 6653 if (seqlen == 0) { 6654 res = _PyUnicode_New(0); /* empty sequence; return u"" */ 6655 goto Done; 6656 } 6657 items = PySequence_Fast_ITEMS(fseq); 6658 /* If singleton sequence with an exact Unicode, return that. */ 6659 if (seqlen == 1) { 6660 item = items[0]; 6661 if (PyUnicode_CheckExact(item)) { 6662 Py_INCREF(item); 6663 res = (PyUnicodeObject *)item; 6664 goto Done; 6665 } 6666 } 6667 else { 6668 /* Set up sep and seplen */ 6669 if (separator == NULL) { 6670 sep = ␣ 6671 seplen = 1; 6672 } 6673 else { 6674 if (!PyUnicode_Check(separator)) { 6675 PyErr_Format(PyExc_TypeError, 6676 "separator: expected str instance," 6677 " %.80s found", 6678 Py_TYPE(separator)->tp_name); 6679 goto onError; 6680 } 6681 sep = PyUnicode_AS_UNICODE(separator); 6682 seplen = PyUnicode_GET_SIZE(separator); 6683 } 6684 } 6685 6686 /* There are at least two things to join, or else we have a subclass 6687 * of str in the sequence. 6688 * Do a pre-pass to figure out the total amount of space we'll 6689 * need (sz), and see whether all argument are strings. 6690 */ 6691 sz = 0; 6692 for (i = 0; i < seqlen; i++) { 6693 const Py_ssize_t old_sz = sz; 6694 item = items[i]; 6695 if (!PyUnicode_Check(item)) { 6696 PyErr_Format(PyExc_TypeError, 6697 "sequence item %zd: expected str instance," 6698 " %.80s found", 6699 i, Py_TYPE(item)->tp_name); 6700 goto onError; 6701 } 6702 sz += PyUnicode_GET_SIZE(item); 6703 if (i != 0) 6704 sz += seplen; 6705 if (sz < old_sz || sz > PY_SSIZE_T_MAX) { 6706 PyErr_SetString(PyExc_OverflowError, 6707 "join() result is too long for a Python string"); 6708 goto onError; 6709 } 6710 } 6711 6712 res = _PyUnicode_New(sz); 6713 if (res == NULL) 6714 goto onError; 6715 6716 /* Catenate everything. */ 6717 res_p = PyUnicode_AS_UNICODE(res); 6718 for (i = 0; i < seqlen; ++i) { 6719 Py_ssize_t itemlen; 6720 item = items[i]; 6721 itemlen = PyUnicode_GET_SIZE(item); 6722 /* Copy item, and maybe the separator. */ 6723 if (i) { 6724 Py_UNICODE_COPY(res_p, sep, seplen); 6725 res_p += seplen; 6726 } 6727 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen); 6728 res_p += itemlen; 6729 } 6730 6731 Done: 6732 Py_DECREF(fseq); 6733 return (PyObject *)res; 6734 6735 onError: 6736 Py_DECREF(fseq); 6737 Py_XDECREF(res); 6738 return NULL; 6739} 6740 6741static 6742PyUnicodeObject *pad(PyUnicodeObject *self, 6743 Py_ssize_t left, 6744 Py_ssize_t right, 6745 Py_UNICODE fill) 6746{ 6747 PyUnicodeObject *u; 6748 6749 if (left < 0) 6750 left = 0; 6751 if (right < 0) 6752 right = 0; 6753 6754 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) { 6755 Py_INCREF(self); 6756 return self; 6757 } 6758 6759 if (left > PY_SSIZE_T_MAX - self->length || 6760 right > PY_SSIZE_T_MAX - (left + self->length)) { 6761 PyErr_SetString(PyExc_OverflowError, "padded string is too long"); 6762 return NULL; 6763 } 6764 u = _PyUnicode_New(left + self->length + right); 6765 if (u) { 6766 if (left) 6767 Py_UNICODE_FILL(u->str, fill, left); 6768 Py_UNICODE_COPY(u->str + left, self->str, self->length); 6769 if (right) 6770 Py_UNICODE_FILL(u->str + left + self->length, fill, right); 6771 } 6772 6773 return u; 6774} 6775 6776PyObject *PyUnicode_Splitlines(PyObject *string, int keepends) 6777{ 6778 PyObject *list; 6779 6780 string = PyUnicode_FromObject(string); 6781 if (string == NULL) 6782 return NULL; 6783 6784 list = stringlib_splitlines( 6785 (PyObject*) string, PyUnicode_AS_UNICODE(string), 6786 PyUnicode_GET_SIZE(string), keepends); 6787 6788 Py_DECREF(string); 6789 return list; 6790} 6791 6792static 6793PyObject *split(PyUnicodeObject *self, 6794 PyUnicodeObject *substring, 6795 Py_ssize_t maxcount) 6796{ 6797 if (maxcount < 0) 6798 maxcount = PY_SSIZE_T_MAX; 6799 6800 if (substring == NULL) 6801 return stringlib_split_whitespace( 6802 (PyObject*) self, self->str, self->length, maxcount 6803 ); 6804 6805 return stringlib_split( 6806 (PyObject*) self, self->str, self->length, 6807 substring->str, substring->length, 6808 maxcount 6809 ); 6810} 6811 6812static 6813PyObject *rsplit(PyUnicodeObject *self, 6814 PyUnicodeObject *substring, 6815 Py_ssize_t maxcount) 6816{ 6817 if (maxcount < 0) 6818 maxcount = PY_SSIZE_T_MAX; 6819 6820 if (substring == NULL) 6821 return stringlib_rsplit_whitespace( 6822 (PyObject*) self, self->str, self->length, maxcount 6823 ); 6824 6825 return stringlib_rsplit( 6826 (PyObject*) self, self->str, self->length, 6827 substring->str, substring->length, 6828 maxcount 6829 ); 6830} 6831 6832static 6833PyObject *replace(PyUnicodeObject *self, 6834 PyUnicodeObject *str1, 6835 PyUnicodeObject *str2, 6836 Py_ssize_t maxcount) 6837{ 6838 PyUnicodeObject *u; 6839 6840 if (maxcount < 0) 6841 maxcount = PY_SSIZE_T_MAX; 6842 else if (maxcount == 0 || self->length == 0) 6843 goto nothing; 6844 6845 if (str1->length == str2->length) { 6846 Py_ssize_t i; 6847 /* same length */ 6848 if (str1->length == 0) 6849 goto nothing; 6850 if (str1->length == 1) { 6851 /* replace characters */ 6852 Py_UNICODE u1, u2; 6853 if (!findchar(self->str, self->length, str1->str[0])) 6854 goto nothing; 6855 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length); 6856 if (!u) 6857 return NULL; 6858 Py_UNICODE_COPY(u->str, self->str, self->length); 6859 u1 = str1->str[0]; 6860 u2 = str2->str[0]; 6861 for (i = 0; i < u->length; i++) 6862 if (u->str[i] == u1) { 6863 if (--maxcount < 0) 6864 break; 6865 u->str[i] = u2; 6866 } 6867 } else { 6868 i = stringlib_find( 6869 self->str, self->length, str1->str, str1->length, 0 6870 ); 6871 if (i < 0) 6872 goto nothing; 6873 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length); 6874 if (!u) 6875 return NULL; 6876 Py_UNICODE_COPY(u->str, self->str, self->length); 6877 6878 /* change everything in-place, starting with this one */ 6879 Py_UNICODE_COPY(u->str+i, str2->str, str2->length); 6880 i += str1->length; 6881 6882 while ( --maxcount > 0) { 6883 i = stringlib_find(self->str+i, self->length-i, 6884 str1->str, str1->length, 6885 i); 6886 if (i == -1) 6887 break; 6888 Py_UNICODE_COPY(u->str+i, str2->str, str2->length); 6889 i += str1->length; 6890 } 6891 } 6892 } else { 6893 6894 Py_ssize_t n, i, j, e; 6895 Py_ssize_t product, new_size, delta; 6896 Py_UNICODE *p; 6897 6898 /* replace strings */ 6899 n = stringlib_count(self->str, self->length, str1->str, str1->length, 6900 maxcount); 6901 if (n == 0) 6902 goto nothing; 6903 /* new_size = self->length + n * (str2->length - str1->length)); */ 6904 delta = (str2->length - str1->length); 6905 if (delta == 0) { 6906 new_size = self->length; 6907 } else { 6908 product = n * (str2->length - str1->length); 6909 if ((product / (str2->length - str1->length)) != n) { 6910 PyErr_SetString(PyExc_OverflowError, 6911 "replace string is too long"); 6912 return NULL; 6913 } 6914 new_size = self->length + product; 6915 if (new_size < 0) { 6916 PyErr_SetString(PyExc_OverflowError, 6917 "replace string is too long"); 6918 return NULL; 6919 } 6920 } 6921 u = _PyUnicode_New(new_size); 6922 if (!u) 6923 return NULL; 6924 i = 0; 6925 p = u->str; 6926 e = self->length - str1->length; 6927 if (str1->length > 0) { 6928 while (n-- > 0) { 6929 /* look for next match */ 6930 j = stringlib_find(self->str+i, self->length-i, 6931 str1->str, str1->length, 6932 i); 6933 if (j == -1) 6934 break; 6935 else if (j > i) { 6936 /* copy unchanged part [i:j] */ 6937 Py_UNICODE_COPY(p, self->str+i, j-i); 6938 p += j - i; 6939 } 6940 /* copy substitution string */ 6941 if (str2->length > 0) { 6942 Py_UNICODE_COPY(p, str2->str, str2->length); 6943 p += str2->length; 6944 } 6945 i = j + str1->length; 6946 } 6947 if (i < self->length) 6948 /* copy tail [i:] */ 6949 Py_UNICODE_COPY(p, self->str+i, self->length-i); 6950 } else { 6951 /* interleave */ 6952 while (n > 0) { 6953 Py_UNICODE_COPY(p, str2->str, str2->length); 6954 p += str2->length; 6955 if (--n <= 0) 6956 break; 6957 *p++ = self->str[i++]; 6958 } 6959 Py_UNICODE_COPY(p, self->str+i, self->length-i); 6960 } 6961 } 6962 return (PyObject *) u; 6963 6964 nothing: 6965 /* nothing to replace; return original string (when possible) */ 6966 if (PyUnicode_CheckExact(self)) { 6967 Py_INCREF(self); 6968 return (PyObject *) self; 6969 } 6970 return PyUnicode_FromUnicode(self->str, self->length); 6971} 6972 6973/* --- Unicode Object Methods --------------------------------------------- */ 6974 6975PyDoc_STRVAR(title__doc__, 6976 "S.title() -> str\n\ 6977\n\ 6978Return a titlecased version of S, i.e. words start with title case\n\ 6979characters, all remaining cased characters have lower case."); 6980 6981static PyObject* 6982unicode_title(PyUnicodeObject *self) 6983{ 6984 return fixup(self, fixtitle); 6985} 6986 6987PyDoc_STRVAR(capitalize__doc__, 6988 "S.capitalize() -> str\n\ 6989\n\ 6990Return a capitalized version of S, i.e. make the first character\n\ 6991have upper case and the rest lower case."); 6992 6993static PyObject* 6994unicode_capitalize(PyUnicodeObject *self) 6995{ 6996 return fixup(self, fixcapitalize); 6997} 6998 6999#if 0 7000PyDoc_STRVAR(capwords__doc__, 7001 "S.capwords() -> str\n\ 7002\n\ 7003Apply .capitalize() to all words in S and return the result with\n\ 7004normalized whitespace (all whitespace strings are replaced by ' ')."); 7005 7006static PyObject* 7007unicode_capwords(PyUnicodeObject *self) 7008{ 7009 PyObject *list; 7010 PyObject *item; 7011 Py_ssize_t i; 7012 7013 /* Split into words */ 7014 list = split(self, NULL, -1); 7015 if (!list) 7016 return NULL; 7017 7018 /* Capitalize each word */ 7019 for (i = 0; i < PyList_GET_SIZE(list); i++) { 7020 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i), 7021 fixcapitalize); 7022 if (item == NULL) 7023 goto onError; 7024 Py_DECREF(PyList_GET_ITEM(list, i)); 7025 PyList_SET_ITEM(list, i, item); 7026 } 7027 7028 /* Join the words to form a new string */ 7029 item = PyUnicode_Join(NULL, list); 7030 7031 onError: 7032 Py_DECREF(list); 7033 return (PyObject *)item; 7034} 7035#endif 7036 7037/* Argument converter. Coerces to a single unicode character */ 7038 7039static int 7040convert_uc(PyObject *obj, void *addr) 7041{ 7042 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr; 7043 PyObject *uniobj; 7044 Py_UNICODE *unistr; 7045 7046 uniobj = PyUnicode_FromObject(obj); 7047 if (uniobj == NULL) { 7048 PyErr_SetString(PyExc_TypeError, 7049 "The fill character cannot be converted to Unicode"); 7050 return 0; 7051 } 7052 if (PyUnicode_GET_SIZE(uniobj) != 1) { 7053 PyErr_SetString(PyExc_TypeError, 7054 "The fill character must be exactly one character long"); 7055 Py_DECREF(uniobj); 7056 return 0; 7057 } 7058 unistr = PyUnicode_AS_UNICODE(uniobj); 7059 *fillcharloc = unistr[0]; 7060 Py_DECREF(uniobj); 7061 return 1; 7062} 7063 7064PyDoc_STRVAR(center__doc__, 7065 "S.center(width[, fillchar]) -> str\n\ 7066\n\ 7067Return S centered in a string of length width. Padding is\n\ 7068done using the specified fill character (default is a space)"); 7069 7070static PyObject * 7071unicode_center(PyUnicodeObject *self, PyObject *args) 7072{ 7073 Py_ssize_t marg, left; 7074 Py_ssize_t width; 7075 Py_UNICODE fillchar = ' '; 7076 7077 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar)) 7078 return NULL; 7079 7080 if (self->length >= width && PyUnicode_CheckExact(self)) { 7081 Py_INCREF(self); 7082 return (PyObject*) self; 7083 } 7084 7085 marg = width - self->length; 7086 left = marg / 2 + (marg & width & 1); 7087 7088 return (PyObject*) pad(self, left, marg - left, fillchar); 7089} 7090 7091#if 0 7092 7093/* This code should go into some future Unicode collation support 7094 module. The basic comparison should compare ordinals on a naive 7095 basis (this is what Java does and thus Jython too). */ 7096 7097/* speedy UTF-16 code point order comparison */ 7098/* gleaned from: */ 7099/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */ 7100 7101static short utf16Fixup[32] = 7102{ 7103 0, 0, 0, 0, 0, 0, 0, 0, 7104 0, 0, 0, 0, 0, 0, 0, 0, 7105 0, 0, 0, 0, 0, 0, 0, 0, 7106 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800 7107}; 7108 7109static int 7110unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2) 7111{ 7112 Py_ssize_t len1, len2; 7113 7114 Py_UNICODE *s1 = str1->str; 7115 Py_UNICODE *s2 = str2->str; 7116 7117 len1 = str1->length; 7118 len2 = str2->length; 7119 7120 while (len1 > 0 && len2 > 0) { 7121 Py_UNICODE c1, c2; 7122 7123 c1 = *s1++; 7124 c2 = *s2++; 7125 7126 if (c1 > (1<<11) * 26) 7127 c1 += utf16Fixup[c1>>11]; 7128 if (c2 > (1<<11) * 26) 7129 c2 += utf16Fixup[c2>>11]; 7130 /* now c1 and c2 are in UTF-32-compatible order */ 7131 7132 if (c1 != c2) 7133 return (c1 < c2) ? -1 : 1; 7134 7135 len1--; len2--; 7136 } 7137 7138 return (len1 < len2) ? -1 : (len1 != len2); 7139} 7140 7141#else 7142 7143static int 7144unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2) 7145{ 7146 register Py_ssize_t len1, len2; 7147 7148 Py_UNICODE *s1 = str1->str; 7149 Py_UNICODE *s2 = str2->str; 7150 7151 len1 = str1->length; 7152 len2 = str2->length; 7153 7154 while (len1 > 0 && len2 > 0) { 7155 Py_UNICODE c1, c2; 7156 7157 c1 = *s1++; 7158 c2 = *s2++; 7159 7160 if (c1 != c2) 7161 return (c1 < c2) ? -1 : 1; 7162 7163 len1--; len2--; 7164 } 7165 7166 return (len1 < len2) ? -1 : (len1 != len2); 7167} 7168 7169#endif 7170 7171int PyUnicode_Compare(PyObject *left, 7172 PyObject *right) 7173{ 7174 if (PyUnicode_Check(left) && PyUnicode_Check(right)) 7175 return unicode_compare((PyUnicodeObject *)left, 7176 (PyUnicodeObject *)right); 7177 PyErr_Format(PyExc_TypeError, 7178 "Can't compare %.100s and %.100s", 7179 left->ob_type->tp_name, 7180 right->ob_type->tp_name); 7181 return -1; 7182} 7183 7184int 7185PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str) 7186{ 7187 int i; 7188 Py_UNICODE *id; 7189 assert(PyUnicode_Check(uni)); 7190 id = PyUnicode_AS_UNICODE(uni); 7191 /* Compare Unicode string and source character set string */ 7192 for (i = 0; id[i] && str[i]; i++) 7193 if (id[i] != str[i]) 7194 return ((int)id[i] < (int)str[i]) ? -1 : 1; 7195 /* This check keeps Python strings that end in '\0' from comparing equal 7196 to C strings identical up to that point. */ 7197 if (PyUnicode_GET_SIZE(uni) != i || id[i]) 7198 return 1; /* uni is longer */ 7199 if (str[i]) 7200 return -1; /* str is longer */ 7201 return 0; 7202} 7203 7204 7205#define TEST_COND(cond) \ 7206 ((cond) ? Py_True : Py_False) 7207 7208PyObject *PyUnicode_RichCompare(PyObject *left, 7209 PyObject *right, 7210 int op) 7211{ 7212 int result; 7213 7214 if (PyUnicode_Check(left) && PyUnicode_Check(right)) { 7215 PyObject *v; 7216 if (((PyUnicodeObject *) left)->length != 7217 ((PyUnicodeObject *) right)->length) { 7218 if (op == Py_EQ) { 7219 Py_INCREF(Py_False); 7220 return Py_False; 7221 } 7222 if (op == Py_NE) { 7223 Py_INCREF(Py_True); 7224 return Py_True; 7225 } 7226 } 7227 if (left == right) 7228 result = 0; 7229 else 7230 result = unicode_compare((PyUnicodeObject *)left, 7231 (PyUnicodeObject *)right); 7232 7233 /* Convert the return value to a Boolean */ 7234 switch (op) { 7235 case Py_EQ: 7236 v = TEST_COND(result == 0); 7237 break; 7238 case Py_NE: 7239 v = TEST_COND(result != 0); 7240 break; 7241 case Py_LE: 7242 v = TEST_COND(result <= 0); 7243 break; 7244 case Py_GE: 7245 v = TEST_COND(result >= 0); 7246 break; 7247 case Py_LT: 7248 v = TEST_COND(result == -1); 7249 break; 7250 case Py_GT: 7251 v = TEST_COND(result == 1); 7252 break; 7253 default: 7254 PyErr_BadArgument(); 7255 return NULL; 7256 } 7257 Py_INCREF(v); 7258 return v; 7259 } 7260 7261 Py_INCREF(Py_NotImplemented); 7262 return Py_NotImplemented; 7263} 7264 7265int PyUnicode_Contains(PyObject *container, 7266 PyObject *element) 7267{ 7268 PyObject *str, *sub; 7269 int result; 7270 7271 /* Coerce the two arguments */ 7272 sub = PyUnicode_FromObject(element); 7273 if (!sub) { 7274 PyErr_Format(PyExc_TypeError, 7275 "'in <string>' requires string as left operand, not %s", 7276 element->ob_type->tp_name); 7277 return -1; 7278 } 7279 7280 str = PyUnicode_FromObject(container); 7281 if (!str) { 7282 Py_DECREF(sub); 7283 return -1; 7284 } 7285 7286 result = stringlib_contains_obj(str, sub); 7287 7288 Py_DECREF(str); 7289 Py_DECREF(sub); 7290 7291 return result; 7292} 7293 7294/* Concat to string or Unicode object giving a new Unicode object. */ 7295 7296PyObject *PyUnicode_Concat(PyObject *left, 7297 PyObject *right) 7298{ 7299 PyUnicodeObject *u = NULL, *v = NULL, *w; 7300 7301 /* Coerce the two arguments */ 7302 u = (PyUnicodeObject *)PyUnicode_FromObject(left); 7303 if (u == NULL) 7304 goto onError; 7305 v = (PyUnicodeObject *)PyUnicode_FromObject(right); 7306 if (v == NULL) 7307 goto onError; 7308 7309 /* Shortcuts */ 7310 if (v == unicode_empty) { 7311 Py_DECREF(v); 7312 return (PyObject *)u; 7313 } 7314 if (u == unicode_empty) { 7315 Py_DECREF(u); 7316 return (PyObject *)v; 7317 } 7318 7319 /* Concat the two Unicode strings */ 7320 w = _PyUnicode_New(u->length + v->length); 7321 if (w == NULL) 7322 goto onError; 7323 Py_UNICODE_COPY(w->str, u->str, u->length); 7324 Py_UNICODE_COPY(w->str + u->length, v->str, v->length); 7325 7326 Py_DECREF(u); 7327 Py_DECREF(v); 7328 return (PyObject *)w; 7329 7330 onError: 7331 Py_XDECREF(u); 7332 Py_XDECREF(v); 7333 return NULL; 7334} 7335 7336void 7337PyUnicode_Append(PyObject **pleft, PyObject *right) 7338{ 7339 PyObject *new; 7340 if (*pleft == NULL) 7341 return; 7342 if (right == NULL || !PyUnicode_Check(*pleft)) { 7343 Py_DECREF(*pleft); 7344 *pleft = NULL; 7345 return; 7346 } 7347 new = PyUnicode_Concat(*pleft, right); 7348 Py_DECREF(*pleft); 7349 *pleft = new; 7350} 7351 7352void 7353PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right) 7354{ 7355 PyUnicode_Append(pleft, right); 7356 Py_XDECREF(right); 7357} 7358 7359PyDoc_STRVAR(count__doc__, 7360 "S.count(sub[, start[, end]]) -> int\n\ 7361\n\ 7362Return the number of non-overlapping occurrences of substring sub in\n\ 7363string S[start:end]. Optional arguments start and end are\n\ 7364interpreted as in slice notation."); 7365 7366static PyObject * 7367unicode_count(PyUnicodeObject *self, PyObject *args) 7368{ 7369 PyUnicodeObject *substring; 7370 Py_ssize_t start = 0; 7371 Py_ssize_t end = PY_SSIZE_T_MAX; 7372 PyObject *result; 7373 7374 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring, 7375 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 7376 return NULL; 7377 7378 substring = (PyUnicodeObject *)PyUnicode_FromObject( 7379 (PyObject *)substring); 7380 if (substring == NULL) 7381 return NULL; 7382 7383 ADJUST_INDICES(start, end, self->length); 7384 result = PyLong_FromSsize_t( 7385 stringlib_count(self->str + start, end - start, 7386 substring->str, substring->length, 7387 PY_SSIZE_T_MAX) 7388 ); 7389 7390 Py_DECREF(substring); 7391 7392 return result; 7393} 7394 7395PyDoc_STRVAR(encode__doc__, 7396 "S.encode([encoding[, errors]]) -> bytes\n\ 7397\n\ 7398Encode S using the codec registered for encoding. encoding defaults\n\ 7399to the default encoding. errors may be given to set a different error\n\ 7400handling scheme. Default is 'strict' meaning that encoding errors raise\n\ 7401a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\ 7402'xmlcharrefreplace' as well as any other name registered with\n\ 7403codecs.register_error that can handle UnicodeEncodeErrors."); 7404 7405static PyObject * 7406unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs) 7407{ 7408 static char *kwlist[] = {"encoding", "errors", 0}; 7409 char *encoding = NULL; 7410 char *errors = NULL; 7411 PyObject *v; 7412 7413 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode", 7414 kwlist, &encoding, &errors)) 7415 return NULL; 7416 v = PyUnicode_AsEncodedString((PyObject *)self, encoding, errors); 7417 if (v == NULL) 7418 goto onError; 7419 if (!PyBytes_Check(v)) { 7420 PyErr_Format(PyExc_TypeError, 7421 "encoder did not return a bytes object " 7422 "(type=%.400s)", 7423 Py_TYPE(v)->tp_name); 7424 Py_DECREF(v); 7425 return NULL; 7426 } 7427 return v; 7428 7429 onError: 7430 return NULL; 7431} 7432 7433PyDoc_STRVAR(expandtabs__doc__, 7434 "S.expandtabs([tabsize]) -> str\n\ 7435\n\ 7436Return a copy of S where all tab characters are expanded using spaces.\n\ 7437If tabsize is not given, a tab size of 8 characters is assumed."); 7438 7439static PyObject* 7440unicode_expandtabs(PyUnicodeObject *self, PyObject *args) 7441{ 7442 Py_UNICODE *e; 7443 Py_UNICODE *p; 7444 Py_UNICODE *q; 7445 Py_UNICODE *qe; 7446 Py_ssize_t i, j, incr; 7447 PyUnicodeObject *u; 7448 int tabsize = 8; 7449 7450 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize)) 7451 return NULL; 7452 7453 /* First pass: determine size of output string */ 7454 i = 0; /* chars up to and including most recent \n or \r */ 7455 j = 0; /* chars since most recent \n or \r (use in tab calculations) */ 7456 e = self->str + self->length; /* end of input */ 7457 for (p = self->str; p < e; p++) 7458 if (*p == '\t') { 7459 if (tabsize > 0) { 7460 incr = tabsize - (j % tabsize); /* cannot overflow */ 7461 if (j > PY_SSIZE_T_MAX - incr) 7462 goto overflow1; 7463 j += incr; 7464 } 7465 } 7466 else { 7467 if (j > PY_SSIZE_T_MAX - 1) 7468 goto overflow1; 7469 j++; 7470 if (*p == '\n' || *p == '\r') { 7471 if (i > PY_SSIZE_T_MAX - j) 7472 goto overflow1; 7473 i += j; 7474 j = 0; 7475 } 7476 } 7477 7478 if (i > PY_SSIZE_T_MAX - j) 7479 goto overflow1; 7480 7481 /* Second pass: create output string and fill it */ 7482 u = _PyUnicode_New(i + j); 7483 if (!u) 7484 return NULL; 7485 7486 j = 0; /* same as in first pass */ 7487 q = u->str; /* next output char */ 7488 qe = u->str + u->length; /* end of output */ 7489 7490 for (p = self->str; p < e; p++) 7491 if (*p == '\t') { 7492 if (tabsize > 0) { 7493 i = tabsize - (j % tabsize); 7494 j += i; 7495 while (i--) { 7496 if (q >= qe) 7497 goto overflow2; 7498 *q++ = ' '; 7499 } 7500 } 7501 } 7502 else { 7503 if (q >= qe) 7504 goto overflow2; 7505 *q++ = *p; 7506 j++; 7507 if (*p == '\n' || *p == '\r') 7508 j = 0; 7509 } 7510 7511 return (PyObject*) u; 7512 7513 overflow2: 7514 Py_DECREF(u); 7515 overflow1: 7516 PyErr_SetString(PyExc_OverflowError, "new string is too long"); 7517 return NULL; 7518} 7519 7520PyDoc_STRVAR(find__doc__, 7521 "S.find(sub[, start[, end]]) -> int\n\ 7522\n\ 7523Return the lowest index in S where substring sub is found,\n\ 7524such that sub is contained within s[start:end]. Optional\n\ 7525arguments start and end are interpreted as in slice notation.\n\ 7526\n\ 7527Return -1 on failure."); 7528 7529static PyObject * 7530unicode_find(PyUnicodeObject *self, PyObject *args) 7531{ 7532 PyObject *substring; 7533 Py_ssize_t start; 7534 Py_ssize_t end; 7535 Py_ssize_t result; 7536 7537 if (!_ParseTupleFinds(args, &substring, &start, &end)) 7538 return NULL; 7539 7540 result = stringlib_find_slice( 7541 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self), 7542 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring), 7543 start, end 7544 ); 7545 7546 Py_DECREF(substring); 7547 7548 return PyLong_FromSsize_t(result); 7549} 7550 7551static PyObject * 7552unicode_getitem(PyUnicodeObject *self, Py_ssize_t index) 7553{ 7554 if (index < 0 || index >= self->length) { 7555 PyErr_SetString(PyExc_IndexError, "string index out of range"); 7556 return NULL; 7557 } 7558 7559 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1); 7560} 7561 7562/* Believe it or not, this produces the same value for ASCII strings 7563 as string_hash(). */ 7564static Py_hash_t 7565unicode_hash(PyUnicodeObject *self) 7566{ 7567 Py_ssize_t len; 7568 Py_UNICODE *p; 7569 Py_hash_t x; 7570 7571 if (self->hash != -1) 7572 return self->hash; 7573 len = Py_SIZE(self); 7574 p = self->str; 7575 x = *p << 7; 7576 while (--len >= 0) 7577 x = (1000003*x) ^ *p++; 7578 x ^= Py_SIZE(self); 7579 if (x == -1) 7580 x = -2; 7581 self->hash = x; 7582 return x; 7583} 7584 7585PyDoc_STRVAR(index__doc__, 7586 "S.index(sub[, start[, end]]) -> int\n\ 7587\n\ 7588Like S.find() but raise ValueError when the substring is not found."); 7589 7590static PyObject * 7591unicode_index(PyUnicodeObject *self, PyObject *args) 7592{ 7593 Py_ssize_t result; 7594 PyObject *substring; 7595 Py_ssize_t start; 7596 Py_ssize_t end; 7597 7598 if (!_ParseTupleFinds(args, &substring, &start, &end)) 7599 return NULL; 7600 7601 result = stringlib_find_slice( 7602 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self), 7603 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring), 7604 start, end 7605 ); 7606 7607 Py_DECREF(substring); 7608 7609 if (result < 0) { 7610 PyErr_SetString(PyExc_ValueError, "substring not found"); 7611 return NULL; 7612 } 7613 7614 return PyLong_FromSsize_t(result); 7615} 7616 7617PyDoc_STRVAR(islower__doc__, 7618 "S.islower() -> bool\n\ 7619\n\ 7620Return True if all cased characters in S are lowercase and there is\n\ 7621at least one cased character in S, False otherwise."); 7622 7623static PyObject* 7624unicode_islower(PyUnicodeObject *self) 7625{ 7626 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 7627 register const Py_UNICODE *e; 7628 int cased; 7629 7630 /* Shortcut for single character strings */ 7631 if (PyUnicode_GET_SIZE(self) == 1) 7632 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p)); 7633 7634 /* Special case for empty strings */ 7635 if (PyUnicode_GET_SIZE(self) == 0) 7636 return PyBool_FromLong(0); 7637 7638 e = p + PyUnicode_GET_SIZE(self); 7639 cased = 0; 7640 for (; p < e; p++) { 7641 register const Py_UNICODE ch = *p; 7642 7643 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) 7644 return PyBool_FromLong(0); 7645 else if (!cased && Py_UNICODE_ISLOWER(ch)) 7646 cased = 1; 7647 } 7648 return PyBool_FromLong(cased); 7649} 7650 7651PyDoc_STRVAR(isupper__doc__, 7652 "S.isupper() -> bool\n\ 7653\n\ 7654Return True if all cased characters in S are uppercase and there is\n\ 7655at least one cased character in S, False otherwise."); 7656 7657static PyObject* 7658unicode_isupper(PyUnicodeObject *self) 7659{ 7660 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 7661 register const Py_UNICODE *e; 7662 int cased; 7663 7664 /* Shortcut for single character strings */ 7665 if (PyUnicode_GET_SIZE(self) == 1) 7666 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0); 7667 7668 /* Special case for empty strings */ 7669 if (PyUnicode_GET_SIZE(self) == 0) 7670 return PyBool_FromLong(0); 7671 7672 e = p + PyUnicode_GET_SIZE(self); 7673 cased = 0; 7674 for (; p < e; p++) { 7675 register const Py_UNICODE ch = *p; 7676 7677 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch)) 7678 return PyBool_FromLong(0); 7679 else if (!cased && Py_UNICODE_ISUPPER(ch)) 7680 cased = 1; 7681 } 7682 return PyBool_FromLong(cased); 7683} 7684 7685PyDoc_STRVAR(istitle__doc__, 7686 "S.istitle() -> bool\n\ 7687\n\ 7688Return True if S is a titlecased string and there is at least one\n\ 7689character in S, i.e. upper- and titlecase characters may only\n\ 7690follow uncased characters and lowercase characters only cased ones.\n\ 7691Return False otherwise."); 7692 7693static PyObject* 7694unicode_istitle(PyUnicodeObject *self) 7695{ 7696 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 7697 register const Py_UNICODE *e; 7698 int cased, previous_is_cased; 7699 7700 /* Shortcut for single character strings */ 7701 if (PyUnicode_GET_SIZE(self) == 1) 7702 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) || 7703 (Py_UNICODE_ISUPPER(*p) != 0)); 7704 7705 /* Special case for empty strings */ 7706 if (PyUnicode_GET_SIZE(self) == 0) 7707 return PyBool_FromLong(0); 7708 7709 e = p + PyUnicode_GET_SIZE(self); 7710 cased = 0; 7711 previous_is_cased = 0; 7712 for (; p < e; p++) { 7713 register const Py_UNICODE ch = *p; 7714 7715 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) { 7716 if (previous_is_cased) 7717 return PyBool_FromLong(0); 7718 previous_is_cased = 1; 7719 cased = 1; 7720 } 7721 else if (Py_UNICODE_ISLOWER(ch)) { 7722 if (!previous_is_cased) 7723 return PyBool_FromLong(0); 7724 previous_is_cased = 1; 7725 cased = 1; 7726 } 7727 else 7728 previous_is_cased = 0; 7729 } 7730 return PyBool_FromLong(cased); 7731} 7732 7733PyDoc_STRVAR(isspace__doc__, 7734 "S.isspace() -> bool\n\ 7735\n\ 7736Return True if all characters in S are whitespace\n\ 7737and there is at least one character in S, False otherwise."); 7738 7739static PyObject* 7740unicode_isspace(PyUnicodeObject *self) 7741{ 7742 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 7743 register const Py_UNICODE *e; 7744 7745 /* Shortcut for single character strings */ 7746 if (PyUnicode_GET_SIZE(self) == 1 && 7747 Py_UNICODE_ISSPACE(*p)) 7748 return PyBool_FromLong(1); 7749 7750 /* Special case for empty strings */ 7751 if (PyUnicode_GET_SIZE(self) == 0) 7752 return PyBool_FromLong(0); 7753 7754 e = p + PyUnicode_GET_SIZE(self); 7755 for (; p < e; p++) { 7756 if (!Py_UNICODE_ISSPACE(*p)) 7757 return PyBool_FromLong(0); 7758 } 7759 return PyBool_FromLong(1); 7760} 7761 7762PyDoc_STRVAR(isalpha__doc__, 7763 "S.isalpha() -> bool\n\ 7764\n\ 7765Return True if all characters in S are alphabetic\n\ 7766and there is at least one character in S, False otherwise."); 7767 7768static PyObject* 7769unicode_isalpha(PyUnicodeObject *self) 7770{ 7771 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 7772 register const Py_UNICODE *e; 7773 7774 /* Shortcut for single character strings */ 7775 if (PyUnicode_GET_SIZE(self) == 1 && 7776 Py_UNICODE_ISALPHA(*p)) 7777 return PyBool_FromLong(1); 7778 7779 /* Special case for empty strings */ 7780 if (PyUnicode_GET_SIZE(self) == 0) 7781 return PyBool_FromLong(0); 7782 7783 e = p + PyUnicode_GET_SIZE(self); 7784 for (; p < e; p++) { 7785 if (!Py_UNICODE_ISALPHA(*p)) 7786 return PyBool_FromLong(0); 7787 } 7788 return PyBool_FromLong(1); 7789} 7790 7791PyDoc_STRVAR(isalnum__doc__, 7792 "S.isalnum() -> bool\n\ 7793\n\ 7794Return True if all characters in S are alphanumeric\n\ 7795and there is at least one character in S, False otherwise."); 7796 7797static PyObject* 7798unicode_isalnum(PyUnicodeObject *self) 7799{ 7800 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 7801 register const Py_UNICODE *e; 7802 7803 /* Shortcut for single character strings */ 7804 if (PyUnicode_GET_SIZE(self) == 1 && 7805 Py_UNICODE_ISALNUM(*p)) 7806 return PyBool_FromLong(1); 7807 7808 /* Special case for empty strings */ 7809 if (PyUnicode_GET_SIZE(self) == 0) 7810 return PyBool_FromLong(0); 7811 7812 e = p + PyUnicode_GET_SIZE(self); 7813 for (; p < e; p++) { 7814 if (!Py_UNICODE_ISALNUM(*p)) 7815 return PyBool_FromLong(0); 7816 } 7817 return PyBool_FromLong(1); 7818} 7819 7820PyDoc_STRVAR(isdecimal__doc__, 7821 "S.isdecimal() -> bool\n\ 7822\n\ 7823Return True if there are only decimal characters in S,\n\ 7824False otherwise."); 7825 7826static PyObject* 7827unicode_isdecimal(PyUnicodeObject *self) 7828{ 7829 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 7830 register const Py_UNICODE *e; 7831 7832 /* Shortcut for single character strings */ 7833 if (PyUnicode_GET_SIZE(self) == 1 && 7834 Py_UNICODE_ISDECIMAL(*p)) 7835 return PyBool_FromLong(1); 7836 7837 /* Special case for empty strings */ 7838 if (PyUnicode_GET_SIZE(self) == 0) 7839 return PyBool_FromLong(0); 7840 7841 e = p + PyUnicode_GET_SIZE(self); 7842 for (; p < e; p++) { 7843 if (!Py_UNICODE_ISDECIMAL(*p)) 7844 return PyBool_FromLong(0); 7845 } 7846 return PyBool_FromLong(1); 7847} 7848 7849PyDoc_STRVAR(isdigit__doc__, 7850 "S.isdigit() -> bool\n\ 7851\n\ 7852Return True if all characters in S are digits\n\ 7853and there is at least one character in S, False otherwise."); 7854 7855static PyObject* 7856unicode_isdigit(PyUnicodeObject *self) 7857{ 7858 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 7859 register const Py_UNICODE *e; 7860 7861 /* Shortcut for single character strings */ 7862 if (PyUnicode_GET_SIZE(self) == 1 && 7863 Py_UNICODE_ISDIGIT(*p)) 7864 return PyBool_FromLong(1); 7865 7866 /* Special case for empty strings */ 7867 if (PyUnicode_GET_SIZE(self) == 0) 7868 return PyBool_FromLong(0); 7869 7870 e = p + PyUnicode_GET_SIZE(self); 7871 for (; p < e; p++) { 7872 if (!Py_UNICODE_ISDIGIT(*p)) 7873 return PyBool_FromLong(0); 7874 } 7875 return PyBool_FromLong(1); 7876} 7877 7878PyDoc_STRVAR(isnumeric__doc__, 7879 "S.isnumeric() -> bool\n\ 7880\n\ 7881Return True if there are only numeric characters in S,\n\ 7882False otherwise."); 7883 7884static PyObject* 7885unicode_isnumeric(PyUnicodeObject *self) 7886{ 7887 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 7888 register const Py_UNICODE *e; 7889 7890 /* Shortcut for single character strings */ 7891 if (PyUnicode_GET_SIZE(self) == 1 && 7892 Py_UNICODE_ISNUMERIC(*p)) 7893 return PyBool_FromLong(1); 7894 7895 /* Special case for empty strings */ 7896 if (PyUnicode_GET_SIZE(self) == 0) 7897 return PyBool_FromLong(0); 7898 7899 e = p + PyUnicode_GET_SIZE(self); 7900 for (; p < e; p++) { 7901 if (!Py_UNICODE_ISNUMERIC(*p)) 7902 return PyBool_FromLong(0); 7903 } 7904 return PyBool_FromLong(1); 7905} 7906 7907int 7908PyUnicode_IsIdentifier(PyObject *self) 7909{ 7910 register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self); 7911 register const Py_UNICODE *e; 7912 7913 /* Special case for empty strings */ 7914 if (PyUnicode_GET_SIZE(self) == 0) 7915 return 0; 7916 7917 /* PEP 3131 says that the first character must be in 7918 XID_Start and subsequent characters in XID_Continue, 7919 and for the ASCII range, the 2.x rules apply (i.e 7920 start with letters and underscore, continue with 7921 letters, digits, underscore). However, given the current 7922 definition of XID_Start and XID_Continue, it is sufficient 7923 to check just for these, except that _ must be allowed 7924 as starting an identifier. */ 7925 if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */) 7926 return 0; 7927 7928 e = p + PyUnicode_GET_SIZE(self); 7929 for (p++; p < e; p++) { 7930 if (!_PyUnicode_IsXidContinue(*p)) 7931 return 0; 7932 } 7933 return 1; 7934} 7935 7936PyDoc_STRVAR(isidentifier__doc__, 7937 "S.isidentifier() -> bool\n\ 7938\n\ 7939Return True if S is a valid identifier according\n\ 7940to the language definition."); 7941 7942static PyObject* 7943unicode_isidentifier(PyObject *self) 7944{ 7945 return PyBool_FromLong(PyUnicode_IsIdentifier(self)); 7946} 7947 7948PyDoc_STRVAR(isprintable__doc__, 7949 "S.isprintable() -> bool\n\ 7950\n\ 7951Return True if all characters in S are considered\n\ 7952printable in repr() or S is empty, False otherwise."); 7953 7954static PyObject* 7955unicode_isprintable(PyObject *self) 7956{ 7957 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 7958 register const Py_UNICODE *e; 7959 7960 /* Shortcut for single character strings */ 7961 if (PyUnicode_GET_SIZE(self) == 1 && Py_UNICODE_ISPRINTABLE(*p)) { 7962 Py_RETURN_TRUE; 7963 } 7964 7965 e = p + PyUnicode_GET_SIZE(self); 7966 for (; p < e; p++) { 7967 if (!Py_UNICODE_ISPRINTABLE(*p)) { 7968 Py_RETURN_FALSE; 7969 } 7970 } 7971 Py_RETURN_TRUE; 7972} 7973 7974PyDoc_STRVAR(join__doc__, 7975 "S.join(iterable) -> str\n\ 7976\n\ 7977Return a string which is the concatenation of the strings in the\n\ 7978iterable. The separator between elements is S."); 7979 7980static PyObject* 7981unicode_join(PyObject *self, PyObject *data) 7982{ 7983 return PyUnicode_Join(self, data); 7984} 7985 7986static Py_ssize_t 7987unicode_length(PyUnicodeObject *self) 7988{ 7989 return self->length; 7990} 7991 7992PyDoc_STRVAR(ljust__doc__, 7993 "S.ljust(width[, fillchar]) -> str\n\ 7994\n\ 7995Return S left-justified in a Unicode string of length width. Padding is\n\ 7996done using the specified fill character (default is a space)."); 7997 7998static PyObject * 7999unicode_ljust(PyUnicodeObject *self, PyObject *args) 8000{ 8001 Py_ssize_t width; 8002 Py_UNICODE fillchar = ' '; 8003 8004 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar)) 8005 return NULL; 8006 8007 if (self->length >= width && PyUnicode_CheckExact(self)) { 8008 Py_INCREF(self); 8009 return (PyObject*) self; 8010 } 8011 8012 return (PyObject*) pad(self, 0, width - self->length, fillchar); 8013} 8014 8015PyDoc_STRVAR(lower__doc__, 8016 "S.lower() -> str\n\ 8017\n\ 8018Return a copy of the string S converted to lowercase."); 8019 8020static PyObject* 8021unicode_lower(PyUnicodeObject *self) 8022{ 8023 return fixup(self, fixlower); 8024} 8025 8026#define LEFTSTRIP 0 8027#define RIGHTSTRIP 1 8028#define BOTHSTRIP 2 8029 8030/* Arrays indexed by above */ 8031static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"}; 8032 8033#define STRIPNAME(i) (stripformat[i]+3) 8034 8035/* externally visible for str.strip(unicode) */ 8036PyObject * 8037_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj) 8038{ 8039 Py_UNICODE *s = PyUnicode_AS_UNICODE(self); 8040 Py_ssize_t len = PyUnicode_GET_SIZE(self); 8041 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj); 8042 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj); 8043 Py_ssize_t i, j; 8044 8045 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen); 8046 8047 i = 0; 8048 if (striptype != RIGHTSTRIP) { 8049 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) { 8050 i++; 8051 } 8052 } 8053 8054 j = len; 8055 if (striptype != LEFTSTRIP) { 8056 do { 8057 j--; 8058 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen)); 8059 j++; 8060 } 8061 8062 if (i == 0 && j == len && PyUnicode_CheckExact(self)) { 8063 Py_INCREF(self); 8064 return (PyObject*)self; 8065 } 8066 else 8067 return PyUnicode_FromUnicode(s+i, j-i); 8068} 8069 8070 8071static PyObject * 8072do_strip(PyUnicodeObject *self, int striptype) 8073{ 8074 Py_UNICODE *s = PyUnicode_AS_UNICODE(self); 8075 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j; 8076 8077 i = 0; 8078 if (striptype != RIGHTSTRIP) { 8079 while (i < len && Py_UNICODE_ISSPACE(s[i])) { 8080 i++; 8081 } 8082 } 8083 8084 j = len; 8085 if (striptype != LEFTSTRIP) { 8086 do { 8087 j--; 8088 } while (j >= i && Py_UNICODE_ISSPACE(s[j])); 8089 j++; 8090 } 8091 8092 if (i == 0 && j == len && PyUnicode_CheckExact(self)) { 8093 Py_INCREF(self); 8094 return (PyObject*)self; 8095 } 8096 else 8097 return PyUnicode_FromUnicode(s+i, j-i); 8098} 8099 8100 8101static PyObject * 8102do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args) 8103{ 8104 PyObject *sep = NULL; 8105 8106 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep)) 8107 return NULL; 8108 8109 if (sep != NULL && sep != Py_None) { 8110 if (PyUnicode_Check(sep)) 8111 return _PyUnicode_XStrip(self, striptype, sep); 8112 else { 8113 PyErr_Format(PyExc_TypeError, 8114 "%s arg must be None or str", 8115 STRIPNAME(striptype)); 8116 return NULL; 8117 } 8118 } 8119 8120 return do_strip(self, striptype); 8121} 8122 8123 8124PyDoc_STRVAR(strip__doc__, 8125 "S.strip([chars]) -> str\n\ 8126\n\ 8127Return a copy of the string S with leading and trailing\n\ 8128whitespace removed.\n\ 8129If chars is given and not None, remove characters in chars instead."); 8130 8131static PyObject * 8132unicode_strip(PyUnicodeObject *self, PyObject *args) 8133{ 8134 if (PyTuple_GET_SIZE(args) == 0) 8135 return do_strip(self, BOTHSTRIP); /* Common case */ 8136 else 8137 return do_argstrip(self, BOTHSTRIP, args); 8138} 8139 8140 8141PyDoc_STRVAR(lstrip__doc__, 8142 "S.lstrip([chars]) -> str\n\ 8143\n\ 8144Return a copy of the string S with leading whitespace removed.\n\ 8145If chars is given and not None, remove characters in chars instead."); 8146 8147static PyObject * 8148unicode_lstrip(PyUnicodeObject *self, PyObject *args) 8149{ 8150 if (PyTuple_GET_SIZE(args) == 0) 8151 return do_strip(self, LEFTSTRIP); /* Common case */ 8152 else 8153 return do_argstrip(self, LEFTSTRIP, args); 8154} 8155 8156 8157PyDoc_STRVAR(rstrip__doc__, 8158 "S.rstrip([chars]) -> str\n\ 8159\n\ 8160Return a copy of the string S with trailing whitespace removed.\n\ 8161If chars is given and not None, remove characters in chars instead."); 8162 8163static PyObject * 8164unicode_rstrip(PyUnicodeObject *self, PyObject *args) 8165{ 8166 if (PyTuple_GET_SIZE(args) == 0) 8167 return do_strip(self, RIGHTSTRIP); /* Common case */ 8168 else 8169 return do_argstrip(self, RIGHTSTRIP, args); 8170} 8171 8172 8173static PyObject* 8174unicode_repeat(PyUnicodeObject *str, Py_ssize_t len) 8175{ 8176 PyUnicodeObject *u; 8177 Py_UNICODE *p; 8178 Py_ssize_t nchars; 8179 size_t nbytes; 8180 8181 if (len < 1) { 8182 Py_INCREF(unicode_empty); 8183 return (PyObject *)unicode_empty; 8184 } 8185 8186 if (len == 1 && PyUnicode_CheckExact(str)) { 8187 /* no repeat, return original string */ 8188 Py_INCREF(str); 8189 return (PyObject*) str; 8190 } 8191 8192 /* ensure # of chars needed doesn't overflow int and # of bytes 8193 * needed doesn't overflow size_t 8194 */ 8195 nchars = len * str->length; 8196 if (nchars / len != str->length) { 8197 PyErr_SetString(PyExc_OverflowError, 8198 "repeated string is too long"); 8199 return NULL; 8200 } 8201 nbytes = (nchars + 1) * sizeof(Py_UNICODE); 8202 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) { 8203 PyErr_SetString(PyExc_OverflowError, 8204 "repeated string is too long"); 8205 return NULL; 8206 } 8207 u = _PyUnicode_New(nchars); 8208 if (!u) 8209 return NULL; 8210 8211 p = u->str; 8212 8213 if (str->length == 1) { 8214 Py_UNICODE_FILL(p, str->str[0], len); 8215 } else { 8216 Py_ssize_t done = str->length; /* number of characters copied this far */ 8217 Py_UNICODE_COPY(p, str->str, str->length); 8218 while (done < nchars) { 8219 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done; 8220 Py_UNICODE_COPY(p+done, p, n); 8221 done += n; 8222 } 8223 } 8224 8225 return (PyObject*) u; 8226} 8227 8228PyObject *PyUnicode_Replace(PyObject *obj, 8229 PyObject *subobj, 8230 PyObject *replobj, 8231 Py_ssize_t maxcount) 8232{ 8233 PyObject *self; 8234 PyObject *str1; 8235 PyObject *str2; 8236 PyObject *result; 8237 8238 self = PyUnicode_FromObject(obj); 8239 if (self == NULL) 8240 return NULL; 8241 str1 = PyUnicode_FromObject(subobj); 8242 if (str1 == NULL) { 8243 Py_DECREF(self); 8244 return NULL; 8245 } 8246 str2 = PyUnicode_FromObject(replobj); 8247 if (str2 == NULL) { 8248 Py_DECREF(self); 8249 Py_DECREF(str1); 8250 return NULL; 8251 } 8252 result = replace((PyUnicodeObject *)self, 8253 (PyUnicodeObject *)str1, 8254 (PyUnicodeObject *)str2, 8255 maxcount); 8256 Py_DECREF(self); 8257 Py_DECREF(str1); 8258 Py_DECREF(str2); 8259 return result; 8260} 8261 8262PyDoc_STRVAR(replace__doc__, 8263 "S.replace(old, new[, count]) -> str\n\ 8264\n\ 8265Return a copy of S with all occurrences of substring\n\ 8266old replaced by new. If the optional argument count is\n\ 8267given, only the first count occurrences are replaced."); 8268 8269static PyObject* 8270unicode_replace(PyUnicodeObject *self, PyObject *args) 8271{ 8272 PyUnicodeObject *str1; 8273 PyUnicodeObject *str2; 8274 Py_ssize_t maxcount = -1; 8275 PyObject *result; 8276 8277 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount)) 8278 return NULL; 8279 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1); 8280 if (str1 == NULL) 8281 return NULL; 8282 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2); 8283 if (str2 == NULL) { 8284 Py_DECREF(str1); 8285 return NULL; 8286 } 8287 8288 result = replace(self, str1, str2, maxcount); 8289 8290 Py_DECREF(str1); 8291 Py_DECREF(str2); 8292 return result; 8293} 8294 8295static 8296PyObject *unicode_repr(PyObject *unicode) 8297{ 8298 PyObject *repr; 8299 Py_UNICODE *p; 8300 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode); 8301 Py_ssize_t size = PyUnicode_GET_SIZE(unicode); 8302 8303 /* XXX(nnorwitz): rather than over-allocating, it would be 8304 better to choose a different scheme. Perhaps scan the 8305 first N-chars of the string and allocate based on that size. 8306 */ 8307 /* Initial allocation is based on the longest-possible unichr 8308 escape. 8309 8310 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source 8311 unichr, so in this case it's the longest unichr escape. In 8312 narrow (UTF-16) builds this is five chars per source unichr 8313 since there are two unichrs in the surrogate pair, so in narrow 8314 (UTF-16) builds it's not the longest unichr escape. 8315 8316 In wide or narrow builds '\uxxxx' is 6 chars per source unichr, 8317 so in the narrow (UTF-16) build case it's the longest unichr 8318 escape. 8319 */ 8320 8321 repr = PyUnicode_FromUnicode(NULL, 8322 2 /* quotes */ 8323#ifdef Py_UNICODE_WIDE 8324 + 10*size 8325#else 8326 + 6*size 8327#endif 8328 + 1); 8329 if (repr == NULL) 8330 return NULL; 8331 8332 p = PyUnicode_AS_UNICODE(repr); 8333 8334 /* Add quote */ 8335 *p++ = (findchar(s, size, '\'') && 8336 !findchar(s, size, '"')) ? '"' : '\''; 8337 while (size-- > 0) { 8338 Py_UNICODE ch = *s++; 8339 8340 /* Escape quotes and backslashes */ 8341 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) { 8342 *p++ = '\\'; 8343 *p++ = ch; 8344 continue; 8345 } 8346 8347 /* Map special whitespace to '\t', \n', '\r' */ 8348 if (ch == '\t') { 8349 *p++ = '\\'; 8350 *p++ = 't'; 8351 } 8352 else if (ch == '\n') { 8353 *p++ = '\\'; 8354 *p++ = 'n'; 8355 } 8356 else if (ch == '\r') { 8357 *p++ = '\\'; 8358 *p++ = 'r'; 8359 } 8360 8361 /* Map non-printable US ASCII to '\xhh' */ 8362 else if (ch < ' ' || ch == 0x7F) { 8363 *p++ = '\\'; 8364 *p++ = 'x'; 8365 *p++ = hexdigits[(ch >> 4) & 0x000F]; 8366 *p++ = hexdigits[ch & 0x000F]; 8367 } 8368 8369 /* Copy ASCII characters as-is */ 8370 else if (ch < 0x7F) { 8371 *p++ = ch; 8372 } 8373 8374 /* Non-ASCII characters */ 8375 else { 8376 Py_UCS4 ucs = ch; 8377 8378#ifndef Py_UNICODE_WIDE 8379 Py_UNICODE ch2 = 0; 8380 /* Get code point from surrogate pair */ 8381 if (size > 0) { 8382 ch2 = *s; 8383 if (ch >= 0xD800 && ch < 0xDC00 && ch2 >= 0xDC00 8384 && ch2 <= 0xDFFF) { 8385 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) 8386 + 0x00010000; 8387 s++; 8388 size--; 8389 } 8390 } 8391#endif 8392 /* Map Unicode whitespace and control characters 8393 (categories Z* and C* except ASCII space) 8394 */ 8395 if (!Py_UNICODE_ISPRINTABLE(ucs)) { 8396 /* Map 8-bit characters to '\xhh' */ 8397 if (ucs <= 0xff) { 8398 *p++ = '\\'; 8399 *p++ = 'x'; 8400 *p++ = hexdigits[(ch >> 4) & 0x000F]; 8401 *p++ = hexdigits[ch & 0x000F]; 8402 } 8403 /* Map 21-bit characters to '\U00xxxxxx' */ 8404 else if (ucs >= 0x10000) { 8405 *p++ = '\\'; 8406 *p++ = 'U'; 8407 *p++ = hexdigits[(ucs >> 28) & 0x0000000F]; 8408 *p++ = hexdigits[(ucs >> 24) & 0x0000000F]; 8409 *p++ = hexdigits[(ucs >> 20) & 0x0000000F]; 8410 *p++ = hexdigits[(ucs >> 16) & 0x0000000F]; 8411 *p++ = hexdigits[(ucs >> 12) & 0x0000000F]; 8412 *p++ = hexdigits[(ucs >> 8) & 0x0000000F]; 8413 *p++ = hexdigits[(ucs >> 4) & 0x0000000F]; 8414 *p++ = hexdigits[ucs & 0x0000000F]; 8415 } 8416 /* Map 16-bit characters to '\uxxxx' */ 8417 else { 8418 *p++ = '\\'; 8419 *p++ = 'u'; 8420 *p++ = hexdigits[(ucs >> 12) & 0x000F]; 8421 *p++ = hexdigits[(ucs >> 8) & 0x000F]; 8422 *p++ = hexdigits[(ucs >> 4) & 0x000F]; 8423 *p++ = hexdigits[ucs & 0x000F]; 8424 } 8425 } 8426 /* Copy characters as-is */ 8427 else { 8428 *p++ = ch; 8429#ifndef Py_UNICODE_WIDE 8430 if (ucs >= 0x10000) 8431 *p++ = ch2; 8432#endif 8433 } 8434 } 8435 } 8436 /* Add quote */ 8437 *p++ = PyUnicode_AS_UNICODE(repr)[0]; 8438 8439 *p = '\0'; 8440 PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr)); 8441 return repr; 8442} 8443 8444PyDoc_STRVAR(rfind__doc__, 8445 "S.rfind(sub[, start[, end]]) -> int\n\ 8446\n\ 8447Return the highest index in S where substring sub is found,\n\ 8448such that sub is contained within s[start:end]. Optional\n\ 8449arguments start and end are interpreted as in slice notation.\n\ 8450\n\ 8451Return -1 on failure."); 8452 8453static PyObject * 8454unicode_rfind(PyUnicodeObject *self, PyObject *args) 8455{ 8456 PyObject *substring; 8457 Py_ssize_t start; 8458 Py_ssize_t end; 8459 Py_ssize_t result; 8460 8461 if (!_ParseTupleFinds(args, &substring, &start, &end)) 8462 return NULL; 8463 8464 result = stringlib_rfind_slice( 8465 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self), 8466 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring), 8467 start, end 8468 ); 8469 8470 Py_DECREF(substring); 8471 8472 return PyLong_FromSsize_t(result); 8473} 8474 8475PyDoc_STRVAR(rindex__doc__, 8476 "S.rindex(sub[, start[, end]]) -> int\n\ 8477\n\ 8478Like S.rfind() but raise ValueError when the substring is not found."); 8479 8480static PyObject * 8481unicode_rindex(PyUnicodeObject *self, PyObject *args) 8482{ 8483 PyObject *substring; 8484 Py_ssize_t start; 8485 Py_ssize_t end; 8486 Py_ssize_t result; 8487 8488 if (!_ParseTupleFinds(args, &substring, &start, &end)) 8489 return NULL; 8490 8491 result = stringlib_rfind_slice( 8492 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self), 8493 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring), 8494 start, end 8495 ); 8496 8497 Py_DECREF(substring); 8498 8499 if (result < 0) { 8500 PyErr_SetString(PyExc_ValueError, "substring not found"); 8501 return NULL; 8502 } 8503 return PyLong_FromSsize_t(result); 8504} 8505 8506PyDoc_STRVAR(rjust__doc__, 8507 "S.rjust(width[, fillchar]) -> str\n\ 8508\n\ 8509Return S right-justified in a string of length width. Padding is\n\ 8510done using the specified fill character (default is a space)."); 8511 8512static PyObject * 8513unicode_rjust(PyUnicodeObject *self, PyObject *args) 8514{ 8515 Py_ssize_t width; 8516 Py_UNICODE fillchar = ' '; 8517 8518 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar)) 8519 return NULL; 8520 8521 if (self->length >= width && PyUnicode_CheckExact(self)) { 8522 Py_INCREF(self); 8523 return (PyObject*) self; 8524 } 8525 8526 return (PyObject*) pad(self, width - self->length, 0, fillchar); 8527} 8528 8529PyObject *PyUnicode_Split(PyObject *s, 8530 PyObject *sep, 8531 Py_ssize_t maxsplit) 8532{ 8533 PyObject *result; 8534 8535 s = PyUnicode_FromObject(s); 8536 if (s == NULL) 8537 return NULL; 8538 if (sep != NULL) { 8539 sep = PyUnicode_FromObject(sep); 8540 if (sep == NULL) { 8541 Py_DECREF(s); 8542 return NULL; 8543 } 8544 } 8545 8546 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit); 8547 8548 Py_DECREF(s); 8549 Py_XDECREF(sep); 8550 return result; 8551} 8552 8553PyDoc_STRVAR(split__doc__, 8554 "S.split([sep[, maxsplit]]) -> list of strings\n\ 8555\n\ 8556Return a list of the words in S, using sep as the\n\ 8557delimiter string. If maxsplit is given, at most maxsplit\n\ 8558splits are done. If sep is not specified or is None, any\n\ 8559whitespace string is a separator and empty strings are\n\ 8560removed from the result."); 8561 8562static PyObject* 8563unicode_split(PyUnicodeObject *self, PyObject *args) 8564{ 8565 PyObject *substring = Py_None; 8566 Py_ssize_t maxcount = -1; 8567 8568 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount)) 8569 return NULL; 8570 8571 if (substring == Py_None) 8572 return split(self, NULL, maxcount); 8573 else if (PyUnicode_Check(substring)) 8574 return split(self, (PyUnicodeObject *)substring, maxcount); 8575 else 8576 return PyUnicode_Split((PyObject *)self, substring, maxcount); 8577} 8578 8579PyObject * 8580PyUnicode_Partition(PyObject *str_in, PyObject *sep_in) 8581{ 8582 PyObject* str_obj; 8583 PyObject* sep_obj; 8584 PyObject* out; 8585 8586 str_obj = PyUnicode_FromObject(str_in); 8587 if (!str_obj) 8588 return NULL; 8589 sep_obj = PyUnicode_FromObject(sep_in); 8590 if (!sep_obj) { 8591 Py_DECREF(str_obj); 8592 return NULL; 8593 } 8594 8595 out = stringlib_partition( 8596 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj), 8597 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj) 8598 ); 8599 8600 Py_DECREF(sep_obj); 8601 Py_DECREF(str_obj); 8602 8603 return out; 8604} 8605 8606 8607PyObject * 8608PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in) 8609{ 8610 PyObject* str_obj; 8611 PyObject* sep_obj; 8612 PyObject* out; 8613 8614 str_obj = PyUnicode_FromObject(str_in); 8615 if (!str_obj) 8616 return NULL; 8617 sep_obj = PyUnicode_FromObject(sep_in); 8618 if (!sep_obj) { 8619 Py_DECREF(str_obj); 8620 return NULL; 8621 } 8622 8623 out = stringlib_rpartition( 8624 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj), 8625 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj) 8626 ); 8627 8628 Py_DECREF(sep_obj); 8629 Py_DECREF(str_obj); 8630 8631 return out; 8632} 8633 8634PyDoc_STRVAR(partition__doc__, 8635 "S.partition(sep) -> (head, sep, tail)\n\ 8636\n\ 8637Search for the separator sep in S, and return the part before it,\n\ 8638the separator itself, and the part after it. If the separator is not\n\ 8639found, return S and two empty strings."); 8640 8641static PyObject* 8642unicode_partition(PyUnicodeObject *self, PyObject *separator) 8643{ 8644 return PyUnicode_Partition((PyObject *)self, separator); 8645} 8646 8647PyDoc_STRVAR(rpartition__doc__, 8648 "S.rpartition(sep) -> (head, sep, tail)\n\ 8649\n\ 8650Search for the separator sep in S, starting at the end of S, and return\n\ 8651the part before it, the separator itself, and the part after it. If the\n\ 8652separator is not found, return two empty strings and S."); 8653 8654static PyObject* 8655unicode_rpartition(PyUnicodeObject *self, PyObject *separator) 8656{ 8657 return PyUnicode_RPartition((PyObject *)self, separator); 8658} 8659 8660PyObject *PyUnicode_RSplit(PyObject *s, 8661 PyObject *sep, 8662 Py_ssize_t maxsplit) 8663{ 8664 PyObject *result; 8665 8666 s = PyUnicode_FromObject(s); 8667 if (s == NULL) 8668 return NULL; 8669 if (sep != NULL) { 8670 sep = PyUnicode_FromObject(sep); 8671 if (sep == NULL) { 8672 Py_DECREF(s); 8673 return NULL; 8674 } 8675 } 8676 8677 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit); 8678 8679 Py_DECREF(s); 8680 Py_XDECREF(sep); 8681 return result; 8682} 8683 8684PyDoc_STRVAR(rsplit__doc__, 8685 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\ 8686\n\ 8687Return a list of the words in S, using sep as the\n\ 8688delimiter string, starting at the end of the string and\n\ 8689working to the front. If maxsplit is given, at most maxsplit\n\ 8690splits are done. If sep is not specified, any whitespace string\n\ 8691is a separator."); 8692 8693static PyObject* 8694unicode_rsplit(PyUnicodeObject *self, PyObject *args) 8695{ 8696 PyObject *substring = Py_None; 8697 Py_ssize_t maxcount = -1; 8698 8699 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount)) 8700 return NULL; 8701 8702 if (substring == Py_None) 8703 return rsplit(self, NULL, maxcount); 8704 else if (PyUnicode_Check(substring)) 8705 return rsplit(self, (PyUnicodeObject *)substring, maxcount); 8706 else 8707 return PyUnicode_RSplit((PyObject *)self, substring, maxcount); 8708} 8709 8710PyDoc_STRVAR(splitlines__doc__, 8711 "S.splitlines([keepends]) -> list of strings\n\ 8712\n\ 8713Return a list of the lines in S, breaking at line boundaries.\n\ 8714Line breaks are not included in the resulting list unless keepends\n\ 8715is given and true."); 8716 8717static PyObject* 8718unicode_splitlines(PyUnicodeObject *self, PyObject *args) 8719{ 8720 int keepends = 0; 8721 8722 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends)) 8723 return NULL; 8724 8725 return PyUnicode_Splitlines((PyObject *)self, keepends); 8726} 8727 8728static 8729PyObject *unicode_str(PyObject *self) 8730{ 8731 if (PyUnicode_CheckExact(self)) { 8732 Py_INCREF(self); 8733 return self; 8734 } else 8735 /* Subtype -- return genuine unicode string with the same value. */ 8736 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self), 8737 PyUnicode_GET_SIZE(self)); 8738} 8739 8740PyDoc_STRVAR(swapcase__doc__, 8741 "S.swapcase() -> str\n\ 8742\n\ 8743Return a copy of S with uppercase characters converted to lowercase\n\ 8744and vice versa."); 8745 8746static PyObject* 8747unicode_swapcase(PyUnicodeObject *self) 8748{ 8749 return fixup(self, fixswapcase); 8750} 8751 8752PyDoc_STRVAR(maketrans__doc__, 8753 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\ 8754\n\ 8755Return a translation table usable for str.translate().\n\ 8756If there is only one argument, it must be a dictionary mapping Unicode\n\ 8757ordinals (integers) or characters to Unicode ordinals, strings or None.\n\ 8758Character keys will be then converted to ordinals.\n\ 8759If there are two arguments, they must be strings of equal length, and\n\ 8760in the resulting dictionary, each character in x will be mapped to the\n\ 8761character at the same position in y. If there is a third argument, it\n\ 8762must be a string, whose characters will be mapped to None in the result."); 8763 8764static PyObject* 8765unicode_maketrans(PyUnicodeObject *null, PyObject *args) 8766{ 8767 PyObject *x, *y = NULL, *z = NULL; 8768 PyObject *new = NULL, *key, *value; 8769 Py_ssize_t i = 0; 8770 int res; 8771 8772 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z)) 8773 return NULL; 8774 new = PyDict_New(); 8775 if (!new) 8776 return NULL; 8777 if (y != NULL) { 8778 /* x must be a string too, of equal length */ 8779 Py_ssize_t ylen = PyUnicode_GET_SIZE(y); 8780 if (!PyUnicode_Check(x)) { 8781 PyErr_SetString(PyExc_TypeError, "first maketrans argument must " 8782 "be a string if there is a second argument"); 8783 goto err; 8784 } 8785 if (PyUnicode_GET_SIZE(x) != ylen) { 8786 PyErr_SetString(PyExc_ValueError, "the first two maketrans " 8787 "arguments must have equal length"); 8788 goto err; 8789 } 8790 /* create entries for translating chars in x to those in y */ 8791 for (i = 0; i < PyUnicode_GET_SIZE(x); i++) { 8792 key = PyLong_FromLong(PyUnicode_AS_UNICODE(x)[i]); 8793 value = PyLong_FromLong(PyUnicode_AS_UNICODE(y)[i]); 8794 if (!key || !value) 8795 goto err; 8796 res = PyDict_SetItem(new, key, value); 8797 Py_DECREF(key); 8798 Py_DECREF(value); 8799 if (res < 0) 8800 goto err; 8801 } 8802 /* create entries for deleting chars in z */ 8803 if (z != NULL) { 8804 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) { 8805 key = PyLong_FromLong(PyUnicode_AS_UNICODE(z)[i]); 8806 if (!key) 8807 goto err; 8808 res = PyDict_SetItem(new, key, Py_None); 8809 Py_DECREF(key); 8810 if (res < 0) 8811 goto err; 8812 } 8813 } 8814 } else { 8815 /* x must be a dict */ 8816 if (!PyDict_CheckExact(x)) { 8817 PyErr_SetString(PyExc_TypeError, "if you give only one argument " 8818 "to maketrans it must be a dict"); 8819 goto err; 8820 } 8821 /* copy entries into the new dict, converting string keys to int keys */ 8822 while (PyDict_Next(x, &i, &key, &value)) { 8823 if (PyUnicode_Check(key)) { 8824 /* convert string keys to integer keys */ 8825 PyObject *newkey; 8826 if (PyUnicode_GET_SIZE(key) != 1) { 8827 PyErr_SetString(PyExc_ValueError, "string keys in translate " 8828 "table must be of length 1"); 8829 goto err; 8830 } 8831 newkey = PyLong_FromLong(PyUnicode_AS_UNICODE(key)[0]); 8832 if (!newkey) 8833 goto err; 8834 res = PyDict_SetItem(new, newkey, value); 8835 Py_DECREF(newkey); 8836 if (res < 0) 8837 goto err; 8838 } else if (PyLong_Check(key)) { 8839 /* just keep integer keys */ 8840 if (PyDict_SetItem(new, key, value) < 0) 8841 goto err; 8842 } else { 8843 PyErr_SetString(PyExc_TypeError, "keys in translate table must " 8844 "be strings or integers"); 8845 goto err; 8846 } 8847 } 8848 } 8849 return new; 8850 err: 8851 Py_DECREF(new); 8852 return NULL; 8853} 8854 8855PyDoc_STRVAR(translate__doc__, 8856 "S.translate(table) -> str\n\ 8857\n\ 8858Return a copy of the string S, where all characters have been mapped\n\ 8859through the given translation table, which must be a mapping of\n\ 8860Unicode ordinals to Unicode ordinals, strings, or None.\n\ 8861Unmapped characters are left untouched. Characters mapped to None\n\ 8862are deleted."); 8863 8864static PyObject* 8865unicode_translate(PyUnicodeObject *self, PyObject *table) 8866{ 8867 return PyUnicode_TranslateCharmap(self->str, self->length, table, "ignore"); 8868} 8869 8870PyDoc_STRVAR(upper__doc__, 8871 "S.upper() -> str\n\ 8872\n\ 8873Return a copy of S converted to uppercase."); 8874 8875static PyObject* 8876unicode_upper(PyUnicodeObject *self) 8877{ 8878 return fixup(self, fixupper); 8879} 8880 8881PyDoc_STRVAR(zfill__doc__, 8882 "S.zfill(width) -> str\n\ 8883\n\ 8884Pad a numeric string S with zeros on the left, to fill a field\n\ 8885of the specified width. The string S is never truncated."); 8886 8887static PyObject * 8888unicode_zfill(PyUnicodeObject *self, PyObject *args) 8889{ 8890 Py_ssize_t fill; 8891 PyUnicodeObject *u; 8892 8893 Py_ssize_t width; 8894 if (!PyArg_ParseTuple(args, "n:zfill", &width)) 8895 return NULL; 8896 8897 if (self->length >= width) { 8898 if (PyUnicode_CheckExact(self)) { 8899 Py_INCREF(self); 8900 return (PyObject*) self; 8901 } 8902 else 8903 return PyUnicode_FromUnicode( 8904 PyUnicode_AS_UNICODE(self), 8905 PyUnicode_GET_SIZE(self) 8906 ); 8907 } 8908 8909 fill = width - self->length; 8910 8911 u = pad(self, fill, 0, '0'); 8912 8913 if (u == NULL) 8914 return NULL; 8915 8916 if (u->str[fill] == '+' || u->str[fill] == '-') { 8917 /* move sign to beginning of string */ 8918 u->str[0] = u->str[fill]; 8919 u->str[fill] = '0'; 8920 } 8921 8922 return (PyObject*) u; 8923} 8924 8925#if 0 8926static PyObject* 8927unicode_freelistsize(PyUnicodeObject *self) 8928{ 8929 return PyLong_FromLong(numfree); 8930} 8931#endif 8932 8933PyDoc_STRVAR(startswith__doc__, 8934 "S.startswith(prefix[, start[, end]]) -> bool\n\ 8935\n\ 8936Return True if S starts with the specified prefix, False otherwise.\n\ 8937With optional start, test S beginning at that position.\n\ 8938With optional end, stop comparing S at that position.\n\ 8939prefix can also be a tuple of strings to try."); 8940 8941static PyObject * 8942unicode_startswith(PyUnicodeObject *self, 8943 PyObject *args) 8944{ 8945 PyObject *subobj; 8946 PyUnicodeObject *substring; 8947 Py_ssize_t start = 0; 8948 Py_ssize_t end = PY_SSIZE_T_MAX; 8949 int result; 8950 8951 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj, 8952 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 8953 return NULL; 8954 if (PyTuple_Check(subobj)) { 8955 Py_ssize_t i; 8956 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) { 8957 substring = (PyUnicodeObject *)PyUnicode_FromObject( 8958 PyTuple_GET_ITEM(subobj, i)); 8959 if (substring == NULL) 8960 return NULL; 8961 result = tailmatch(self, substring, start, end, -1); 8962 Py_DECREF(substring); 8963 if (result) { 8964 Py_RETURN_TRUE; 8965 } 8966 } 8967 /* nothing matched */ 8968 Py_RETURN_FALSE; 8969 } 8970 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj); 8971 if (substring == NULL) 8972 return NULL; 8973 result = tailmatch(self, substring, start, end, -1); 8974 Py_DECREF(substring); 8975 return PyBool_FromLong(result); 8976} 8977 8978 8979PyDoc_STRVAR(endswith__doc__, 8980 "S.endswith(suffix[, start[, end]]) -> bool\n\ 8981\n\ 8982Return True if S ends with the specified suffix, False otherwise.\n\ 8983With optional start, test S beginning at that position.\n\ 8984With optional end, stop comparing S at that position.\n\ 8985suffix can also be a tuple of strings to try."); 8986 8987static PyObject * 8988unicode_endswith(PyUnicodeObject *self, 8989 PyObject *args) 8990{ 8991 PyObject *subobj; 8992 PyUnicodeObject *substring; 8993 Py_ssize_t start = 0; 8994 Py_ssize_t end = PY_SSIZE_T_MAX; 8995 int result; 8996 8997 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj, 8998 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 8999 return NULL; 9000 if (PyTuple_Check(subobj)) { 9001 Py_ssize_t i; 9002 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) { 9003 substring = (PyUnicodeObject *)PyUnicode_FromObject( 9004 PyTuple_GET_ITEM(subobj, i)); 9005 if (substring == NULL) 9006 return NULL; 9007 result = tailmatch(self, substring, start, end, +1); 9008 Py_DECREF(substring); 9009 if (result) { 9010 Py_RETURN_TRUE; 9011 } 9012 } 9013 Py_RETURN_FALSE; 9014 } 9015 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj); 9016 if (substring == NULL) 9017 return NULL; 9018 9019 result = tailmatch(self, substring, start, end, +1); 9020 Py_DECREF(substring); 9021 return PyBool_FromLong(result); 9022} 9023 9024#include "stringlib/string_format.h" 9025 9026PyDoc_STRVAR(format__doc__, 9027 "S.format(*args, **kwargs) -> str\n\ 9028\n\ 9029Return a formatted version of S, using substitutions from args and kwargs.\n\ 9030The substitutions are identified by braces ('{' and '}')."); 9031 9032PyDoc_STRVAR(format_map__doc__, 9033 "S.format_map(mapping) -> str\n\ 9034\n\ 9035Return a formatted version of S, using substitutions from mapping.\n\ 9036The substitutions are identified by braces ('{' and '}')."); 9037 9038static PyObject * 9039unicode__format__(PyObject* self, PyObject* args) 9040{ 9041 PyObject *format_spec; 9042 9043 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec)) 9044 return NULL; 9045 9046 return _PyUnicode_FormatAdvanced(self, 9047 PyUnicode_AS_UNICODE(format_spec), 9048 PyUnicode_GET_SIZE(format_spec)); 9049} 9050 9051PyDoc_STRVAR(p_format__doc__, 9052 "S.__format__(format_spec) -> str\n\ 9053\n\ 9054Return a formatted version of S as described by format_spec."); 9055 9056static PyObject * 9057unicode__sizeof__(PyUnicodeObject *v) 9058{ 9059 return PyLong_FromSsize_t(sizeof(PyUnicodeObject) + 9060 sizeof(Py_UNICODE) * (v->length + 1)); 9061} 9062 9063PyDoc_STRVAR(sizeof__doc__, 9064 "S.__sizeof__() -> size of S in memory, in bytes"); 9065 9066static PyObject * 9067unicode_getnewargs(PyUnicodeObject *v) 9068{ 9069 return Py_BuildValue("(u#)", v->str, v->length); 9070} 9071 9072 9073static PyMethodDef unicode_methods[] = { 9074 9075 /* Order is according to common usage: often used methods should 9076 appear first, since lookup is done sequentially. */ 9077 9078 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__}, 9079 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__}, 9080 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__}, 9081 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__}, 9082 {"join", (PyCFunction) unicode_join, METH_O, join__doc__}, 9083 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__}, 9084 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__}, 9085 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__}, 9086 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__}, 9087 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__}, 9088 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__}, 9089 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__}, 9090 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__}, 9091 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__}, 9092 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__}, 9093 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__}, 9094 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__}, 9095 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__}, 9096 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__}, 9097 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__}, 9098 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__}, 9099 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__}, 9100 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__}, 9101 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__}, 9102 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__}, 9103 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__}, 9104 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__}, 9105 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__}, 9106 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__}, 9107 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__}, 9108 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__}, 9109 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__}, 9110 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__}, 9111 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__}, 9112 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__}, 9113 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__}, 9114 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__}, 9115 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__}, 9116 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__}, 9117 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__}, 9118 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__}, 9119 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__}, 9120 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__}, 9121 {"maketrans", (PyCFunction) unicode_maketrans, 9122 METH_VARARGS | METH_STATIC, maketrans__doc__}, 9123 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__}, 9124#if 0 9125 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__}, 9126#endif 9127 9128#if 0 9129 /* This one is just used for debugging the implementation. */ 9130 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS}, 9131#endif 9132 9133 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS}, 9134 {NULL, NULL} 9135}; 9136 9137static PyObject * 9138unicode_mod(PyObject *v, PyObject *w) 9139{ 9140 if (!PyUnicode_Check(v)) { 9141 Py_INCREF(Py_NotImplemented); 9142 return Py_NotImplemented; 9143 } 9144 return PyUnicode_Format(v, w); 9145} 9146 9147static PyNumberMethods unicode_as_number = { 9148 0, /*nb_add*/ 9149 0, /*nb_subtract*/ 9150 0, /*nb_multiply*/ 9151 unicode_mod, /*nb_remainder*/ 9152}; 9153 9154static PySequenceMethods unicode_as_sequence = { 9155 (lenfunc) unicode_length, /* sq_length */ 9156 PyUnicode_Concat, /* sq_concat */ 9157 (ssizeargfunc) unicode_repeat, /* sq_repeat */ 9158 (ssizeargfunc) unicode_getitem, /* sq_item */ 9159 0, /* sq_slice */ 9160 0, /* sq_ass_item */ 9161 0, /* sq_ass_slice */ 9162 PyUnicode_Contains, /* sq_contains */ 9163}; 9164 9165static PyObject* 9166unicode_subscript(PyUnicodeObject* self, PyObject* item) 9167{ 9168 if (PyIndex_Check(item)) { 9169 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError); 9170 if (i == -1 && PyErr_Occurred()) 9171 return NULL; 9172 if (i < 0) 9173 i += PyUnicode_GET_SIZE(self); 9174 return unicode_getitem(self, i); 9175 } else if (PySlice_Check(item)) { 9176 Py_ssize_t start, stop, step, slicelength, cur, i; 9177 Py_UNICODE* source_buf; 9178 Py_UNICODE* result_buf; 9179 PyObject* result; 9180 9181 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self), 9182 &start, &stop, &step, &slicelength) < 0) { 9183 return NULL; 9184 } 9185 9186 if (slicelength <= 0) { 9187 return PyUnicode_FromUnicode(NULL, 0); 9188 } else if (start == 0 && step == 1 && slicelength == self->length && 9189 PyUnicode_CheckExact(self)) { 9190 Py_INCREF(self); 9191 return (PyObject *)self; 9192 } else if (step == 1) { 9193 return PyUnicode_FromUnicode(self->str + start, slicelength); 9194 } else { 9195 source_buf = PyUnicode_AS_UNICODE((PyObject*)self); 9196 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength* 9197 sizeof(Py_UNICODE)); 9198 9199 if (result_buf == NULL) 9200 return PyErr_NoMemory(); 9201 9202 for (cur = start, i = 0; i < slicelength; cur += step, i++) { 9203 result_buf[i] = source_buf[cur]; 9204 } 9205 9206 result = PyUnicode_FromUnicode(result_buf, slicelength); 9207 PyObject_FREE(result_buf); 9208 return result; 9209 } 9210 } else { 9211 PyErr_SetString(PyExc_TypeError, "string indices must be integers"); 9212 return NULL; 9213 } 9214} 9215 9216static PyMappingMethods unicode_as_mapping = { 9217 (lenfunc)unicode_length, /* mp_length */ 9218 (binaryfunc)unicode_subscript, /* mp_subscript */ 9219 (objobjargproc)0, /* mp_ass_subscript */ 9220}; 9221 9222 9223/* Helpers for PyUnicode_Format() */ 9224 9225static PyObject * 9226getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx) 9227{ 9228 Py_ssize_t argidx = *p_argidx; 9229 if (argidx < arglen) { 9230 (*p_argidx)++; 9231 if (arglen < 0) 9232 return args; 9233 else 9234 return PyTuple_GetItem(args, argidx); 9235 } 9236 PyErr_SetString(PyExc_TypeError, 9237 "not enough arguments for format string"); 9238 return NULL; 9239} 9240 9241/* Returns a new reference to a PyUnicode object, or NULL on failure. */ 9242 9243static PyObject * 9244formatfloat(PyObject *v, int flags, int prec, int type) 9245{ 9246 char *p; 9247 PyObject *result; 9248 double x; 9249 9250 x = PyFloat_AsDouble(v); 9251 if (x == -1.0 && PyErr_Occurred()) 9252 return NULL; 9253 9254 if (prec < 0) 9255 prec = 6; 9256 9257 p = PyOS_double_to_string(x, type, prec, 9258 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL); 9259 if (p == NULL) 9260 return NULL; 9261 result = PyUnicode_FromStringAndSize(p, strlen(p)); 9262 PyMem_Free(p); 9263 return result; 9264} 9265 9266static PyObject* 9267formatlong(PyObject *val, int flags, int prec, int type) 9268{ 9269 char *buf; 9270 int len; 9271 PyObject *str; /* temporary string object. */ 9272 PyObject *result; 9273 9274 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len); 9275 if (!str) 9276 return NULL; 9277 result = PyUnicode_FromStringAndSize(buf, len); 9278 Py_DECREF(str); 9279 return result; 9280} 9281 9282static int 9283formatchar(Py_UNICODE *buf, 9284 size_t buflen, 9285 PyObject *v) 9286{ 9287 /* presume that the buffer is at least 3 characters long */ 9288 if (PyUnicode_Check(v)) { 9289 if (PyUnicode_GET_SIZE(v) == 1) { 9290 buf[0] = PyUnicode_AS_UNICODE(v)[0]; 9291 buf[1] = '\0'; 9292 return 1; 9293 } 9294#ifndef Py_UNICODE_WIDE 9295 if (PyUnicode_GET_SIZE(v) == 2) { 9296 /* Decode a valid surrogate pair */ 9297 int c0 = PyUnicode_AS_UNICODE(v)[0]; 9298 int c1 = PyUnicode_AS_UNICODE(v)[1]; 9299 if (0xD800 <= c0 && c0 <= 0xDBFF && 9300 0xDC00 <= c1 && c1 <= 0xDFFF) { 9301 buf[0] = c0; 9302 buf[1] = c1; 9303 buf[2] = '\0'; 9304 return 2; 9305 } 9306 } 9307#endif 9308 goto onError; 9309 } 9310 else { 9311 /* Integer input truncated to a character */ 9312 long x; 9313 x = PyLong_AsLong(v); 9314 if (x == -1 && PyErr_Occurred()) 9315 goto onError; 9316 9317 if (x < 0 || x > 0x10ffff) { 9318 PyErr_SetString(PyExc_OverflowError, 9319 "%c arg not in range(0x110000)"); 9320 return -1; 9321 } 9322 9323#ifndef Py_UNICODE_WIDE 9324 if (x > 0xffff) { 9325 x -= 0x10000; 9326 buf[0] = (Py_UNICODE)(0xD800 | (x >> 10)); 9327 buf[1] = (Py_UNICODE)(0xDC00 | (x & 0x3FF)); 9328 return 2; 9329 } 9330#endif 9331 buf[0] = (Py_UNICODE) x; 9332 buf[1] = '\0'; 9333 return 1; 9334 } 9335 9336 onError: 9337 PyErr_SetString(PyExc_TypeError, 9338 "%c requires int or char"); 9339 return -1; 9340} 9341 9342/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...) 9343 FORMATBUFLEN is the length of the buffer in which chars are formatted. 9344*/ 9345#define FORMATBUFLEN (size_t)10 9346 9347PyObject *PyUnicode_Format(PyObject *format, 9348 PyObject *args) 9349{ 9350 Py_UNICODE *fmt, *res; 9351 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx; 9352 int args_owned = 0; 9353 PyUnicodeObject *result = NULL; 9354 PyObject *dict = NULL; 9355 PyObject *uformat; 9356 9357 if (format == NULL || args == NULL) { 9358 PyErr_BadInternalCall(); 9359 return NULL; 9360 } 9361 uformat = PyUnicode_FromObject(format); 9362 if (uformat == NULL) 9363 return NULL; 9364 fmt = PyUnicode_AS_UNICODE(uformat); 9365 fmtcnt = PyUnicode_GET_SIZE(uformat); 9366 9367 reslen = rescnt = fmtcnt + 100; 9368 result = _PyUnicode_New(reslen); 9369 if (result == NULL) 9370 goto onError; 9371 res = PyUnicode_AS_UNICODE(result); 9372 9373 if (PyTuple_Check(args)) { 9374 arglen = PyTuple_Size(args); 9375 argidx = 0; 9376 } 9377 else { 9378 arglen = -1; 9379 argidx = -2; 9380 } 9381 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) && 9382 !PyUnicode_Check(args)) 9383 dict = args; 9384 9385 while (--fmtcnt >= 0) { 9386 if (*fmt != '%') { 9387 if (--rescnt < 0) { 9388 rescnt = fmtcnt + 100; 9389 reslen += rescnt; 9390 if (_PyUnicode_Resize(&result, reslen) < 0) 9391 goto onError; 9392 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt; 9393 --rescnt; 9394 } 9395 *res++ = *fmt++; 9396 } 9397 else { 9398 /* Got a format specifier */ 9399 int flags = 0; 9400 Py_ssize_t width = -1; 9401 int prec = -1; 9402 Py_UNICODE c = '\0'; 9403 Py_UNICODE fill; 9404 int isnumok; 9405 PyObject *v = NULL; 9406 PyObject *temp = NULL; 9407 Py_UNICODE *pbuf; 9408 Py_UNICODE sign; 9409 Py_ssize_t len; 9410 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For formatchar() */ 9411 9412 fmt++; 9413 if (*fmt == '(') { 9414 Py_UNICODE *keystart; 9415 Py_ssize_t keylen; 9416 PyObject *key; 9417 int pcount = 1; 9418 9419 if (dict == NULL) { 9420 PyErr_SetString(PyExc_TypeError, 9421 "format requires a mapping"); 9422 goto onError; 9423 } 9424 ++fmt; 9425 --fmtcnt; 9426 keystart = fmt; 9427 /* Skip over balanced parentheses */ 9428 while (pcount > 0 && --fmtcnt >= 0) { 9429 if (*fmt == ')') 9430 --pcount; 9431 else if (*fmt == '(') 9432 ++pcount; 9433 fmt++; 9434 } 9435 keylen = fmt - keystart - 1; 9436 if (fmtcnt < 0 || pcount > 0) { 9437 PyErr_SetString(PyExc_ValueError, 9438 "incomplete format key"); 9439 goto onError; 9440 } 9441#if 0 9442 /* keys are converted to strings using UTF-8 and 9443 then looked up since Python uses strings to hold 9444 variables names etc. in its namespaces and we 9445 wouldn't want to break common idioms. */ 9446 key = PyUnicode_EncodeUTF8(keystart, 9447 keylen, 9448 NULL); 9449#else 9450 key = PyUnicode_FromUnicode(keystart, keylen); 9451#endif 9452 if (key == NULL) 9453 goto onError; 9454 if (args_owned) { 9455 Py_DECREF(args); 9456 args_owned = 0; 9457 } 9458 args = PyObject_GetItem(dict, key); 9459 Py_DECREF(key); 9460 if (args == NULL) { 9461 goto onError; 9462 } 9463 args_owned = 1; 9464 arglen = -1; 9465 argidx = -2; 9466 } 9467 while (--fmtcnt >= 0) { 9468 switch (c = *fmt++) { 9469 case '-': flags |= F_LJUST; continue; 9470 case '+': flags |= F_SIGN; continue; 9471 case ' ': flags |= F_BLANK; continue; 9472 case '#': flags |= F_ALT; continue; 9473 case '0': flags |= F_ZERO; continue; 9474 } 9475 break; 9476 } 9477 if (c == '*') { 9478 v = getnextarg(args, arglen, &argidx); 9479 if (v == NULL) 9480 goto onError; 9481 if (!PyLong_Check(v)) { 9482 PyErr_SetString(PyExc_TypeError, 9483 "* wants int"); 9484 goto onError; 9485 } 9486 width = PyLong_AsLong(v); 9487 if (width == -1 && PyErr_Occurred()) 9488 goto onError; 9489 if (width < 0) { 9490 flags |= F_LJUST; 9491 width = -width; 9492 } 9493 if (--fmtcnt >= 0) 9494 c = *fmt++; 9495 } 9496 else if (c >= '0' && c <= '9') { 9497 width = c - '0'; 9498 while (--fmtcnt >= 0) { 9499 c = *fmt++; 9500 if (c < '0' || c > '9') 9501 break; 9502 if ((width*10) / 10 != width) { 9503 PyErr_SetString(PyExc_ValueError, 9504 "width too big"); 9505 goto onError; 9506 } 9507 width = width*10 + (c - '0'); 9508 } 9509 } 9510 if (c == '.') { 9511 prec = 0; 9512 if (--fmtcnt >= 0) 9513 c = *fmt++; 9514 if (c == '*') { 9515 v = getnextarg(args, arglen, &argidx); 9516 if (v == NULL) 9517 goto onError; 9518 if (!PyLong_Check(v)) { 9519 PyErr_SetString(PyExc_TypeError, 9520 "* wants int"); 9521 goto onError; 9522 } 9523 prec = PyLong_AsLong(v); 9524 if (prec == -1 && PyErr_Occurred()) 9525 goto onError; 9526 if (prec < 0) 9527 prec = 0; 9528 if (--fmtcnt >= 0) 9529 c = *fmt++; 9530 } 9531 else if (c >= '0' && c <= '9') { 9532 prec = c - '0'; 9533 while (--fmtcnt >= 0) { 9534 c = *fmt++; 9535 if (c < '0' || c > '9') 9536 break; 9537 if ((prec*10) / 10 != prec) { 9538 PyErr_SetString(PyExc_ValueError, 9539 "prec too big"); 9540 goto onError; 9541 } 9542 prec = prec*10 + (c - '0'); 9543 } 9544 } 9545 } /* prec */ 9546 if (fmtcnt >= 0) { 9547 if (c == 'h' || c == 'l' || c == 'L') { 9548 if (--fmtcnt >= 0) 9549 c = *fmt++; 9550 } 9551 } 9552 if (fmtcnt < 0) { 9553 PyErr_SetString(PyExc_ValueError, 9554 "incomplete format"); 9555 goto onError; 9556 } 9557 if (c != '%') { 9558 v = getnextarg(args, arglen, &argidx); 9559 if (v == NULL) 9560 goto onError; 9561 } 9562 sign = 0; 9563 fill = ' '; 9564 switch (c) { 9565 9566 case '%': 9567 pbuf = formatbuf; 9568 /* presume that buffer length is at least 1 */ 9569 pbuf[0] = '%'; 9570 len = 1; 9571 break; 9572 9573 case 's': 9574 case 'r': 9575 case 'a': 9576 if (PyUnicode_CheckExact(v) && c == 's') { 9577 temp = v; 9578 Py_INCREF(temp); 9579 } 9580 else { 9581 if (c == 's') 9582 temp = PyObject_Str(v); 9583 else if (c == 'r') 9584 temp = PyObject_Repr(v); 9585 else 9586 temp = PyObject_ASCII(v); 9587 if (temp == NULL) 9588 goto onError; 9589 if (PyUnicode_Check(temp)) 9590 /* nothing to do */; 9591 else { 9592 Py_DECREF(temp); 9593 PyErr_SetString(PyExc_TypeError, 9594 "%s argument has non-string str()"); 9595 goto onError; 9596 } 9597 } 9598 pbuf = PyUnicode_AS_UNICODE(temp); 9599 len = PyUnicode_GET_SIZE(temp); 9600 if (prec >= 0 && len > prec) 9601 len = prec; 9602 break; 9603 9604 case 'i': 9605 case 'd': 9606 case 'u': 9607 case 'o': 9608 case 'x': 9609 case 'X': 9610 if (c == 'i') 9611 c = 'd'; 9612 isnumok = 0; 9613 if (PyNumber_Check(v)) { 9614 PyObject *iobj=NULL; 9615 9616 if (PyLong_Check(v)) { 9617 iobj = v; 9618 Py_INCREF(iobj); 9619 } 9620 else { 9621 iobj = PyNumber_Long(v); 9622 } 9623 if (iobj!=NULL) { 9624 if (PyLong_Check(iobj)) { 9625 isnumok = 1; 9626 temp = formatlong(iobj, flags, prec, c); 9627 Py_DECREF(iobj); 9628 if (!temp) 9629 goto onError; 9630 pbuf = PyUnicode_AS_UNICODE(temp); 9631 len = PyUnicode_GET_SIZE(temp); 9632 sign = 1; 9633 } 9634 else { 9635 Py_DECREF(iobj); 9636 } 9637 } 9638 } 9639 if (!isnumok) { 9640 PyErr_Format(PyExc_TypeError, 9641 "%%%c format: a number is required, " 9642 "not %.200s", (char)c, Py_TYPE(v)->tp_name); 9643 goto onError; 9644 } 9645 if (flags & F_ZERO) 9646 fill = '0'; 9647 break; 9648 9649 case 'e': 9650 case 'E': 9651 case 'f': 9652 case 'F': 9653 case 'g': 9654 case 'G': 9655 temp = formatfloat(v, flags, prec, c); 9656 if (!temp) 9657 goto onError; 9658 pbuf = PyUnicode_AS_UNICODE(temp); 9659 len = PyUnicode_GET_SIZE(temp); 9660 sign = 1; 9661 if (flags & F_ZERO) 9662 fill = '0'; 9663 break; 9664 9665 case 'c': 9666 pbuf = formatbuf; 9667 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v); 9668 if (len < 0) 9669 goto onError; 9670 break; 9671 9672 default: 9673 PyErr_Format(PyExc_ValueError, 9674 "unsupported format character '%c' (0x%x) " 9675 "at index %zd", 9676 (31<=c && c<=126) ? (char)c : '?', 9677 (int)c, 9678 (Py_ssize_t)(fmt - 1 - 9679 PyUnicode_AS_UNICODE(uformat))); 9680 goto onError; 9681 } 9682 if (sign) { 9683 if (*pbuf == '-' || *pbuf == '+') { 9684 sign = *pbuf++; 9685 len--; 9686 } 9687 else if (flags & F_SIGN) 9688 sign = '+'; 9689 else if (flags & F_BLANK) 9690 sign = ' '; 9691 else 9692 sign = 0; 9693 } 9694 if (width < len) 9695 width = len; 9696 if (rescnt - (sign != 0) < width) { 9697 reslen -= rescnt; 9698 rescnt = width + fmtcnt + 100; 9699 reslen += rescnt; 9700 if (reslen < 0) { 9701 Py_XDECREF(temp); 9702 PyErr_NoMemory(); 9703 goto onError; 9704 } 9705 if (_PyUnicode_Resize(&result, reslen) < 0) { 9706 Py_XDECREF(temp); 9707 goto onError; 9708 } 9709 res = PyUnicode_AS_UNICODE(result) 9710 + reslen - rescnt; 9711 } 9712 if (sign) { 9713 if (fill != ' ') 9714 *res++ = sign; 9715 rescnt--; 9716 if (width > len) 9717 width--; 9718 } 9719 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) { 9720 assert(pbuf[0] == '0'); 9721 assert(pbuf[1] == c); 9722 if (fill != ' ') { 9723 *res++ = *pbuf++; 9724 *res++ = *pbuf++; 9725 } 9726 rescnt -= 2; 9727 width -= 2; 9728 if (width < 0) 9729 width = 0; 9730 len -= 2; 9731 } 9732 if (width > len && !(flags & F_LJUST)) { 9733 do { 9734 --rescnt; 9735 *res++ = fill; 9736 } while (--width > len); 9737 } 9738 if (fill == ' ') { 9739 if (sign) 9740 *res++ = sign; 9741 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) { 9742 assert(pbuf[0] == '0'); 9743 assert(pbuf[1] == c); 9744 *res++ = *pbuf++; 9745 *res++ = *pbuf++; 9746 } 9747 } 9748 Py_UNICODE_COPY(res, pbuf, len); 9749 res += len; 9750 rescnt -= len; 9751 while (--width >= len) { 9752 --rescnt; 9753 *res++ = ' '; 9754 } 9755 if (dict && (argidx < arglen) && c != '%') { 9756 PyErr_SetString(PyExc_TypeError, 9757 "not all arguments converted during string formatting"); 9758 Py_XDECREF(temp); 9759 goto onError; 9760 } 9761 Py_XDECREF(temp); 9762 } /* '%' */ 9763 } /* until end */ 9764 if (argidx < arglen && !dict) { 9765 PyErr_SetString(PyExc_TypeError, 9766 "not all arguments converted during string formatting"); 9767 goto onError; 9768 } 9769 9770 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0) 9771 goto onError; 9772 if (args_owned) { 9773 Py_DECREF(args); 9774 } 9775 Py_DECREF(uformat); 9776 return (PyObject *)result; 9777 9778 onError: 9779 Py_XDECREF(result); 9780 Py_DECREF(uformat); 9781 if (args_owned) { 9782 Py_DECREF(args); 9783 } 9784 return NULL; 9785} 9786 9787static PyObject * 9788unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds); 9789 9790static PyObject * 9791unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds) 9792{ 9793 PyObject *x = NULL; 9794 static char *kwlist[] = {"object", "encoding", "errors", 0}; 9795 char *encoding = NULL; 9796 char *errors = NULL; 9797 9798 if (type != &PyUnicode_Type) 9799 return unicode_subtype_new(type, args, kwds); 9800 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str", 9801 kwlist, &x, &encoding, &errors)) 9802 return NULL; 9803 if (x == NULL) 9804 return (PyObject *)_PyUnicode_New(0); 9805 if (encoding == NULL && errors == NULL) 9806 return PyObject_Str(x); 9807 else 9808 return PyUnicode_FromEncodedObject(x, encoding, errors); 9809} 9810 9811static PyObject * 9812unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds) 9813{ 9814 PyUnicodeObject *tmp, *pnew; 9815 Py_ssize_t n; 9816 9817 assert(PyType_IsSubtype(type, &PyUnicode_Type)); 9818 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds); 9819 if (tmp == NULL) 9820 return NULL; 9821 assert(PyUnicode_Check(tmp)); 9822 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length); 9823 if (pnew == NULL) { 9824 Py_DECREF(tmp); 9825 return NULL; 9826 } 9827 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1)); 9828 if (pnew->str == NULL) { 9829 _Py_ForgetReference((PyObject *)pnew); 9830 PyObject_Del(pnew); 9831 Py_DECREF(tmp); 9832 return PyErr_NoMemory(); 9833 } 9834 Py_UNICODE_COPY(pnew->str, tmp->str, n+1); 9835 pnew->length = n; 9836 pnew->hash = tmp->hash; 9837 Py_DECREF(tmp); 9838 return (PyObject *)pnew; 9839} 9840 9841PyDoc_STRVAR(unicode_doc, 9842 "str(string[, encoding[, errors]]) -> str\n\ 9843\n\ 9844Create a new string object from the given encoded string.\n\ 9845encoding defaults to the current default string encoding.\n\ 9846errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'."); 9847 9848static PyObject *unicode_iter(PyObject *seq); 9849 9850PyTypeObject PyUnicode_Type = { 9851 PyVarObject_HEAD_INIT(&PyType_Type, 0) 9852 "str", /* tp_name */ 9853 sizeof(PyUnicodeObject), /* tp_size */ 9854 0, /* tp_itemsize */ 9855 /* Slots */ 9856 (destructor)unicode_dealloc, /* tp_dealloc */ 9857 0, /* tp_print */ 9858 0, /* tp_getattr */ 9859 0, /* tp_setattr */ 9860 0, /* tp_reserved */ 9861 unicode_repr, /* tp_repr */ 9862 &unicode_as_number, /* tp_as_number */ 9863 &unicode_as_sequence, /* tp_as_sequence */ 9864 &unicode_as_mapping, /* tp_as_mapping */ 9865 (hashfunc) unicode_hash, /* tp_hash*/ 9866 0, /* tp_call*/ 9867 (reprfunc) unicode_str, /* tp_str */ 9868 PyObject_GenericGetAttr, /* tp_getattro */ 9869 0, /* tp_setattro */ 9870 0, /* tp_as_buffer */ 9871 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | 9872 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */ 9873 unicode_doc, /* tp_doc */ 9874 0, /* tp_traverse */ 9875 0, /* tp_clear */ 9876 PyUnicode_RichCompare, /* tp_richcompare */ 9877 0, /* tp_weaklistoffset */ 9878 unicode_iter, /* tp_iter */ 9879 0, /* tp_iternext */ 9880 unicode_methods, /* tp_methods */ 9881 0, /* tp_members */ 9882 0, /* tp_getset */ 9883 &PyBaseObject_Type, /* tp_base */ 9884 0, /* tp_dict */ 9885 0, /* tp_descr_get */ 9886 0, /* tp_descr_set */ 9887 0, /* tp_dictoffset */ 9888 0, /* tp_init */ 9889 0, /* tp_alloc */ 9890 unicode_new, /* tp_new */ 9891 PyObject_Del, /* tp_free */ 9892}; 9893 9894/* Initialize the Unicode implementation */ 9895 9896void _PyUnicode_Init(void) 9897{ 9898 int i; 9899 9900 /* XXX - move this array to unicodectype.c ? */ 9901 Py_UNICODE linebreak[] = { 9902 0x000A, /* LINE FEED */ 9903 0x000D, /* CARRIAGE RETURN */ 9904 0x001C, /* FILE SEPARATOR */ 9905 0x001D, /* GROUP SEPARATOR */ 9906 0x001E, /* RECORD SEPARATOR */ 9907 0x0085, /* NEXT LINE */ 9908 0x2028, /* LINE SEPARATOR */ 9909 0x2029, /* PARAGRAPH SEPARATOR */ 9910 }; 9911 9912 /* Init the implementation */ 9913 free_list = NULL; 9914 numfree = 0; 9915 unicode_empty = _PyUnicode_New(0); 9916 if (!unicode_empty) 9917 return; 9918 9919 for (i = 0; i < 256; i++) 9920 unicode_latin1[i] = NULL; 9921 if (PyType_Ready(&PyUnicode_Type) < 0) 9922 Py_FatalError("Can't initialize 'unicode'"); 9923 9924 /* initialize the linebreak bloom filter */ 9925 bloom_linebreak = make_bloom_mask( 9926 linebreak, sizeof(linebreak) / sizeof(linebreak[0]) 9927 ); 9928 9929 PyType_Ready(&EncodingMapType); 9930} 9931 9932/* Finalize the Unicode implementation */ 9933 9934int 9935PyUnicode_ClearFreeList(void) 9936{ 9937 int freelist_size = numfree; 9938 PyUnicodeObject *u; 9939 9940 for (u = free_list; u != NULL;) { 9941 PyUnicodeObject *v = u; 9942 u = *(PyUnicodeObject **)u; 9943 if (v->str) 9944 PyObject_DEL(v->str); 9945 Py_XDECREF(v->defenc); 9946 PyObject_Del(v); 9947 numfree--; 9948 } 9949 free_list = NULL; 9950 assert(numfree == 0); 9951 return freelist_size; 9952} 9953 9954void 9955_PyUnicode_Fini(void) 9956{ 9957 int i; 9958 9959 Py_XDECREF(unicode_empty); 9960 unicode_empty = NULL; 9961 9962 for (i = 0; i < 256; i++) { 9963 if (unicode_latin1[i]) { 9964 Py_DECREF(unicode_latin1[i]); 9965 unicode_latin1[i] = NULL; 9966 } 9967 } 9968 (void)PyUnicode_ClearFreeList(); 9969} 9970 9971void 9972PyUnicode_InternInPlace(PyObject **p) 9973{ 9974 register PyUnicodeObject *s = (PyUnicodeObject *)(*p); 9975 PyObject *t; 9976 if (s == NULL || !PyUnicode_Check(s)) 9977 Py_FatalError( 9978 "PyUnicode_InternInPlace: unicode strings only please!"); 9979 /* If it's a subclass, we don't really know what putting 9980 it in the interned dict might do. */ 9981 if (!PyUnicode_CheckExact(s)) 9982 return; 9983 if (PyUnicode_CHECK_INTERNED(s)) 9984 return; 9985 if (interned == NULL) { 9986 interned = PyDict_New(); 9987 if (interned == NULL) { 9988 PyErr_Clear(); /* Don't leave an exception */ 9989 return; 9990 } 9991 } 9992 /* It might be that the GetItem call fails even 9993 though the key is present in the dictionary, 9994 namely when this happens during a stack overflow. */ 9995 Py_ALLOW_RECURSION 9996 t = PyDict_GetItem(interned, (PyObject *)s); 9997 Py_END_ALLOW_RECURSION 9998 9999 if (t) { 10000 Py_INCREF(t); 10001 Py_DECREF(*p); 10002 *p = t; 10003 return; 10004 } 10005 10006 PyThreadState_GET()->recursion_critical = 1; 10007 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) { 10008 PyErr_Clear(); 10009 PyThreadState_GET()->recursion_critical = 0; 10010 return; 10011 } 10012 PyThreadState_GET()->recursion_critical = 0; 10013 /* The two references in interned are not counted by refcnt. 10014 The deallocator will take care of this */ 10015 Py_REFCNT(s) -= 2; 10016 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL; 10017} 10018 10019void 10020PyUnicode_InternImmortal(PyObject **p) 10021{ 10022 PyUnicode_InternInPlace(p); 10023 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) { 10024 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL; 10025 Py_INCREF(*p); 10026 } 10027} 10028 10029PyObject * 10030PyUnicode_InternFromString(const char *cp) 10031{ 10032 PyObject *s = PyUnicode_FromString(cp); 10033 if (s == NULL) 10034 return NULL; 10035 PyUnicode_InternInPlace(&s); 10036 return s; 10037} 10038 10039void _Py_ReleaseInternedUnicodeStrings(void) 10040{ 10041 PyObject *keys; 10042 PyUnicodeObject *s; 10043 Py_ssize_t i, n; 10044 Py_ssize_t immortal_size = 0, mortal_size = 0; 10045 10046 if (interned == NULL || !PyDict_Check(interned)) 10047 return; 10048 keys = PyDict_Keys(interned); 10049 if (keys == NULL || !PyList_Check(keys)) { 10050 PyErr_Clear(); 10051 return; 10052 } 10053 10054 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak 10055 detector, interned unicode strings are not forcibly deallocated; 10056 rather, we give them their stolen references back, and then clear 10057 and DECREF the interned dict. */ 10058 10059 n = PyList_GET_SIZE(keys); 10060 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n", 10061 n); 10062 for (i = 0; i < n; i++) { 10063 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i); 10064 switch (s->state) { 10065 case SSTATE_NOT_INTERNED: 10066 /* XXX Shouldn't happen */ 10067 break; 10068 case SSTATE_INTERNED_IMMORTAL: 10069 Py_REFCNT(s) += 1; 10070 immortal_size += s->length; 10071 break; 10072 case SSTATE_INTERNED_MORTAL: 10073 Py_REFCNT(s) += 2; 10074 mortal_size += s->length; 10075 break; 10076 default: 10077 Py_FatalError("Inconsistent interned string state."); 10078 } 10079 s->state = SSTATE_NOT_INTERNED; 10080 } 10081 fprintf(stderr, "total size of all interned strings: " 10082 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d " 10083 "mortal/immortal\n", mortal_size, immortal_size); 10084 Py_DECREF(keys); 10085 PyDict_Clear(interned); 10086 Py_DECREF(interned); 10087 interned = NULL; 10088} 10089 10090 10091/********************* Unicode Iterator **************************/ 10092 10093typedef struct { 10094 PyObject_HEAD 10095 Py_ssize_t it_index; 10096 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */ 10097} unicodeiterobject; 10098 10099static void 10100unicodeiter_dealloc(unicodeiterobject *it) 10101{ 10102 _PyObject_GC_UNTRACK(it); 10103 Py_XDECREF(it->it_seq); 10104 PyObject_GC_Del(it); 10105} 10106 10107static int 10108unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg) 10109{ 10110 Py_VISIT(it->it_seq); 10111 return 0; 10112} 10113 10114static PyObject * 10115unicodeiter_next(unicodeiterobject *it) 10116{ 10117 PyUnicodeObject *seq; 10118 PyObject *item; 10119 10120 assert(it != NULL); 10121 seq = it->it_seq; 10122 if (seq == NULL) 10123 return NULL; 10124 assert(PyUnicode_Check(seq)); 10125 10126 if (it->it_index < PyUnicode_GET_SIZE(seq)) { 10127 item = PyUnicode_FromUnicode( 10128 PyUnicode_AS_UNICODE(seq)+it->it_index, 1); 10129 if (item != NULL) 10130 ++it->it_index; 10131 return item; 10132 } 10133 10134 Py_DECREF(seq); 10135 it->it_seq = NULL; 10136 return NULL; 10137} 10138 10139static PyObject * 10140unicodeiter_len(unicodeiterobject *it) 10141{ 10142 Py_ssize_t len = 0; 10143 if (it->it_seq) 10144 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index; 10145 return PyLong_FromSsize_t(len); 10146} 10147 10148PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it))."); 10149 10150static PyMethodDef unicodeiter_methods[] = { 10151 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS, 10152 length_hint_doc}, 10153 {NULL, NULL} /* sentinel */ 10154}; 10155 10156PyTypeObject PyUnicodeIter_Type = { 10157 PyVarObject_HEAD_INIT(&PyType_Type, 0) 10158 "str_iterator", /* tp_name */ 10159 sizeof(unicodeiterobject), /* tp_basicsize */ 10160 0, /* tp_itemsize */ 10161 /* methods */ 10162 (destructor)unicodeiter_dealloc, /* tp_dealloc */ 10163 0, /* tp_print */ 10164 0, /* tp_getattr */ 10165 0, /* tp_setattr */ 10166 0, /* tp_reserved */ 10167 0, /* tp_repr */ 10168 0, /* tp_as_number */ 10169 0, /* tp_as_sequence */ 10170 0, /* tp_as_mapping */ 10171 0, /* tp_hash */ 10172 0, /* tp_call */ 10173 0, /* tp_str */ 10174 PyObject_GenericGetAttr, /* tp_getattro */ 10175 0, /* tp_setattro */ 10176 0, /* tp_as_buffer */ 10177 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */ 10178 0, /* tp_doc */ 10179 (traverseproc)unicodeiter_traverse, /* tp_traverse */ 10180 0, /* tp_clear */ 10181 0, /* tp_richcompare */ 10182 0, /* tp_weaklistoffset */ 10183 PyObject_SelfIter, /* tp_iter */ 10184 (iternextfunc)unicodeiter_next, /* tp_iternext */ 10185 unicodeiter_methods, /* tp_methods */ 10186 0, 10187}; 10188 10189static PyObject * 10190unicode_iter(PyObject *seq) 10191{ 10192 unicodeiterobject *it; 10193 10194 if (!PyUnicode_Check(seq)) { 10195 PyErr_BadInternalCall(); 10196 return NULL; 10197 } 10198 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type); 10199 if (it == NULL) 10200 return NULL; 10201 it->it_index = 0; 10202 Py_INCREF(seq); 10203 it->it_seq = (PyUnicodeObject *)seq; 10204 _PyObject_GC_TRACK(it); 10205 return (PyObject *)it; 10206} 10207 10208size_t 10209Py_UNICODE_strlen(const Py_UNICODE *u) 10210{ 10211 int res = 0; 10212 while(*u++) 10213 res++; 10214 return res; 10215} 10216 10217Py_UNICODE* 10218Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2) 10219{ 10220 Py_UNICODE *u = s1; 10221 while ((*u++ = *s2++)); 10222 return s1; 10223} 10224 10225Py_UNICODE* 10226Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n) 10227{ 10228 Py_UNICODE *u = s1; 10229 while ((*u++ = *s2++)) 10230 if (n-- == 0) 10231 break; 10232 return s1; 10233} 10234 10235Py_UNICODE* 10236Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2) 10237{ 10238 Py_UNICODE *u1 = s1; 10239 u1 += Py_UNICODE_strlen(u1); 10240 Py_UNICODE_strcpy(u1, s2); 10241 return s1; 10242} 10243 10244int 10245Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2) 10246{ 10247 while (*s1 && *s2 && *s1 == *s2) 10248 s1++, s2++; 10249 if (*s1 && *s2) 10250 return (*s1 < *s2) ? -1 : +1; 10251 if (*s1) 10252 return 1; 10253 if (*s2) 10254 return -1; 10255 return 0; 10256} 10257 10258int 10259Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n) 10260{ 10261 register Py_UNICODE u1, u2; 10262 for (; n != 0; n--) { 10263 u1 = *s1; 10264 u2 = *s2; 10265 if (u1 != u2) 10266 return (u1 < u2) ? -1 : +1; 10267 if (u1 == '\0') 10268 return 0; 10269 s1++; 10270 s2++; 10271 } 10272 return 0; 10273} 10274 10275Py_UNICODE* 10276Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c) 10277{ 10278 const Py_UNICODE *p; 10279 for (p = s; *p; p++) 10280 if (*p == c) 10281 return (Py_UNICODE*)p; 10282 return NULL; 10283} 10284 10285Py_UNICODE* 10286Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c) 10287{ 10288 const Py_UNICODE *p; 10289 p = s + Py_UNICODE_strlen(s); 10290 while (p != s) { 10291 p--; 10292 if (*p == c) 10293 return (Py_UNICODE*)p; 10294 } 10295 return NULL; 10296} 10297 10298Py_UNICODE* 10299PyUnicode_AsUnicodeCopy(PyObject *object) 10300{ 10301 PyUnicodeObject *unicode = (PyUnicodeObject *)object; 10302 Py_UNICODE *copy; 10303 Py_ssize_t size; 10304 10305 /* Ensure we won't overflow the size. */ 10306 if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) { 10307 PyErr_NoMemory(); 10308 return NULL; 10309 } 10310 size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */ 10311 size *= sizeof(Py_UNICODE); 10312 copy = PyMem_Malloc(size); 10313 if (copy == NULL) { 10314 PyErr_NoMemory(); 10315 return NULL; 10316 } 10317 memcpy(copy, PyUnicode_AS_UNICODE(unicode), size); 10318 return copy; 10319} 10320 10321/* A _string module, to export formatter_parser and formatter_field_name_split 10322 to the string.Formatter class implemented in Python. */ 10323 10324static PyMethodDef _string_methods[] = { 10325 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split, 10326 METH_O, PyDoc_STR("split the argument as a field name")}, 10327 {"formatter_parser", (PyCFunction) formatter_parser, 10328 METH_O, PyDoc_STR("parse the argument as a format string")}, 10329 {NULL, NULL} 10330}; 10331 10332static struct PyModuleDef _string_module = { 10333 PyModuleDef_HEAD_INIT, 10334 "_string", 10335 PyDoc_STR("string helper module"), 10336 0, 10337 _string_methods, 10338 NULL, 10339 NULL, 10340 NULL, 10341 NULL 10342}; 10343 10344PyMODINIT_FUNC 10345PyInit__string(void) 10346{ 10347 return PyModule_Create(&_string_module); 10348} 10349 10350 10351#ifdef __cplusplus 10352} 10353#endif 10354