unicodeobject.c revision 168e117e0a8825bc3ae0c08f0b08a33ac351a14f
1/* 2 3Unicode implementation based on original code by Fredrik Lundh, 4modified by Marc-Andre Lemburg <mal@lemburg.com> according to the 5Unicode Integration Proposal (see file Misc/unicode.txt). 6 7Major speed upgrades to the method implementations at the Reykjavik 8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke. 9 10Copyright (c) Corporation for National Research Initiatives. 11 12-------------------------------------------------------------------- 13The original string type implementation is: 14 15 Copyright (c) 1999 by Secret Labs AB 16 Copyright (c) 1999 by Fredrik Lundh 17 18By obtaining, using, and/or copying this software and/or its 19associated documentation, you agree that you have read, understood, 20and will comply with the following terms and conditions: 21 22Permission to use, copy, modify, and distribute this software and its 23associated documentation for any purpose and without fee is hereby 24granted, provided that the above copyright notice appears in all 25copies, and that both that copyright notice and this permission notice 26appear in supporting documentation, and that the name of Secret Labs 27AB or the author not be used in advertising or publicity pertaining to 28distribution of the software without specific, written prior 29permission. 30 31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO 32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND 33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR 34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT 37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 38-------------------------------------------------------------------- 39 40*/ 41 42#define PY_SSIZE_T_CLEAN 43#include "Python.h" 44#include "bytes_methods.h" 45 46#include "unicodeobject.h" 47#include "ucnhash.h" 48 49#ifdef MS_WINDOWS 50#include <windows.h> 51#endif 52 53/* Limit for the Unicode object free list */ 54 55#define PyUnicode_MAXFREELIST 1024 56 57/* Limit for the Unicode object free list stay alive optimization. 58 59 The implementation will keep allocated Unicode memory intact for 60 all objects on the free list having a size less than this 61 limit. This reduces malloc() overhead for small Unicode objects. 62 63 At worst this will result in PyUnicode_MAXFREELIST * 64 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT + 65 malloc()-overhead) bytes of unused garbage. 66 67 Setting the limit to 0 effectively turns the feature off. 68 69 Note: This is an experimental feature ! If you get core dumps when 70 using Unicode objects, turn this feature off. 71 72*/ 73 74#define KEEPALIVE_SIZE_LIMIT 9 75 76/* Endianness switches; defaults to little endian */ 77 78#ifdef WORDS_BIGENDIAN 79# define BYTEORDER_IS_BIG_ENDIAN 80#else 81# define BYTEORDER_IS_LITTLE_ENDIAN 82#endif 83 84/* --- Globals ------------------------------------------------------------ 85 86 The globals are initialized by the _PyUnicode_Init() API and should 87 not be used before calling that API. 88 89*/ 90 91 92#ifdef __cplusplus 93extern "C" { 94#endif 95 96/* This dictionary holds all interned unicode strings. Note that references 97 to strings in this dictionary are *not* counted in the string's ob_refcnt. 98 When the interned string reaches a refcnt of 0 the string deallocation 99 function will delete the reference from this dictionary. 100 101 Another way to look at this is that to say that the actual reference 102 count of a string is: s->ob_refcnt + (s->state ? 2 : 0) 103*/ 104static PyObject *interned; 105 106/* Free list for Unicode objects */ 107static PyUnicodeObject *free_list; 108static int numfree; 109 110/* The empty Unicode object is shared to improve performance. */ 111static PyUnicodeObject *unicode_empty; 112 113/* Single character Unicode strings in the Latin-1 range are being 114 shared as well. */ 115static PyUnicodeObject *unicode_latin1[256]; 116 117/* Fast detection of the most frequent whitespace characters */ 118const unsigned char _Py_ascii_whitespace[] = { 119 0, 0, 0, 0, 0, 0, 0, 0, 120/* case 0x0009: * CHARACTER TABULATION */ 121/* case 0x000A: * LINE FEED */ 122/* case 0x000B: * LINE TABULATION */ 123/* case 0x000C: * FORM FEED */ 124/* case 0x000D: * CARRIAGE RETURN */ 125 0, 1, 1, 1, 1, 1, 0, 0, 126 0, 0, 0, 0, 0, 0, 0, 0, 127/* case 0x001C: * FILE SEPARATOR */ 128/* case 0x001D: * GROUP SEPARATOR */ 129/* case 0x001E: * RECORD SEPARATOR */ 130/* case 0x001F: * UNIT SEPARATOR */ 131 0, 0, 0, 0, 1, 1, 1, 1, 132/* case 0x0020: * SPACE */ 133 1, 0, 0, 0, 0, 0, 0, 0, 134 0, 0, 0, 0, 0, 0, 0, 0, 135 0, 0, 0, 0, 0, 0, 0, 0, 136 0, 0, 0, 0, 0, 0, 0, 0, 137 138 0, 0, 0, 0, 0, 0, 0, 0, 139 0, 0, 0, 0, 0, 0, 0, 0, 140 0, 0, 0, 0, 0, 0, 0, 0, 141 0, 0, 0, 0, 0, 0, 0, 0, 142 0, 0, 0, 0, 0, 0, 0, 0, 143 0, 0, 0, 0, 0, 0, 0, 0, 144 0, 0, 0, 0, 0, 0, 0, 0, 145 0, 0, 0, 0, 0, 0, 0, 0 146}; 147 148static PyObject *unicode_encode_call_errorhandler(const char *errors, 149 PyObject **errorHandler,const char *encoding, const char *reason, 150 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject, 151 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos); 152 153static void raise_encode_exception(PyObject **exceptionObject, 154 const char *encoding, 155 const Py_UNICODE *unicode, Py_ssize_t size, 156 Py_ssize_t startpos, Py_ssize_t endpos, 157 const char *reason); 158 159/* Same for linebreaks */ 160static unsigned char ascii_linebreak[] = { 161 0, 0, 0, 0, 0, 0, 0, 0, 162/* 0x000A, * LINE FEED */ 163/* 0x000B, * LINE TABULATION */ 164/* 0x000C, * FORM FEED */ 165/* 0x000D, * CARRIAGE RETURN */ 166 0, 0, 1, 1, 1, 1, 0, 0, 167 0, 0, 0, 0, 0, 0, 0, 0, 168/* 0x001C, * FILE SEPARATOR */ 169/* 0x001D, * GROUP SEPARATOR */ 170/* 0x001E, * RECORD SEPARATOR */ 171 0, 0, 0, 0, 1, 1, 1, 0, 172 0, 0, 0, 0, 0, 0, 0, 0, 173 0, 0, 0, 0, 0, 0, 0, 0, 174 0, 0, 0, 0, 0, 0, 0, 0, 175 0, 0, 0, 0, 0, 0, 0, 0, 176 177 0, 0, 0, 0, 0, 0, 0, 0, 178 0, 0, 0, 0, 0, 0, 0, 0, 179 0, 0, 0, 0, 0, 0, 0, 0, 180 0, 0, 0, 0, 0, 0, 0, 0, 181 0, 0, 0, 0, 0, 0, 0, 0, 182 0, 0, 0, 0, 0, 0, 0, 0, 183 0, 0, 0, 0, 0, 0, 0, 0, 184 0, 0, 0, 0, 0, 0, 0, 0 185}; 186 187 188Py_UNICODE 189PyUnicode_GetMax(void) 190{ 191#ifdef Py_UNICODE_WIDE 192 return 0x10FFFF; 193#else 194 /* This is actually an illegal character, so it should 195 not be passed to unichr. */ 196 return 0xFFFF; 197#endif 198} 199 200/* --- Bloom Filters ----------------------------------------------------- */ 201 202/* stuff to implement simple "bloom filters" for Unicode characters. 203 to keep things simple, we use a single bitmask, using the least 5 204 bits from each unicode characters as the bit index. */ 205 206/* the linebreak mask is set up by Unicode_Init below */ 207 208#if LONG_BIT >= 128 209#define BLOOM_WIDTH 128 210#elif LONG_BIT >= 64 211#define BLOOM_WIDTH 64 212#elif LONG_BIT >= 32 213#define BLOOM_WIDTH 32 214#else 215#error "LONG_BIT is smaller than 32" 216#endif 217 218#define BLOOM_MASK unsigned long 219 220static BLOOM_MASK bloom_linebreak; 221 222#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1))))) 223#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1))))) 224 225#define BLOOM_LINEBREAK(ch) \ 226 ((ch) < 128U ? ascii_linebreak[(ch)] : \ 227 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch))) 228 229Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len) 230{ 231 /* calculate simple bloom-style bitmask for a given unicode string */ 232 233 BLOOM_MASK mask; 234 Py_ssize_t i; 235 236 mask = 0; 237 for (i = 0; i < len; i++) 238 BLOOM_ADD(mask, ptr[i]); 239 240 return mask; 241} 242 243Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen) 244{ 245 Py_ssize_t i; 246 247 for (i = 0; i < setlen; i++) 248 if (set[i] == chr) 249 return 1; 250 251 return 0; 252} 253 254#define BLOOM_MEMBER(mask, chr, set, setlen) \ 255 BLOOM(mask, chr) && unicode_member(chr, set, setlen) 256 257/* --- Unicode Object ----------------------------------------------------- */ 258 259static 260int unicode_resize(register PyUnicodeObject *unicode, 261 Py_ssize_t length) 262{ 263 void *oldstr; 264 265 /* Shortcut if there's nothing much to do. */ 266 if (unicode->length == length) 267 goto reset; 268 269 /* Resizing shared object (unicode_empty or single character 270 objects) in-place is not allowed. Use PyUnicode_Resize() 271 instead ! */ 272 273 if (unicode == unicode_empty || 274 (unicode->length == 1 && 275 unicode->str[0] < 256U && 276 unicode_latin1[unicode->str[0]] == unicode)) { 277 PyErr_SetString(PyExc_SystemError, 278 "can't resize shared str objects"); 279 return -1; 280 } 281 282 /* We allocate one more byte to make sure the string is Ux0000 terminated. 283 The overallocation is also used by fastsearch, which assumes that it's 284 safe to look at str[length] (without making any assumptions about what 285 it contains). */ 286 287 oldstr = unicode->str; 288 unicode->str = PyObject_REALLOC(unicode->str, 289 sizeof(Py_UNICODE) * (length + 1)); 290 if (!unicode->str) { 291 unicode->str = (Py_UNICODE *)oldstr; 292 PyErr_NoMemory(); 293 return -1; 294 } 295 unicode->str[length] = 0; 296 unicode->length = length; 297 298 reset: 299 /* Reset the object caches */ 300 if (unicode->defenc) { 301 Py_CLEAR(unicode->defenc); 302 } 303 unicode->hash = -1; 304 305 return 0; 306} 307 308/* We allocate one more byte to make sure the string is 309 Ux0000 terminated; some code (e.g. new_identifier) 310 relies on that. 311 312 XXX This allocator could further be enhanced by assuring that the 313 free list never reduces its size below 1. 314 315*/ 316 317static 318PyUnicodeObject *_PyUnicode_New(Py_ssize_t length) 319{ 320 register PyUnicodeObject *unicode; 321 322 /* Optimization for empty strings */ 323 if (length == 0 && unicode_empty != NULL) { 324 Py_INCREF(unicode_empty); 325 return unicode_empty; 326 } 327 328 /* Ensure we won't overflow the size. */ 329 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) { 330 return (PyUnicodeObject *)PyErr_NoMemory(); 331 } 332 333 /* Unicode freelist & memory allocation */ 334 if (free_list) { 335 unicode = free_list; 336 free_list = *(PyUnicodeObject **)unicode; 337 numfree--; 338 if (unicode->str) { 339 /* Keep-Alive optimization: we only upsize the buffer, 340 never downsize it. */ 341 if ((unicode->length < length) && 342 unicode_resize(unicode, length) < 0) { 343 PyObject_DEL(unicode->str); 344 unicode->str = NULL; 345 } 346 } 347 else { 348 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1); 349 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size); 350 } 351 PyObject_INIT(unicode, &PyUnicode_Type); 352 } 353 else { 354 size_t new_size; 355 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type); 356 if (unicode == NULL) 357 return NULL; 358 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1); 359 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size); 360 } 361 362 if (!unicode->str) { 363 PyErr_NoMemory(); 364 goto onError; 365 } 366 /* Initialize the first element to guard against cases where 367 * the caller fails before initializing str -- unicode_resize() 368 * reads str[0], and the Keep-Alive optimization can keep memory 369 * allocated for str alive across a call to unicode_dealloc(unicode). 370 * We don't want unicode_resize to read uninitialized memory in 371 * that case. 372 */ 373 unicode->str[0] = 0; 374 unicode->str[length] = 0; 375 unicode->length = length; 376 unicode->hash = -1; 377 unicode->state = 0; 378 unicode->defenc = NULL; 379 return unicode; 380 381 onError: 382 /* XXX UNREF/NEWREF interface should be more symmetrical */ 383 _Py_DEC_REFTOTAL; 384 _Py_ForgetReference((PyObject *)unicode); 385 PyObject_Del(unicode); 386 return NULL; 387} 388 389static 390void unicode_dealloc(register PyUnicodeObject *unicode) 391{ 392 switch (PyUnicode_CHECK_INTERNED(unicode)) { 393 case SSTATE_NOT_INTERNED: 394 break; 395 396 case SSTATE_INTERNED_MORTAL: 397 /* revive dead object temporarily for DelItem */ 398 Py_REFCNT(unicode) = 3; 399 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0) 400 Py_FatalError( 401 "deletion of interned string failed"); 402 break; 403 404 case SSTATE_INTERNED_IMMORTAL: 405 Py_FatalError("Immortal interned string died."); 406 407 default: 408 Py_FatalError("Inconsistent interned string state."); 409 } 410 411 if (PyUnicode_CheckExact(unicode) && 412 numfree < PyUnicode_MAXFREELIST) { 413 /* Keep-Alive optimization */ 414 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) { 415 PyObject_DEL(unicode->str); 416 unicode->str = NULL; 417 unicode->length = 0; 418 } 419 if (unicode->defenc) { 420 Py_CLEAR(unicode->defenc); 421 } 422 /* Add to free list */ 423 *(PyUnicodeObject **)unicode = free_list; 424 free_list = unicode; 425 numfree++; 426 } 427 else { 428 PyObject_DEL(unicode->str); 429 Py_XDECREF(unicode->defenc); 430 Py_TYPE(unicode)->tp_free((PyObject *)unicode); 431 } 432} 433 434static 435int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length) 436{ 437 register PyUnicodeObject *v; 438 439 /* Argument checks */ 440 if (unicode == NULL) { 441 PyErr_BadInternalCall(); 442 return -1; 443 } 444 v = *unicode; 445 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) { 446 PyErr_BadInternalCall(); 447 return -1; 448 } 449 450 /* Resizing unicode_empty and single character objects is not 451 possible since these are being shared. We simply return a fresh 452 copy with the same Unicode content. */ 453 if (v->length != length && 454 (v == unicode_empty || v->length == 1)) { 455 PyUnicodeObject *w = _PyUnicode_New(length); 456 if (w == NULL) 457 return -1; 458 Py_UNICODE_COPY(w->str, v->str, 459 length < v->length ? length : v->length); 460 Py_DECREF(*unicode); 461 *unicode = w; 462 return 0; 463 } 464 465 /* Note that we don't have to modify *unicode for unshared Unicode 466 objects, since we can modify them in-place. */ 467 return unicode_resize(v, length); 468} 469 470int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length) 471{ 472 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length); 473} 474 475PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u, 476 Py_ssize_t size) 477{ 478 PyUnicodeObject *unicode; 479 480 /* If the Unicode data is known at construction time, we can apply 481 some optimizations which share commonly used objects. */ 482 if (u != NULL) { 483 484 /* Optimization for empty strings */ 485 if (size == 0 && unicode_empty != NULL) { 486 Py_INCREF(unicode_empty); 487 return (PyObject *)unicode_empty; 488 } 489 490 /* Single character Unicode objects in the Latin-1 range are 491 shared when using this constructor */ 492 if (size == 1 && *u < 256) { 493 unicode = unicode_latin1[*u]; 494 if (!unicode) { 495 unicode = _PyUnicode_New(1); 496 if (!unicode) 497 return NULL; 498 unicode->str[0] = *u; 499 unicode_latin1[*u] = unicode; 500 } 501 Py_INCREF(unicode); 502 return (PyObject *)unicode; 503 } 504 } 505 506 unicode = _PyUnicode_New(size); 507 if (!unicode) 508 return NULL; 509 510 /* Copy the Unicode data into the new object */ 511 if (u != NULL) 512 Py_UNICODE_COPY(unicode->str, u, size); 513 514 return (PyObject *)unicode; 515} 516 517PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size) 518{ 519 PyUnicodeObject *unicode; 520 521 if (size < 0) { 522 PyErr_SetString(PyExc_SystemError, 523 "Negative size passed to PyUnicode_FromStringAndSize"); 524 return NULL; 525 } 526 527 /* If the Unicode data is known at construction time, we can apply 528 some optimizations which share commonly used objects. 529 Also, this means the input must be UTF-8, so fall back to the 530 UTF-8 decoder at the end. */ 531 if (u != NULL) { 532 533 /* Optimization for empty strings */ 534 if (size == 0 && unicode_empty != NULL) { 535 Py_INCREF(unicode_empty); 536 return (PyObject *)unicode_empty; 537 } 538 539 /* Single characters are shared when using this constructor. 540 Restrict to ASCII, since the input must be UTF-8. */ 541 if (size == 1 && Py_CHARMASK(*u) < 128) { 542 unicode = unicode_latin1[Py_CHARMASK(*u)]; 543 if (!unicode) { 544 unicode = _PyUnicode_New(1); 545 if (!unicode) 546 return NULL; 547 unicode->str[0] = Py_CHARMASK(*u); 548 unicode_latin1[Py_CHARMASK(*u)] = unicode; 549 } 550 Py_INCREF(unicode); 551 return (PyObject *)unicode; 552 } 553 554 return PyUnicode_DecodeUTF8(u, size, NULL); 555 } 556 557 unicode = _PyUnicode_New(size); 558 if (!unicode) 559 return NULL; 560 561 return (PyObject *)unicode; 562} 563 564PyObject *PyUnicode_FromString(const char *u) 565{ 566 size_t size = strlen(u); 567 if (size > PY_SSIZE_T_MAX) { 568 PyErr_SetString(PyExc_OverflowError, "input too long"); 569 return NULL; 570 } 571 572 return PyUnicode_FromStringAndSize(u, size); 573} 574 575#ifdef HAVE_WCHAR_H 576 577#if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4) 578# define CONVERT_WCHAR_TO_SURROGATES 579#endif 580 581#ifdef CONVERT_WCHAR_TO_SURROGATES 582 583/* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need 584 to convert from UTF32 to UTF16. */ 585 586PyObject *PyUnicode_FromWideChar(register const wchar_t *w, 587 Py_ssize_t size) 588{ 589 PyUnicodeObject *unicode; 590 register Py_ssize_t i; 591 Py_ssize_t alloc; 592 const wchar_t *orig_w; 593 594 if (w == NULL) { 595 if (size == 0) 596 return PyUnicode_FromStringAndSize(NULL, 0); 597 PyErr_BadInternalCall(); 598 return NULL; 599 } 600 601 if (size == -1) { 602 size = wcslen(w); 603 } 604 605 alloc = size; 606 orig_w = w; 607 for (i = size; i > 0; i--) { 608 if (*w > 0xFFFF) 609 alloc++; 610 w++; 611 } 612 w = orig_w; 613 unicode = _PyUnicode_New(alloc); 614 if (!unicode) 615 return NULL; 616 617 /* Copy the wchar_t data into the new object */ 618 { 619 register Py_UNICODE *u; 620 u = PyUnicode_AS_UNICODE(unicode); 621 for (i = size; i > 0; i--) { 622 if (*w > 0xFFFF) { 623 wchar_t ordinal = *w++; 624 ordinal -= 0x10000; 625 *u++ = 0xD800 | (ordinal >> 10); 626 *u++ = 0xDC00 | (ordinal & 0x3FF); 627 } 628 else 629 *u++ = *w++; 630 } 631 } 632 return (PyObject *)unicode; 633} 634 635#else 636 637PyObject *PyUnicode_FromWideChar(register const wchar_t *w, 638 Py_ssize_t size) 639{ 640 PyUnicodeObject *unicode; 641 642 if (w == NULL) { 643 if (size == 0) 644 return PyUnicode_FromStringAndSize(NULL, 0); 645 PyErr_BadInternalCall(); 646 return NULL; 647 } 648 649 if (size == -1) { 650 size = wcslen(w); 651 } 652 653 unicode = _PyUnicode_New(size); 654 if (!unicode) 655 return NULL; 656 657 /* Copy the wchar_t data into the new object */ 658#if Py_UNICODE_SIZE == SIZEOF_WCHAR_T 659 memcpy(unicode->str, w, size * sizeof(wchar_t)); 660#else 661 { 662 register Py_UNICODE *u; 663 register Py_ssize_t i; 664 u = PyUnicode_AS_UNICODE(unicode); 665 for (i = size; i > 0; i--) 666 *u++ = *w++; 667 } 668#endif 669 670 return (PyObject *)unicode; 671} 672 673#endif /* CONVERT_WCHAR_TO_SURROGATES */ 674 675#undef CONVERT_WCHAR_TO_SURROGATES 676 677static void 678makefmt(char *fmt, int longflag, int longlongflag, int size_tflag, 679 int zeropad, int width, int precision, char c) 680{ 681 *fmt++ = '%'; 682 if (width) { 683 if (zeropad) 684 *fmt++ = '0'; 685 fmt += sprintf(fmt, "%d", width); 686 } 687 if (precision) 688 fmt += sprintf(fmt, ".%d", precision); 689 if (longflag) 690 *fmt++ = 'l'; 691 else if (longlongflag) { 692 /* longlongflag should only ever be nonzero on machines with 693 HAVE_LONG_LONG defined */ 694#ifdef HAVE_LONG_LONG 695 char *f = PY_FORMAT_LONG_LONG; 696 while (*f) 697 *fmt++ = *f++; 698#else 699 /* we shouldn't ever get here */ 700 assert(0); 701 *fmt++ = 'l'; 702#endif 703 } 704 else if (size_tflag) { 705 char *f = PY_FORMAT_SIZE_T; 706 while (*f) 707 *fmt++ = *f++; 708 } 709 *fmt++ = c; 710 *fmt = '\0'; 711} 712 713#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;} 714 715/* size of fixed-size buffer for formatting single arguments */ 716#define ITEM_BUFFER_LEN 21 717/* maximum number of characters required for output of %ld. 21 characters 718 allows for 64-bit integers (in decimal) and an optional sign. */ 719#define MAX_LONG_CHARS 21 720/* maximum number of characters required for output of %lld. 721 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits, 722 plus 1 for the sign. 53/22 is an upper bound for log10(256). */ 723#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22) 724 725PyObject * 726PyUnicode_FromFormatV(const char *format, va_list vargs) 727{ 728 va_list count; 729 Py_ssize_t callcount = 0; 730 PyObject **callresults = NULL; 731 PyObject **callresult = NULL; 732 Py_ssize_t n = 0; 733 int width = 0; 734 int precision = 0; 735 int zeropad; 736 const char* f; 737 Py_UNICODE *s; 738 PyObject *string; 739 /* used by sprintf */ 740 char buffer[ITEM_BUFFER_LEN+1]; 741 /* use abuffer instead of buffer, if we need more space 742 * (which can happen if there's a format specifier with width). */ 743 char *abuffer = NULL; 744 char *realbuffer; 745 Py_ssize_t abuffersize = 0; 746 char fmt[61]; /* should be enough for %0width.precisionlld */ 747 const char *copy; 748 749 Py_VA_COPY(count, vargs); 750 /* step 1: count the number of %S/%R/%A/%s format specifications 751 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/ 752 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the 753 * result in an array) */ 754 for (f = format; *f; f++) { 755 if (*f == '%') { 756 if (*(f+1)=='%') 757 continue; 758 if (*(f+1)=='S' || *(f+1)=='R' || *(f+1)=='A') 759 ++callcount; 760 while (ISDIGIT((unsigned)*f)) 761 width = (width*10) + *f++ - '0'; 762 while (*++f && *f != '%' && !ISALPHA((unsigned)*f)) 763 ; 764 if (*f == 's') 765 ++callcount; 766 } 767 else if (128 <= (unsigned char)*f) { 768 PyErr_Format(PyExc_ValueError, 769 "PyUnicode_FromFormatV() expects an ASCII-encoded format " 770 "string, got a non-ASCII byte: 0x%02x", 771 (unsigned char)*f); 772 return NULL; 773 } 774 } 775 /* step 2: allocate memory for the results of 776 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */ 777 if (callcount) { 778 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount); 779 if (!callresults) { 780 PyErr_NoMemory(); 781 return NULL; 782 } 783 callresult = callresults; 784 } 785 /* step 3: figure out how large a buffer we need */ 786 for (f = format; *f; f++) { 787 if (*f == '%') { 788#ifdef HAVE_LONG_LONG 789 int longlongflag = 0; 790#endif 791 const char* p = f; 792 width = 0; 793 while (ISDIGIT((unsigned)*f)) 794 width = (width*10) + *f++ - '0'; 795 while (*++f && *f != '%' && !ISALPHA((unsigned)*f)) 796 ; 797 798 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since 799 * they don't affect the amount of space we reserve. 800 */ 801 if (*f == 'l') { 802 if (f[1] == 'd' || f[1] == 'u') { 803 ++f; 804 } 805#ifdef HAVE_LONG_LONG 806 else if (f[1] == 'l' && 807 (f[2] == 'd' || f[2] == 'u')) { 808 longlongflag = 1; 809 f += 2; 810 } 811#endif 812 } 813 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) { 814 ++f; 815 } 816 817 switch (*f) { 818 case 'c': 819 (void)va_arg(count, int); 820 /* fall through... */ 821 case '%': 822 n++; 823 break; 824 case 'd': case 'u': case 'i': case 'x': 825 (void) va_arg(count, int); 826#ifdef HAVE_LONG_LONG 827 if (longlongflag) { 828 if (width < MAX_LONG_LONG_CHARS) 829 width = MAX_LONG_LONG_CHARS; 830 } 831 else 832#endif 833 /* MAX_LONG_CHARS is enough to hold a 64-bit integer, 834 including sign. Decimal takes the most space. This 835 isn't enough for octal. If a width is specified we 836 need more (which we allocate later). */ 837 if (width < MAX_LONG_CHARS) 838 width = MAX_LONG_CHARS; 839 n += width; 840 /* XXX should allow for large precision here too. */ 841 if (abuffersize < width) 842 abuffersize = width; 843 break; 844 case 's': 845 { 846 /* UTF-8 */ 847 const char *s = va_arg(count, const char*); 848 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace"); 849 if (!str) 850 goto fail; 851 n += PyUnicode_GET_SIZE(str); 852 /* Remember the str and switch to the next slot */ 853 *callresult++ = str; 854 break; 855 } 856 case 'U': 857 { 858 PyObject *obj = va_arg(count, PyObject *); 859 assert(obj && PyUnicode_Check(obj)); 860 n += PyUnicode_GET_SIZE(obj); 861 break; 862 } 863 case 'V': 864 { 865 PyObject *obj = va_arg(count, PyObject *); 866 const char *str = va_arg(count, const char *); 867 assert(obj || str); 868 assert(!obj || PyUnicode_Check(obj)); 869 if (obj) 870 n += PyUnicode_GET_SIZE(obj); 871 else 872 n += strlen(str); 873 break; 874 } 875 case 'S': 876 { 877 PyObject *obj = va_arg(count, PyObject *); 878 PyObject *str; 879 assert(obj); 880 str = PyObject_Str(obj); 881 if (!str) 882 goto fail; 883 n += PyUnicode_GET_SIZE(str); 884 /* Remember the str and switch to the next slot */ 885 *callresult++ = str; 886 break; 887 } 888 case 'R': 889 { 890 PyObject *obj = va_arg(count, PyObject *); 891 PyObject *repr; 892 assert(obj); 893 repr = PyObject_Repr(obj); 894 if (!repr) 895 goto fail; 896 n += PyUnicode_GET_SIZE(repr); 897 /* Remember the repr and switch to the next slot */ 898 *callresult++ = repr; 899 break; 900 } 901 case 'A': 902 { 903 PyObject *obj = va_arg(count, PyObject *); 904 PyObject *ascii; 905 assert(obj); 906 ascii = PyObject_ASCII(obj); 907 if (!ascii) 908 goto fail; 909 n += PyUnicode_GET_SIZE(ascii); 910 /* Remember the repr and switch to the next slot */ 911 *callresult++ = ascii; 912 break; 913 } 914 case 'p': 915 (void) va_arg(count, int); 916 /* maximum 64-bit pointer representation: 917 * 0xffffffffffffffff 918 * so 19 characters is enough. 919 * XXX I count 18 -- what's the extra for? 920 */ 921 n += 19; 922 break; 923 default: 924 /* if we stumble upon an unknown 925 formatting code, copy the rest of 926 the format string to the output 927 string. (we cannot just skip the 928 code, since there's no way to know 929 what's in the argument list) */ 930 n += strlen(p); 931 goto expand; 932 } 933 } else 934 n++; 935 } 936 expand: 937 if (abuffersize > ITEM_BUFFER_LEN) { 938 /* add 1 for sprintf's trailing null byte */ 939 abuffer = PyObject_Malloc(abuffersize + 1); 940 if (!abuffer) { 941 PyErr_NoMemory(); 942 goto fail; 943 } 944 realbuffer = abuffer; 945 } 946 else 947 realbuffer = buffer; 948 /* step 4: fill the buffer */ 949 /* Since we've analyzed how much space we need for the worst case, 950 we don't have to resize the string. 951 There can be no errors beyond this point. */ 952 string = PyUnicode_FromUnicode(NULL, n); 953 if (!string) 954 goto fail; 955 956 s = PyUnicode_AS_UNICODE(string); 957 callresult = callresults; 958 959 for (f = format; *f; f++) { 960 if (*f == '%') { 961 const char* p = f++; 962 int longflag = 0; 963 int longlongflag = 0; 964 int size_tflag = 0; 965 zeropad = (*f == '0'); 966 /* parse the width.precision part */ 967 width = 0; 968 while (ISDIGIT((unsigned)*f)) 969 width = (width*10) + *f++ - '0'; 970 precision = 0; 971 if (*f == '.') { 972 f++; 973 while (ISDIGIT((unsigned)*f)) 974 precision = (precision*10) + *f++ - '0'; 975 } 976 /* Handle %ld, %lu, %lld and %llu. */ 977 if (*f == 'l') { 978 if (f[1] == 'd' || f[1] == 'u') { 979 longflag = 1; 980 ++f; 981 } 982#ifdef HAVE_LONG_LONG 983 else if (f[1] == 'l' && 984 (f[2] == 'd' || f[2] == 'u')) { 985 longlongflag = 1; 986 f += 2; 987 } 988#endif 989 } 990 /* handle the size_t flag. */ 991 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) { 992 size_tflag = 1; 993 ++f; 994 } 995 996 switch (*f) { 997 case 'c': 998 *s++ = va_arg(vargs, int); 999 break; 1000 case 'd': 1001 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad, 1002 width, precision, 'd'); 1003 if (longflag) 1004 sprintf(realbuffer, fmt, va_arg(vargs, long)); 1005#ifdef HAVE_LONG_LONG 1006 else if (longlongflag) 1007 sprintf(realbuffer, fmt, va_arg(vargs, PY_LONG_LONG)); 1008#endif 1009 else if (size_tflag) 1010 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t)); 1011 else 1012 sprintf(realbuffer, fmt, va_arg(vargs, int)); 1013 appendstring(realbuffer); 1014 break; 1015 case 'u': 1016 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad, 1017 width, precision, 'u'); 1018 if (longflag) 1019 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long)); 1020#ifdef HAVE_LONG_LONG 1021 else if (longlongflag) 1022 sprintf(realbuffer, fmt, va_arg(vargs, 1023 unsigned PY_LONG_LONG)); 1024#endif 1025 else if (size_tflag) 1026 sprintf(realbuffer, fmt, va_arg(vargs, size_t)); 1027 else 1028 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int)); 1029 appendstring(realbuffer); 1030 break; 1031 case 'i': 1032 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'i'); 1033 sprintf(realbuffer, fmt, va_arg(vargs, int)); 1034 appendstring(realbuffer); 1035 break; 1036 case 'x': 1037 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x'); 1038 sprintf(realbuffer, fmt, va_arg(vargs, int)); 1039 appendstring(realbuffer); 1040 break; 1041 case 's': 1042 { 1043 /* unused, since we already have the result */ 1044 (void) va_arg(vargs, char *); 1045 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult), 1046 PyUnicode_GET_SIZE(*callresult)); 1047 s += PyUnicode_GET_SIZE(*callresult); 1048 /* We're done with the unicode()/repr() => forget it */ 1049 Py_DECREF(*callresult); 1050 /* switch to next unicode()/repr() result */ 1051 ++callresult; 1052 break; 1053 } 1054 case 'U': 1055 { 1056 PyObject *obj = va_arg(vargs, PyObject *); 1057 Py_ssize_t size = PyUnicode_GET_SIZE(obj); 1058 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size); 1059 s += size; 1060 break; 1061 } 1062 case 'V': 1063 { 1064 PyObject *obj = va_arg(vargs, PyObject *); 1065 const char *str = va_arg(vargs, const char *); 1066 if (obj) { 1067 Py_ssize_t size = PyUnicode_GET_SIZE(obj); 1068 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size); 1069 s += size; 1070 } else { 1071 appendstring(str); 1072 } 1073 break; 1074 } 1075 case 'S': 1076 case 'R': 1077 { 1078 Py_UNICODE *ucopy; 1079 Py_ssize_t usize; 1080 Py_ssize_t upos; 1081 /* unused, since we already have the result */ 1082 (void) va_arg(vargs, PyObject *); 1083 ucopy = PyUnicode_AS_UNICODE(*callresult); 1084 usize = PyUnicode_GET_SIZE(*callresult); 1085 for (upos = 0; upos<usize;) 1086 *s++ = ucopy[upos++]; 1087 /* We're done with the unicode()/repr() => forget it */ 1088 Py_DECREF(*callresult); 1089 /* switch to next unicode()/repr() result */ 1090 ++callresult; 1091 break; 1092 } 1093 case 'p': 1094 sprintf(buffer, "%p", va_arg(vargs, void*)); 1095 /* %p is ill-defined: ensure leading 0x. */ 1096 if (buffer[1] == 'X') 1097 buffer[1] = 'x'; 1098 else if (buffer[1] != 'x') { 1099 memmove(buffer+2, buffer, strlen(buffer)+1); 1100 buffer[0] = '0'; 1101 buffer[1] = 'x'; 1102 } 1103 appendstring(buffer); 1104 break; 1105 case '%': 1106 *s++ = '%'; 1107 break; 1108 default: 1109 appendstring(p); 1110 goto end; 1111 } 1112 } 1113 else 1114 *s++ = *f; 1115 } 1116 1117 end: 1118 if (callresults) 1119 PyObject_Free(callresults); 1120 if (abuffer) 1121 PyObject_Free(abuffer); 1122 PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string)); 1123 return string; 1124 fail: 1125 if (callresults) { 1126 PyObject **callresult2 = callresults; 1127 while (callresult2 < callresult) { 1128 Py_DECREF(*callresult2); 1129 ++callresult2; 1130 } 1131 PyObject_Free(callresults); 1132 } 1133 if (abuffer) 1134 PyObject_Free(abuffer); 1135 return NULL; 1136} 1137 1138#undef appendstring 1139 1140PyObject * 1141PyUnicode_FromFormat(const char *format, ...) 1142{ 1143 PyObject* ret; 1144 va_list vargs; 1145 1146#ifdef HAVE_STDARG_PROTOTYPES 1147 va_start(vargs, format); 1148#else 1149 va_start(vargs); 1150#endif 1151 ret = PyUnicode_FromFormatV(format, vargs); 1152 va_end(vargs); 1153 return ret; 1154} 1155 1156/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString(): 1157 convert a Unicode object to a wide character string. 1158 1159 - If w is NULL: return the number of wide characters (including the nul 1160 character) required to convert the unicode object. Ignore size argument. 1161 1162 - Otherwise: return the number of wide characters (excluding the nul 1163 character) written into w. Write at most size wide characters (including 1164 the nul character). */ 1165static Py_ssize_t 1166unicode_aswidechar(PyUnicodeObject *unicode, 1167 wchar_t *w, 1168 Py_ssize_t size) 1169{ 1170#if Py_UNICODE_SIZE == SIZEOF_WCHAR_T 1171 Py_ssize_t res; 1172 if (w != NULL) { 1173 res = PyUnicode_GET_SIZE(unicode); 1174 if (size > res) 1175 size = res + 1; 1176 else 1177 res = size; 1178 memcpy(w, unicode->str, size * sizeof(wchar_t)); 1179 return res; 1180 } 1181 else 1182 return PyUnicode_GET_SIZE(unicode) + 1; 1183#elif Py_UNICODE_SIZE == 2 && SIZEOF_WCHAR_T == 4 1184 register const Py_UNICODE *u; 1185 const Py_UNICODE *uend; 1186 const wchar_t *worig, *wend; 1187 Py_ssize_t nchar; 1188 1189 u = PyUnicode_AS_UNICODE(unicode); 1190 uend = u + PyUnicode_GET_SIZE(unicode); 1191 if (w != NULL) { 1192 worig = w; 1193 wend = w + size; 1194 while (u != uend && w != wend) { 1195 if (0xD800 <= u[0] && u[0] <= 0xDBFF 1196 && 0xDC00 <= u[1] && u[1] <= 0xDFFF) 1197 { 1198 *w = (((u[0] & 0x3FF) << 10) | (u[1] & 0x3FF)) + 0x10000; 1199 u += 2; 1200 } 1201 else { 1202 *w = *u; 1203 u++; 1204 } 1205 w++; 1206 } 1207 if (w != wend) 1208 *w = L'\0'; 1209 return w - worig; 1210 } 1211 else { 1212 nchar = 1; /* nul character at the end */ 1213 while (u != uend) { 1214 if (0xD800 <= u[0] && u[0] <= 0xDBFF 1215 && 0xDC00 <= u[1] && u[1] <= 0xDFFF) 1216 u += 2; 1217 else 1218 u++; 1219 nchar++; 1220 } 1221 } 1222 return nchar; 1223#elif Py_UNICODE_SIZE == 4 && SIZEOF_WCHAR_T == 2 1224 register Py_UNICODE *u, *uend, ordinal; 1225 register Py_ssize_t i; 1226 wchar_t *worig, *wend; 1227 Py_ssize_t nchar; 1228 1229 u = PyUnicode_AS_UNICODE(unicode); 1230 uend = u + PyUnicode_GET_SIZE(u); 1231 if (w != NULL) { 1232 worig = w; 1233 wend = w + size; 1234 while (u != uend && w != wend) { 1235 ordinal = *u; 1236 if (ordinal > 0xffff) { 1237 ordinal -= 0x10000; 1238 *w++ = 0xD800 | (ordinal >> 10); 1239 *w++ = 0xDC00 | (ordinal & 0x3FF); 1240 } 1241 else 1242 *w++ = ordinal; 1243 u++; 1244 } 1245 if (w != wend) 1246 *w = 0; 1247 return w - worig; 1248 } 1249 else { 1250 nchar = 1; /* nul character */ 1251 while (u != uend) { 1252 if (*u > 0xffff) 1253 nchar += 2; 1254 else 1255 nchar++; 1256 u++; 1257 } 1258 return nchar; 1259 } 1260#else 1261# error "unsupported wchar_t and Py_UNICODE sizes, see issue #8670" 1262#endif 1263} 1264 1265Py_ssize_t 1266PyUnicode_AsWideChar(PyUnicodeObject *unicode, 1267 wchar_t *w, 1268 Py_ssize_t size) 1269{ 1270 if (unicode == NULL) { 1271 PyErr_BadInternalCall(); 1272 return -1; 1273 } 1274 return unicode_aswidechar(unicode, w, size); 1275} 1276 1277wchar_t* 1278PyUnicode_AsWideCharString(PyObject *unicode, 1279 Py_ssize_t *size) 1280{ 1281 wchar_t* buffer; 1282 Py_ssize_t buflen; 1283 1284 if (unicode == NULL) { 1285 PyErr_BadInternalCall(); 1286 return NULL; 1287 } 1288 1289 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, NULL, 0); 1290 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) { 1291 PyErr_NoMemory(); 1292 return NULL; 1293 } 1294 1295 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t)); 1296 if (buffer == NULL) { 1297 PyErr_NoMemory(); 1298 return NULL; 1299 } 1300 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, buffer, buflen); 1301 if (size != NULL) 1302 *size = buflen; 1303 return buffer; 1304} 1305 1306#endif 1307 1308PyObject *PyUnicode_FromOrdinal(int ordinal) 1309{ 1310 Py_UNICODE s[2]; 1311 1312 if (ordinal < 0 || ordinal > 0x10ffff) { 1313 PyErr_SetString(PyExc_ValueError, 1314 "chr() arg not in range(0x110000)"); 1315 return NULL; 1316 } 1317 1318#ifndef Py_UNICODE_WIDE 1319 if (ordinal > 0xffff) { 1320 ordinal -= 0x10000; 1321 s[0] = 0xD800 | (ordinal >> 10); 1322 s[1] = 0xDC00 | (ordinal & 0x3FF); 1323 return PyUnicode_FromUnicode(s, 2); 1324 } 1325#endif 1326 1327 s[0] = (Py_UNICODE)ordinal; 1328 return PyUnicode_FromUnicode(s, 1); 1329} 1330 1331PyObject *PyUnicode_FromObject(register PyObject *obj) 1332{ 1333 /* XXX Perhaps we should make this API an alias of 1334 PyObject_Str() instead ?! */ 1335 if (PyUnicode_CheckExact(obj)) { 1336 Py_INCREF(obj); 1337 return obj; 1338 } 1339 if (PyUnicode_Check(obj)) { 1340 /* For a Unicode subtype that's not a Unicode object, 1341 return a true Unicode object with the same data. */ 1342 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj), 1343 PyUnicode_GET_SIZE(obj)); 1344 } 1345 PyErr_Format(PyExc_TypeError, 1346 "Can't convert '%.100s' object to str implicitly", 1347 Py_TYPE(obj)->tp_name); 1348 return NULL; 1349} 1350 1351PyObject *PyUnicode_FromEncodedObject(register PyObject *obj, 1352 const char *encoding, 1353 const char *errors) 1354{ 1355 Py_buffer buffer; 1356 PyObject *v; 1357 1358 if (obj == NULL) { 1359 PyErr_BadInternalCall(); 1360 return NULL; 1361 } 1362 1363 /* Decoding bytes objects is the most common case and should be fast */ 1364 if (PyBytes_Check(obj)) { 1365 if (PyBytes_GET_SIZE(obj) == 0) { 1366 Py_INCREF(unicode_empty); 1367 v = (PyObject *) unicode_empty; 1368 } 1369 else { 1370 v = PyUnicode_Decode( 1371 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj), 1372 encoding, errors); 1373 } 1374 return v; 1375 } 1376 1377 if (PyUnicode_Check(obj)) { 1378 PyErr_SetString(PyExc_TypeError, 1379 "decoding str is not supported"); 1380 return NULL; 1381 } 1382 1383 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */ 1384 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) { 1385 PyErr_Format(PyExc_TypeError, 1386 "coercing to str: need bytes, bytearray " 1387 "or buffer-like object, %.80s found", 1388 Py_TYPE(obj)->tp_name); 1389 return NULL; 1390 } 1391 1392 if (buffer.len == 0) { 1393 Py_INCREF(unicode_empty); 1394 v = (PyObject *) unicode_empty; 1395 } 1396 else 1397 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors); 1398 1399 PyBuffer_Release(&buffer); 1400 return v; 1401} 1402 1403/* Convert encoding to lower case and replace '_' with '-' in order to 1404 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1), 1405 1 on success. */ 1406static int 1407normalize_encoding(const char *encoding, 1408 char *lower, 1409 size_t lower_len) 1410{ 1411 const char *e; 1412 char *l; 1413 char *l_end; 1414 1415 e = encoding; 1416 l = lower; 1417 l_end = &lower[lower_len - 1]; 1418 while (*e) { 1419 if (l == l_end) 1420 return 0; 1421 if (ISUPPER(*e)) { 1422 *l++ = TOLOWER(*e++); 1423 } 1424 else if (*e == '_') { 1425 *l++ = '-'; 1426 e++; 1427 } 1428 else { 1429 *l++ = *e++; 1430 } 1431 } 1432 *l = '\0'; 1433 return 1; 1434} 1435 1436PyObject *PyUnicode_Decode(const char *s, 1437 Py_ssize_t size, 1438 const char *encoding, 1439 const char *errors) 1440{ 1441 PyObject *buffer = NULL, *unicode; 1442 Py_buffer info; 1443 char lower[11]; /* Enough for any encoding shortcut */ 1444 1445 if (encoding == NULL) 1446 encoding = PyUnicode_GetDefaultEncoding(); 1447 1448 /* Shortcuts for common default encodings */ 1449 if (normalize_encoding(encoding, lower, sizeof(lower))) { 1450 if (strcmp(lower, "utf-8") == 0) 1451 return PyUnicode_DecodeUTF8(s, size, errors); 1452 else if ((strcmp(lower, "latin-1") == 0) || 1453 (strcmp(lower, "iso-8859-1") == 0)) 1454 return PyUnicode_DecodeLatin1(s, size, errors); 1455#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T) 1456 else if (strcmp(lower, "mbcs") == 0) 1457 return PyUnicode_DecodeMBCS(s, size, errors); 1458#endif 1459 else if (strcmp(lower, "ascii") == 0) 1460 return PyUnicode_DecodeASCII(s, size, errors); 1461 else if (strcmp(lower, "utf-16") == 0) 1462 return PyUnicode_DecodeUTF16(s, size, errors, 0); 1463 else if (strcmp(lower, "utf-32") == 0) 1464 return PyUnicode_DecodeUTF32(s, size, errors, 0); 1465 } 1466 1467 /* Decode via the codec registry */ 1468 buffer = NULL; 1469 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0) 1470 goto onError; 1471 buffer = PyMemoryView_FromBuffer(&info); 1472 if (buffer == NULL) 1473 goto onError; 1474 unicode = PyCodec_Decode(buffer, encoding, errors); 1475 if (unicode == NULL) 1476 goto onError; 1477 if (!PyUnicode_Check(unicode)) { 1478 PyErr_Format(PyExc_TypeError, 1479 "decoder did not return a str object (type=%.400s)", 1480 Py_TYPE(unicode)->tp_name); 1481 Py_DECREF(unicode); 1482 goto onError; 1483 } 1484 Py_DECREF(buffer); 1485 return unicode; 1486 1487 onError: 1488 Py_XDECREF(buffer); 1489 return NULL; 1490} 1491 1492PyObject *PyUnicode_AsDecodedObject(PyObject *unicode, 1493 const char *encoding, 1494 const char *errors) 1495{ 1496 PyObject *v; 1497 1498 if (!PyUnicode_Check(unicode)) { 1499 PyErr_BadArgument(); 1500 goto onError; 1501 } 1502 1503 if (encoding == NULL) 1504 encoding = PyUnicode_GetDefaultEncoding(); 1505 1506 /* Decode via the codec registry */ 1507 v = PyCodec_Decode(unicode, encoding, errors); 1508 if (v == NULL) 1509 goto onError; 1510 return v; 1511 1512 onError: 1513 return NULL; 1514} 1515 1516PyObject *PyUnicode_AsDecodedUnicode(PyObject *unicode, 1517 const char *encoding, 1518 const char *errors) 1519{ 1520 PyObject *v; 1521 1522 if (!PyUnicode_Check(unicode)) { 1523 PyErr_BadArgument(); 1524 goto onError; 1525 } 1526 1527 if (encoding == NULL) 1528 encoding = PyUnicode_GetDefaultEncoding(); 1529 1530 /* Decode via the codec registry */ 1531 v = PyCodec_Decode(unicode, encoding, errors); 1532 if (v == NULL) 1533 goto onError; 1534 if (!PyUnicode_Check(v)) { 1535 PyErr_Format(PyExc_TypeError, 1536 "decoder did not return a str object (type=%.400s)", 1537 Py_TYPE(v)->tp_name); 1538 Py_DECREF(v); 1539 goto onError; 1540 } 1541 return v; 1542 1543 onError: 1544 return NULL; 1545} 1546 1547PyObject *PyUnicode_Encode(const Py_UNICODE *s, 1548 Py_ssize_t size, 1549 const char *encoding, 1550 const char *errors) 1551{ 1552 PyObject *v, *unicode; 1553 1554 unicode = PyUnicode_FromUnicode(s, size); 1555 if (unicode == NULL) 1556 return NULL; 1557 v = PyUnicode_AsEncodedString(unicode, encoding, errors); 1558 Py_DECREF(unicode); 1559 return v; 1560} 1561 1562PyObject *PyUnicode_AsEncodedObject(PyObject *unicode, 1563 const char *encoding, 1564 const char *errors) 1565{ 1566 PyObject *v; 1567 1568 if (!PyUnicode_Check(unicode)) { 1569 PyErr_BadArgument(); 1570 goto onError; 1571 } 1572 1573 if (encoding == NULL) 1574 encoding = PyUnicode_GetDefaultEncoding(); 1575 1576 /* Encode via the codec registry */ 1577 v = PyCodec_Encode(unicode, encoding, errors); 1578 if (v == NULL) 1579 goto onError; 1580 return v; 1581 1582 onError: 1583 return NULL; 1584} 1585 1586PyObject *PyUnicode_EncodeFSDefault(PyObject *unicode) 1587{ 1588 if (Py_FileSystemDefaultEncoding) { 1589#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T) 1590 if (strcmp(Py_FileSystemDefaultEncoding, "mbcs") == 0) 1591 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode), 1592 PyUnicode_GET_SIZE(unicode), 1593 NULL); 1594#endif 1595 return PyUnicode_AsEncodedString(unicode, 1596 Py_FileSystemDefaultEncoding, 1597 "surrogateescape"); 1598 } 1599 else { 1600 /* locale encoding with surrogateescape */ 1601 wchar_t *wchar; 1602 char *bytes; 1603 PyObject *bytes_obj; 1604 1605 wchar = PyUnicode_AsWideCharString(unicode, NULL); 1606 if (wchar == NULL) 1607 return NULL; 1608 bytes = _Py_wchar2char(wchar); 1609 PyMem_Free(wchar); 1610 if (bytes == NULL) 1611 return NULL; 1612 1613 bytes_obj = PyBytes_FromString(bytes); 1614 PyMem_Free(bytes); 1615 return bytes_obj; 1616 } 1617} 1618 1619PyObject *PyUnicode_AsEncodedString(PyObject *unicode, 1620 const char *encoding, 1621 const char *errors) 1622{ 1623 PyObject *v; 1624 char lower[11]; /* Enough for any encoding shortcut */ 1625 1626 if (!PyUnicode_Check(unicode)) { 1627 PyErr_BadArgument(); 1628 return NULL; 1629 } 1630 1631 if (encoding == NULL) 1632 encoding = PyUnicode_GetDefaultEncoding(); 1633 1634 /* Shortcuts for common default encodings */ 1635 if (normalize_encoding(encoding, lower, sizeof(lower))) { 1636 if (strcmp(lower, "utf-8") == 0) 1637 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode), 1638 PyUnicode_GET_SIZE(unicode), 1639 errors); 1640 else if ((strcmp(lower, "latin-1") == 0) || 1641 (strcmp(lower, "iso-8859-1") == 0)) 1642 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode), 1643 PyUnicode_GET_SIZE(unicode), 1644 errors); 1645#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T) 1646 else if (strcmp(lower, "mbcs") == 0) 1647 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode), 1648 PyUnicode_GET_SIZE(unicode), 1649 errors); 1650#endif 1651 else if (strcmp(lower, "ascii") == 0) 1652 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode), 1653 PyUnicode_GET_SIZE(unicode), 1654 errors); 1655 } 1656 /* During bootstrap, we may need to find the encodings 1657 package, to load the file system encoding, and require the 1658 file system encoding in order to load the encodings 1659 package. 1660 1661 Break out of this dependency by assuming that the path to 1662 the encodings module is ASCII-only. XXX could try wcstombs 1663 instead, if the file system encoding is the locale's 1664 encoding. */ 1665 if (Py_FileSystemDefaultEncoding && 1666 strcmp(encoding, Py_FileSystemDefaultEncoding) == 0 && 1667 !PyThreadState_GET()->interp->codecs_initialized) 1668 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode), 1669 PyUnicode_GET_SIZE(unicode), 1670 errors); 1671 1672 /* Encode via the codec registry */ 1673 v = PyCodec_Encode(unicode, encoding, errors); 1674 if (v == NULL) 1675 return NULL; 1676 1677 /* The normal path */ 1678 if (PyBytes_Check(v)) 1679 return v; 1680 1681 /* If the codec returns a buffer, raise a warning and convert to bytes */ 1682 if (PyByteArray_Check(v)) { 1683 int error; 1684 PyObject *b; 1685 1686 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1, 1687 "encoder %s returned bytearray instead of bytes", 1688 encoding); 1689 if (error) { 1690 Py_DECREF(v); 1691 return NULL; 1692 } 1693 1694 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v)); 1695 Py_DECREF(v); 1696 return b; 1697 } 1698 1699 PyErr_Format(PyExc_TypeError, 1700 "encoder did not return a bytes object (type=%.400s)", 1701 Py_TYPE(v)->tp_name); 1702 Py_DECREF(v); 1703 return NULL; 1704} 1705 1706PyObject *PyUnicode_AsEncodedUnicode(PyObject *unicode, 1707 const char *encoding, 1708 const char *errors) 1709{ 1710 PyObject *v; 1711 1712 if (!PyUnicode_Check(unicode)) { 1713 PyErr_BadArgument(); 1714 goto onError; 1715 } 1716 1717 if (encoding == NULL) 1718 encoding = PyUnicode_GetDefaultEncoding(); 1719 1720 /* Encode via the codec registry */ 1721 v = PyCodec_Encode(unicode, encoding, errors); 1722 if (v == NULL) 1723 goto onError; 1724 if (!PyUnicode_Check(v)) { 1725 PyErr_Format(PyExc_TypeError, 1726 "encoder did not return an str object (type=%.400s)", 1727 Py_TYPE(v)->tp_name); 1728 Py_DECREF(v); 1729 goto onError; 1730 } 1731 return v; 1732 1733 onError: 1734 return NULL; 1735} 1736 1737PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode, 1738 const char *errors) 1739{ 1740 PyObject *v = ((PyUnicodeObject *)unicode)->defenc; 1741 if (v) 1742 return v; 1743 if (errors != NULL) 1744 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString"); 1745 v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode), 1746 PyUnicode_GET_SIZE(unicode), 1747 NULL); 1748 if (!v) 1749 return NULL; 1750 ((PyUnicodeObject *)unicode)->defenc = v; 1751 return v; 1752} 1753 1754PyObject* 1755PyUnicode_DecodeFSDefault(const char *s) { 1756 Py_ssize_t size = (Py_ssize_t)strlen(s); 1757 return PyUnicode_DecodeFSDefaultAndSize(s, size); 1758} 1759 1760PyObject* 1761PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size) 1762{ 1763 /* During the early bootstrapping process, Py_FileSystemDefaultEncoding 1764 can be undefined. If it is case, decode using UTF-8. The following assumes 1765 that Py_FileSystemDefaultEncoding is set to a built-in encoding during the 1766 bootstrapping process where the codecs aren't ready yet. 1767 */ 1768 if (Py_FileSystemDefaultEncoding) { 1769#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T) 1770 if (strcmp(Py_FileSystemDefaultEncoding, "mbcs") == 0) { 1771 return PyUnicode_DecodeMBCS(s, size, NULL); 1772 } 1773#elif defined(__APPLE__) 1774 if (strcmp(Py_FileSystemDefaultEncoding, "utf-8") == 0) { 1775 return PyUnicode_DecodeUTF8(s, size, "surrogateescape"); 1776 } 1777#endif 1778 return PyUnicode_Decode(s, size, 1779 Py_FileSystemDefaultEncoding, 1780 "surrogateescape"); 1781 } 1782 else { 1783 /* locale encoding with surrogateescape */ 1784 wchar_t *wchar; 1785 PyObject *unicode; 1786 size_t len; 1787 1788 if (s[size] != '\0' || size != strlen(s)) { 1789 PyErr_SetString(PyExc_TypeError, "embedded NUL character"); 1790 return NULL; 1791 } 1792 1793 wchar = _Py_char2wchar(s, &len); 1794 if (wchar == NULL) 1795 return NULL; 1796 1797 unicode = PyUnicode_FromWideChar(wchar, len); 1798 PyMem_Free(wchar); 1799 return unicode; 1800 } 1801} 1802 1803 1804int 1805PyUnicode_FSConverter(PyObject* arg, void* addr) 1806{ 1807 PyObject *output = NULL; 1808 Py_ssize_t size; 1809 void *data; 1810 if (arg == NULL) { 1811 Py_DECREF(*(PyObject**)addr); 1812 return 1; 1813 } 1814 if (PyBytes_Check(arg)) { 1815 output = arg; 1816 Py_INCREF(output); 1817 } 1818 else { 1819 arg = PyUnicode_FromObject(arg); 1820 if (!arg) 1821 return 0; 1822 output = PyUnicode_EncodeFSDefault(arg); 1823 Py_DECREF(arg); 1824 if (!output) 1825 return 0; 1826 if (!PyBytes_Check(output)) { 1827 Py_DECREF(output); 1828 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes"); 1829 return 0; 1830 } 1831 } 1832 size = PyBytes_GET_SIZE(output); 1833 data = PyBytes_AS_STRING(output); 1834 if (size != strlen(data)) { 1835 PyErr_SetString(PyExc_TypeError, "embedded NUL character"); 1836 Py_DECREF(output); 1837 return 0; 1838 } 1839 *(PyObject**)addr = output; 1840 return Py_CLEANUP_SUPPORTED; 1841} 1842 1843 1844int 1845PyUnicode_FSDecoder(PyObject* arg, void* addr) 1846{ 1847 PyObject *output = NULL; 1848 Py_ssize_t size; 1849 void *data; 1850 if (arg == NULL) { 1851 Py_DECREF(*(PyObject**)addr); 1852 return 1; 1853 } 1854 if (PyUnicode_Check(arg)) { 1855 output = arg; 1856 Py_INCREF(output); 1857 } 1858 else { 1859 arg = PyBytes_FromObject(arg); 1860 if (!arg) 1861 return 0; 1862 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg), 1863 PyBytes_GET_SIZE(arg)); 1864 Py_DECREF(arg); 1865 if (!output) 1866 return 0; 1867 if (!PyUnicode_Check(output)) { 1868 Py_DECREF(output); 1869 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode"); 1870 return 0; 1871 } 1872 } 1873 size = PyUnicode_GET_SIZE(output); 1874 data = PyUnicode_AS_UNICODE(output); 1875 if (size != Py_UNICODE_strlen(data)) { 1876 PyErr_SetString(PyExc_TypeError, "embedded NUL character"); 1877 Py_DECREF(output); 1878 return 0; 1879 } 1880 *(PyObject**)addr = output; 1881 return Py_CLEANUP_SUPPORTED; 1882} 1883 1884 1885char* 1886_PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize) 1887{ 1888 PyObject *bytes; 1889 if (!PyUnicode_Check(unicode)) { 1890 PyErr_BadArgument(); 1891 return NULL; 1892 } 1893 bytes = _PyUnicode_AsDefaultEncodedString(unicode, NULL); 1894 if (bytes == NULL) 1895 return NULL; 1896 if (psize != NULL) 1897 *psize = PyBytes_GET_SIZE(bytes); 1898 return PyBytes_AS_STRING(bytes); 1899} 1900 1901char* 1902_PyUnicode_AsString(PyObject *unicode) 1903{ 1904 return _PyUnicode_AsStringAndSize(unicode, NULL); 1905} 1906 1907Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode) 1908{ 1909 if (!PyUnicode_Check(unicode)) { 1910 PyErr_BadArgument(); 1911 goto onError; 1912 } 1913 return PyUnicode_AS_UNICODE(unicode); 1914 1915 onError: 1916 return NULL; 1917} 1918 1919Py_ssize_t PyUnicode_GetSize(PyObject *unicode) 1920{ 1921 if (!PyUnicode_Check(unicode)) { 1922 PyErr_BadArgument(); 1923 goto onError; 1924 } 1925 return PyUnicode_GET_SIZE(unicode); 1926 1927 onError: 1928 return -1; 1929} 1930 1931const char *PyUnicode_GetDefaultEncoding(void) 1932{ 1933 return "utf-8"; 1934} 1935 1936/* create or adjust a UnicodeDecodeError */ 1937static void 1938make_decode_exception(PyObject **exceptionObject, 1939 const char *encoding, 1940 const char *input, Py_ssize_t length, 1941 Py_ssize_t startpos, Py_ssize_t endpos, 1942 const char *reason) 1943{ 1944 if (*exceptionObject == NULL) { 1945 *exceptionObject = PyUnicodeDecodeError_Create( 1946 encoding, input, length, startpos, endpos, reason); 1947 } 1948 else { 1949 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos)) 1950 goto onError; 1951 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos)) 1952 goto onError; 1953 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason)) 1954 goto onError; 1955 } 1956 return; 1957 1958onError: 1959 Py_DECREF(*exceptionObject); 1960 *exceptionObject = NULL; 1961} 1962 1963/* error handling callback helper: 1964 build arguments, call the callback and check the arguments, 1965 if no exception occurred, copy the replacement to the output 1966 and adjust various state variables. 1967 return 0 on success, -1 on error 1968*/ 1969 1970static 1971int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler, 1972 const char *encoding, const char *reason, 1973 const char **input, const char **inend, Py_ssize_t *startinpos, 1974 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr, 1975 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr) 1976{ 1977 static char *argparse = "O!n;decoding error handler must return (str, int) tuple"; 1978 1979 PyObject *restuple = NULL; 1980 PyObject *repunicode = NULL; 1981 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output); 1982 Py_ssize_t insize; 1983 Py_ssize_t requiredsize; 1984 Py_ssize_t newpos; 1985 Py_UNICODE *repptr; 1986 PyObject *inputobj = NULL; 1987 Py_ssize_t repsize; 1988 int res = -1; 1989 1990 if (*errorHandler == NULL) { 1991 *errorHandler = PyCodec_LookupError(errors); 1992 if (*errorHandler == NULL) 1993 goto onError; 1994 } 1995 1996 make_decode_exception(exceptionObject, 1997 encoding, 1998 *input, *inend - *input, 1999 *startinpos, *endinpos, 2000 reason); 2001 if (*exceptionObject == NULL) 2002 goto onError; 2003 2004 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL); 2005 if (restuple == NULL) 2006 goto onError; 2007 if (!PyTuple_Check(restuple)) { 2008 PyErr_SetString(PyExc_TypeError, &argparse[4]); 2009 goto onError; 2010 } 2011 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos)) 2012 goto onError; 2013 2014 /* Copy back the bytes variables, which might have been modified by the 2015 callback */ 2016 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject); 2017 if (!inputobj) 2018 goto onError; 2019 if (!PyBytes_Check(inputobj)) { 2020 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes"); 2021 } 2022 *input = PyBytes_AS_STRING(inputobj); 2023 insize = PyBytes_GET_SIZE(inputobj); 2024 *inend = *input + insize; 2025 /* we can DECREF safely, as the exception has another reference, 2026 so the object won't go away. */ 2027 Py_DECREF(inputobj); 2028 2029 if (newpos<0) 2030 newpos = insize+newpos; 2031 if (newpos<0 || newpos>insize) { 2032 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos); 2033 goto onError; 2034 } 2035 2036 /* need more space? (at least enough for what we 2037 have+the replacement+the rest of the string (starting 2038 at the new input position), so we won't have to check space 2039 when there are no errors in the rest of the string) */ 2040 repptr = PyUnicode_AS_UNICODE(repunicode); 2041 repsize = PyUnicode_GET_SIZE(repunicode); 2042 requiredsize = *outpos + repsize + insize-newpos; 2043 if (requiredsize > outsize) { 2044 if (requiredsize<2*outsize) 2045 requiredsize = 2*outsize; 2046 if (_PyUnicode_Resize(output, requiredsize) < 0) 2047 goto onError; 2048 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos; 2049 } 2050 *endinpos = newpos; 2051 *inptr = *input + newpos; 2052 Py_UNICODE_COPY(*outptr, repptr, repsize); 2053 *outptr += repsize; 2054 *outpos += repsize; 2055 2056 /* we made it! */ 2057 res = 0; 2058 2059 onError: 2060 Py_XDECREF(restuple); 2061 return res; 2062} 2063 2064/* --- UTF-7 Codec -------------------------------------------------------- */ 2065 2066/* See RFC2152 for details. We encode conservatively and decode liberally. */ 2067 2068/* Three simple macros defining base-64. */ 2069 2070/* Is c a base-64 character? */ 2071 2072#define IS_BASE64(c) \ 2073 (((c) >= 'A' && (c) <= 'Z') || \ 2074 ((c) >= 'a' && (c) <= 'z') || \ 2075 ((c) >= '0' && (c) <= '9') || \ 2076 (c) == '+' || (c) == '/') 2077 2078/* given that c is a base-64 character, what is its base-64 value? */ 2079 2080#define FROM_BASE64(c) \ 2081 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \ 2082 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \ 2083 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \ 2084 (c) == '+' ? 62 : 63) 2085 2086/* What is the base-64 character of the bottom 6 bits of n? */ 2087 2088#define TO_BASE64(n) \ 2089 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f]) 2090 2091/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be 2092 * decoded as itself. We are permissive on decoding; the only ASCII 2093 * byte not decoding to itself is the + which begins a base64 2094 * string. */ 2095 2096#define DECODE_DIRECT(c) \ 2097 ((c) <= 127 && (c) != '+') 2098 2099/* The UTF-7 encoder treats ASCII characters differently according to 2100 * whether they are Set D, Set O, Whitespace, or special (i.e. none of 2101 * the above). See RFC2152. This array identifies these different 2102 * sets: 2103 * 0 : "Set D" 2104 * alphanumeric and '(),-./:? 2105 * 1 : "Set O" 2106 * !"#$%&*;<=>@[]^_`{|} 2107 * 2 : "whitespace" 2108 * ht nl cr sp 2109 * 3 : special (must be base64 encoded) 2110 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127) 2111 */ 2112 2113static 2114char utf7_category[128] = { 2115/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */ 2116 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3, 2117/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */ 2118 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2119/* sp ! " # $ % & ' ( ) * + , - . / */ 2120 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0, 2121/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */ 2122 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 2123/* @ A B C D E F G H I J K L M N O */ 2124 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2125/* P Q R S T U V W X Y Z [ \ ] ^ _ */ 2126 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1, 2127/* ` a b c d e f g h i j k l m n o */ 2128 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2129/* p q r s t u v w x y z { | } ~ del */ 2130 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3, 2131}; 2132 2133/* ENCODE_DIRECT: this character should be encoded as itself. The 2134 * answer depends on whether we are encoding set O as itself, and also 2135 * on whether we are encoding whitespace as itself. RFC2152 makes it 2136 * clear that the answers to these questions vary between 2137 * applications, so this code needs to be flexible. */ 2138 2139#define ENCODE_DIRECT(c, directO, directWS) \ 2140 ((c) < 128 && (c) > 0 && \ 2141 ((utf7_category[(c)] == 0) || \ 2142 (directWS && (utf7_category[(c)] == 2)) || \ 2143 (directO && (utf7_category[(c)] == 1)))) 2144 2145PyObject *PyUnicode_DecodeUTF7(const char *s, 2146 Py_ssize_t size, 2147 const char *errors) 2148{ 2149 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL); 2150} 2151 2152/* The decoder. The only state we preserve is our read position, 2153 * i.e. how many characters we have consumed. So if we end in the 2154 * middle of a shift sequence we have to back off the read position 2155 * and the output to the beginning of the sequence, otherwise we lose 2156 * all the shift state (seen bits, number of bits seen, high 2157 * surrogate). */ 2158 2159PyObject *PyUnicode_DecodeUTF7Stateful(const char *s, 2160 Py_ssize_t size, 2161 const char *errors, 2162 Py_ssize_t *consumed) 2163{ 2164 const char *starts = s; 2165 Py_ssize_t startinpos; 2166 Py_ssize_t endinpos; 2167 Py_ssize_t outpos; 2168 const char *e; 2169 PyUnicodeObject *unicode; 2170 Py_UNICODE *p; 2171 const char *errmsg = ""; 2172 int inShift = 0; 2173 Py_UNICODE *shiftOutStart; 2174 unsigned int base64bits = 0; 2175 unsigned long base64buffer = 0; 2176 Py_UNICODE surrogate = 0; 2177 PyObject *errorHandler = NULL; 2178 PyObject *exc = NULL; 2179 2180 unicode = _PyUnicode_New(size); 2181 if (!unicode) 2182 return NULL; 2183 if (size == 0) { 2184 if (consumed) 2185 *consumed = 0; 2186 return (PyObject *)unicode; 2187 } 2188 2189 p = unicode->str; 2190 shiftOutStart = p; 2191 e = s + size; 2192 2193 while (s < e) { 2194 Py_UNICODE ch; 2195 restart: 2196 ch = (unsigned char) *s; 2197 2198 if (inShift) { /* in a base-64 section */ 2199 if (IS_BASE64(ch)) { /* consume a base-64 character */ 2200 base64buffer = (base64buffer << 6) | FROM_BASE64(ch); 2201 base64bits += 6; 2202 s++; 2203 if (base64bits >= 16) { 2204 /* we have enough bits for a UTF-16 value */ 2205 Py_UNICODE outCh = (Py_UNICODE) 2206 (base64buffer >> (base64bits-16)); 2207 base64bits -= 16; 2208 base64buffer &= (1 << base64bits) - 1; /* clear high bits */ 2209 if (surrogate) { 2210 /* expecting a second surrogate */ 2211 if (outCh >= 0xDC00 && outCh <= 0xDFFF) { 2212#ifdef Py_UNICODE_WIDE 2213 *p++ = (((surrogate & 0x3FF)<<10) 2214 | (outCh & 0x3FF)) + 0x10000; 2215#else 2216 *p++ = surrogate; 2217 *p++ = outCh; 2218#endif 2219 surrogate = 0; 2220 } 2221 else { 2222 surrogate = 0; 2223 errmsg = "second surrogate missing"; 2224 goto utf7Error; 2225 } 2226 } 2227 else if (outCh >= 0xD800 && outCh <= 0xDBFF) { 2228 /* first surrogate */ 2229 surrogate = outCh; 2230 } 2231 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) { 2232 errmsg = "unexpected second surrogate"; 2233 goto utf7Error; 2234 } 2235 else { 2236 *p++ = outCh; 2237 } 2238 } 2239 } 2240 else { /* now leaving a base-64 section */ 2241 inShift = 0; 2242 s++; 2243 if (surrogate) { 2244 errmsg = "second surrogate missing at end of shift sequence"; 2245 goto utf7Error; 2246 } 2247 if (base64bits > 0) { /* left-over bits */ 2248 if (base64bits >= 6) { 2249 /* We've seen at least one base-64 character */ 2250 errmsg = "partial character in shift sequence"; 2251 goto utf7Error; 2252 } 2253 else { 2254 /* Some bits remain; they should be zero */ 2255 if (base64buffer != 0) { 2256 errmsg = "non-zero padding bits in shift sequence"; 2257 goto utf7Error; 2258 } 2259 } 2260 } 2261 if (ch != '-') { 2262 /* '-' is absorbed; other terminating 2263 characters are preserved */ 2264 *p++ = ch; 2265 } 2266 } 2267 } 2268 else if ( ch == '+' ) { 2269 startinpos = s-starts; 2270 s++; /* consume '+' */ 2271 if (s < e && *s == '-') { /* '+-' encodes '+' */ 2272 s++; 2273 *p++ = '+'; 2274 } 2275 else { /* begin base64-encoded section */ 2276 inShift = 1; 2277 shiftOutStart = p; 2278 base64bits = 0; 2279 } 2280 } 2281 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */ 2282 *p++ = ch; 2283 s++; 2284 } 2285 else { 2286 startinpos = s-starts; 2287 s++; 2288 errmsg = "unexpected special character"; 2289 goto utf7Error; 2290 } 2291 continue; 2292utf7Error: 2293 outpos = p-PyUnicode_AS_UNICODE(unicode); 2294 endinpos = s-starts; 2295 if (unicode_decode_call_errorhandler( 2296 errors, &errorHandler, 2297 "utf7", errmsg, 2298 &starts, &e, &startinpos, &endinpos, &exc, &s, 2299 &unicode, &outpos, &p)) 2300 goto onError; 2301 } 2302 2303 /* end of string */ 2304 2305 if (inShift && !consumed) { /* in shift sequence, no more to follow */ 2306 /* if we're in an inconsistent state, that's an error */ 2307 if (surrogate || 2308 (base64bits >= 6) || 2309 (base64bits > 0 && base64buffer != 0)) { 2310 outpos = p-PyUnicode_AS_UNICODE(unicode); 2311 endinpos = size; 2312 if (unicode_decode_call_errorhandler( 2313 errors, &errorHandler, 2314 "utf7", "unterminated shift sequence", 2315 &starts, &e, &startinpos, &endinpos, &exc, &s, 2316 &unicode, &outpos, &p)) 2317 goto onError; 2318 if (s < e) 2319 goto restart; 2320 } 2321 } 2322 2323 /* return state */ 2324 if (consumed) { 2325 if (inShift) { 2326 p = shiftOutStart; /* back off output */ 2327 *consumed = startinpos; 2328 } 2329 else { 2330 *consumed = s-starts; 2331 } 2332 } 2333 2334 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0) 2335 goto onError; 2336 2337 Py_XDECREF(errorHandler); 2338 Py_XDECREF(exc); 2339 return (PyObject *)unicode; 2340 2341 onError: 2342 Py_XDECREF(errorHandler); 2343 Py_XDECREF(exc); 2344 Py_DECREF(unicode); 2345 return NULL; 2346} 2347 2348 2349PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s, 2350 Py_ssize_t size, 2351 int base64SetO, 2352 int base64WhiteSpace, 2353 const char *errors) 2354{ 2355 PyObject *v; 2356 /* It might be possible to tighten this worst case */ 2357 Py_ssize_t allocated = 8 * size; 2358 int inShift = 0; 2359 Py_ssize_t i = 0; 2360 unsigned int base64bits = 0; 2361 unsigned long base64buffer = 0; 2362 char * out; 2363 char * start; 2364 2365 if (size == 0) 2366 return PyBytes_FromStringAndSize(NULL, 0); 2367 2368 if (allocated / 8 != size) 2369 return PyErr_NoMemory(); 2370 2371 v = PyBytes_FromStringAndSize(NULL, allocated); 2372 if (v == NULL) 2373 return NULL; 2374 2375 start = out = PyBytes_AS_STRING(v); 2376 for (;i < size; ++i) { 2377 Py_UNICODE ch = s[i]; 2378 2379 if (inShift) { 2380 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) { 2381 /* shifting out */ 2382 if (base64bits) { /* output remaining bits */ 2383 *out++ = TO_BASE64(base64buffer << (6-base64bits)); 2384 base64buffer = 0; 2385 base64bits = 0; 2386 } 2387 inShift = 0; 2388 /* Characters not in the BASE64 set implicitly unshift the sequence 2389 so no '-' is required, except if the character is itself a '-' */ 2390 if (IS_BASE64(ch) || ch == '-') { 2391 *out++ = '-'; 2392 } 2393 *out++ = (char) ch; 2394 } 2395 else { 2396 goto encode_char; 2397 } 2398 } 2399 else { /* not in a shift sequence */ 2400 if (ch == '+') { 2401 *out++ = '+'; 2402 *out++ = '-'; 2403 } 2404 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) { 2405 *out++ = (char) ch; 2406 } 2407 else { 2408 *out++ = '+'; 2409 inShift = 1; 2410 goto encode_char; 2411 } 2412 } 2413 continue; 2414encode_char: 2415#ifdef Py_UNICODE_WIDE 2416 if (ch >= 0x10000) { 2417 /* code first surrogate */ 2418 base64bits += 16; 2419 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10); 2420 while (base64bits >= 6) { 2421 *out++ = TO_BASE64(base64buffer >> (base64bits-6)); 2422 base64bits -= 6; 2423 } 2424 /* prepare second surrogate */ 2425 ch = 0xDC00 | ((ch-0x10000) & 0x3FF); 2426 } 2427#endif 2428 base64bits += 16; 2429 base64buffer = (base64buffer << 16) | ch; 2430 while (base64bits >= 6) { 2431 *out++ = TO_BASE64(base64buffer >> (base64bits-6)); 2432 base64bits -= 6; 2433 } 2434 } 2435 if (base64bits) 2436 *out++= TO_BASE64(base64buffer << (6-base64bits) ); 2437 if (inShift) 2438 *out++ = '-'; 2439 if (_PyBytes_Resize(&v, out - start) < 0) 2440 return NULL; 2441 return v; 2442} 2443 2444#undef IS_BASE64 2445#undef FROM_BASE64 2446#undef TO_BASE64 2447#undef DECODE_DIRECT 2448#undef ENCODE_DIRECT 2449 2450/* --- UTF-8 Codec -------------------------------------------------------- */ 2451 2452static 2453char utf8_code_length[256] = { 2454 /* Map UTF-8 encoded prefix byte to sequence length. Zero means 2455 illegal prefix. See RFC 3629 for details */ 2456 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */ 2457 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2458 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2459 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2460 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2461 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2462 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2463 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */ 2464 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */ 2465 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2466 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2467 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */ 2468 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */ 2469 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */ 2470 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */ 2471 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */ 2472}; 2473 2474PyObject *PyUnicode_DecodeUTF8(const char *s, 2475 Py_ssize_t size, 2476 const char *errors) 2477{ 2478 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL); 2479} 2480 2481/* Mask to check or force alignment of a pointer to C 'long' boundaries */ 2482#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1) 2483 2484/* Mask to quickly check whether a C 'long' contains a 2485 non-ASCII, UTF8-encoded char. */ 2486#if (SIZEOF_LONG == 8) 2487# define ASCII_CHAR_MASK 0x8080808080808080L 2488#elif (SIZEOF_LONG == 4) 2489# define ASCII_CHAR_MASK 0x80808080L 2490#else 2491# error C 'long' size should be either 4 or 8! 2492#endif 2493 2494PyObject *PyUnicode_DecodeUTF8Stateful(const char *s, 2495 Py_ssize_t size, 2496 const char *errors, 2497 Py_ssize_t *consumed) 2498{ 2499 const char *starts = s; 2500 int n; 2501 int k; 2502 Py_ssize_t startinpos; 2503 Py_ssize_t endinpos; 2504 Py_ssize_t outpos; 2505 const char *e, *aligned_end; 2506 PyUnicodeObject *unicode; 2507 Py_UNICODE *p; 2508 const char *errmsg = ""; 2509 PyObject *errorHandler = NULL; 2510 PyObject *exc = NULL; 2511 2512 /* Note: size will always be longer than the resulting Unicode 2513 character count */ 2514 unicode = _PyUnicode_New(size); 2515 if (!unicode) 2516 return NULL; 2517 if (size == 0) { 2518 if (consumed) 2519 *consumed = 0; 2520 return (PyObject *)unicode; 2521 } 2522 2523 /* Unpack UTF-8 encoded data */ 2524 p = unicode->str; 2525 e = s + size; 2526 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK); 2527 2528 while (s < e) { 2529 Py_UCS4 ch = (unsigned char)*s; 2530 2531 if (ch < 0x80) { 2532 /* Fast path for runs of ASCII characters. Given that common UTF-8 2533 input will consist of an overwhelming majority of ASCII 2534 characters, we try to optimize for this case by checking 2535 as many characters as a C 'long' can contain. 2536 First, check if we can do an aligned read, as most CPUs have 2537 a penalty for unaligned reads. 2538 */ 2539 if (!((size_t) s & LONG_PTR_MASK)) { 2540 /* Help register allocation */ 2541 register const char *_s = s; 2542 register Py_UNICODE *_p = p; 2543 while (_s < aligned_end) { 2544 /* Read a whole long at a time (either 4 or 8 bytes), 2545 and do a fast unrolled copy if it only contains ASCII 2546 characters. */ 2547 unsigned long data = *(unsigned long *) _s; 2548 if (data & ASCII_CHAR_MASK) 2549 break; 2550 _p[0] = (unsigned char) _s[0]; 2551 _p[1] = (unsigned char) _s[1]; 2552 _p[2] = (unsigned char) _s[2]; 2553 _p[3] = (unsigned char) _s[3]; 2554#if (SIZEOF_LONG == 8) 2555 _p[4] = (unsigned char) _s[4]; 2556 _p[5] = (unsigned char) _s[5]; 2557 _p[6] = (unsigned char) _s[6]; 2558 _p[7] = (unsigned char) _s[7]; 2559#endif 2560 _s += SIZEOF_LONG; 2561 _p += SIZEOF_LONG; 2562 } 2563 s = _s; 2564 p = _p; 2565 if (s == e) 2566 break; 2567 ch = (unsigned char)*s; 2568 } 2569 } 2570 2571 if (ch < 0x80) { 2572 *p++ = (Py_UNICODE)ch; 2573 s++; 2574 continue; 2575 } 2576 2577 n = utf8_code_length[ch]; 2578 2579 if (s + n > e) { 2580 if (consumed) 2581 break; 2582 else { 2583 errmsg = "unexpected end of data"; 2584 startinpos = s-starts; 2585 endinpos = startinpos+1; 2586 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++) 2587 endinpos++; 2588 goto utf8Error; 2589 } 2590 } 2591 2592 switch (n) { 2593 2594 case 0: 2595 errmsg = "invalid start byte"; 2596 startinpos = s-starts; 2597 endinpos = startinpos+1; 2598 goto utf8Error; 2599 2600 case 1: 2601 errmsg = "internal error"; 2602 startinpos = s-starts; 2603 endinpos = startinpos+1; 2604 goto utf8Error; 2605 2606 case 2: 2607 if ((s[1] & 0xc0) != 0x80) { 2608 errmsg = "invalid continuation byte"; 2609 startinpos = s-starts; 2610 endinpos = startinpos + 1; 2611 goto utf8Error; 2612 } 2613 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f); 2614 assert ((ch > 0x007F) && (ch <= 0x07FF)); 2615 *p++ = (Py_UNICODE)ch; 2616 break; 2617 2618 case 3: 2619 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf 2620 will result in surrogates in range d800-dfff. Surrogates are 2621 not valid UTF-8 so they are rejected. 2622 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf 2623 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */ 2624 if ((s[1] & 0xc0) != 0x80 || 2625 (s[2] & 0xc0) != 0x80 || 2626 ((unsigned char)s[0] == 0xE0 && 2627 (unsigned char)s[1] < 0xA0) || 2628 ((unsigned char)s[0] == 0xED && 2629 (unsigned char)s[1] > 0x9F)) { 2630 errmsg = "invalid continuation byte"; 2631 startinpos = s-starts; 2632 endinpos = startinpos + 1; 2633 2634 /* if s[1] first two bits are 1 and 0, then the invalid 2635 continuation byte is s[2], so increment endinpos by 1, 2636 if not, s[1] is invalid and endinpos doesn't need to 2637 be incremented. */ 2638 if ((s[1] & 0xC0) == 0x80) 2639 endinpos++; 2640 goto utf8Error; 2641 } 2642 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f); 2643 assert ((ch > 0x07FF) && (ch <= 0xFFFF)); 2644 *p++ = (Py_UNICODE)ch; 2645 break; 2646 2647 case 4: 2648 if ((s[1] & 0xc0) != 0x80 || 2649 (s[2] & 0xc0) != 0x80 || 2650 (s[3] & 0xc0) != 0x80 || 2651 ((unsigned char)s[0] == 0xF0 && 2652 (unsigned char)s[1] < 0x90) || 2653 ((unsigned char)s[0] == 0xF4 && 2654 (unsigned char)s[1] > 0x8F)) { 2655 errmsg = "invalid continuation byte"; 2656 startinpos = s-starts; 2657 endinpos = startinpos + 1; 2658 if ((s[1] & 0xC0) == 0x80) { 2659 endinpos++; 2660 if ((s[2] & 0xC0) == 0x80) 2661 endinpos++; 2662 } 2663 goto utf8Error; 2664 } 2665 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) + 2666 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f); 2667 assert ((ch > 0xFFFF) && (ch <= 0x10ffff)); 2668 2669#ifdef Py_UNICODE_WIDE 2670 *p++ = (Py_UNICODE)ch; 2671#else 2672 /* compute and append the two surrogates: */ 2673 2674 /* translate from 10000..10FFFF to 0..FFFF */ 2675 ch -= 0x10000; 2676 2677 /* high surrogate = top 10 bits added to D800 */ 2678 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10)); 2679 2680 /* low surrogate = bottom 10 bits added to DC00 */ 2681 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF)); 2682#endif 2683 break; 2684 } 2685 s += n; 2686 continue; 2687 2688 utf8Error: 2689 outpos = p-PyUnicode_AS_UNICODE(unicode); 2690 if (unicode_decode_call_errorhandler( 2691 errors, &errorHandler, 2692 "utf8", errmsg, 2693 &starts, &e, &startinpos, &endinpos, &exc, &s, 2694 &unicode, &outpos, &p)) 2695 goto onError; 2696 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK); 2697 } 2698 if (consumed) 2699 *consumed = s-starts; 2700 2701 /* Adjust length */ 2702 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0) 2703 goto onError; 2704 2705 Py_XDECREF(errorHandler); 2706 Py_XDECREF(exc); 2707 return (PyObject *)unicode; 2708 2709 onError: 2710 Py_XDECREF(errorHandler); 2711 Py_XDECREF(exc); 2712 Py_DECREF(unicode); 2713 return NULL; 2714} 2715 2716#undef ASCII_CHAR_MASK 2717 2718 2719/* Allocation strategy: if the string is short, convert into a stack buffer 2720 and allocate exactly as much space needed at the end. Else allocate the 2721 maximum possible needed (4 result bytes per Unicode character), and return 2722 the excess memory at the end. 2723*/ 2724PyObject * 2725PyUnicode_EncodeUTF8(const Py_UNICODE *s, 2726 Py_ssize_t size, 2727 const char *errors) 2728{ 2729#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */ 2730 2731 Py_ssize_t i; /* index into s of next input byte */ 2732 PyObject *result; /* result string object */ 2733 char *p; /* next free byte in output buffer */ 2734 Py_ssize_t nallocated; /* number of result bytes allocated */ 2735 Py_ssize_t nneeded; /* number of result bytes needed */ 2736 char stackbuf[MAX_SHORT_UNICHARS * 4]; 2737 PyObject *errorHandler = NULL; 2738 PyObject *exc = NULL; 2739 2740 assert(s != NULL); 2741 assert(size >= 0); 2742 2743 if (size <= MAX_SHORT_UNICHARS) { 2744 /* Write into the stack buffer; nallocated can't overflow. 2745 * At the end, we'll allocate exactly as much heap space as it 2746 * turns out we need. 2747 */ 2748 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int); 2749 result = NULL; /* will allocate after we're done */ 2750 p = stackbuf; 2751 } 2752 else { 2753 /* Overallocate on the heap, and give the excess back at the end. */ 2754 nallocated = size * 4; 2755 if (nallocated / 4 != size) /* overflow! */ 2756 return PyErr_NoMemory(); 2757 result = PyBytes_FromStringAndSize(NULL, nallocated); 2758 if (result == NULL) 2759 return NULL; 2760 p = PyBytes_AS_STRING(result); 2761 } 2762 2763 for (i = 0; i < size;) { 2764 Py_UCS4 ch = s[i++]; 2765 2766 if (ch < 0x80) 2767 /* Encode ASCII */ 2768 *p++ = (char) ch; 2769 2770 else if (ch < 0x0800) { 2771 /* Encode Latin-1 */ 2772 *p++ = (char)(0xc0 | (ch >> 6)); 2773 *p++ = (char)(0x80 | (ch & 0x3f)); 2774 } else if (0xD800 <= ch && ch <= 0xDFFF) { 2775#ifndef Py_UNICODE_WIDE 2776 /* Special case: check for high and low surrogate */ 2777 if (ch <= 0xDBFF && i != size && 0xDC00 <= s[i] && s[i] <= 0xDFFF) { 2778 Py_UCS4 ch2 = s[i]; 2779 /* Combine the two surrogates to form a UCS4 value */ 2780 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000; 2781 i++; 2782 2783 /* Encode UCS4 Unicode ordinals */ 2784 *p++ = (char)(0xf0 | (ch >> 18)); 2785 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f)); 2786 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); 2787 *p++ = (char)(0x80 | (ch & 0x3f)); 2788 } else { 2789#endif 2790 Py_ssize_t newpos; 2791 PyObject *rep; 2792 Py_ssize_t repsize, k; 2793 rep = unicode_encode_call_errorhandler 2794 (errors, &errorHandler, "utf-8", "surrogates not allowed", 2795 s, size, &exc, i-1, i, &newpos); 2796 if (!rep) 2797 goto error; 2798 2799 if (PyBytes_Check(rep)) 2800 repsize = PyBytes_GET_SIZE(rep); 2801 else 2802 repsize = PyUnicode_GET_SIZE(rep); 2803 2804 if (repsize > 4) { 2805 Py_ssize_t offset; 2806 2807 if (result == NULL) 2808 offset = p - stackbuf; 2809 else 2810 offset = p - PyBytes_AS_STRING(result); 2811 2812 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) { 2813 /* integer overflow */ 2814 PyErr_NoMemory(); 2815 goto error; 2816 } 2817 nallocated += repsize - 4; 2818 if (result != NULL) { 2819 if (_PyBytes_Resize(&result, nallocated) < 0) 2820 goto error; 2821 } else { 2822 result = PyBytes_FromStringAndSize(NULL, nallocated); 2823 if (result == NULL) 2824 goto error; 2825 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset); 2826 } 2827 p = PyBytes_AS_STRING(result) + offset; 2828 } 2829 2830 if (PyBytes_Check(rep)) { 2831 char *prep = PyBytes_AS_STRING(rep); 2832 for(k = repsize; k > 0; k--) 2833 *p++ = *prep++; 2834 } else /* rep is unicode */ { 2835 Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep); 2836 Py_UNICODE c; 2837 2838 for(k=0; k<repsize; k++) { 2839 c = prep[k]; 2840 if (0x80 <= c) { 2841 raise_encode_exception(&exc, "utf-8", s, size, 2842 i-1, i, "surrogates not allowed"); 2843 goto error; 2844 } 2845 *p++ = (char)prep[k]; 2846 } 2847 } 2848 Py_DECREF(rep); 2849#ifndef Py_UNICODE_WIDE 2850 } 2851#endif 2852 } else if (ch < 0x10000) { 2853 *p++ = (char)(0xe0 | (ch >> 12)); 2854 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); 2855 *p++ = (char)(0x80 | (ch & 0x3f)); 2856 } else /* ch >= 0x10000 */ { 2857 /* Encode UCS4 Unicode ordinals */ 2858 *p++ = (char)(0xf0 | (ch >> 18)); 2859 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f)); 2860 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); 2861 *p++ = (char)(0x80 | (ch & 0x3f)); 2862 } 2863 } 2864 2865 if (result == NULL) { 2866 /* This was stack allocated. */ 2867 nneeded = p - stackbuf; 2868 assert(nneeded <= nallocated); 2869 result = PyBytes_FromStringAndSize(stackbuf, nneeded); 2870 } 2871 else { 2872 /* Cut back to size actually needed. */ 2873 nneeded = p - PyBytes_AS_STRING(result); 2874 assert(nneeded <= nallocated); 2875 _PyBytes_Resize(&result, nneeded); 2876 } 2877 Py_XDECREF(errorHandler); 2878 Py_XDECREF(exc); 2879 return result; 2880 error: 2881 Py_XDECREF(errorHandler); 2882 Py_XDECREF(exc); 2883 Py_XDECREF(result); 2884 return NULL; 2885 2886#undef MAX_SHORT_UNICHARS 2887} 2888 2889PyObject *PyUnicode_AsUTF8String(PyObject *unicode) 2890{ 2891 if (!PyUnicode_Check(unicode)) { 2892 PyErr_BadArgument(); 2893 return NULL; 2894 } 2895 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode), 2896 PyUnicode_GET_SIZE(unicode), 2897 NULL); 2898} 2899 2900/* --- UTF-32 Codec ------------------------------------------------------- */ 2901 2902PyObject * 2903PyUnicode_DecodeUTF32(const char *s, 2904 Py_ssize_t size, 2905 const char *errors, 2906 int *byteorder) 2907{ 2908 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL); 2909} 2910 2911PyObject * 2912PyUnicode_DecodeUTF32Stateful(const char *s, 2913 Py_ssize_t size, 2914 const char *errors, 2915 int *byteorder, 2916 Py_ssize_t *consumed) 2917{ 2918 const char *starts = s; 2919 Py_ssize_t startinpos; 2920 Py_ssize_t endinpos; 2921 Py_ssize_t outpos; 2922 PyUnicodeObject *unicode; 2923 Py_UNICODE *p; 2924#ifndef Py_UNICODE_WIDE 2925 int pairs = 0; 2926 const unsigned char *qq; 2927#else 2928 const int pairs = 0; 2929#endif 2930 const unsigned char *q, *e; 2931 int bo = 0; /* assume native ordering by default */ 2932 const char *errmsg = ""; 2933 /* Offsets from q for retrieving bytes in the right order. */ 2934#ifdef BYTEORDER_IS_LITTLE_ENDIAN 2935 int iorder[] = {0, 1, 2, 3}; 2936#else 2937 int iorder[] = {3, 2, 1, 0}; 2938#endif 2939 PyObject *errorHandler = NULL; 2940 PyObject *exc = NULL; 2941 2942 q = (unsigned char *)s; 2943 e = q + size; 2944 2945 if (byteorder) 2946 bo = *byteorder; 2947 2948 /* Check for BOM marks (U+FEFF) in the input and adjust current 2949 byte order setting accordingly. In native mode, the leading BOM 2950 mark is skipped, in all other modes, it is copied to the output 2951 stream as-is (giving a ZWNBSP character). */ 2952 if (bo == 0) { 2953 if (size >= 4) { 2954 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) | 2955 (q[iorder[1]] << 8) | q[iorder[0]]; 2956#ifdef BYTEORDER_IS_LITTLE_ENDIAN 2957 if (bom == 0x0000FEFF) { 2958 q += 4; 2959 bo = -1; 2960 } 2961 else if (bom == 0xFFFE0000) { 2962 q += 4; 2963 bo = 1; 2964 } 2965#else 2966 if (bom == 0x0000FEFF) { 2967 q += 4; 2968 bo = 1; 2969 } 2970 else if (bom == 0xFFFE0000) { 2971 q += 4; 2972 bo = -1; 2973 } 2974#endif 2975 } 2976 } 2977 2978 if (bo == -1) { 2979 /* force LE */ 2980 iorder[0] = 0; 2981 iorder[1] = 1; 2982 iorder[2] = 2; 2983 iorder[3] = 3; 2984 } 2985 else if (bo == 1) { 2986 /* force BE */ 2987 iorder[0] = 3; 2988 iorder[1] = 2; 2989 iorder[2] = 1; 2990 iorder[3] = 0; 2991 } 2992 2993 /* On narrow builds we split characters outside the BMP into two 2994 codepoints => count how much extra space we need. */ 2995#ifndef Py_UNICODE_WIDE 2996 for (qq = q; qq < e; qq += 4) 2997 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0) 2998 pairs++; 2999#endif 3000 3001 /* This might be one to much, because of a BOM */ 3002 unicode = _PyUnicode_New((size+3)/4+pairs); 3003 if (!unicode) 3004 return NULL; 3005 if (size == 0) 3006 return (PyObject *)unicode; 3007 3008 /* Unpack UTF-32 encoded data */ 3009 p = unicode->str; 3010 3011 while (q < e) { 3012 Py_UCS4 ch; 3013 /* remaining bytes at the end? (size should be divisible by 4) */ 3014 if (e-q<4) { 3015 if (consumed) 3016 break; 3017 errmsg = "truncated data"; 3018 startinpos = ((const char *)q)-starts; 3019 endinpos = ((const char *)e)-starts; 3020 goto utf32Error; 3021 /* The remaining input chars are ignored if the callback 3022 chooses to skip the input */ 3023 } 3024 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) | 3025 (q[iorder[1]] << 8) | q[iorder[0]]; 3026 3027 if (ch >= 0x110000) 3028 { 3029 errmsg = "codepoint not in range(0x110000)"; 3030 startinpos = ((const char *)q)-starts; 3031 endinpos = startinpos+4; 3032 goto utf32Error; 3033 } 3034#ifndef Py_UNICODE_WIDE 3035 if (ch >= 0x10000) 3036 { 3037 *p++ = 0xD800 | ((ch-0x10000) >> 10); 3038 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF); 3039 } 3040 else 3041#endif 3042 *p++ = ch; 3043 q += 4; 3044 continue; 3045 utf32Error: 3046 outpos = p-PyUnicode_AS_UNICODE(unicode); 3047 if (unicode_decode_call_errorhandler( 3048 errors, &errorHandler, 3049 "utf32", errmsg, 3050 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q, 3051 &unicode, &outpos, &p)) 3052 goto onError; 3053 } 3054 3055 if (byteorder) 3056 *byteorder = bo; 3057 3058 if (consumed) 3059 *consumed = (const char *)q-starts; 3060 3061 /* Adjust length */ 3062 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0) 3063 goto onError; 3064 3065 Py_XDECREF(errorHandler); 3066 Py_XDECREF(exc); 3067 return (PyObject *)unicode; 3068 3069 onError: 3070 Py_DECREF(unicode); 3071 Py_XDECREF(errorHandler); 3072 Py_XDECREF(exc); 3073 return NULL; 3074} 3075 3076PyObject * 3077PyUnicode_EncodeUTF32(const Py_UNICODE *s, 3078 Py_ssize_t size, 3079 const char *errors, 3080 int byteorder) 3081{ 3082 PyObject *v; 3083 unsigned char *p; 3084 Py_ssize_t nsize, bytesize; 3085#ifndef Py_UNICODE_WIDE 3086 Py_ssize_t i, pairs; 3087#else 3088 const int pairs = 0; 3089#endif 3090 /* Offsets from p for storing byte pairs in the right order. */ 3091#ifdef BYTEORDER_IS_LITTLE_ENDIAN 3092 int iorder[] = {0, 1, 2, 3}; 3093#else 3094 int iorder[] = {3, 2, 1, 0}; 3095#endif 3096 3097#define STORECHAR(CH) \ 3098 do { \ 3099 p[iorder[3]] = ((CH) >> 24) & 0xff; \ 3100 p[iorder[2]] = ((CH) >> 16) & 0xff; \ 3101 p[iorder[1]] = ((CH) >> 8) & 0xff; \ 3102 p[iorder[0]] = (CH) & 0xff; \ 3103 p += 4; \ 3104 } while(0) 3105 3106 /* In narrow builds we can output surrogate pairs as one codepoint, 3107 so we need less space. */ 3108#ifndef Py_UNICODE_WIDE 3109 for (i = pairs = 0; i < size-1; i++) 3110 if (0xD800 <= s[i] && s[i] <= 0xDBFF && 3111 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF) 3112 pairs++; 3113#endif 3114 nsize = (size - pairs + (byteorder == 0)); 3115 bytesize = nsize * 4; 3116 if (bytesize / 4 != nsize) 3117 return PyErr_NoMemory(); 3118 v = PyBytes_FromStringAndSize(NULL, bytesize); 3119 if (v == NULL) 3120 return NULL; 3121 3122 p = (unsigned char *)PyBytes_AS_STRING(v); 3123 if (byteorder == 0) 3124 STORECHAR(0xFEFF); 3125 if (size == 0) 3126 goto done; 3127 3128 if (byteorder == -1) { 3129 /* force LE */ 3130 iorder[0] = 0; 3131 iorder[1] = 1; 3132 iorder[2] = 2; 3133 iorder[3] = 3; 3134 } 3135 else if (byteorder == 1) { 3136 /* force BE */ 3137 iorder[0] = 3; 3138 iorder[1] = 2; 3139 iorder[2] = 1; 3140 iorder[3] = 0; 3141 } 3142 3143 while (size-- > 0) { 3144 Py_UCS4 ch = *s++; 3145#ifndef Py_UNICODE_WIDE 3146 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) { 3147 Py_UCS4 ch2 = *s; 3148 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) { 3149 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000; 3150 s++; 3151 size--; 3152 } 3153 } 3154#endif 3155 STORECHAR(ch); 3156 } 3157 3158 done: 3159 return v; 3160#undef STORECHAR 3161} 3162 3163PyObject *PyUnicode_AsUTF32String(PyObject *unicode) 3164{ 3165 if (!PyUnicode_Check(unicode)) { 3166 PyErr_BadArgument(); 3167 return NULL; 3168 } 3169 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode), 3170 PyUnicode_GET_SIZE(unicode), 3171 NULL, 3172 0); 3173} 3174 3175/* --- UTF-16 Codec ------------------------------------------------------- */ 3176 3177PyObject * 3178PyUnicode_DecodeUTF16(const char *s, 3179 Py_ssize_t size, 3180 const char *errors, 3181 int *byteorder) 3182{ 3183 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL); 3184} 3185 3186/* Two masks for fast checking of whether a C 'long' may contain 3187 UTF16-encoded surrogate characters. This is an efficient heuristic, 3188 assuming that non-surrogate characters with a code point >= 0x8000 are 3189 rare in most input. 3190 FAST_CHAR_MASK is used when the input is in native byte ordering, 3191 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering. 3192*/ 3193#if (SIZEOF_LONG == 8) 3194# define FAST_CHAR_MASK 0x8000800080008000L 3195# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L 3196#elif (SIZEOF_LONG == 4) 3197# define FAST_CHAR_MASK 0x80008000L 3198# define SWAPPED_FAST_CHAR_MASK 0x00800080L 3199#else 3200# error C 'long' size should be either 4 or 8! 3201#endif 3202 3203PyObject * 3204PyUnicode_DecodeUTF16Stateful(const char *s, 3205 Py_ssize_t size, 3206 const char *errors, 3207 int *byteorder, 3208 Py_ssize_t *consumed) 3209{ 3210 const char *starts = s; 3211 Py_ssize_t startinpos; 3212 Py_ssize_t endinpos; 3213 Py_ssize_t outpos; 3214 PyUnicodeObject *unicode; 3215 Py_UNICODE *p; 3216 const unsigned char *q, *e, *aligned_end; 3217 int bo = 0; /* assume native ordering by default */ 3218 int native_ordering = 0; 3219 const char *errmsg = ""; 3220 /* Offsets from q for retrieving byte pairs in the right order. */ 3221#ifdef BYTEORDER_IS_LITTLE_ENDIAN 3222 int ihi = 1, ilo = 0; 3223#else 3224 int ihi = 0, ilo = 1; 3225#endif 3226 PyObject *errorHandler = NULL; 3227 PyObject *exc = NULL; 3228 3229 /* Note: size will always be longer than the resulting Unicode 3230 character count */ 3231 unicode = _PyUnicode_New(size); 3232 if (!unicode) 3233 return NULL; 3234 if (size == 0) 3235 return (PyObject *)unicode; 3236 3237 /* Unpack UTF-16 encoded data */ 3238 p = unicode->str; 3239 q = (unsigned char *)s; 3240 e = q + size - 1; 3241 3242 if (byteorder) 3243 bo = *byteorder; 3244 3245 /* Check for BOM marks (U+FEFF) in the input and adjust current 3246 byte order setting accordingly. In native mode, the leading BOM 3247 mark is skipped, in all other modes, it is copied to the output 3248 stream as-is (giving a ZWNBSP character). */ 3249 if (bo == 0) { 3250 if (size >= 2) { 3251 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo]; 3252#ifdef BYTEORDER_IS_LITTLE_ENDIAN 3253 if (bom == 0xFEFF) { 3254 q += 2; 3255 bo = -1; 3256 } 3257 else if (bom == 0xFFFE) { 3258 q += 2; 3259 bo = 1; 3260 } 3261#else 3262 if (bom == 0xFEFF) { 3263 q += 2; 3264 bo = 1; 3265 } 3266 else if (bom == 0xFFFE) { 3267 q += 2; 3268 bo = -1; 3269 } 3270#endif 3271 } 3272 } 3273 3274 if (bo == -1) { 3275 /* force LE */ 3276 ihi = 1; 3277 ilo = 0; 3278 } 3279 else if (bo == 1) { 3280 /* force BE */ 3281 ihi = 0; 3282 ilo = 1; 3283 } 3284#ifdef BYTEORDER_IS_LITTLE_ENDIAN 3285 native_ordering = ilo < ihi; 3286#else 3287 native_ordering = ilo > ihi; 3288#endif 3289 3290 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK); 3291 while (q < e) { 3292 Py_UNICODE ch; 3293 /* First check for possible aligned read of a C 'long'. Unaligned 3294 reads are more expensive, better to defer to another iteration. */ 3295 if (!((size_t) q & LONG_PTR_MASK)) { 3296 /* Fast path for runs of non-surrogate chars. */ 3297 register const unsigned char *_q = q; 3298 Py_UNICODE *_p = p; 3299 if (native_ordering) { 3300 /* Native ordering is simple: as long as the input cannot 3301 possibly contain a surrogate char, do an unrolled copy 3302 of several 16-bit code points to the target object. 3303 The non-surrogate check is done on several input bytes 3304 at a time (as many as a C 'long' can contain). */ 3305 while (_q < aligned_end) { 3306 unsigned long data = * (unsigned long *) _q; 3307 if (data & FAST_CHAR_MASK) 3308 break; 3309 _p[0] = ((unsigned short *) _q)[0]; 3310 _p[1] = ((unsigned short *) _q)[1]; 3311#if (SIZEOF_LONG == 8) 3312 _p[2] = ((unsigned short *) _q)[2]; 3313 _p[3] = ((unsigned short *) _q)[3]; 3314#endif 3315 _q += SIZEOF_LONG; 3316 _p += SIZEOF_LONG / 2; 3317 } 3318 } 3319 else { 3320 /* Byteswapped ordering is similar, but we must decompose 3321 the copy bytewise, and take care of zero'ing out the 3322 upper bytes if the target object is in 32-bit units 3323 (that is, in UCS-4 builds). */ 3324 while (_q < aligned_end) { 3325 unsigned long data = * (unsigned long *) _q; 3326 if (data & SWAPPED_FAST_CHAR_MASK) 3327 break; 3328 /* Zero upper bytes in UCS-4 builds */ 3329#if (Py_UNICODE_SIZE > 2) 3330 _p[0] = 0; 3331 _p[1] = 0; 3332#if (SIZEOF_LONG == 8) 3333 _p[2] = 0; 3334 _p[3] = 0; 3335#endif 3336#endif 3337 /* Issue #4916; UCS-4 builds on big endian machines must 3338 fill the two last bytes of each 4-byte unit. */ 3339#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2) 3340# define OFF 2 3341#else 3342# define OFF 0 3343#endif 3344 ((unsigned char *) _p)[OFF + 1] = _q[0]; 3345 ((unsigned char *) _p)[OFF + 0] = _q[1]; 3346 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2]; 3347 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3]; 3348#if (SIZEOF_LONG == 8) 3349 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4]; 3350 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5]; 3351 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6]; 3352 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7]; 3353#endif 3354#undef OFF 3355 _q += SIZEOF_LONG; 3356 _p += SIZEOF_LONG / 2; 3357 } 3358 } 3359 p = _p; 3360 q = _q; 3361 if (q >= e) 3362 break; 3363 } 3364 ch = (q[ihi] << 8) | q[ilo]; 3365 3366 q += 2; 3367 3368 if (ch < 0xD800 || ch > 0xDFFF) { 3369 *p++ = ch; 3370 continue; 3371 } 3372 3373 /* UTF-16 code pair: */ 3374 if (q > e) { 3375 errmsg = "unexpected end of data"; 3376 startinpos = (((const char *)q) - 2) - starts; 3377 endinpos = ((const char *)e) + 1 - starts; 3378 goto utf16Error; 3379 } 3380 if (0xD800 <= ch && ch <= 0xDBFF) { 3381 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo]; 3382 q += 2; 3383 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) { 3384#ifndef Py_UNICODE_WIDE 3385 *p++ = ch; 3386 *p++ = ch2; 3387#else 3388 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000; 3389#endif 3390 continue; 3391 } 3392 else { 3393 errmsg = "illegal UTF-16 surrogate"; 3394 startinpos = (((const char *)q)-4)-starts; 3395 endinpos = startinpos+2; 3396 goto utf16Error; 3397 } 3398 3399 } 3400 errmsg = "illegal encoding"; 3401 startinpos = (((const char *)q)-2)-starts; 3402 endinpos = startinpos+2; 3403 /* Fall through to report the error */ 3404 3405 utf16Error: 3406 outpos = p - PyUnicode_AS_UNICODE(unicode); 3407 if (unicode_decode_call_errorhandler( 3408 errors, 3409 &errorHandler, 3410 "utf16", errmsg, 3411 &starts, 3412 (const char **)&e, 3413 &startinpos, 3414 &endinpos, 3415 &exc, 3416 (const char **)&q, 3417 &unicode, 3418 &outpos, 3419 &p)) 3420 goto onError; 3421 } 3422 /* remaining byte at the end? (size should be even) */ 3423 if (e == q) { 3424 if (!consumed) { 3425 errmsg = "truncated data"; 3426 startinpos = ((const char *)q) - starts; 3427 endinpos = ((const char *)e) + 1 - starts; 3428 outpos = p - PyUnicode_AS_UNICODE(unicode); 3429 if (unicode_decode_call_errorhandler( 3430 errors, 3431 &errorHandler, 3432 "utf16", errmsg, 3433 &starts, 3434 (const char **)&e, 3435 &startinpos, 3436 &endinpos, 3437 &exc, 3438 (const char **)&q, 3439 &unicode, 3440 &outpos, 3441 &p)) 3442 goto onError; 3443 /* The remaining input chars are ignored if the callback 3444 chooses to skip the input */ 3445 } 3446 } 3447 3448 if (byteorder) 3449 *byteorder = bo; 3450 3451 if (consumed) 3452 *consumed = (const char *)q-starts; 3453 3454 /* Adjust length */ 3455 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0) 3456 goto onError; 3457 3458 Py_XDECREF(errorHandler); 3459 Py_XDECREF(exc); 3460 return (PyObject *)unicode; 3461 3462 onError: 3463 Py_DECREF(unicode); 3464 Py_XDECREF(errorHandler); 3465 Py_XDECREF(exc); 3466 return NULL; 3467} 3468 3469#undef FAST_CHAR_MASK 3470#undef SWAPPED_FAST_CHAR_MASK 3471 3472PyObject * 3473PyUnicode_EncodeUTF16(const Py_UNICODE *s, 3474 Py_ssize_t size, 3475 const char *errors, 3476 int byteorder) 3477{ 3478 PyObject *v; 3479 unsigned char *p; 3480 Py_ssize_t nsize, bytesize; 3481#ifdef Py_UNICODE_WIDE 3482 Py_ssize_t i, pairs; 3483#else 3484 const int pairs = 0; 3485#endif 3486 /* Offsets from p for storing byte pairs in the right order. */ 3487#ifdef BYTEORDER_IS_LITTLE_ENDIAN 3488 int ihi = 1, ilo = 0; 3489#else 3490 int ihi = 0, ilo = 1; 3491#endif 3492 3493#define STORECHAR(CH) \ 3494 do { \ 3495 p[ihi] = ((CH) >> 8) & 0xff; \ 3496 p[ilo] = (CH) & 0xff; \ 3497 p += 2; \ 3498 } while(0) 3499 3500#ifdef Py_UNICODE_WIDE 3501 for (i = pairs = 0; i < size; i++) 3502 if (s[i] >= 0x10000) 3503 pairs++; 3504#endif 3505 /* 2 * (size + pairs + (byteorder == 0)) */ 3506 if (size > PY_SSIZE_T_MAX || 3507 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0)) 3508 return PyErr_NoMemory(); 3509 nsize = size + pairs + (byteorder == 0); 3510 bytesize = nsize * 2; 3511 if (bytesize / 2 != nsize) 3512 return PyErr_NoMemory(); 3513 v = PyBytes_FromStringAndSize(NULL, bytesize); 3514 if (v == NULL) 3515 return NULL; 3516 3517 p = (unsigned char *)PyBytes_AS_STRING(v); 3518 if (byteorder == 0) 3519 STORECHAR(0xFEFF); 3520 if (size == 0) 3521 goto done; 3522 3523 if (byteorder == -1) { 3524 /* force LE */ 3525 ihi = 1; 3526 ilo = 0; 3527 } 3528 else if (byteorder == 1) { 3529 /* force BE */ 3530 ihi = 0; 3531 ilo = 1; 3532 } 3533 3534 while (size-- > 0) { 3535 Py_UNICODE ch = *s++; 3536 Py_UNICODE ch2 = 0; 3537#ifdef Py_UNICODE_WIDE 3538 if (ch >= 0x10000) { 3539 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF); 3540 ch = 0xD800 | ((ch-0x10000) >> 10); 3541 } 3542#endif 3543 STORECHAR(ch); 3544 if (ch2) 3545 STORECHAR(ch2); 3546 } 3547 3548 done: 3549 return v; 3550#undef STORECHAR 3551} 3552 3553PyObject *PyUnicode_AsUTF16String(PyObject *unicode) 3554{ 3555 if (!PyUnicode_Check(unicode)) { 3556 PyErr_BadArgument(); 3557 return NULL; 3558 } 3559 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode), 3560 PyUnicode_GET_SIZE(unicode), 3561 NULL, 3562 0); 3563} 3564 3565/* --- Unicode Escape Codec ----------------------------------------------- */ 3566 3567static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL; 3568 3569PyObject *PyUnicode_DecodeUnicodeEscape(const char *s, 3570 Py_ssize_t size, 3571 const char *errors) 3572{ 3573 const char *starts = s; 3574 Py_ssize_t startinpos; 3575 Py_ssize_t endinpos; 3576 Py_ssize_t outpos; 3577 int i; 3578 PyUnicodeObject *v; 3579 Py_UNICODE *p; 3580 const char *end; 3581 char* message; 3582 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */ 3583 PyObject *errorHandler = NULL; 3584 PyObject *exc = NULL; 3585 3586 /* Escaped strings will always be longer than the resulting 3587 Unicode string, so we start with size here and then reduce the 3588 length after conversion to the true value. 3589 (but if the error callback returns a long replacement string 3590 we'll have to allocate more space) */ 3591 v = _PyUnicode_New(size); 3592 if (v == NULL) 3593 goto onError; 3594 if (size == 0) 3595 return (PyObject *)v; 3596 3597 p = PyUnicode_AS_UNICODE(v); 3598 end = s + size; 3599 3600 while (s < end) { 3601 unsigned char c; 3602 Py_UNICODE x; 3603 int digits; 3604 3605 /* Non-escape characters are interpreted as Unicode ordinals */ 3606 if (*s != '\\') { 3607 *p++ = (unsigned char) *s++; 3608 continue; 3609 } 3610 3611 startinpos = s-starts; 3612 /* \ - Escapes */ 3613 s++; 3614 c = *s++; 3615 if (s > end) 3616 c = '\0'; /* Invalid after \ */ 3617 switch (c) { 3618 3619 /* \x escapes */ 3620 case '\n': break; 3621 case '\\': *p++ = '\\'; break; 3622 case '\'': *p++ = '\''; break; 3623 case '\"': *p++ = '\"'; break; 3624 case 'b': *p++ = '\b'; break; 3625 case 'f': *p++ = '\014'; break; /* FF */ 3626 case 't': *p++ = '\t'; break; 3627 case 'n': *p++ = '\n'; break; 3628 case 'r': *p++ = '\r'; break; 3629 case 'v': *p++ = '\013'; break; /* VT */ 3630 case 'a': *p++ = '\007'; break; /* BEL, not classic C */ 3631 3632 /* \OOO (octal) escapes */ 3633 case '0': case '1': case '2': case '3': 3634 case '4': case '5': case '6': case '7': 3635 x = s[-1] - '0'; 3636 if (s < end && '0' <= *s && *s <= '7') { 3637 x = (x<<3) + *s++ - '0'; 3638 if (s < end && '0' <= *s && *s <= '7') 3639 x = (x<<3) + *s++ - '0'; 3640 } 3641 *p++ = x; 3642 break; 3643 3644 /* hex escapes */ 3645 /* \xXX */ 3646 case 'x': 3647 digits = 2; 3648 message = "truncated \\xXX escape"; 3649 goto hexescape; 3650 3651 /* \uXXXX */ 3652 case 'u': 3653 digits = 4; 3654 message = "truncated \\uXXXX escape"; 3655 goto hexescape; 3656 3657 /* \UXXXXXXXX */ 3658 case 'U': 3659 digits = 8; 3660 message = "truncated \\UXXXXXXXX escape"; 3661 hexescape: 3662 chr = 0; 3663 outpos = p-PyUnicode_AS_UNICODE(v); 3664 if (s+digits>end) { 3665 endinpos = size; 3666 if (unicode_decode_call_errorhandler( 3667 errors, &errorHandler, 3668 "unicodeescape", "end of string in escape sequence", 3669 &starts, &end, &startinpos, &endinpos, &exc, &s, 3670 &v, &outpos, &p)) 3671 goto onError; 3672 goto nextByte; 3673 } 3674 for (i = 0; i < digits; ++i) { 3675 c = (unsigned char) s[i]; 3676 if (!ISXDIGIT(c)) { 3677 endinpos = (s+i+1)-starts; 3678 if (unicode_decode_call_errorhandler( 3679 errors, &errorHandler, 3680 "unicodeescape", message, 3681 &starts, &end, &startinpos, &endinpos, &exc, &s, 3682 &v, &outpos, &p)) 3683 goto onError; 3684 goto nextByte; 3685 } 3686 chr = (chr<<4) & ~0xF; 3687 if (c >= '0' && c <= '9') 3688 chr += c - '0'; 3689 else if (c >= 'a' && c <= 'f') 3690 chr += 10 + c - 'a'; 3691 else 3692 chr += 10 + c - 'A'; 3693 } 3694 s += i; 3695 if (chr == 0xffffffff && PyErr_Occurred()) 3696 /* _decoding_error will have already written into the 3697 target buffer. */ 3698 break; 3699 store: 3700 /* when we get here, chr is a 32-bit unicode character */ 3701 if (chr <= 0xffff) 3702 /* UCS-2 character */ 3703 *p++ = (Py_UNICODE) chr; 3704 else if (chr <= 0x10ffff) { 3705 /* UCS-4 character. Either store directly, or as 3706 surrogate pair. */ 3707#ifdef Py_UNICODE_WIDE 3708 *p++ = chr; 3709#else 3710 chr -= 0x10000L; 3711 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10); 3712 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF); 3713#endif 3714 } else { 3715 endinpos = s-starts; 3716 outpos = p-PyUnicode_AS_UNICODE(v); 3717 if (unicode_decode_call_errorhandler( 3718 errors, &errorHandler, 3719 "unicodeescape", "illegal Unicode character", 3720 &starts, &end, &startinpos, &endinpos, &exc, &s, 3721 &v, &outpos, &p)) 3722 goto onError; 3723 } 3724 break; 3725 3726 /* \N{name} */ 3727 case 'N': 3728 message = "malformed \\N character escape"; 3729 if (ucnhash_CAPI == NULL) { 3730 /* load the unicode data module */ 3731 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(PyUnicodeData_CAPSULE_NAME, 1); 3732 if (ucnhash_CAPI == NULL) 3733 goto ucnhashError; 3734 } 3735 if (*s == '{') { 3736 const char *start = s+1; 3737 /* look for the closing brace */ 3738 while (*s != '}' && s < end) 3739 s++; 3740 if (s > start && s < end && *s == '}') { 3741 /* found a name. look it up in the unicode database */ 3742 message = "unknown Unicode character name"; 3743 s++; 3744 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr)) 3745 goto store; 3746 } 3747 } 3748 endinpos = s-starts; 3749 outpos = p-PyUnicode_AS_UNICODE(v); 3750 if (unicode_decode_call_errorhandler( 3751 errors, &errorHandler, 3752 "unicodeescape", message, 3753 &starts, &end, &startinpos, &endinpos, &exc, &s, 3754 &v, &outpos, &p)) 3755 goto onError; 3756 break; 3757 3758 default: 3759 if (s > end) { 3760 message = "\\ at end of string"; 3761 s--; 3762 endinpos = s-starts; 3763 outpos = p-PyUnicode_AS_UNICODE(v); 3764 if (unicode_decode_call_errorhandler( 3765 errors, &errorHandler, 3766 "unicodeescape", message, 3767 &starts, &end, &startinpos, &endinpos, &exc, &s, 3768 &v, &outpos, &p)) 3769 goto onError; 3770 } 3771 else { 3772 *p++ = '\\'; 3773 *p++ = (unsigned char)s[-1]; 3774 } 3775 break; 3776 } 3777 nextByte: 3778 ; 3779 } 3780 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0) 3781 goto onError; 3782 Py_XDECREF(errorHandler); 3783 Py_XDECREF(exc); 3784 return (PyObject *)v; 3785 3786 ucnhashError: 3787 PyErr_SetString( 3788 PyExc_UnicodeError, 3789 "\\N escapes not supported (can't load unicodedata module)" 3790 ); 3791 Py_XDECREF(v); 3792 Py_XDECREF(errorHandler); 3793 Py_XDECREF(exc); 3794 return NULL; 3795 3796 onError: 3797 Py_XDECREF(v); 3798 Py_XDECREF(errorHandler); 3799 Py_XDECREF(exc); 3800 return NULL; 3801} 3802 3803/* Return a Unicode-Escape string version of the Unicode object. 3804 3805 If quotes is true, the string is enclosed in u"" or u'' quotes as 3806 appropriate. 3807 3808*/ 3809 3810Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s, 3811 Py_ssize_t size, 3812 Py_UNICODE ch) 3813{ 3814 /* like wcschr, but doesn't stop at NULL characters */ 3815 3816 while (size-- > 0) { 3817 if (*s == ch) 3818 return s; 3819 s++; 3820 } 3821 3822 return NULL; 3823} 3824 3825static const char *hexdigits = "0123456789abcdef"; 3826 3827PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s, 3828 Py_ssize_t size) 3829{ 3830 PyObject *repr; 3831 char *p; 3832 3833#ifdef Py_UNICODE_WIDE 3834 const Py_ssize_t expandsize = 10; 3835#else 3836 const Py_ssize_t expandsize = 6; 3837#endif 3838 3839 /* XXX(nnorwitz): rather than over-allocating, it would be 3840 better to choose a different scheme. Perhaps scan the 3841 first N-chars of the string and allocate based on that size. 3842 */ 3843 /* Initial allocation is based on the longest-possible unichr 3844 escape. 3845 3846 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source 3847 unichr, so in this case it's the longest unichr escape. In 3848 narrow (UTF-16) builds this is five chars per source unichr 3849 since there are two unichrs in the surrogate pair, so in narrow 3850 (UTF-16) builds it's not the longest unichr escape. 3851 3852 In wide or narrow builds '\uxxxx' is 6 chars per source unichr, 3853 so in the narrow (UTF-16) build case it's the longest unichr 3854 escape. 3855 */ 3856 3857 if (size == 0) 3858 return PyBytes_FromStringAndSize(NULL, 0); 3859 3860 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize) 3861 return PyErr_NoMemory(); 3862 3863 repr = PyBytes_FromStringAndSize(NULL, 3864 2 3865 + expandsize*size 3866 + 1); 3867 if (repr == NULL) 3868 return NULL; 3869 3870 p = PyBytes_AS_STRING(repr); 3871 3872 while (size-- > 0) { 3873 Py_UNICODE ch = *s++; 3874 3875 /* Escape backslashes */ 3876 if (ch == '\\') { 3877 *p++ = '\\'; 3878 *p++ = (char) ch; 3879 continue; 3880 } 3881 3882#ifdef Py_UNICODE_WIDE 3883 /* Map 21-bit characters to '\U00xxxxxx' */ 3884 else if (ch >= 0x10000) { 3885 *p++ = '\\'; 3886 *p++ = 'U'; 3887 *p++ = hexdigits[(ch >> 28) & 0x0000000F]; 3888 *p++ = hexdigits[(ch >> 24) & 0x0000000F]; 3889 *p++ = hexdigits[(ch >> 20) & 0x0000000F]; 3890 *p++ = hexdigits[(ch >> 16) & 0x0000000F]; 3891 *p++ = hexdigits[(ch >> 12) & 0x0000000F]; 3892 *p++ = hexdigits[(ch >> 8) & 0x0000000F]; 3893 *p++ = hexdigits[(ch >> 4) & 0x0000000F]; 3894 *p++ = hexdigits[ch & 0x0000000F]; 3895 continue; 3896 } 3897#else 3898 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */ 3899 else if (ch >= 0xD800 && ch < 0xDC00) { 3900 Py_UNICODE ch2; 3901 Py_UCS4 ucs; 3902 3903 ch2 = *s++; 3904 size--; 3905 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) { 3906 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000; 3907 *p++ = '\\'; 3908 *p++ = 'U'; 3909 *p++ = hexdigits[(ucs >> 28) & 0x0000000F]; 3910 *p++ = hexdigits[(ucs >> 24) & 0x0000000F]; 3911 *p++ = hexdigits[(ucs >> 20) & 0x0000000F]; 3912 *p++ = hexdigits[(ucs >> 16) & 0x0000000F]; 3913 *p++ = hexdigits[(ucs >> 12) & 0x0000000F]; 3914 *p++ = hexdigits[(ucs >> 8) & 0x0000000F]; 3915 *p++ = hexdigits[(ucs >> 4) & 0x0000000F]; 3916 *p++ = hexdigits[ucs & 0x0000000F]; 3917 continue; 3918 } 3919 /* Fall through: isolated surrogates are copied as-is */ 3920 s--; 3921 size++; 3922 } 3923#endif 3924 3925 /* Map 16-bit characters to '\uxxxx' */ 3926 if (ch >= 256) { 3927 *p++ = '\\'; 3928 *p++ = 'u'; 3929 *p++ = hexdigits[(ch >> 12) & 0x000F]; 3930 *p++ = hexdigits[(ch >> 8) & 0x000F]; 3931 *p++ = hexdigits[(ch >> 4) & 0x000F]; 3932 *p++ = hexdigits[ch & 0x000F]; 3933 } 3934 3935 /* Map special whitespace to '\t', \n', '\r' */ 3936 else if (ch == '\t') { 3937 *p++ = '\\'; 3938 *p++ = 't'; 3939 } 3940 else if (ch == '\n') { 3941 *p++ = '\\'; 3942 *p++ = 'n'; 3943 } 3944 else if (ch == '\r') { 3945 *p++ = '\\'; 3946 *p++ = 'r'; 3947 } 3948 3949 /* Map non-printable US ASCII to '\xhh' */ 3950 else if (ch < ' ' || ch >= 0x7F) { 3951 *p++ = '\\'; 3952 *p++ = 'x'; 3953 *p++ = hexdigits[(ch >> 4) & 0x000F]; 3954 *p++ = hexdigits[ch & 0x000F]; 3955 } 3956 3957 /* Copy everything else as-is */ 3958 else 3959 *p++ = (char) ch; 3960 } 3961 3962 assert(p - PyBytes_AS_STRING(repr) > 0); 3963 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) 3964 return NULL; 3965 return repr; 3966} 3967 3968PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode) 3969{ 3970 PyObject *s; 3971 if (!PyUnicode_Check(unicode)) { 3972 PyErr_BadArgument(); 3973 return NULL; 3974 } 3975 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode), 3976 PyUnicode_GET_SIZE(unicode)); 3977 return s; 3978} 3979 3980/* --- Raw Unicode Escape Codec ------------------------------------------- */ 3981 3982PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s, 3983 Py_ssize_t size, 3984 const char *errors) 3985{ 3986 const char *starts = s; 3987 Py_ssize_t startinpos; 3988 Py_ssize_t endinpos; 3989 Py_ssize_t outpos; 3990 PyUnicodeObject *v; 3991 Py_UNICODE *p; 3992 const char *end; 3993 const char *bs; 3994 PyObject *errorHandler = NULL; 3995 PyObject *exc = NULL; 3996 3997 /* Escaped strings will always be longer than the resulting 3998 Unicode string, so we start with size here and then reduce the 3999 length after conversion to the true value. (But decoding error 4000 handler might have to resize the string) */ 4001 v = _PyUnicode_New(size); 4002 if (v == NULL) 4003 goto onError; 4004 if (size == 0) 4005 return (PyObject *)v; 4006 p = PyUnicode_AS_UNICODE(v); 4007 end = s + size; 4008 while (s < end) { 4009 unsigned char c; 4010 Py_UCS4 x; 4011 int i; 4012 int count; 4013 4014 /* Non-escape characters are interpreted as Unicode ordinals */ 4015 if (*s != '\\') { 4016 *p++ = (unsigned char)*s++; 4017 continue; 4018 } 4019 startinpos = s-starts; 4020 4021 /* \u-escapes are only interpreted iff the number of leading 4022 backslashes if odd */ 4023 bs = s; 4024 for (;s < end;) { 4025 if (*s != '\\') 4026 break; 4027 *p++ = (unsigned char)*s++; 4028 } 4029 if (((s - bs) & 1) == 0 || 4030 s >= end || 4031 (*s != 'u' && *s != 'U')) { 4032 continue; 4033 } 4034 p--; 4035 count = *s=='u' ? 4 : 8; 4036 s++; 4037 4038 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */ 4039 outpos = p-PyUnicode_AS_UNICODE(v); 4040 for (x = 0, i = 0; i < count; ++i, ++s) { 4041 c = (unsigned char)*s; 4042 if (!ISXDIGIT(c)) { 4043 endinpos = s-starts; 4044 if (unicode_decode_call_errorhandler( 4045 errors, &errorHandler, 4046 "rawunicodeescape", "truncated \\uXXXX", 4047 &starts, &end, &startinpos, &endinpos, &exc, &s, 4048 &v, &outpos, &p)) 4049 goto onError; 4050 goto nextByte; 4051 } 4052 x = (x<<4) & ~0xF; 4053 if (c >= '0' && c <= '9') 4054 x += c - '0'; 4055 else if (c >= 'a' && c <= 'f') 4056 x += 10 + c - 'a'; 4057 else 4058 x += 10 + c - 'A'; 4059 } 4060 if (x <= 0xffff) 4061 /* UCS-2 character */ 4062 *p++ = (Py_UNICODE) x; 4063 else if (x <= 0x10ffff) { 4064 /* UCS-4 character. Either store directly, or as 4065 surrogate pair. */ 4066#ifdef Py_UNICODE_WIDE 4067 *p++ = (Py_UNICODE) x; 4068#else 4069 x -= 0x10000L; 4070 *p++ = 0xD800 + (Py_UNICODE) (x >> 10); 4071 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF); 4072#endif 4073 } else { 4074 endinpos = s-starts; 4075 outpos = p-PyUnicode_AS_UNICODE(v); 4076 if (unicode_decode_call_errorhandler( 4077 errors, &errorHandler, 4078 "rawunicodeescape", "\\Uxxxxxxxx out of range", 4079 &starts, &end, &startinpos, &endinpos, &exc, &s, 4080 &v, &outpos, &p)) 4081 goto onError; 4082 } 4083 nextByte: 4084 ; 4085 } 4086 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0) 4087 goto onError; 4088 Py_XDECREF(errorHandler); 4089 Py_XDECREF(exc); 4090 return (PyObject *)v; 4091 4092 onError: 4093 Py_XDECREF(v); 4094 Py_XDECREF(errorHandler); 4095 Py_XDECREF(exc); 4096 return NULL; 4097} 4098 4099PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s, 4100 Py_ssize_t size) 4101{ 4102 PyObject *repr; 4103 char *p; 4104 char *q; 4105 4106#ifdef Py_UNICODE_WIDE 4107 const Py_ssize_t expandsize = 10; 4108#else 4109 const Py_ssize_t expandsize = 6; 4110#endif 4111 4112 if (size > PY_SSIZE_T_MAX / expandsize) 4113 return PyErr_NoMemory(); 4114 4115 repr = PyBytes_FromStringAndSize(NULL, expandsize * size); 4116 if (repr == NULL) 4117 return NULL; 4118 if (size == 0) 4119 return repr; 4120 4121 p = q = PyBytes_AS_STRING(repr); 4122 while (size-- > 0) { 4123 Py_UNICODE ch = *s++; 4124#ifdef Py_UNICODE_WIDE 4125 /* Map 32-bit characters to '\Uxxxxxxxx' */ 4126 if (ch >= 0x10000) { 4127 *p++ = '\\'; 4128 *p++ = 'U'; 4129 *p++ = hexdigits[(ch >> 28) & 0xf]; 4130 *p++ = hexdigits[(ch >> 24) & 0xf]; 4131 *p++ = hexdigits[(ch >> 20) & 0xf]; 4132 *p++ = hexdigits[(ch >> 16) & 0xf]; 4133 *p++ = hexdigits[(ch >> 12) & 0xf]; 4134 *p++ = hexdigits[(ch >> 8) & 0xf]; 4135 *p++ = hexdigits[(ch >> 4) & 0xf]; 4136 *p++ = hexdigits[ch & 15]; 4137 } 4138 else 4139#else 4140 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */ 4141 if (ch >= 0xD800 && ch < 0xDC00) { 4142 Py_UNICODE ch2; 4143 Py_UCS4 ucs; 4144 4145 ch2 = *s++; 4146 size--; 4147 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) { 4148 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000; 4149 *p++ = '\\'; 4150 *p++ = 'U'; 4151 *p++ = hexdigits[(ucs >> 28) & 0xf]; 4152 *p++ = hexdigits[(ucs >> 24) & 0xf]; 4153 *p++ = hexdigits[(ucs >> 20) & 0xf]; 4154 *p++ = hexdigits[(ucs >> 16) & 0xf]; 4155 *p++ = hexdigits[(ucs >> 12) & 0xf]; 4156 *p++ = hexdigits[(ucs >> 8) & 0xf]; 4157 *p++ = hexdigits[(ucs >> 4) & 0xf]; 4158 *p++ = hexdigits[ucs & 0xf]; 4159 continue; 4160 } 4161 /* Fall through: isolated surrogates are copied as-is */ 4162 s--; 4163 size++; 4164 } 4165#endif 4166 /* Map 16-bit characters to '\uxxxx' */ 4167 if (ch >= 256) { 4168 *p++ = '\\'; 4169 *p++ = 'u'; 4170 *p++ = hexdigits[(ch >> 12) & 0xf]; 4171 *p++ = hexdigits[(ch >> 8) & 0xf]; 4172 *p++ = hexdigits[(ch >> 4) & 0xf]; 4173 *p++ = hexdigits[ch & 15]; 4174 } 4175 /* Copy everything else as-is */ 4176 else 4177 *p++ = (char) ch; 4178 } 4179 size = p - q; 4180 4181 assert(size > 0); 4182 if (_PyBytes_Resize(&repr, size) < 0) 4183 return NULL; 4184 return repr; 4185} 4186 4187PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode) 4188{ 4189 PyObject *s; 4190 if (!PyUnicode_Check(unicode)) { 4191 PyErr_BadArgument(); 4192 return NULL; 4193 } 4194 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode), 4195 PyUnicode_GET_SIZE(unicode)); 4196 4197 return s; 4198} 4199 4200/* --- Unicode Internal Codec ------------------------------------------- */ 4201 4202PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s, 4203 Py_ssize_t size, 4204 const char *errors) 4205{ 4206 const char *starts = s; 4207 Py_ssize_t startinpos; 4208 Py_ssize_t endinpos; 4209 Py_ssize_t outpos; 4210 PyUnicodeObject *v; 4211 Py_UNICODE *p; 4212 const char *end; 4213 const char *reason; 4214 PyObject *errorHandler = NULL; 4215 PyObject *exc = NULL; 4216 4217#ifdef Py_UNICODE_WIDE 4218 Py_UNICODE unimax = PyUnicode_GetMax(); 4219#endif 4220 4221 /* XXX overflow detection missing */ 4222 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE); 4223 if (v == NULL) 4224 goto onError; 4225 if (PyUnicode_GetSize((PyObject *)v) == 0) 4226 return (PyObject *)v; 4227 p = PyUnicode_AS_UNICODE(v); 4228 end = s + size; 4229 4230 while (s < end) { 4231 memcpy(p, s, sizeof(Py_UNICODE)); 4232 /* We have to sanity check the raw data, otherwise doom looms for 4233 some malformed UCS-4 data. */ 4234 if ( 4235#ifdef Py_UNICODE_WIDE 4236 *p > unimax || *p < 0 || 4237#endif 4238 end-s < Py_UNICODE_SIZE 4239 ) 4240 { 4241 startinpos = s - starts; 4242 if (end-s < Py_UNICODE_SIZE) { 4243 endinpos = end-starts; 4244 reason = "truncated input"; 4245 } 4246 else { 4247 endinpos = s - starts + Py_UNICODE_SIZE; 4248 reason = "illegal code point (> 0x10FFFF)"; 4249 } 4250 outpos = p - PyUnicode_AS_UNICODE(v); 4251 if (unicode_decode_call_errorhandler( 4252 errors, &errorHandler, 4253 "unicode_internal", reason, 4254 &starts, &end, &startinpos, &endinpos, &exc, &s, 4255 &v, &outpos, &p)) { 4256 goto onError; 4257 } 4258 } 4259 else { 4260 p++; 4261 s += Py_UNICODE_SIZE; 4262 } 4263 } 4264 4265 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0) 4266 goto onError; 4267 Py_XDECREF(errorHandler); 4268 Py_XDECREF(exc); 4269 return (PyObject *)v; 4270 4271 onError: 4272 Py_XDECREF(v); 4273 Py_XDECREF(errorHandler); 4274 Py_XDECREF(exc); 4275 return NULL; 4276} 4277 4278/* --- Latin-1 Codec ------------------------------------------------------ */ 4279 4280PyObject *PyUnicode_DecodeLatin1(const char *s, 4281 Py_ssize_t size, 4282 const char *errors) 4283{ 4284 PyUnicodeObject *v; 4285 Py_UNICODE *p; 4286 const char *e, *unrolled_end; 4287 4288 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */ 4289 if (size == 1) { 4290 Py_UNICODE r = *(unsigned char*)s; 4291 return PyUnicode_FromUnicode(&r, 1); 4292 } 4293 4294 v = _PyUnicode_New(size); 4295 if (v == NULL) 4296 goto onError; 4297 if (size == 0) 4298 return (PyObject *)v; 4299 p = PyUnicode_AS_UNICODE(v); 4300 e = s + size; 4301 /* Unrolling the copy makes it much faster by reducing the looping 4302 overhead. This is similar to what many memcpy() implementations do. */ 4303 unrolled_end = e - 4; 4304 while (s < unrolled_end) { 4305 p[0] = (unsigned char) s[0]; 4306 p[1] = (unsigned char) s[1]; 4307 p[2] = (unsigned char) s[2]; 4308 p[3] = (unsigned char) s[3]; 4309 s += 4; 4310 p += 4; 4311 } 4312 while (s < e) 4313 *p++ = (unsigned char) *s++; 4314 return (PyObject *)v; 4315 4316 onError: 4317 Py_XDECREF(v); 4318 return NULL; 4319} 4320 4321/* create or adjust a UnicodeEncodeError */ 4322static void make_encode_exception(PyObject **exceptionObject, 4323 const char *encoding, 4324 const Py_UNICODE *unicode, Py_ssize_t size, 4325 Py_ssize_t startpos, Py_ssize_t endpos, 4326 const char *reason) 4327{ 4328 if (*exceptionObject == NULL) { 4329 *exceptionObject = PyUnicodeEncodeError_Create( 4330 encoding, unicode, size, startpos, endpos, reason); 4331 } 4332 else { 4333 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos)) 4334 goto onError; 4335 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos)) 4336 goto onError; 4337 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason)) 4338 goto onError; 4339 return; 4340 onError: 4341 Py_DECREF(*exceptionObject); 4342 *exceptionObject = NULL; 4343 } 4344} 4345 4346/* raises a UnicodeEncodeError */ 4347static void raise_encode_exception(PyObject **exceptionObject, 4348 const char *encoding, 4349 const Py_UNICODE *unicode, Py_ssize_t size, 4350 Py_ssize_t startpos, Py_ssize_t endpos, 4351 const char *reason) 4352{ 4353 make_encode_exception(exceptionObject, 4354 encoding, unicode, size, startpos, endpos, reason); 4355 if (*exceptionObject != NULL) 4356 PyCodec_StrictErrors(*exceptionObject); 4357} 4358 4359/* error handling callback helper: 4360 build arguments, call the callback and check the arguments, 4361 put the result into newpos and return the replacement string, which 4362 has to be freed by the caller */ 4363static PyObject *unicode_encode_call_errorhandler(const char *errors, 4364 PyObject **errorHandler, 4365 const char *encoding, const char *reason, 4366 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject, 4367 Py_ssize_t startpos, Py_ssize_t endpos, 4368 Py_ssize_t *newpos) 4369{ 4370 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple"; 4371 4372 PyObject *restuple; 4373 PyObject *resunicode; 4374 4375 if (*errorHandler == NULL) { 4376 *errorHandler = PyCodec_LookupError(errors); 4377 if (*errorHandler == NULL) 4378 return NULL; 4379 } 4380 4381 make_encode_exception(exceptionObject, 4382 encoding, unicode, size, startpos, endpos, reason); 4383 if (*exceptionObject == NULL) 4384 return NULL; 4385 4386 restuple = PyObject_CallFunctionObjArgs( 4387 *errorHandler, *exceptionObject, NULL); 4388 if (restuple == NULL) 4389 return NULL; 4390 if (!PyTuple_Check(restuple)) { 4391 PyErr_SetString(PyExc_TypeError, &argparse[3]); 4392 Py_DECREF(restuple); 4393 return NULL; 4394 } 4395 if (!PyArg_ParseTuple(restuple, argparse, 4396 &resunicode, newpos)) { 4397 Py_DECREF(restuple); 4398 return NULL; 4399 } 4400 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) { 4401 PyErr_SetString(PyExc_TypeError, &argparse[3]); 4402 Py_DECREF(restuple); 4403 return NULL; 4404 } 4405 if (*newpos<0) 4406 *newpos = size+*newpos; 4407 if (*newpos<0 || *newpos>size) { 4408 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos); 4409 Py_DECREF(restuple); 4410 return NULL; 4411 } 4412 Py_INCREF(resunicode); 4413 Py_DECREF(restuple); 4414 return resunicode; 4415} 4416 4417static PyObject *unicode_encode_ucs1(const Py_UNICODE *p, 4418 Py_ssize_t size, 4419 const char *errors, 4420 int limit) 4421{ 4422 /* output object */ 4423 PyObject *res; 4424 /* pointers to the beginning and end+1 of input */ 4425 const Py_UNICODE *startp = p; 4426 const Py_UNICODE *endp = p + size; 4427 /* pointer to the beginning of the unencodable characters */ 4428 /* const Py_UNICODE *badp = NULL; */ 4429 /* pointer into the output */ 4430 char *str; 4431 /* current output position */ 4432 Py_ssize_t ressize; 4433 const char *encoding = (limit == 256) ? "latin-1" : "ascii"; 4434 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)"; 4435 PyObject *errorHandler = NULL; 4436 PyObject *exc = NULL; 4437 /* the following variable is used for caching string comparisons 4438 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */ 4439 int known_errorHandler = -1; 4440 4441 /* allocate enough for a simple encoding without 4442 replacements, if we need more, we'll resize */ 4443 if (size == 0) 4444 return PyBytes_FromStringAndSize(NULL, 0); 4445 res = PyBytes_FromStringAndSize(NULL, size); 4446 if (res == NULL) 4447 return NULL; 4448 str = PyBytes_AS_STRING(res); 4449 ressize = size; 4450 4451 while (p<endp) { 4452 Py_UNICODE c = *p; 4453 4454 /* can we encode this? */ 4455 if (c<limit) { 4456 /* no overflow check, because we know that the space is enough */ 4457 *str++ = (char)c; 4458 ++p; 4459 } 4460 else { 4461 Py_ssize_t unicodepos = p-startp; 4462 Py_ssize_t requiredsize; 4463 PyObject *repunicode; 4464 Py_ssize_t repsize; 4465 Py_ssize_t newpos; 4466 Py_ssize_t respos; 4467 Py_UNICODE *uni2; 4468 /* startpos for collecting unencodable chars */ 4469 const Py_UNICODE *collstart = p; 4470 const Py_UNICODE *collend = p; 4471 /* find all unecodable characters */ 4472 while ((collend < endp) && ((*collend)>=limit)) 4473 ++collend; 4474 /* cache callback name lookup (if not done yet, i.e. it's the first error) */ 4475 if (known_errorHandler==-1) { 4476 if ((errors==NULL) || (!strcmp(errors, "strict"))) 4477 known_errorHandler = 1; 4478 else if (!strcmp(errors, "replace")) 4479 known_errorHandler = 2; 4480 else if (!strcmp(errors, "ignore")) 4481 known_errorHandler = 3; 4482 else if (!strcmp(errors, "xmlcharrefreplace")) 4483 known_errorHandler = 4; 4484 else 4485 known_errorHandler = 0; 4486 } 4487 switch (known_errorHandler) { 4488 case 1: /* strict */ 4489 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason); 4490 goto onError; 4491 case 2: /* replace */ 4492 while (collstart++<collend) 4493 *str++ = '?'; /* fall through */ 4494 case 3: /* ignore */ 4495 p = collend; 4496 break; 4497 case 4: /* xmlcharrefreplace */ 4498 respos = str - PyBytes_AS_STRING(res); 4499 /* determine replacement size (temporarily (mis)uses p) */ 4500 for (p = collstart, repsize = 0; p < collend; ++p) { 4501 if (*p<10) 4502 repsize += 2+1+1; 4503 else if (*p<100) 4504 repsize += 2+2+1; 4505 else if (*p<1000) 4506 repsize += 2+3+1; 4507 else if (*p<10000) 4508 repsize += 2+4+1; 4509#ifndef Py_UNICODE_WIDE 4510 else 4511 repsize += 2+5+1; 4512#else 4513 else if (*p<100000) 4514 repsize += 2+5+1; 4515 else if (*p<1000000) 4516 repsize += 2+6+1; 4517 else 4518 repsize += 2+7+1; 4519#endif 4520 } 4521 requiredsize = respos+repsize+(endp-collend); 4522 if (requiredsize > ressize) { 4523 if (requiredsize<2*ressize) 4524 requiredsize = 2*ressize; 4525 if (_PyBytes_Resize(&res, requiredsize)) 4526 goto onError; 4527 str = PyBytes_AS_STRING(res) + respos; 4528 ressize = requiredsize; 4529 } 4530 /* generate replacement (temporarily (mis)uses p) */ 4531 for (p = collstart; p < collend; ++p) { 4532 str += sprintf(str, "&#%d;", (int)*p); 4533 } 4534 p = collend; 4535 break; 4536 default: 4537 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler, 4538 encoding, reason, startp, size, &exc, 4539 collstart-startp, collend-startp, &newpos); 4540 if (repunicode == NULL) 4541 goto onError; 4542 if (PyBytes_Check(repunicode)) { 4543 /* Directly copy bytes result to output. */ 4544 repsize = PyBytes_Size(repunicode); 4545 if (repsize > 1) { 4546 /* Make room for all additional bytes. */ 4547 respos = str - PyBytes_AS_STRING(res); 4548 if (_PyBytes_Resize(&res, ressize+repsize-1)) { 4549 Py_DECREF(repunicode); 4550 goto onError; 4551 } 4552 str = PyBytes_AS_STRING(res) + respos; 4553 ressize += repsize-1; 4554 } 4555 memcpy(str, PyBytes_AsString(repunicode), repsize); 4556 str += repsize; 4557 p = startp + newpos; 4558 Py_DECREF(repunicode); 4559 break; 4560 } 4561 /* need more space? (at least enough for what we 4562 have+the replacement+the rest of the string, so 4563 we won't have to check space for encodable characters) */ 4564 respos = str - PyBytes_AS_STRING(res); 4565 repsize = PyUnicode_GET_SIZE(repunicode); 4566 requiredsize = respos+repsize+(endp-collend); 4567 if (requiredsize > ressize) { 4568 if (requiredsize<2*ressize) 4569 requiredsize = 2*ressize; 4570 if (_PyBytes_Resize(&res, requiredsize)) { 4571 Py_DECREF(repunicode); 4572 goto onError; 4573 } 4574 str = PyBytes_AS_STRING(res) + respos; 4575 ressize = requiredsize; 4576 } 4577 /* check if there is anything unencodable in the replacement 4578 and copy it to the output */ 4579 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) { 4580 c = *uni2; 4581 if (c >= limit) { 4582 raise_encode_exception(&exc, encoding, startp, size, 4583 unicodepos, unicodepos+1, reason); 4584 Py_DECREF(repunicode); 4585 goto onError; 4586 } 4587 *str = (char)c; 4588 } 4589 p = startp + newpos; 4590 Py_DECREF(repunicode); 4591 } 4592 } 4593 } 4594 /* Resize if we allocated to much */ 4595 size = str - PyBytes_AS_STRING(res); 4596 if (size < ressize) { /* If this falls res will be NULL */ 4597 assert(size >= 0); 4598 if (_PyBytes_Resize(&res, size) < 0) 4599 goto onError; 4600 } 4601 4602 Py_XDECREF(errorHandler); 4603 Py_XDECREF(exc); 4604 return res; 4605 4606 onError: 4607 Py_XDECREF(res); 4608 Py_XDECREF(errorHandler); 4609 Py_XDECREF(exc); 4610 return NULL; 4611} 4612 4613PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p, 4614 Py_ssize_t size, 4615 const char *errors) 4616{ 4617 return unicode_encode_ucs1(p, size, errors, 256); 4618} 4619 4620PyObject *PyUnicode_AsLatin1String(PyObject *unicode) 4621{ 4622 if (!PyUnicode_Check(unicode)) { 4623 PyErr_BadArgument(); 4624 return NULL; 4625 } 4626 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode), 4627 PyUnicode_GET_SIZE(unicode), 4628 NULL); 4629} 4630 4631/* --- 7-bit ASCII Codec -------------------------------------------------- */ 4632 4633PyObject *PyUnicode_DecodeASCII(const char *s, 4634 Py_ssize_t size, 4635 const char *errors) 4636{ 4637 const char *starts = s; 4638 PyUnicodeObject *v; 4639 Py_UNICODE *p; 4640 Py_ssize_t startinpos; 4641 Py_ssize_t endinpos; 4642 Py_ssize_t outpos; 4643 const char *e; 4644 PyObject *errorHandler = NULL; 4645 PyObject *exc = NULL; 4646 4647 /* ASCII is equivalent to the first 128 ordinals in Unicode. */ 4648 if (size == 1 && *(unsigned char*)s < 128) { 4649 Py_UNICODE r = *(unsigned char*)s; 4650 return PyUnicode_FromUnicode(&r, 1); 4651 } 4652 4653 v = _PyUnicode_New(size); 4654 if (v == NULL) 4655 goto onError; 4656 if (size == 0) 4657 return (PyObject *)v; 4658 p = PyUnicode_AS_UNICODE(v); 4659 e = s + size; 4660 while (s < e) { 4661 register unsigned char c = (unsigned char)*s; 4662 if (c < 128) { 4663 *p++ = c; 4664 ++s; 4665 } 4666 else { 4667 startinpos = s-starts; 4668 endinpos = startinpos + 1; 4669 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v); 4670 if (unicode_decode_call_errorhandler( 4671 errors, &errorHandler, 4672 "ascii", "ordinal not in range(128)", 4673 &starts, &e, &startinpos, &endinpos, &exc, &s, 4674 &v, &outpos, &p)) 4675 goto onError; 4676 } 4677 } 4678 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v)) 4679 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0) 4680 goto onError; 4681 Py_XDECREF(errorHandler); 4682 Py_XDECREF(exc); 4683 return (PyObject *)v; 4684 4685 onError: 4686 Py_XDECREF(v); 4687 Py_XDECREF(errorHandler); 4688 Py_XDECREF(exc); 4689 return NULL; 4690} 4691 4692PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p, 4693 Py_ssize_t size, 4694 const char *errors) 4695{ 4696 return unicode_encode_ucs1(p, size, errors, 128); 4697} 4698 4699PyObject *PyUnicode_AsASCIIString(PyObject *unicode) 4700{ 4701 if (!PyUnicode_Check(unicode)) { 4702 PyErr_BadArgument(); 4703 return NULL; 4704 } 4705 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode), 4706 PyUnicode_GET_SIZE(unicode), 4707 NULL); 4708} 4709 4710#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T) 4711 4712/* --- MBCS codecs for Windows -------------------------------------------- */ 4713 4714#if SIZEOF_INT < SIZEOF_SIZE_T 4715#define NEED_RETRY 4716#endif 4717 4718/* XXX This code is limited to "true" double-byte encodings, as 4719 a) it assumes an incomplete character consists of a single byte, and 4720 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte 4721 encodings, see IsDBCSLeadByteEx documentation. */ 4722 4723static int is_dbcs_lead_byte(const char *s, int offset) 4724{ 4725 const char *curr = s + offset; 4726 4727 if (IsDBCSLeadByte(*curr)) { 4728 const char *prev = CharPrev(s, curr); 4729 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2); 4730 } 4731 return 0; 4732} 4733 4734/* 4735 * Decode MBCS string into unicode object. If 'final' is set, converts 4736 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise. 4737 */ 4738static int decode_mbcs(PyUnicodeObject **v, 4739 const char *s, /* MBCS string */ 4740 int size, /* sizeof MBCS string */ 4741 int final, 4742 const char *errors) 4743{ 4744 Py_UNICODE *p; 4745 Py_ssize_t n; 4746 DWORD usize; 4747 DWORD flags; 4748 4749 assert(size >= 0); 4750 4751 /* check and handle 'errors' arg */ 4752 if (errors==NULL || strcmp(errors, "strict")==0) 4753 flags = MB_ERR_INVALID_CHARS; 4754 else if (strcmp(errors, "ignore")==0) 4755 flags = 0; 4756 else { 4757 PyErr_Format(PyExc_ValueError, 4758 "mbcs encoding does not support errors='%s'", 4759 errors); 4760 return -1; 4761 } 4762 4763 /* Skip trailing lead-byte unless 'final' is set */ 4764 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1)) 4765 --size; 4766 4767 /* First get the size of the result */ 4768 if (size > 0) { 4769 usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0); 4770 if (usize==0) 4771 goto mbcs_decode_error; 4772 } else 4773 usize = 0; 4774 4775 if (*v == NULL) { 4776 /* Create unicode object */ 4777 *v = _PyUnicode_New(usize); 4778 if (*v == NULL) 4779 return -1; 4780 n = 0; 4781 } 4782 else { 4783 /* Extend unicode object */ 4784 n = PyUnicode_GET_SIZE(*v); 4785 if (_PyUnicode_Resize(v, n + usize) < 0) 4786 return -1; 4787 } 4788 4789 /* Do the conversion */ 4790 if (usize > 0) { 4791 p = PyUnicode_AS_UNICODE(*v) + n; 4792 if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) { 4793 goto mbcs_decode_error; 4794 } 4795 } 4796 return size; 4797 4798mbcs_decode_error: 4799 /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then 4800 we raise a UnicodeDecodeError - else it is a 'generic' 4801 windows error 4802 */ 4803 if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) { 4804 /* Ideally, we should get reason from FormatMessage - this 4805 is the Windows 2000 English version of the message 4806 */ 4807 PyObject *exc = NULL; 4808 const char *reason = "No mapping for the Unicode character exists " 4809 "in the target multi-byte code page."; 4810 make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason); 4811 if (exc != NULL) { 4812 PyCodec_StrictErrors(exc); 4813 Py_DECREF(exc); 4814 } 4815 } else { 4816 PyErr_SetFromWindowsErrWithFilename(0, NULL); 4817 } 4818 return -1; 4819} 4820 4821PyObject *PyUnicode_DecodeMBCSStateful(const char *s, 4822 Py_ssize_t size, 4823 const char *errors, 4824 Py_ssize_t *consumed) 4825{ 4826 PyUnicodeObject *v = NULL; 4827 int done; 4828 4829 if (consumed) 4830 *consumed = 0; 4831 4832#ifdef NEED_RETRY 4833 retry: 4834 if (size > INT_MAX) 4835 done = decode_mbcs(&v, s, INT_MAX, 0, errors); 4836 else 4837#endif 4838 done = decode_mbcs(&v, s, (int)size, !consumed, errors); 4839 4840 if (done < 0) { 4841 Py_XDECREF(v); 4842 return NULL; 4843 } 4844 4845 if (consumed) 4846 *consumed += done; 4847 4848#ifdef NEED_RETRY 4849 if (size > INT_MAX) { 4850 s += done; 4851 size -= done; 4852 goto retry; 4853 } 4854#endif 4855 4856 return (PyObject *)v; 4857} 4858 4859PyObject *PyUnicode_DecodeMBCS(const char *s, 4860 Py_ssize_t size, 4861 const char *errors) 4862{ 4863 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL); 4864} 4865 4866/* 4867 * Convert unicode into string object (MBCS). 4868 * Returns 0 if succeed, -1 otherwise. 4869 */ 4870static int encode_mbcs(PyObject **repr, 4871 const Py_UNICODE *p, /* unicode */ 4872 int size, /* size of unicode */ 4873 const char* errors) 4874{ 4875 BOOL usedDefaultChar = FALSE; 4876 BOOL *pusedDefaultChar; 4877 int mbcssize; 4878 Py_ssize_t n; 4879 PyObject *exc = NULL; 4880 DWORD flags; 4881 4882 assert(size >= 0); 4883 4884 /* check and handle 'errors' arg */ 4885 if (errors==NULL || strcmp(errors, "strict")==0) { 4886 flags = WC_NO_BEST_FIT_CHARS; 4887 pusedDefaultChar = &usedDefaultChar; 4888 } else if (strcmp(errors, "replace")==0) { 4889 flags = 0; 4890 pusedDefaultChar = NULL; 4891 } else { 4892 PyErr_Format(PyExc_ValueError, 4893 "mbcs encoding does not support errors='%s'", 4894 errors); 4895 return -1; 4896 } 4897 4898 /* First get the size of the result */ 4899 if (size > 0) { 4900 mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0, 4901 NULL, pusedDefaultChar); 4902 if (mbcssize == 0) { 4903 PyErr_SetFromWindowsErrWithFilename(0, NULL); 4904 return -1; 4905 } 4906 /* If we used a default char, then we failed! */ 4907 if (pusedDefaultChar && *pusedDefaultChar) 4908 goto mbcs_encode_error; 4909 } else { 4910 mbcssize = 0; 4911 } 4912 4913 if (*repr == NULL) { 4914 /* Create string object */ 4915 *repr = PyBytes_FromStringAndSize(NULL, mbcssize); 4916 if (*repr == NULL) 4917 return -1; 4918 n = 0; 4919 } 4920 else { 4921 /* Extend string object */ 4922 n = PyBytes_Size(*repr); 4923 if (_PyBytes_Resize(repr, n + mbcssize) < 0) 4924 return -1; 4925 } 4926 4927 /* Do the conversion */ 4928 if (size > 0) { 4929 char *s = PyBytes_AS_STRING(*repr) + n; 4930 if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize, 4931 NULL, pusedDefaultChar)) { 4932 PyErr_SetFromWindowsErrWithFilename(0, NULL); 4933 return -1; 4934 } 4935 if (pusedDefaultChar && *pusedDefaultChar) 4936 goto mbcs_encode_error; 4937 } 4938 return 0; 4939 4940mbcs_encode_error: 4941 raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character"); 4942 Py_XDECREF(exc); 4943 return -1; 4944} 4945 4946PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p, 4947 Py_ssize_t size, 4948 const char *errors) 4949{ 4950 PyObject *repr = NULL; 4951 int ret; 4952 4953#ifdef NEED_RETRY 4954 retry: 4955 if (size > INT_MAX) 4956 ret = encode_mbcs(&repr, p, INT_MAX, errors); 4957 else 4958#endif 4959 ret = encode_mbcs(&repr, p, (int)size, errors); 4960 4961 if (ret < 0) { 4962 Py_XDECREF(repr); 4963 return NULL; 4964 } 4965 4966#ifdef NEED_RETRY 4967 if (size > INT_MAX) { 4968 p += INT_MAX; 4969 size -= INT_MAX; 4970 goto retry; 4971 } 4972#endif 4973 4974 return repr; 4975} 4976 4977PyObject *PyUnicode_AsMBCSString(PyObject *unicode) 4978{ 4979 if (!PyUnicode_Check(unicode)) { 4980 PyErr_BadArgument(); 4981 return NULL; 4982 } 4983 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode), 4984 PyUnicode_GET_SIZE(unicode), 4985 NULL); 4986} 4987 4988#undef NEED_RETRY 4989 4990#endif /* MS_WINDOWS */ 4991 4992/* --- Character Mapping Codec -------------------------------------------- */ 4993 4994PyObject *PyUnicode_DecodeCharmap(const char *s, 4995 Py_ssize_t size, 4996 PyObject *mapping, 4997 const char *errors) 4998{ 4999 const char *starts = s; 5000 Py_ssize_t startinpos; 5001 Py_ssize_t endinpos; 5002 Py_ssize_t outpos; 5003 const char *e; 5004 PyUnicodeObject *v; 5005 Py_UNICODE *p; 5006 Py_ssize_t extrachars = 0; 5007 PyObject *errorHandler = NULL; 5008 PyObject *exc = NULL; 5009 Py_UNICODE *mapstring = NULL; 5010 Py_ssize_t maplen = 0; 5011 5012 /* Default to Latin-1 */ 5013 if (mapping == NULL) 5014 return PyUnicode_DecodeLatin1(s, size, errors); 5015 5016 v = _PyUnicode_New(size); 5017 if (v == NULL) 5018 goto onError; 5019 if (size == 0) 5020 return (PyObject *)v; 5021 p = PyUnicode_AS_UNICODE(v); 5022 e = s + size; 5023 if (PyUnicode_CheckExact(mapping)) { 5024 mapstring = PyUnicode_AS_UNICODE(mapping); 5025 maplen = PyUnicode_GET_SIZE(mapping); 5026 while (s < e) { 5027 unsigned char ch = *s; 5028 Py_UNICODE x = 0xfffe; /* illegal value */ 5029 5030 if (ch < maplen) 5031 x = mapstring[ch]; 5032 5033 if (x == 0xfffe) { 5034 /* undefined mapping */ 5035 outpos = p-PyUnicode_AS_UNICODE(v); 5036 startinpos = s-starts; 5037 endinpos = startinpos+1; 5038 if (unicode_decode_call_errorhandler( 5039 errors, &errorHandler, 5040 "charmap", "character maps to <undefined>", 5041 &starts, &e, &startinpos, &endinpos, &exc, &s, 5042 &v, &outpos, &p)) { 5043 goto onError; 5044 } 5045 continue; 5046 } 5047 *p++ = x; 5048 ++s; 5049 } 5050 } 5051 else { 5052 while (s < e) { 5053 unsigned char ch = *s; 5054 PyObject *w, *x; 5055 5056 /* Get mapping (char ordinal -> integer, Unicode char or None) */ 5057 w = PyLong_FromLong((long)ch); 5058 if (w == NULL) 5059 goto onError; 5060 x = PyObject_GetItem(mapping, w); 5061 Py_DECREF(w); 5062 if (x == NULL) { 5063 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 5064 /* No mapping found means: mapping is undefined. */ 5065 PyErr_Clear(); 5066 x = Py_None; 5067 Py_INCREF(x); 5068 } else 5069 goto onError; 5070 } 5071 5072 /* Apply mapping */ 5073 if (PyLong_Check(x)) { 5074 long value = PyLong_AS_LONG(x); 5075 if (value < 0 || value > 65535) { 5076 PyErr_SetString(PyExc_TypeError, 5077 "character mapping must be in range(65536)"); 5078 Py_DECREF(x); 5079 goto onError; 5080 } 5081 *p++ = (Py_UNICODE)value; 5082 } 5083 else if (x == Py_None) { 5084 /* undefined mapping */ 5085 outpos = p-PyUnicode_AS_UNICODE(v); 5086 startinpos = s-starts; 5087 endinpos = startinpos+1; 5088 if (unicode_decode_call_errorhandler( 5089 errors, &errorHandler, 5090 "charmap", "character maps to <undefined>", 5091 &starts, &e, &startinpos, &endinpos, &exc, &s, 5092 &v, &outpos, &p)) { 5093 Py_DECREF(x); 5094 goto onError; 5095 } 5096 Py_DECREF(x); 5097 continue; 5098 } 5099 else if (PyUnicode_Check(x)) { 5100 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x); 5101 5102 if (targetsize == 1) 5103 /* 1-1 mapping */ 5104 *p++ = *PyUnicode_AS_UNICODE(x); 5105 5106 else if (targetsize > 1) { 5107 /* 1-n mapping */ 5108 if (targetsize > extrachars) { 5109 /* resize first */ 5110 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v); 5111 Py_ssize_t needed = (targetsize - extrachars) + \ 5112 (targetsize << 2); 5113 extrachars += needed; 5114 /* XXX overflow detection missing */ 5115 if (_PyUnicode_Resize(&v, 5116 PyUnicode_GET_SIZE(v) + needed) < 0) { 5117 Py_DECREF(x); 5118 goto onError; 5119 } 5120 p = PyUnicode_AS_UNICODE(v) + oldpos; 5121 } 5122 Py_UNICODE_COPY(p, 5123 PyUnicode_AS_UNICODE(x), 5124 targetsize); 5125 p += targetsize; 5126 extrachars -= targetsize; 5127 } 5128 /* 1-0 mapping: skip the character */ 5129 } 5130 else { 5131 /* wrong return value */ 5132 PyErr_SetString(PyExc_TypeError, 5133 "character mapping must return integer, None or str"); 5134 Py_DECREF(x); 5135 goto onError; 5136 } 5137 Py_DECREF(x); 5138 ++s; 5139 } 5140 } 5141 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v)) 5142 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0) 5143 goto onError; 5144 Py_XDECREF(errorHandler); 5145 Py_XDECREF(exc); 5146 return (PyObject *)v; 5147 5148 onError: 5149 Py_XDECREF(errorHandler); 5150 Py_XDECREF(exc); 5151 Py_XDECREF(v); 5152 return NULL; 5153} 5154 5155/* Charmap encoding: the lookup table */ 5156 5157struct encoding_map{ 5158 PyObject_HEAD 5159 unsigned char level1[32]; 5160 int count2, count3; 5161 unsigned char level23[1]; 5162}; 5163 5164static PyObject* 5165encoding_map_size(PyObject *obj, PyObject* args) 5166{ 5167 struct encoding_map *map = (struct encoding_map*)obj; 5168 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 + 5169 128*map->count3); 5170} 5171 5172static PyMethodDef encoding_map_methods[] = { 5173 {"size", encoding_map_size, METH_NOARGS, 5174 PyDoc_STR("Return the size (in bytes) of this object") }, 5175 { 0 } 5176}; 5177 5178static void 5179encoding_map_dealloc(PyObject* o) 5180{ 5181 PyObject_FREE(o); 5182} 5183 5184static PyTypeObject EncodingMapType = { 5185 PyVarObject_HEAD_INIT(NULL, 0) 5186 "EncodingMap", /*tp_name*/ 5187 sizeof(struct encoding_map), /*tp_basicsize*/ 5188 0, /*tp_itemsize*/ 5189 /* methods */ 5190 encoding_map_dealloc, /*tp_dealloc*/ 5191 0, /*tp_print*/ 5192 0, /*tp_getattr*/ 5193 0, /*tp_setattr*/ 5194 0, /*tp_reserved*/ 5195 0, /*tp_repr*/ 5196 0, /*tp_as_number*/ 5197 0, /*tp_as_sequence*/ 5198 0, /*tp_as_mapping*/ 5199 0, /*tp_hash*/ 5200 0, /*tp_call*/ 5201 0, /*tp_str*/ 5202 0, /*tp_getattro*/ 5203 0, /*tp_setattro*/ 5204 0, /*tp_as_buffer*/ 5205 Py_TPFLAGS_DEFAULT, /*tp_flags*/ 5206 0, /*tp_doc*/ 5207 0, /*tp_traverse*/ 5208 0, /*tp_clear*/ 5209 0, /*tp_richcompare*/ 5210 0, /*tp_weaklistoffset*/ 5211 0, /*tp_iter*/ 5212 0, /*tp_iternext*/ 5213 encoding_map_methods, /*tp_methods*/ 5214 0, /*tp_members*/ 5215 0, /*tp_getset*/ 5216 0, /*tp_base*/ 5217 0, /*tp_dict*/ 5218 0, /*tp_descr_get*/ 5219 0, /*tp_descr_set*/ 5220 0, /*tp_dictoffset*/ 5221 0, /*tp_init*/ 5222 0, /*tp_alloc*/ 5223 0, /*tp_new*/ 5224 0, /*tp_free*/ 5225 0, /*tp_is_gc*/ 5226}; 5227 5228PyObject* 5229PyUnicode_BuildEncodingMap(PyObject* string) 5230{ 5231 Py_UNICODE *decode; 5232 PyObject *result; 5233 struct encoding_map *mresult; 5234 int i; 5235 int need_dict = 0; 5236 unsigned char level1[32]; 5237 unsigned char level2[512]; 5238 unsigned char *mlevel1, *mlevel2, *mlevel3; 5239 int count2 = 0, count3 = 0; 5240 5241 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) { 5242 PyErr_BadArgument(); 5243 return NULL; 5244 } 5245 decode = PyUnicode_AS_UNICODE(string); 5246 memset(level1, 0xFF, sizeof level1); 5247 memset(level2, 0xFF, sizeof level2); 5248 5249 /* If there isn't a one-to-one mapping of NULL to \0, 5250 or if there are non-BMP characters, we need to use 5251 a mapping dictionary. */ 5252 if (decode[0] != 0) 5253 need_dict = 1; 5254 for (i = 1; i < 256; i++) { 5255 int l1, l2; 5256 if (decode[i] == 0 5257#ifdef Py_UNICODE_WIDE 5258 || decode[i] > 0xFFFF 5259#endif 5260 ) { 5261 need_dict = 1; 5262 break; 5263 } 5264 if (decode[i] == 0xFFFE) 5265 /* unmapped character */ 5266 continue; 5267 l1 = decode[i] >> 11; 5268 l2 = decode[i] >> 7; 5269 if (level1[l1] == 0xFF) 5270 level1[l1] = count2++; 5271 if (level2[l2] == 0xFF) 5272 level2[l2] = count3++; 5273 } 5274 5275 if (count2 >= 0xFF || count3 >= 0xFF) 5276 need_dict = 1; 5277 5278 if (need_dict) { 5279 PyObject *result = PyDict_New(); 5280 PyObject *key, *value; 5281 if (!result) 5282 return NULL; 5283 for (i = 0; i < 256; i++) { 5284 key = value = NULL; 5285 key = PyLong_FromLong(decode[i]); 5286 value = PyLong_FromLong(i); 5287 if (!key || !value) 5288 goto failed1; 5289 if (PyDict_SetItem(result, key, value) == -1) 5290 goto failed1; 5291 Py_DECREF(key); 5292 Py_DECREF(value); 5293 } 5294 return result; 5295 failed1: 5296 Py_XDECREF(key); 5297 Py_XDECREF(value); 5298 Py_DECREF(result); 5299 return NULL; 5300 } 5301 5302 /* Create a three-level trie */ 5303 result = PyObject_MALLOC(sizeof(struct encoding_map) + 5304 16*count2 + 128*count3 - 1); 5305 if (!result) 5306 return PyErr_NoMemory(); 5307 PyObject_Init(result, &EncodingMapType); 5308 mresult = (struct encoding_map*)result; 5309 mresult->count2 = count2; 5310 mresult->count3 = count3; 5311 mlevel1 = mresult->level1; 5312 mlevel2 = mresult->level23; 5313 mlevel3 = mresult->level23 + 16*count2; 5314 memcpy(mlevel1, level1, 32); 5315 memset(mlevel2, 0xFF, 16*count2); 5316 memset(mlevel3, 0, 128*count3); 5317 count3 = 0; 5318 for (i = 1; i < 256; i++) { 5319 int o1, o2, o3, i2, i3; 5320 if (decode[i] == 0xFFFE) 5321 /* unmapped character */ 5322 continue; 5323 o1 = decode[i]>>11; 5324 o2 = (decode[i]>>7) & 0xF; 5325 i2 = 16*mlevel1[o1] + o2; 5326 if (mlevel2[i2] == 0xFF) 5327 mlevel2[i2] = count3++; 5328 o3 = decode[i] & 0x7F; 5329 i3 = 128*mlevel2[i2] + o3; 5330 mlevel3[i3] = i; 5331 } 5332 return result; 5333} 5334 5335static int 5336encoding_map_lookup(Py_UNICODE c, PyObject *mapping) 5337{ 5338 struct encoding_map *map = (struct encoding_map*)mapping; 5339 int l1 = c>>11; 5340 int l2 = (c>>7) & 0xF; 5341 int l3 = c & 0x7F; 5342 int i; 5343 5344#ifdef Py_UNICODE_WIDE 5345 if (c > 0xFFFF) { 5346 return -1; 5347 } 5348#endif 5349 if (c == 0) 5350 return 0; 5351 /* level 1*/ 5352 i = map->level1[l1]; 5353 if (i == 0xFF) { 5354 return -1; 5355 } 5356 /* level 2*/ 5357 i = map->level23[16*i+l2]; 5358 if (i == 0xFF) { 5359 return -1; 5360 } 5361 /* level 3 */ 5362 i = map->level23[16*map->count2 + 128*i + l3]; 5363 if (i == 0) { 5364 return -1; 5365 } 5366 return i; 5367} 5368 5369/* Lookup the character ch in the mapping. If the character 5370 can't be found, Py_None is returned (or NULL, if another 5371 error occurred). */ 5372static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping) 5373{ 5374 PyObject *w = PyLong_FromLong((long)c); 5375 PyObject *x; 5376 5377 if (w == NULL) 5378 return NULL; 5379 x = PyObject_GetItem(mapping, w); 5380 Py_DECREF(w); 5381 if (x == NULL) { 5382 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 5383 /* No mapping found means: mapping is undefined. */ 5384 PyErr_Clear(); 5385 x = Py_None; 5386 Py_INCREF(x); 5387 return x; 5388 } else 5389 return NULL; 5390 } 5391 else if (x == Py_None) 5392 return x; 5393 else if (PyLong_Check(x)) { 5394 long value = PyLong_AS_LONG(x); 5395 if (value < 0 || value > 255) { 5396 PyErr_SetString(PyExc_TypeError, 5397 "character mapping must be in range(256)"); 5398 Py_DECREF(x); 5399 return NULL; 5400 } 5401 return x; 5402 } 5403 else if (PyBytes_Check(x)) 5404 return x; 5405 else { 5406 /* wrong return value */ 5407 PyErr_Format(PyExc_TypeError, 5408 "character mapping must return integer, bytes or None, not %.400s", 5409 x->ob_type->tp_name); 5410 Py_DECREF(x); 5411 return NULL; 5412 } 5413} 5414 5415static int 5416charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize) 5417{ 5418 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj); 5419 /* exponentially overallocate to minimize reallocations */ 5420 if (requiredsize < 2*outsize) 5421 requiredsize = 2*outsize; 5422 if (_PyBytes_Resize(outobj, requiredsize)) 5423 return -1; 5424 return 0; 5425} 5426 5427typedef enum charmapencode_result { 5428 enc_SUCCESS, enc_FAILED, enc_EXCEPTION 5429}charmapencode_result; 5430/* lookup the character, put the result in the output string and adjust 5431 various state variables. Resize the output bytes object if not enough 5432 space is available. Return a new reference to the object that 5433 was put in the output buffer, or Py_None, if the mapping was undefined 5434 (in which case no character was written) or NULL, if a 5435 reallocation error occurred. The caller must decref the result */ 5436static 5437charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping, 5438 PyObject **outobj, Py_ssize_t *outpos) 5439{ 5440 PyObject *rep; 5441 char *outstart; 5442 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj); 5443 5444 if (Py_TYPE(mapping) == &EncodingMapType) { 5445 int res = encoding_map_lookup(c, mapping); 5446 Py_ssize_t requiredsize = *outpos+1; 5447 if (res == -1) 5448 return enc_FAILED; 5449 if (outsize<requiredsize) 5450 if (charmapencode_resize(outobj, outpos, requiredsize)) 5451 return enc_EXCEPTION; 5452 outstart = PyBytes_AS_STRING(*outobj); 5453 outstart[(*outpos)++] = (char)res; 5454 return enc_SUCCESS; 5455 } 5456 5457 rep = charmapencode_lookup(c, mapping); 5458 if (rep==NULL) 5459 return enc_EXCEPTION; 5460 else if (rep==Py_None) { 5461 Py_DECREF(rep); 5462 return enc_FAILED; 5463 } else { 5464 if (PyLong_Check(rep)) { 5465 Py_ssize_t requiredsize = *outpos+1; 5466 if (outsize<requiredsize) 5467 if (charmapencode_resize(outobj, outpos, requiredsize)) { 5468 Py_DECREF(rep); 5469 return enc_EXCEPTION; 5470 } 5471 outstart = PyBytes_AS_STRING(*outobj); 5472 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep); 5473 } 5474 else { 5475 const char *repchars = PyBytes_AS_STRING(rep); 5476 Py_ssize_t repsize = PyBytes_GET_SIZE(rep); 5477 Py_ssize_t requiredsize = *outpos+repsize; 5478 if (outsize<requiredsize) 5479 if (charmapencode_resize(outobj, outpos, requiredsize)) { 5480 Py_DECREF(rep); 5481 return enc_EXCEPTION; 5482 } 5483 outstart = PyBytes_AS_STRING(*outobj); 5484 memcpy(outstart + *outpos, repchars, repsize); 5485 *outpos += repsize; 5486 } 5487 } 5488 Py_DECREF(rep); 5489 return enc_SUCCESS; 5490} 5491 5492/* handle an error in PyUnicode_EncodeCharmap 5493 Return 0 on success, -1 on error */ 5494static 5495int charmap_encoding_error( 5496 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping, 5497 PyObject **exceptionObject, 5498 int *known_errorHandler, PyObject **errorHandler, const char *errors, 5499 PyObject **res, Py_ssize_t *respos) 5500{ 5501 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */ 5502 Py_ssize_t repsize; 5503 Py_ssize_t newpos; 5504 Py_UNICODE *uni2; 5505 /* startpos for collecting unencodable chars */ 5506 Py_ssize_t collstartpos = *inpos; 5507 Py_ssize_t collendpos = *inpos+1; 5508 Py_ssize_t collpos; 5509 char *encoding = "charmap"; 5510 char *reason = "character maps to <undefined>"; 5511 charmapencode_result x; 5512 5513 /* find all unencodable characters */ 5514 while (collendpos < size) { 5515 PyObject *rep; 5516 if (Py_TYPE(mapping) == &EncodingMapType) { 5517 int res = encoding_map_lookup(p[collendpos], mapping); 5518 if (res != -1) 5519 break; 5520 ++collendpos; 5521 continue; 5522 } 5523 5524 rep = charmapencode_lookup(p[collendpos], mapping); 5525 if (rep==NULL) 5526 return -1; 5527 else if (rep!=Py_None) { 5528 Py_DECREF(rep); 5529 break; 5530 } 5531 Py_DECREF(rep); 5532 ++collendpos; 5533 } 5534 /* cache callback name lookup 5535 * (if not done yet, i.e. it's the first error) */ 5536 if (*known_errorHandler==-1) { 5537 if ((errors==NULL) || (!strcmp(errors, "strict"))) 5538 *known_errorHandler = 1; 5539 else if (!strcmp(errors, "replace")) 5540 *known_errorHandler = 2; 5541 else if (!strcmp(errors, "ignore")) 5542 *known_errorHandler = 3; 5543 else if (!strcmp(errors, "xmlcharrefreplace")) 5544 *known_errorHandler = 4; 5545 else 5546 *known_errorHandler = 0; 5547 } 5548 switch (*known_errorHandler) { 5549 case 1: /* strict */ 5550 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason); 5551 return -1; 5552 case 2: /* replace */ 5553 for (collpos = collstartpos; collpos<collendpos; ++collpos) { 5554 x = charmapencode_output('?', mapping, res, respos); 5555 if (x==enc_EXCEPTION) { 5556 return -1; 5557 } 5558 else if (x==enc_FAILED) { 5559 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason); 5560 return -1; 5561 } 5562 } 5563 /* fall through */ 5564 case 3: /* ignore */ 5565 *inpos = collendpos; 5566 break; 5567 case 4: /* xmlcharrefreplace */ 5568 /* generate replacement (temporarily (mis)uses p) */ 5569 for (collpos = collstartpos; collpos < collendpos; ++collpos) { 5570 char buffer[2+29+1+1]; 5571 char *cp; 5572 sprintf(buffer, "&#%d;", (int)p[collpos]); 5573 for (cp = buffer; *cp; ++cp) { 5574 x = charmapencode_output(*cp, mapping, res, respos); 5575 if (x==enc_EXCEPTION) 5576 return -1; 5577 else if (x==enc_FAILED) { 5578 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason); 5579 return -1; 5580 } 5581 } 5582 } 5583 *inpos = collendpos; 5584 break; 5585 default: 5586 repunicode = unicode_encode_call_errorhandler(errors, errorHandler, 5587 encoding, reason, p, size, exceptionObject, 5588 collstartpos, collendpos, &newpos); 5589 if (repunicode == NULL) 5590 return -1; 5591 if (PyBytes_Check(repunicode)) { 5592 /* Directly copy bytes result to output. */ 5593 Py_ssize_t outsize = PyBytes_Size(*res); 5594 Py_ssize_t requiredsize; 5595 repsize = PyBytes_Size(repunicode); 5596 requiredsize = *respos + repsize; 5597 if (requiredsize > outsize) 5598 /* Make room for all additional bytes. */ 5599 if (charmapencode_resize(res, respos, requiredsize)) { 5600 Py_DECREF(repunicode); 5601 return -1; 5602 } 5603 memcpy(PyBytes_AsString(*res) + *respos, 5604 PyBytes_AsString(repunicode), repsize); 5605 *respos += repsize; 5606 *inpos = newpos; 5607 Py_DECREF(repunicode); 5608 break; 5609 } 5610 /* generate replacement */ 5611 repsize = PyUnicode_GET_SIZE(repunicode); 5612 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) { 5613 x = charmapencode_output(*uni2, mapping, res, respos); 5614 if (x==enc_EXCEPTION) { 5615 return -1; 5616 } 5617 else if (x==enc_FAILED) { 5618 Py_DECREF(repunicode); 5619 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason); 5620 return -1; 5621 } 5622 } 5623 *inpos = newpos; 5624 Py_DECREF(repunicode); 5625 } 5626 return 0; 5627} 5628 5629PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p, 5630 Py_ssize_t size, 5631 PyObject *mapping, 5632 const char *errors) 5633{ 5634 /* output object */ 5635 PyObject *res = NULL; 5636 /* current input position */ 5637 Py_ssize_t inpos = 0; 5638 /* current output position */ 5639 Py_ssize_t respos = 0; 5640 PyObject *errorHandler = NULL; 5641 PyObject *exc = NULL; 5642 /* the following variable is used for caching string comparisons 5643 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 5644 * 3=ignore, 4=xmlcharrefreplace */ 5645 int known_errorHandler = -1; 5646 5647 /* Default to Latin-1 */ 5648 if (mapping == NULL) 5649 return PyUnicode_EncodeLatin1(p, size, errors); 5650 5651 /* allocate enough for a simple encoding without 5652 replacements, if we need more, we'll resize */ 5653 res = PyBytes_FromStringAndSize(NULL, size); 5654 if (res == NULL) 5655 goto onError; 5656 if (size == 0) 5657 return res; 5658 5659 while (inpos<size) { 5660 /* try to encode it */ 5661 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos); 5662 if (x==enc_EXCEPTION) /* error */ 5663 goto onError; 5664 if (x==enc_FAILED) { /* unencodable character */ 5665 if (charmap_encoding_error(p, size, &inpos, mapping, 5666 &exc, 5667 &known_errorHandler, &errorHandler, errors, 5668 &res, &respos)) { 5669 goto onError; 5670 } 5671 } 5672 else 5673 /* done with this character => adjust input position */ 5674 ++inpos; 5675 } 5676 5677 /* Resize if we allocated to much */ 5678 if (respos<PyBytes_GET_SIZE(res)) 5679 if (_PyBytes_Resize(&res, respos) < 0) 5680 goto onError; 5681 5682 Py_XDECREF(exc); 5683 Py_XDECREF(errorHandler); 5684 return res; 5685 5686 onError: 5687 Py_XDECREF(res); 5688 Py_XDECREF(exc); 5689 Py_XDECREF(errorHandler); 5690 return NULL; 5691} 5692 5693PyObject *PyUnicode_AsCharmapString(PyObject *unicode, 5694 PyObject *mapping) 5695{ 5696 if (!PyUnicode_Check(unicode) || mapping == NULL) { 5697 PyErr_BadArgument(); 5698 return NULL; 5699 } 5700 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode), 5701 PyUnicode_GET_SIZE(unicode), 5702 mapping, 5703 NULL); 5704} 5705 5706/* create or adjust a UnicodeTranslateError */ 5707static void make_translate_exception(PyObject **exceptionObject, 5708 const Py_UNICODE *unicode, Py_ssize_t size, 5709 Py_ssize_t startpos, Py_ssize_t endpos, 5710 const char *reason) 5711{ 5712 if (*exceptionObject == NULL) { 5713 *exceptionObject = PyUnicodeTranslateError_Create( 5714 unicode, size, startpos, endpos, reason); 5715 } 5716 else { 5717 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos)) 5718 goto onError; 5719 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos)) 5720 goto onError; 5721 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason)) 5722 goto onError; 5723 return; 5724 onError: 5725 Py_DECREF(*exceptionObject); 5726 *exceptionObject = NULL; 5727 } 5728} 5729 5730/* raises a UnicodeTranslateError */ 5731static void raise_translate_exception(PyObject **exceptionObject, 5732 const Py_UNICODE *unicode, Py_ssize_t size, 5733 Py_ssize_t startpos, Py_ssize_t endpos, 5734 const char *reason) 5735{ 5736 make_translate_exception(exceptionObject, 5737 unicode, size, startpos, endpos, reason); 5738 if (*exceptionObject != NULL) 5739 PyCodec_StrictErrors(*exceptionObject); 5740} 5741 5742/* error handling callback helper: 5743 build arguments, call the callback and check the arguments, 5744 put the result into newpos and return the replacement string, which 5745 has to be freed by the caller */ 5746static PyObject *unicode_translate_call_errorhandler(const char *errors, 5747 PyObject **errorHandler, 5748 const char *reason, 5749 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject, 5750 Py_ssize_t startpos, Py_ssize_t endpos, 5751 Py_ssize_t *newpos) 5752{ 5753 static char *argparse = "O!n;translating error handler must return (str, int) tuple"; 5754 5755 Py_ssize_t i_newpos; 5756 PyObject *restuple; 5757 PyObject *resunicode; 5758 5759 if (*errorHandler == NULL) { 5760 *errorHandler = PyCodec_LookupError(errors); 5761 if (*errorHandler == NULL) 5762 return NULL; 5763 } 5764 5765 make_translate_exception(exceptionObject, 5766 unicode, size, startpos, endpos, reason); 5767 if (*exceptionObject == NULL) 5768 return NULL; 5769 5770 restuple = PyObject_CallFunctionObjArgs( 5771 *errorHandler, *exceptionObject, NULL); 5772 if (restuple == NULL) 5773 return NULL; 5774 if (!PyTuple_Check(restuple)) { 5775 PyErr_SetString(PyExc_TypeError, &argparse[4]); 5776 Py_DECREF(restuple); 5777 return NULL; 5778 } 5779 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, 5780 &resunicode, &i_newpos)) { 5781 Py_DECREF(restuple); 5782 return NULL; 5783 } 5784 if (i_newpos<0) 5785 *newpos = size+i_newpos; 5786 else 5787 *newpos = i_newpos; 5788 if (*newpos<0 || *newpos>size) { 5789 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos); 5790 Py_DECREF(restuple); 5791 return NULL; 5792 } 5793 Py_INCREF(resunicode); 5794 Py_DECREF(restuple); 5795 return resunicode; 5796} 5797 5798/* Lookup the character ch in the mapping and put the result in result, 5799 which must be decrefed by the caller. 5800 Return 0 on success, -1 on error */ 5801static 5802int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result) 5803{ 5804 PyObject *w = PyLong_FromLong((long)c); 5805 PyObject *x; 5806 5807 if (w == NULL) 5808 return -1; 5809 x = PyObject_GetItem(mapping, w); 5810 Py_DECREF(w); 5811 if (x == NULL) { 5812 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 5813 /* No mapping found means: use 1:1 mapping. */ 5814 PyErr_Clear(); 5815 *result = NULL; 5816 return 0; 5817 } else 5818 return -1; 5819 } 5820 else if (x == Py_None) { 5821 *result = x; 5822 return 0; 5823 } 5824 else if (PyLong_Check(x)) { 5825 long value = PyLong_AS_LONG(x); 5826 long max = PyUnicode_GetMax(); 5827 if (value < 0 || value > max) { 5828 PyErr_Format(PyExc_TypeError, 5829 "character mapping must be in range(0x%x)", max+1); 5830 Py_DECREF(x); 5831 return -1; 5832 } 5833 *result = x; 5834 return 0; 5835 } 5836 else if (PyUnicode_Check(x)) { 5837 *result = x; 5838 return 0; 5839 } 5840 else { 5841 /* wrong return value */ 5842 PyErr_SetString(PyExc_TypeError, 5843 "character mapping must return integer, None or str"); 5844 Py_DECREF(x); 5845 return -1; 5846 } 5847} 5848/* ensure that *outobj is at least requiredsize characters long, 5849 if not reallocate and adjust various state variables. 5850 Return 0 on success, -1 on error */ 5851static 5852int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp, 5853 Py_ssize_t requiredsize) 5854{ 5855 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj); 5856 if (requiredsize > oldsize) { 5857 /* remember old output position */ 5858 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj); 5859 /* exponentially overallocate to minimize reallocations */ 5860 if (requiredsize < 2 * oldsize) 5861 requiredsize = 2 * oldsize; 5862 if (PyUnicode_Resize(outobj, requiredsize) < 0) 5863 return -1; 5864 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos; 5865 } 5866 return 0; 5867} 5868/* lookup the character, put the result in the output string and adjust 5869 various state variables. Return a new reference to the object that 5870 was put in the output buffer in *result, or Py_None, if the mapping was 5871 undefined (in which case no character was written). 5872 The called must decref result. 5873 Return 0 on success, -1 on error. */ 5874static 5875int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp, 5876 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp, 5877 PyObject **res) 5878{ 5879 if (charmaptranslate_lookup(*curinp, mapping, res)) 5880 return -1; 5881 if (*res==NULL) { 5882 /* not found => default to 1:1 mapping */ 5883 *(*outp)++ = *curinp; 5884 } 5885 else if (*res==Py_None) 5886 ; 5887 else if (PyLong_Check(*res)) { 5888 /* no overflow check, because we know that the space is enough */ 5889 *(*outp)++ = (Py_UNICODE)PyLong_AS_LONG(*res); 5890 } 5891 else if (PyUnicode_Check(*res)) { 5892 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res); 5893 if (repsize==1) { 5894 /* no overflow check, because we know that the space is enough */ 5895 *(*outp)++ = *PyUnicode_AS_UNICODE(*res); 5896 } 5897 else if (repsize!=0) { 5898 /* more than one character */ 5899 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) + 5900 (insize - (curinp-startinp)) + 5901 repsize - 1; 5902 if (charmaptranslate_makespace(outobj, outp, requiredsize)) 5903 return -1; 5904 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize); 5905 *outp += repsize; 5906 } 5907 } 5908 else 5909 return -1; 5910 return 0; 5911} 5912 5913PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p, 5914 Py_ssize_t size, 5915 PyObject *mapping, 5916 const char *errors) 5917{ 5918 /* output object */ 5919 PyObject *res = NULL; 5920 /* pointers to the beginning and end+1 of input */ 5921 const Py_UNICODE *startp = p; 5922 const Py_UNICODE *endp = p + size; 5923 /* pointer into the output */ 5924 Py_UNICODE *str; 5925 /* current output position */ 5926 Py_ssize_t respos = 0; 5927 char *reason = "character maps to <undefined>"; 5928 PyObject *errorHandler = NULL; 5929 PyObject *exc = NULL; 5930 /* the following variable is used for caching string comparisons 5931 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 5932 * 3=ignore, 4=xmlcharrefreplace */ 5933 int known_errorHandler = -1; 5934 5935 if (mapping == NULL) { 5936 PyErr_BadArgument(); 5937 return NULL; 5938 } 5939 5940 /* allocate enough for a simple 1:1 translation without 5941 replacements, if we need more, we'll resize */ 5942 res = PyUnicode_FromUnicode(NULL, size); 5943 if (res == NULL) 5944 goto onError; 5945 if (size == 0) 5946 return res; 5947 str = PyUnicode_AS_UNICODE(res); 5948 5949 while (p<endp) { 5950 /* try to encode it */ 5951 PyObject *x = NULL; 5952 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) { 5953 Py_XDECREF(x); 5954 goto onError; 5955 } 5956 Py_XDECREF(x); 5957 if (x!=Py_None) /* it worked => adjust input pointer */ 5958 ++p; 5959 else { /* untranslatable character */ 5960 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */ 5961 Py_ssize_t repsize; 5962 Py_ssize_t newpos; 5963 Py_UNICODE *uni2; 5964 /* startpos for collecting untranslatable chars */ 5965 const Py_UNICODE *collstart = p; 5966 const Py_UNICODE *collend = p+1; 5967 const Py_UNICODE *coll; 5968 5969 /* find all untranslatable characters */ 5970 while (collend < endp) { 5971 if (charmaptranslate_lookup(*collend, mapping, &x)) 5972 goto onError; 5973 Py_XDECREF(x); 5974 if (x!=Py_None) 5975 break; 5976 ++collend; 5977 } 5978 /* cache callback name lookup 5979 * (if not done yet, i.e. it's the first error) */ 5980 if (known_errorHandler==-1) { 5981 if ((errors==NULL) || (!strcmp(errors, "strict"))) 5982 known_errorHandler = 1; 5983 else if (!strcmp(errors, "replace")) 5984 known_errorHandler = 2; 5985 else if (!strcmp(errors, "ignore")) 5986 known_errorHandler = 3; 5987 else if (!strcmp(errors, "xmlcharrefreplace")) 5988 known_errorHandler = 4; 5989 else 5990 known_errorHandler = 0; 5991 } 5992 switch (known_errorHandler) { 5993 case 1: /* strict */ 5994 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason); 5995 goto onError; 5996 case 2: /* replace */ 5997 /* No need to check for space, this is a 1:1 replacement */ 5998 for (coll = collstart; coll<collend; ++coll) 5999 *str++ = '?'; 6000 /* fall through */ 6001 case 3: /* ignore */ 6002 p = collend; 6003 break; 6004 case 4: /* xmlcharrefreplace */ 6005 /* generate replacement (temporarily (mis)uses p) */ 6006 for (p = collstart; p < collend; ++p) { 6007 char buffer[2+29+1+1]; 6008 char *cp; 6009 sprintf(buffer, "&#%d;", (int)*p); 6010 if (charmaptranslate_makespace(&res, &str, 6011 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend))) 6012 goto onError; 6013 for (cp = buffer; *cp; ++cp) 6014 *str++ = *cp; 6015 } 6016 p = collend; 6017 break; 6018 default: 6019 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler, 6020 reason, startp, size, &exc, 6021 collstart-startp, collend-startp, &newpos); 6022 if (repunicode == NULL) 6023 goto onError; 6024 /* generate replacement */ 6025 repsize = PyUnicode_GET_SIZE(repunicode); 6026 if (charmaptranslate_makespace(&res, &str, 6027 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) { 6028 Py_DECREF(repunicode); 6029 goto onError; 6030 } 6031 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) 6032 *str++ = *uni2; 6033 p = startp + newpos; 6034 Py_DECREF(repunicode); 6035 } 6036 } 6037 } 6038 /* Resize if we allocated to much */ 6039 respos = str-PyUnicode_AS_UNICODE(res); 6040 if (respos<PyUnicode_GET_SIZE(res)) { 6041 if (PyUnicode_Resize(&res, respos) < 0) 6042 goto onError; 6043 } 6044 Py_XDECREF(exc); 6045 Py_XDECREF(errorHandler); 6046 return res; 6047 6048 onError: 6049 Py_XDECREF(res); 6050 Py_XDECREF(exc); 6051 Py_XDECREF(errorHandler); 6052 return NULL; 6053} 6054 6055PyObject *PyUnicode_Translate(PyObject *str, 6056 PyObject *mapping, 6057 const char *errors) 6058{ 6059 PyObject *result; 6060 6061 str = PyUnicode_FromObject(str); 6062 if (str == NULL) 6063 goto onError; 6064 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str), 6065 PyUnicode_GET_SIZE(str), 6066 mapping, 6067 errors); 6068 Py_DECREF(str); 6069 return result; 6070 6071 onError: 6072 Py_XDECREF(str); 6073 return NULL; 6074} 6075 6076/* --- Decimal Encoder ---------------------------------------------------- */ 6077 6078int PyUnicode_EncodeDecimal(Py_UNICODE *s, 6079 Py_ssize_t length, 6080 char *output, 6081 const char *errors) 6082{ 6083 Py_UNICODE *p, *end; 6084 PyObject *errorHandler = NULL; 6085 PyObject *exc = NULL; 6086 const char *encoding = "decimal"; 6087 const char *reason = "invalid decimal Unicode string"; 6088 /* the following variable is used for caching string comparisons 6089 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */ 6090 int known_errorHandler = -1; 6091 6092 if (output == NULL) { 6093 PyErr_BadArgument(); 6094 return -1; 6095 } 6096 6097 p = s; 6098 end = s + length; 6099 while (p < end) { 6100 register Py_UNICODE ch = *p; 6101 int decimal; 6102 PyObject *repunicode; 6103 Py_ssize_t repsize; 6104 Py_ssize_t newpos; 6105 Py_UNICODE *uni2; 6106 Py_UNICODE *collstart; 6107 Py_UNICODE *collend; 6108 6109 if (Py_UNICODE_ISSPACE(ch)) { 6110 *output++ = ' '; 6111 ++p; 6112 continue; 6113 } 6114 decimal = Py_UNICODE_TODECIMAL(ch); 6115 if (decimal >= 0) { 6116 *output++ = '0' + decimal; 6117 ++p; 6118 continue; 6119 } 6120 if (0 < ch && ch < 256) { 6121 *output++ = (char)ch; 6122 ++p; 6123 continue; 6124 } 6125 /* All other characters are considered unencodable */ 6126 collstart = p; 6127 collend = p+1; 6128 while (collend < end) { 6129 if ((0 < *collend && *collend < 256) || 6130 !Py_UNICODE_ISSPACE(*collend) || 6131 Py_UNICODE_TODECIMAL(*collend)) 6132 break; 6133 } 6134 /* cache callback name lookup 6135 * (if not done yet, i.e. it's the first error) */ 6136 if (known_errorHandler==-1) { 6137 if ((errors==NULL) || (!strcmp(errors, "strict"))) 6138 known_errorHandler = 1; 6139 else if (!strcmp(errors, "replace")) 6140 known_errorHandler = 2; 6141 else if (!strcmp(errors, "ignore")) 6142 known_errorHandler = 3; 6143 else if (!strcmp(errors, "xmlcharrefreplace")) 6144 known_errorHandler = 4; 6145 else 6146 known_errorHandler = 0; 6147 } 6148 switch (known_errorHandler) { 6149 case 1: /* strict */ 6150 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason); 6151 goto onError; 6152 case 2: /* replace */ 6153 for (p = collstart; p < collend; ++p) 6154 *output++ = '?'; 6155 /* fall through */ 6156 case 3: /* ignore */ 6157 p = collend; 6158 break; 6159 case 4: /* xmlcharrefreplace */ 6160 /* generate replacement (temporarily (mis)uses p) */ 6161 for (p = collstart; p < collend; ++p) 6162 output += sprintf(output, "&#%d;", (int)*p); 6163 p = collend; 6164 break; 6165 default: 6166 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler, 6167 encoding, reason, s, length, &exc, 6168 collstart-s, collend-s, &newpos); 6169 if (repunicode == NULL) 6170 goto onError; 6171 if (!PyUnicode_Check(repunicode)) { 6172 /* Byte results not supported, since they have no decimal property. */ 6173 PyErr_SetString(PyExc_TypeError, "error handler should return unicode"); 6174 Py_DECREF(repunicode); 6175 goto onError; 6176 } 6177 /* generate replacement */ 6178 repsize = PyUnicode_GET_SIZE(repunicode); 6179 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) { 6180 Py_UNICODE ch = *uni2; 6181 if (Py_UNICODE_ISSPACE(ch)) 6182 *output++ = ' '; 6183 else { 6184 decimal = Py_UNICODE_TODECIMAL(ch); 6185 if (decimal >= 0) 6186 *output++ = '0' + decimal; 6187 else if (0 < ch && ch < 256) 6188 *output++ = (char)ch; 6189 else { 6190 Py_DECREF(repunicode); 6191 raise_encode_exception(&exc, encoding, 6192 s, length, collstart-s, collend-s, reason); 6193 goto onError; 6194 } 6195 } 6196 } 6197 p = s + newpos; 6198 Py_DECREF(repunicode); 6199 } 6200 } 6201 /* 0-terminate the output string */ 6202 *output++ = '\0'; 6203 Py_XDECREF(exc); 6204 Py_XDECREF(errorHandler); 6205 return 0; 6206 6207 onError: 6208 Py_XDECREF(exc); 6209 Py_XDECREF(errorHandler); 6210 return -1; 6211} 6212 6213/* --- Helpers ------------------------------------------------------------ */ 6214 6215#include "stringlib/unicodedefs.h" 6216#include "stringlib/fastsearch.h" 6217 6218#include "stringlib/count.h" 6219#include "stringlib/find.h" 6220#include "stringlib/partition.h" 6221#include "stringlib/split.h" 6222 6223#define _Py_InsertThousandsGrouping _PyUnicode_InsertThousandsGrouping 6224#define _Py_InsertThousandsGroupingLocale _PyUnicode_InsertThousandsGroupingLocale 6225#include "stringlib/localeutil.h" 6226 6227/* helper macro to fixup start/end slice values */ 6228#define ADJUST_INDICES(start, end, len) \ 6229 if (end > len) \ 6230 end = len; \ 6231 else if (end < 0) { \ 6232 end += len; \ 6233 if (end < 0) \ 6234 end = 0; \ 6235 } \ 6236 if (start < 0) { \ 6237 start += len; \ 6238 if (start < 0) \ 6239 start = 0; \ 6240 } 6241 6242Py_ssize_t PyUnicode_Count(PyObject *str, 6243 PyObject *substr, 6244 Py_ssize_t start, 6245 Py_ssize_t end) 6246{ 6247 Py_ssize_t result; 6248 PyUnicodeObject* str_obj; 6249 PyUnicodeObject* sub_obj; 6250 6251 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str); 6252 if (!str_obj) 6253 return -1; 6254 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr); 6255 if (!sub_obj) { 6256 Py_DECREF(str_obj); 6257 return -1; 6258 } 6259 6260 ADJUST_INDICES(start, end, str_obj->length); 6261 result = stringlib_count( 6262 str_obj->str + start, end - start, sub_obj->str, sub_obj->length, 6263 PY_SSIZE_T_MAX 6264 ); 6265 6266 Py_DECREF(sub_obj); 6267 Py_DECREF(str_obj); 6268 6269 return result; 6270} 6271 6272Py_ssize_t PyUnicode_Find(PyObject *str, 6273 PyObject *sub, 6274 Py_ssize_t start, 6275 Py_ssize_t end, 6276 int direction) 6277{ 6278 Py_ssize_t result; 6279 6280 str = PyUnicode_FromObject(str); 6281 if (!str) 6282 return -2; 6283 sub = PyUnicode_FromObject(sub); 6284 if (!sub) { 6285 Py_DECREF(str); 6286 return -2; 6287 } 6288 6289 if (direction > 0) 6290 result = stringlib_find_slice( 6291 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str), 6292 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub), 6293 start, end 6294 ); 6295 else 6296 result = stringlib_rfind_slice( 6297 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str), 6298 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub), 6299 start, end 6300 ); 6301 6302 Py_DECREF(str); 6303 Py_DECREF(sub); 6304 6305 return result; 6306} 6307 6308static 6309int tailmatch(PyUnicodeObject *self, 6310 PyUnicodeObject *substring, 6311 Py_ssize_t start, 6312 Py_ssize_t end, 6313 int direction) 6314{ 6315 if (substring->length == 0) 6316 return 1; 6317 6318 ADJUST_INDICES(start, end, self->length); 6319 end -= substring->length; 6320 if (end < start) 6321 return 0; 6322 6323 if (direction > 0) { 6324 if (Py_UNICODE_MATCH(self, end, substring)) 6325 return 1; 6326 } else { 6327 if (Py_UNICODE_MATCH(self, start, substring)) 6328 return 1; 6329 } 6330 6331 return 0; 6332} 6333 6334Py_ssize_t PyUnicode_Tailmatch(PyObject *str, 6335 PyObject *substr, 6336 Py_ssize_t start, 6337 Py_ssize_t end, 6338 int direction) 6339{ 6340 Py_ssize_t result; 6341 6342 str = PyUnicode_FromObject(str); 6343 if (str == NULL) 6344 return -1; 6345 substr = PyUnicode_FromObject(substr); 6346 if (substr == NULL) { 6347 Py_DECREF(str); 6348 return -1; 6349 } 6350 6351 result = tailmatch((PyUnicodeObject *)str, 6352 (PyUnicodeObject *)substr, 6353 start, end, direction); 6354 Py_DECREF(str); 6355 Py_DECREF(substr); 6356 return result; 6357} 6358 6359/* Apply fixfct filter to the Unicode object self and return a 6360 reference to the modified object */ 6361 6362static 6363PyObject *fixup(PyUnicodeObject *self, 6364 int (*fixfct)(PyUnicodeObject *s)) 6365{ 6366 6367 PyUnicodeObject *u; 6368 6369 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length); 6370 if (u == NULL) 6371 return NULL; 6372 6373 Py_UNICODE_COPY(u->str, self->str, self->length); 6374 6375 if (!fixfct(u) && PyUnicode_CheckExact(self)) { 6376 /* fixfct should return TRUE if it modified the buffer. If 6377 FALSE, return a reference to the original buffer instead 6378 (to save space, not time) */ 6379 Py_INCREF(self); 6380 Py_DECREF(u); 6381 return (PyObject*) self; 6382 } 6383 return (PyObject*) u; 6384} 6385 6386static 6387int fixupper(PyUnicodeObject *self) 6388{ 6389 Py_ssize_t len = self->length; 6390 Py_UNICODE *s = self->str; 6391 int status = 0; 6392 6393 while (len-- > 0) { 6394 register Py_UNICODE ch; 6395 6396 ch = Py_UNICODE_TOUPPER(*s); 6397 if (ch != *s) { 6398 status = 1; 6399 *s = ch; 6400 } 6401 s++; 6402 } 6403 6404 return status; 6405} 6406 6407static 6408int fixlower(PyUnicodeObject *self) 6409{ 6410 Py_ssize_t len = self->length; 6411 Py_UNICODE *s = self->str; 6412 int status = 0; 6413 6414 while (len-- > 0) { 6415 register Py_UNICODE ch; 6416 6417 ch = Py_UNICODE_TOLOWER(*s); 6418 if (ch != *s) { 6419 status = 1; 6420 *s = ch; 6421 } 6422 s++; 6423 } 6424 6425 return status; 6426} 6427 6428static 6429int fixswapcase(PyUnicodeObject *self) 6430{ 6431 Py_ssize_t len = self->length; 6432 Py_UNICODE *s = self->str; 6433 int status = 0; 6434 6435 while (len-- > 0) { 6436 if (Py_UNICODE_ISUPPER(*s)) { 6437 *s = Py_UNICODE_TOLOWER(*s); 6438 status = 1; 6439 } else if (Py_UNICODE_ISLOWER(*s)) { 6440 *s = Py_UNICODE_TOUPPER(*s); 6441 status = 1; 6442 } 6443 s++; 6444 } 6445 6446 return status; 6447} 6448 6449static 6450int fixcapitalize(PyUnicodeObject *self) 6451{ 6452 Py_ssize_t len = self->length; 6453 Py_UNICODE *s = self->str; 6454 int status = 0; 6455 6456 if (len == 0) 6457 return 0; 6458 if (Py_UNICODE_ISLOWER(*s)) { 6459 *s = Py_UNICODE_TOUPPER(*s); 6460 status = 1; 6461 } 6462 s++; 6463 while (--len > 0) { 6464 if (Py_UNICODE_ISUPPER(*s)) { 6465 *s = Py_UNICODE_TOLOWER(*s); 6466 status = 1; 6467 } 6468 s++; 6469 } 6470 return status; 6471} 6472 6473static 6474int fixtitle(PyUnicodeObject *self) 6475{ 6476 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 6477 register Py_UNICODE *e; 6478 int previous_is_cased; 6479 6480 /* Shortcut for single character strings */ 6481 if (PyUnicode_GET_SIZE(self) == 1) { 6482 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p); 6483 if (*p != ch) { 6484 *p = ch; 6485 return 1; 6486 } 6487 else 6488 return 0; 6489 } 6490 6491 e = p + PyUnicode_GET_SIZE(self); 6492 previous_is_cased = 0; 6493 for (; p < e; p++) { 6494 register const Py_UNICODE ch = *p; 6495 6496 if (previous_is_cased) 6497 *p = Py_UNICODE_TOLOWER(ch); 6498 else 6499 *p = Py_UNICODE_TOTITLE(ch); 6500 6501 if (Py_UNICODE_ISLOWER(ch) || 6502 Py_UNICODE_ISUPPER(ch) || 6503 Py_UNICODE_ISTITLE(ch)) 6504 previous_is_cased = 1; 6505 else 6506 previous_is_cased = 0; 6507 } 6508 return 1; 6509} 6510 6511PyObject * 6512PyUnicode_Join(PyObject *separator, PyObject *seq) 6513{ 6514 const Py_UNICODE blank = ' '; 6515 const Py_UNICODE *sep = ␣ 6516 Py_ssize_t seplen = 1; 6517 PyUnicodeObject *res = NULL; /* the result */ 6518 Py_UNICODE *res_p; /* pointer to free byte in res's string area */ 6519 PyObject *fseq; /* PySequence_Fast(seq) */ 6520 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */ 6521 PyObject **items; 6522 PyObject *item; 6523 Py_ssize_t sz, i; 6524 6525 fseq = PySequence_Fast(seq, ""); 6526 if (fseq == NULL) { 6527 return NULL; 6528 } 6529 6530 /* NOTE: the following code can't call back into Python code, 6531 * so we are sure that fseq won't be mutated. 6532 */ 6533 6534 seqlen = PySequence_Fast_GET_SIZE(fseq); 6535 /* If empty sequence, return u"". */ 6536 if (seqlen == 0) { 6537 res = _PyUnicode_New(0); /* empty sequence; return u"" */ 6538 goto Done; 6539 } 6540 items = PySequence_Fast_ITEMS(fseq); 6541 /* If singleton sequence with an exact Unicode, return that. */ 6542 if (seqlen == 1) { 6543 item = items[0]; 6544 if (PyUnicode_CheckExact(item)) { 6545 Py_INCREF(item); 6546 res = (PyUnicodeObject *)item; 6547 goto Done; 6548 } 6549 } 6550 else { 6551 /* Set up sep and seplen */ 6552 if (separator == NULL) { 6553 sep = ␣ 6554 seplen = 1; 6555 } 6556 else { 6557 if (!PyUnicode_Check(separator)) { 6558 PyErr_Format(PyExc_TypeError, 6559 "separator: expected str instance," 6560 " %.80s found", 6561 Py_TYPE(separator)->tp_name); 6562 goto onError; 6563 } 6564 sep = PyUnicode_AS_UNICODE(separator); 6565 seplen = PyUnicode_GET_SIZE(separator); 6566 } 6567 } 6568 6569 /* There are at least two things to join, or else we have a subclass 6570 * of str in the sequence. 6571 * Do a pre-pass to figure out the total amount of space we'll 6572 * need (sz), and see whether all argument are strings. 6573 */ 6574 sz = 0; 6575 for (i = 0; i < seqlen; i++) { 6576 const Py_ssize_t old_sz = sz; 6577 item = items[i]; 6578 if (!PyUnicode_Check(item)) { 6579 PyErr_Format(PyExc_TypeError, 6580 "sequence item %zd: expected str instance," 6581 " %.80s found", 6582 i, Py_TYPE(item)->tp_name); 6583 goto onError; 6584 } 6585 sz += PyUnicode_GET_SIZE(item); 6586 if (i != 0) 6587 sz += seplen; 6588 if (sz < old_sz || sz > PY_SSIZE_T_MAX) { 6589 PyErr_SetString(PyExc_OverflowError, 6590 "join() result is too long for a Python string"); 6591 goto onError; 6592 } 6593 } 6594 6595 res = _PyUnicode_New(sz); 6596 if (res == NULL) 6597 goto onError; 6598 6599 /* Catenate everything. */ 6600 res_p = PyUnicode_AS_UNICODE(res); 6601 for (i = 0; i < seqlen; ++i) { 6602 Py_ssize_t itemlen; 6603 item = items[i]; 6604 itemlen = PyUnicode_GET_SIZE(item); 6605 /* Copy item, and maybe the separator. */ 6606 if (i) { 6607 Py_UNICODE_COPY(res_p, sep, seplen); 6608 res_p += seplen; 6609 } 6610 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen); 6611 res_p += itemlen; 6612 } 6613 6614 Done: 6615 Py_DECREF(fseq); 6616 return (PyObject *)res; 6617 6618 onError: 6619 Py_DECREF(fseq); 6620 Py_XDECREF(res); 6621 return NULL; 6622} 6623 6624static 6625PyUnicodeObject *pad(PyUnicodeObject *self, 6626 Py_ssize_t left, 6627 Py_ssize_t right, 6628 Py_UNICODE fill) 6629{ 6630 PyUnicodeObject *u; 6631 6632 if (left < 0) 6633 left = 0; 6634 if (right < 0) 6635 right = 0; 6636 6637 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) { 6638 Py_INCREF(self); 6639 return self; 6640 } 6641 6642 if (left > PY_SSIZE_T_MAX - self->length || 6643 right > PY_SSIZE_T_MAX - (left + self->length)) { 6644 PyErr_SetString(PyExc_OverflowError, "padded string is too long"); 6645 return NULL; 6646 } 6647 u = _PyUnicode_New(left + self->length + right); 6648 if (u) { 6649 if (left) 6650 Py_UNICODE_FILL(u->str, fill, left); 6651 Py_UNICODE_COPY(u->str + left, self->str, self->length); 6652 if (right) 6653 Py_UNICODE_FILL(u->str + left + self->length, fill, right); 6654 } 6655 6656 return u; 6657} 6658 6659PyObject *PyUnicode_Splitlines(PyObject *string, int keepends) 6660{ 6661 PyObject *list; 6662 6663 string = PyUnicode_FromObject(string); 6664 if (string == NULL) 6665 return NULL; 6666 6667 list = stringlib_splitlines( 6668 (PyObject*) string, PyUnicode_AS_UNICODE(string), 6669 PyUnicode_GET_SIZE(string), keepends); 6670 6671 Py_DECREF(string); 6672 return list; 6673} 6674 6675static 6676PyObject *split(PyUnicodeObject *self, 6677 PyUnicodeObject *substring, 6678 Py_ssize_t maxcount) 6679{ 6680 if (maxcount < 0) 6681 maxcount = PY_SSIZE_T_MAX; 6682 6683 if (substring == NULL) 6684 return stringlib_split_whitespace( 6685 (PyObject*) self, self->str, self->length, maxcount 6686 ); 6687 6688 return stringlib_split( 6689 (PyObject*) self, self->str, self->length, 6690 substring->str, substring->length, 6691 maxcount 6692 ); 6693} 6694 6695static 6696PyObject *rsplit(PyUnicodeObject *self, 6697 PyUnicodeObject *substring, 6698 Py_ssize_t maxcount) 6699{ 6700 if (maxcount < 0) 6701 maxcount = PY_SSIZE_T_MAX; 6702 6703 if (substring == NULL) 6704 return stringlib_rsplit_whitespace( 6705 (PyObject*) self, self->str, self->length, maxcount 6706 ); 6707 6708 return stringlib_rsplit( 6709 (PyObject*) self, self->str, self->length, 6710 substring->str, substring->length, 6711 maxcount 6712 ); 6713} 6714 6715static 6716PyObject *replace(PyUnicodeObject *self, 6717 PyUnicodeObject *str1, 6718 PyUnicodeObject *str2, 6719 Py_ssize_t maxcount) 6720{ 6721 PyUnicodeObject *u; 6722 6723 if (maxcount < 0) 6724 maxcount = PY_SSIZE_T_MAX; 6725 else if (maxcount == 0 || self->length == 0) 6726 goto nothing; 6727 6728 if (str1->length == str2->length) { 6729 Py_ssize_t i; 6730 /* same length */ 6731 if (str1->length == 0) 6732 goto nothing; 6733 if (str1->length == 1) { 6734 /* replace characters */ 6735 Py_UNICODE u1, u2; 6736 if (!findchar(self->str, self->length, str1->str[0])) 6737 goto nothing; 6738 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length); 6739 if (!u) 6740 return NULL; 6741 Py_UNICODE_COPY(u->str, self->str, self->length); 6742 u1 = str1->str[0]; 6743 u2 = str2->str[0]; 6744 for (i = 0; i < u->length; i++) 6745 if (u->str[i] == u1) { 6746 if (--maxcount < 0) 6747 break; 6748 u->str[i] = u2; 6749 } 6750 } else { 6751 i = stringlib_find( 6752 self->str, self->length, str1->str, str1->length, 0 6753 ); 6754 if (i < 0) 6755 goto nothing; 6756 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length); 6757 if (!u) 6758 return NULL; 6759 Py_UNICODE_COPY(u->str, self->str, self->length); 6760 6761 /* change everything in-place, starting with this one */ 6762 Py_UNICODE_COPY(u->str+i, str2->str, str2->length); 6763 i += str1->length; 6764 6765 while ( --maxcount > 0) { 6766 i = stringlib_find(self->str+i, self->length-i, 6767 str1->str, str1->length, 6768 i); 6769 if (i == -1) 6770 break; 6771 Py_UNICODE_COPY(u->str+i, str2->str, str2->length); 6772 i += str1->length; 6773 } 6774 } 6775 } else { 6776 6777 Py_ssize_t n, i, j, e; 6778 Py_ssize_t product, new_size, delta; 6779 Py_UNICODE *p; 6780 6781 /* replace strings */ 6782 n = stringlib_count(self->str, self->length, str1->str, str1->length, 6783 maxcount); 6784 if (n == 0) 6785 goto nothing; 6786 /* new_size = self->length + n * (str2->length - str1->length)); */ 6787 delta = (str2->length - str1->length); 6788 if (delta == 0) { 6789 new_size = self->length; 6790 } else { 6791 product = n * (str2->length - str1->length); 6792 if ((product / (str2->length - str1->length)) != n) { 6793 PyErr_SetString(PyExc_OverflowError, 6794 "replace string is too long"); 6795 return NULL; 6796 } 6797 new_size = self->length + product; 6798 if (new_size < 0) { 6799 PyErr_SetString(PyExc_OverflowError, 6800 "replace string is too long"); 6801 return NULL; 6802 } 6803 } 6804 u = _PyUnicode_New(new_size); 6805 if (!u) 6806 return NULL; 6807 i = 0; 6808 p = u->str; 6809 e = self->length - str1->length; 6810 if (str1->length > 0) { 6811 while (n-- > 0) { 6812 /* look for next match */ 6813 j = stringlib_find(self->str+i, self->length-i, 6814 str1->str, str1->length, 6815 i); 6816 if (j == -1) 6817 break; 6818 else if (j > i) { 6819 /* copy unchanged part [i:j] */ 6820 Py_UNICODE_COPY(p, self->str+i, j-i); 6821 p += j - i; 6822 } 6823 /* copy substitution string */ 6824 if (str2->length > 0) { 6825 Py_UNICODE_COPY(p, str2->str, str2->length); 6826 p += str2->length; 6827 } 6828 i = j + str1->length; 6829 } 6830 if (i < self->length) 6831 /* copy tail [i:] */ 6832 Py_UNICODE_COPY(p, self->str+i, self->length-i); 6833 } else { 6834 /* interleave */ 6835 while (n > 0) { 6836 Py_UNICODE_COPY(p, str2->str, str2->length); 6837 p += str2->length; 6838 if (--n <= 0) 6839 break; 6840 *p++ = self->str[i++]; 6841 } 6842 Py_UNICODE_COPY(p, self->str+i, self->length-i); 6843 } 6844 } 6845 return (PyObject *) u; 6846 6847 nothing: 6848 /* nothing to replace; return original string (when possible) */ 6849 if (PyUnicode_CheckExact(self)) { 6850 Py_INCREF(self); 6851 return (PyObject *) self; 6852 } 6853 return PyUnicode_FromUnicode(self->str, self->length); 6854} 6855 6856/* --- Unicode Object Methods --------------------------------------------- */ 6857 6858PyDoc_STRVAR(title__doc__, 6859 "S.title() -> str\n\ 6860\n\ 6861Return a titlecased version of S, i.e. words start with title case\n\ 6862characters, all remaining cased characters have lower case."); 6863 6864static PyObject* 6865unicode_title(PyUnicodeObject *self) 6866{ 6867 return fixup(self, fixtitle); 6868} 6869 6870PyDoc_STRVAR(capitalize__doc__, 6871 "S.capitalize() -> str\n\ 6872\n\ 6873Return a capitalized version of S, i.e. make the first character\n\ 6874have upper case and the rest lower case."); 6875 6876static PyObject* 6877unicode_capitalize(PyUnicodeObject *self) 6878{ 6879 return fixup(self, fixcapitalize); 6880} 6881 6882#if 0 6883PyDoc_STRVAR(capwords__doc__, 6884 "S.capwords() -> str\n\ 6885\n\ 6886Apply .capitalize() to all words in S and return the result with\n\ 6887normalized whitespace (all whitespace strings are replaced by ' ')."); 6888 6889static PyObject* 6890unicode_capwords(PyUnicodeObject *self) 6891{ 6892 PyObject *list; 6893 PyObject *item; 6894 Py_ssize_t i; 6895 6896 /* Split into words */ 6897 list = split(self, NULL, -1); 6898 if (!list) 6899 return NULL; 6900 6901 /* Capitalize each word */ 6902 for (i = 0; i < PyList_GET_SIZE(list); i++) { 6903 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i), 6904 fixcapitalize); 6905 if (item == NULL) 6906 goto onError; 6907 Py_DECREF(PyList_GET_ITEM(list, i)); 6908 PyList_SET_ITEM(list, i, item); 6909 } 6910 6911 /* Join the words to form a new string */ 6912 item = PyUnicode_Join(NULL, list); 6913 6914 onError: 6915 Py_DECREF(list); 6916 return (PyObject *)item; 6917} 6918#endif 6919 6920/* Argument converter. Coerces to a single unicode character */ 6921 6922static int 6923convert_uc(PyObject *obj, void *addr) 6924{ 6925 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr; 6926 PyObject *uniobj; 6927 Py_UNICODE *unistr; 6928 6929 uniobj = PyUnicode_FromObject(obj); 6930 if (uniobj == NULL) { 6931 PyErr_SetString(PyExc_TypeError, 6932 "The fill character cannot be converted to Unicode"); 6933 return 0; 6934 } 6935 if (PyUnicode_GET_SIZE(uniobj) != 1) { 6936 PyErr_SetString(PyExc_TypeError, 6937 "The fill character must be exactly one character long"); 6938 Py_DECREF(uniobj); 6939 return 0; 6940 } 6941 unistr = PyUnicode_AS_UNICODE(uniobj); 6942 *fillcharloc = unistr[0]; 6943 Py_DECREF(uniobj); 6944 return 1; 6945} 6946 6947PyDoc_STRVAR(center__doc__, 6948 "S.center(width[, fillchar]) -> str\n\ 6949\n\ 6950Return S centered in a string of length width. Padding is\n\ 6951done using the specified fill character (default is a space)"); 6952 6953static PyObject * 6954unicode_center(PyUnicodeObject *self, PyObject *args) 6955{ 6956 Py_ssize_t marg, left; 6957 Py_ssize_t width; 6958 Py_UNICODE fillchar = ' '; 6959 6960 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar)) 6961 return NULL; 6962 6963 if (self->length >= width && PyUnicode_CheckExact(self)) { 6964 Py_INCREF(self); 6965 return (PyObject*) self; 6966 } 6967 6968 marg = width - self->length; 6969 left = marg / 2 + (marg & width & 1); 6970 6971 return (PyObject*) pad(self, left, marg - left, fillchar); 6972} 6973 6974#if 0 6975 6976/* This code should go into some future Unicode collation support 6977 module. The basic comparison should compare ordinals on a naive 6978 basis (this is what Java does and thus Jython too). */ 6979 6980/* speedy UTF-16 code point order comparison */ 6981/* gleaned from: */ 6982/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */ 6983 6984static short utf16Fixup[32] = 6985{ 6986 0, 0, 0, 0, 0, 0, 0, 0, 6987 0, 0, 0, 0, 0, 0, 0, 0, 6988 0, 0, 0, 0, 0, 0, 0, 0, 6989 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800 6990}; 6991 6992static int 6993unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2) 6994{ 6995 Py_ssize_t len1, len2; 6996 6997 Py_UNICODE *s1 = str1->str; 6998 Py_UNICODE *s2 = str2->str; 6999 7000 len1 = str1->length; 7001 len2 = str2->length; 7002 7003 while (len1 > 0 && len2 > 0) { 7004 Py_UNICODE c1, c2; 7005 7006 c1 = *s1++; 7007 c2 = *s2++; 7008 7009 if (c1 > (1<<11) * 26) 7010 c1 += utf16Fixup[c1>>11]; 7011 if (c2 > (1<<11) * 26) 7012 c2 += utf16Fixup[c2>>11]; 7013 /* now c1 and c2 are in UTF-32-compatible order */ 7014 7015 if (c1 != c2) 7016 return (c1 < c2) ? -1 : 1; 7017 7018 len1--; len2--; 7019 } 7020 7021 return (len1 < len2) ? -1 : (len1 != len2); 7022} 7023 7024#else 7025 7026static int 7027unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2) 7028{ 7029 register Py_ssize_t len1, len2; 7030 7031 Py_UNICODE *s1 = str1->str; 7032 Py_UNICODE *s2 = str2->str; 7033 7034 len1 = str1->length; 7035 len2 = str2->length; 7036 7037 while (len1 > 0 && len2 > 0) { 7038 Py_UNICODE c1, c2; 7039 7040 c1 = *s1++; 7041 c2 = *s2++; 7042 7043 if (c1 != c2) 7044 return (c1 < c2) ? -1 : 1; 7045 7046 len1--; len2--; 7047 } 7048 7049 return (len1 < len2) ? -1 : (len1 != len2); 7050} 7051 7052#endif 7053 7054int PyUnicode_Compare(PyObject *left, 7055 PyObject *right) 7056{ 7057 if (PyUnicode_Check(left) && PyUnicode_Check(right)) 7058 return unicode_compare((PyUnicodeObject *)left, 7059 (PyUnicodeObject *)right); 7060 PyErr_Format(PyExc_TypeError, 7061 "Can't compare %.100s and %.100s", 7062 left->ob_type->tp_name, 7063 right->ob_type->tp_name); 7064 return -1; 7065} 7066 7067int 7068PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str) 7069{ 7070 int i; 7071 Py_UNICODE *id; 7072 assert(PyUnicode_Check(uni)); 7073 id = PyUnicode_AS_UNICODE(uni); 7074 /* Compare Unicode string and source character set string */ 7075 for (i = 0; id[i] && str[i]; i++) 7076 if (id[i] != str[i]) 7077 return ((int)id[i] < (int)str[i]) ? -1 : 1; 7078 /* This check keeps Python strings that end in '\0' from comparing equal 7079 to C strings identical up to that point. */ 7080 if (PyUnicode_GET_SIZE(uni) != i || id[i]) 7081 return 1; /* uni is longer */ 7082 if (str[i]) 7083 return -1; /* str is longer */ 7084 return 0; 7085} 7086 7087 7088#define TEST_COND(cond) \ 7089 ((cond) ? Py_True : Py_False) 7090 7091PyObject *PyUnicode_RichCompare(PyObject *left, 7092 PyObject *right, 7093 int op) 7094{ 7095 int result; 7096 7097 if (PyUnicode_Check(left) && PyUnicode_Check(right)) { 7098 PyObject *v; 7099 if (((PyUnicodeObject *) left)->length != 7100 ((PyUnicodeObject *) right)->length) { 7101 if (op == Py_EQ) { 7102 Py_INCREF(Py_False); 7103 return Py_False; 7104 } 7105 if (op == Py_NE) { 7106 Py_INCREF(Py_True); 7107 return Py_True; 7108 } 7109 } 7110 if (left == right) 7111 result = 0; 7112 else 7113 result = unicode_compare((PyUnicodeObject *)left, 7114 (PyUnicodeObject *)right); 7115 7116 /* Convert the return value to a Boolean */ 7117 switch (op) { 7118 case Py_EQ: 7119 v = TEST_COND(result == 0); 7120 break; 7121 case Py_NE: 7122 v = TEST_COND(result != 0); 7123 break; 7124 case Py_LE: 7125 v = TEST_COND(result <= 0); 7126 break; 7127 case Py_GE: 7128 v = TEST_COND(result >= 0); 7129 break; 7130 case Py_LT: 7131 v = TEST_COND(result == -1); 7132 break; 7133 case Py_GT: 7134 v = TEST_COND(result == 1); 7135 break; 7136 default: 7137 PyErr_BadArgument(); 7138 return NULL; 7139 } 7140 Py_INCREF(v); 7141 return v; 7142 } 7143 7144 Py_INCREF(Py_NotImplemented); 7145 return Py_NotImplemented; 7146} 7147 7148int PyUnicode_Contains(PyObject *container, 7149 PyObject *element) 7150{ 7151 PyObject *str, *sub; 7152 int result; 7153 7154 /* Coerce the two arguments */ 7155 sub = PyUnicode_FromObject(element); 7156 if (!sub) { 7157 PyErr_Format(PyExc_TypeError, 7158 "'in <string>' requires string as left operand, not %s", 7159 element->ob_type->tp_name); 7160 return -1; 7161 } 7162 7163 str = PyUnicode_FromObject(container); 7164 if (!str) { 7165 Py_DECREF(sub); 7166 return -1; 7167 } 7168 7169 result = stringlib_contains_obj(str, sub); 7170 7171 Py_DECREF(str); 7172 Py_DECREF(sub); 7173 7174 return result; 7175} 7176 7177/* Concat to string or Unicode object giving a new Unicode object. */ 7178 7179PyObject *PyUnicode_Concat(PyObject *left, 7180 PyObject *right) 7181{ 7182 PyUnicodeObject *u = NULL, *v = NULL, *w; 7183 7184 /* Coerce the two arguments */ 7185 u = (PyUnicodeObject *)PyUnicode_FromObject(left); 7186 if (u == NULL) 7187 goto onError; 7188 v = (PyUnicodeObject *)PyUnicode_FromObject(right); 7189 if (v == NULL) 7190 goto onError; 7191 7192 /* Shortcuts */ 7193 if (v == unicode_empty) { 7194 Py_DECREF(v); 7195 return (PyObject *)u; 7196 } 7197 if (u == unicode_empty) { 7198 Py_DECREF(u); 7199 return (PyObject *)v; 7200 } 7201 7202 /* Concat the two Unicode strings */ 7203 w = _PyUnicode_New(u->length + v->length); 7204 if (w == NULL) 7205 goto onError; 7206 Py_UNICODE_COPY(w->str, u->str, u->length); 7207 Py_UNICODE_COPY(w->str + u->length, v->str, v->length); 7208 7209 Py_DECREF(u); 7210 Py_DECREF(v); 7211 return (PyObject *)w; 7212 7213 onError: 7214 Py_XDECREF(u); 7215 Py_XDECREF(v); 7216 return NULL; 7217} 7218 7219void 7220PyUnicode_Append(PyObject **pleft, PyObject *right) 7221{ 7222 PyObject *new; 7223 if (*pleft == NULL) 7224 return; 7225 if (right == NULL || !PyUnicode_Check(*pleft)) { 7226 Py_DECREF(*pleft); 7227 *pleft = NULL; 7228 return; 7229 } 7230 new = PyUnicode_Concat(*pleft, right); 7231 Py_DECREF(*pleft); 7232 *pleft = new; 7233} 7234 7235void 7236PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right) 7237{ 7238 PyUnicode_Append(pleft, right); 7239 Py_XDECREF(right); 7240} 7241 7242PyDoc_STRVAR(count__doc__, 7243 "S.count(sub[, start[, end]]) -> int\n\ 7244\n\ 7245Return the number of non-overlapping occurrences of substring sub in\n\ 7246string S[start:end]. Optional arguments start and end are\n\ 7247interpreted as in slice notation."); 7248 7249static PyObject * 7250unicode_count(PyUnicodeObject *self, PyObject *args) 7251{ 7252 PyUnicodeObject *substring; 7253 Py_ssize_t start = 0; 7254 Py_ssize_t end = PY_SSIZE_T_MAX; 7255 PyObject *result; 7256 7257 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring, 7258 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 7259 return NULL; 7260 7261 substring = (PyUnicodeObject *)PyUnicode_FromObject( 7262 (PyObject *)substring); 7263 if (substring == NULL) 7264 return NULL; 7265 7266 ADJUST_INDICES(start, end, self->length); 7267 result = PyLong_FromSsize_t( 7268 stringlib_count(self->str + start, end - start, 7269 substring->str, substring->length, 7270 PY_SSIZE_T_MAX) 7271 ); 7272 7273 Py_DECREF(substring); 7274 7275 return result; 7276} 7277 7278PyDoc_STRVAR(encode__doc__, 7279 "S.encode([encoding[, errors]]) -> bytes\n\ 7280\n\ 7281Encode S using the codec registered for encoding. encoding defaults\n\ 7282to the default encoding. errors may be given to set a different error\n\ 7283handling scheme. Default is 'strict' meaning that encoding errors raise\n\ 7284a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\ 7285'xmlcharrefreplace' as well as any other name registered with\n\ 7286codecs.register_error that can handle UnicodeEncodeErrors."); 7287 7288static PyObject * 7289unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs) 7290{ 7291 static char *kwlist[] = {"encoding", "errors", 0}; 7292 char *encoding = NULL; 7293 char *errors = NULL; 7294 PyObject *v; 7295 7296 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode", 7297 kwlist, &encoding, &errors)) 7298 return NULL; 7299 v = PyUnicode_AsEncodedString((PyObject *)self, encoding, errors); 7300 if (v == NULL) 7301 goto onError; 7302 if (!PyBytes_Check(v)) { 7303 PyErr_Format(PyExc_TypeError, 7304 "encoder did not return a bytes object " 7305 "(type=%.400s)", 7306 Py_TYPE(v)->tp_name); 7307 Py_DECREF(v); 7308 return NULL; 7309 } 7310 return v; 7311 7312 onError: 7313 return NULL; 7314} 7315 7316PyDoc_STRVAR(expandtabs__doc__, 7317 "S.expandtabs([tabsize]) -> str\n\ 7318\n\ 7319Return a copy of S where all tab characters are expanded using spaces.\n\ 7320If tabsize is not given, a tab size of 8 characters is assumed."); 7321 7322static PyObject* 7323unicode_expandtabs(PyUnicodeObject *self, PyObject *args) 7324{ 7325 Py_UNICODE *e; 7326 Py_UNICODE *p; 7327 Py_UNICODE *q; 7328 Py_UNICODE *qe; 7329 Py_ssize_t i, j, incr; 7330 PyUnicodeObject *u; 7331 int tabsize = 8; 7332 7333 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize)) 7334 return NULL; 7335 7336 /* First pass: determine size of output string */ 7337 i = 0; /* chars up to and including most recent \n or \r */ 7338 j = 0; /* chars since most recent \n or \r (use in tab calculations) */ 7339 e = self->str + self->length; /* end of input */ 7340 for (p = self->str; p < e; p++) 7341 if (*p == '\t') { 7342 if (tabsize > 0) { 7343 incr = tabsize - (j % tabsize); /* cannot overflow */ 7344 if (j > PY_SSIZE_T_MAX - incr) 7345 goto overflow1; 7346 j += incr; 7347 } 7348 } 7349 else { 7350 if (j > PY_SSIZE_T_MAX - 1) 7351 goto overflow1; 7352 j++; 7353 if (*p == '\n' || *p == '\r') { 7354 if (i > PY_SSIZE_T_MAX - j) 7355 goto overflow1; 7356 i += j; 7357 j = 0; 7358 } 7359 } 7360 7361 if (i > PY_SSIZE_T_MAX - j) 7362 goto overflow1; 7363 7364 /* Second pass: create output string and fill it */ 7365 u = _PyUnicode_New(i + j); 7366 if (!u) 7367 return NULL; 7368 7369 j = 0; /* same as in first pass */ 7370 q = u->str; /* next output char */ 7371 qe = u->str + u->length; /* end of output */ 7372 7373 for (p = self->str; p < e; p++) 7374 if (*p == '\t') { 7375 if (tabsize > 0) { 7376 i = tabsize - (j % tabsize); 7377 j += i; 7378 while (i--) { 7379 if (q >= qe) 7380 goto overflow2; 7381 *q++ = ' '; 7382 } 7383 } 7384 } 7385 else { 7386 if (q >= qe) 7387 goto overflow2; 7388 *q++ = *p; 7389 j++; 7390 if (*p == '\n' || *p == '\r') 7391 j = 0; 7392 } 7393 7394 return (PyObject*) u; 7395 7396 overflow2: 7397 Py_DECREF(u); 7398 overflow1: 7399 PyErr_SetString(PyExc_OverflowError, "new string is too long"); 7400 return NULL; 7401} 7402 7403PyDoc_STRVAR(find__doc__, 7404 "S.find(sub[, start[, end]]) -> int\n\ 7405\n\ 7406Return the lowest index in S where substring sub is found,\n\ 7407such that sub is contained within s[start:end]. Optional\n\ 7408arguments start and end are interpreted as in slice notation.\n\ 7409\n\ 7410Return -1 on failure."); 7411 7412static PyObject * 7413unicode_find(PyUnicodeObject *self, PyObject *args) 7414{ 7415 PyObject *substring; 7416 Py_ssize_t start; 7417 Py_ssize_t end; 7418 Py_ssize_t result; 7419 7420 if (!_ParseTupleFinds(args, &substring, &start, &end)) 7421 return NULL; 7422 7423 result = stringlib_find_slice( 7424 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self), 7425 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring), 7426 start, end 7427 ); 7428 7429 Py_DECREF(substring); 7430 7431 return PyLong_FromSsize_t(result); 7432} 7433 7434static PyObject * 7435unicode_getitem(PyUnicodeObject *self, Py_ssize_t index) 7436{ 7437 if (index < 0 || index >= self->length) { 7438 PyErr_SetString(PyExc_IndexError, "string index out of range"); 7439 return NULL; 7440 } 7441 7442 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1); 7443} 7444 7445/* Believe it or not, this produces the same value for ASCII strings 7446 as string_hash(). */ 7447static long 7448unicode_hash(PyUnicodeObject *self) 7449{ 7450 Py_ssize_t len; 7451 Py_UNICODE *p; 7452 long x; 7453 7454 if (self->hash != -1) 7455 return self->hash; 7456 len = Py_SIZE(self); 7457 p = self->str; 7458 x = *p << 7; 7459 while (--len >= 0) 7460 x = (1000003*x) ^ *p++; 7461 x ^= Py_SIZE(self); 7462 if (x == -1) 7463 x = -2; 7464 self->hash = x; 7465 return x; 7466} 7467 7468PyDoc_STRVAR(index__doc__, 7469 "S.index(sub[, start[, end]]) -> int\n\ 7470\n\ 7471Like S.find() but raise ValueError when the substring is not found."); 7472 7473static PyObject * 7474unicode_index(PyUnicodeObject *self, PyObject *args) 7475{ 7476 Py_ssize_t result; 7477 PyObject *substring; 7478 Py_ssize_t start; 7479 Py_ssize_t end; 7480 7481 if (!_ParseTupleFinds(args, &substring, &start, &end)) 7482 return NULL; 7483 7484 result = stringlib_find_slice( 7485 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self), 7486 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring), 7487 start, end 7488 ); 7489 7490 Py_DECREF(substring); 7491 7492 if (result < 0) { 7493 PyErr_SetString(PyExc_ValueError, "substring not found"); 7494 return NULL; 7495 } 7496 7497 return PyLong_FromSsize_t(result); 7498} 7499 7500PyDoc_STRVAR(islower__doc__, 7501 "S.islower() -> bool\n\ 7502\n\ 7503Return True if all cased characters in S are lowercase and there is\n\ 7504at least one cased character in S, False otherwise."); 7505 7506static PyObject* 7507unicode_islower(PyUnicodeObject *self) 7508{ 7509 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 7510 register const Py_UNICODE *e; 7511 int cased; 7512 7513 /* Shortcut for single character strings */ 7514 if (PyUnicode_GET_SIZE(self) == 1) 7515 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p)); 7516 7517 /* Special case for empty strings */ 7518 if (PyUnicode_GET_SIZE(self) == 0) 7519 return PyBool_FromLong(0); 7520 7521 e = p + PyUnicode_GET_SIZE(self); 7522 cased = 0; 7523 for (; p < e; p++) { 7524 register const Py_UNICODE ch = *p; 7525 7526 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) 7527 return PyBool_FromLong(0); 7528 else if (!cased && Py_UNICODE_ISLOWER(ch)) 7529 cased = 1; 7530 } 7531 return PyBool_FromLong(cased); 7532} 7533 7534PyDoc_STRVAR(isupper__doc__, 7535 "S.isupper() -> bool\n\ 7536\n\ 7537Return True if all cased characters in S are uppercase and there is\n\ 7538at least one cased character in S, False otherwise."); 7539 7540static PyObject* 7541unicode_isupper(PyUnicodeObject *self) 7542{ 7543 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 7544 register const Py_UNICODE *e; 7545 int cased; 7546 7547 /* Shortcut for single character strings */ 7548 if (PyUnicode_GET_SIZE(self) == 1) 7549 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0); 7550 7551 /* Special case for empty strings */ 7552 if (PyUnicode_GET_SIZE(self) == 0) 7553 return PyBool_FromLong(0); 7554 7555 e = p + PyUnicode_GET_SIZE(self); 7556 cased = 0; 7557 for (; p < e; p++) { 7558 register const Py_UNICODE ch = *p; 7559 7560 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch)) 7561 return PyBool_FromLong(0); 7562 else if (!cased && Py_UNICODE_ISUPPER(ch)) 7563 cased = 1; 7564 } 7565 return PyBool_FromLong(cased); 7566} 7567 7568PyDoc_STRVAR(istitle__doc__, 7569 "S.istitle() -> bool\n\ 7570\n\ 7571Return True if S is a titlecased string and there is at least one\n\ 7572character in S, i.e. upper- and titlecase characters may only\n\ 7573follow uncased characters and lowercase characters only cased ones.\n\ 7574Return False otherwise."); 7575 7576static PyObject* 7577unicode_istitle(PyUnicodeObject *self) 7578{ 7579 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 7580 register const Py_UNICODE *e; 7581 int cased, previous_is_cased; 7582 7583 /* Shortcut for single character strings */ 7584 if (PyUnicode_GET_SIZE(self) == 1) 7585 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) || 7586 (Py_UNICODE_ISUPPER(*p) != 0)); 7587 7588 /* Special case for empty strings */ 7589 if (PyUnicode_GET_SIZE(self) == 0) 7590 return PyBool_FromLong(0); 7591 7592 e = p + PyUnicode_GET_SIZE(self); 7593 cased = 0; 7594 previous_is_cased = 0; 7595 for (; p < e; p++) { 7596 register const Py_UNICODE ch = *p; 7597 7598 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) { 7599 if (previous_is_cased) 7600 return PyBool_FromLong(0); 7601 previous_is_cased = 1; 7602 cased = 1; 7603 } 7604 else if (Py_UNICODE_ISLOWER(ch)) { 7605 if (!previous_is_cased) 7606 return PyBool_FromLong(0); 7607 previous_is_cased = 1; 7608 cased = 1; 7609 } 7610 else 7611 previous_is_cased = 0; 7612 } 7613 return PyBool_FromLong(cased); 7614} 7615 7616PyDoc_STRVAR(isspace__doc__, 7617 "S.isspace() -> bool\n\ 7618\n\ 7619Return True if all characters in S are whitespace\n\ 7620and there is at least one character in S, False otherwise."); 7621 7622static PyObject* 7623unicode_isspace(PyUnicodeObject *self) 7624{ 7625 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 7626 register const Py_UNICODE *e; 7627 7628 /* Shortcut for single character strings */ 7629 if (PyUnicode_GET_SIZE(self) == 1 && 7630 Py_UNICODE_ISSPACE(*p)) 7631 return PyBool_FromLong(1); 7632 7633 /* Special case for empty strings */ 7634 if (PyUnicode_GET_SIZE(self) == 0) 7635 return PyBool_FromLong(0); 7636 7637 e = p + PyUnicode_GET_SIZE(self); 7638 for (; p < e; p++) { 7639 if (!Py_UNICODE_ISSPACE(*p)) 7640 return PyBool_FromLong(0); 7641 } 7642 return PyBool_FromLong(1); 7643} 7644 7645PyDoc_STRVAR(isalpha__doc__, 7646 "S.isalpha() -> bool\n\ 7647\n\ 7648Return True if all characters in S are alphabetic\n\ 7649and there is at least one character in S, False otherwise."); 7650 7651static PyObject* 7652unicode_isalpha(PyUnicodeObject *self) 7653{ 7654 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 7655 register const Py_UNICODE *e; 7656 7657 /* Shortcut for single character strings */ 7658 if (PyUnicode_GET_SIZE(self) == 1 && 7659 Py_UNICODE_ISALPHA(*p)) 7660 return PyBool_FromLong(1); 7661 7662 /* Special case for empty strings */ 7663 if (PyUnicode_GET_SIZE(self) == 0) 7664 return PyBool_FromLong(0); 7665 7666 e = p + PyUnicode_GET_SIZE(self); 7667 for (; p < e; p++) { 7668 if (!Py_UNICODE_ISALPHA(*p)) 7669 return PyBool_FromLong(0); 7670 } 7671 return PyBool_FromLong(1); 7672} 7673 7674PyDoc_STRVAR(isalnum__doc__, 7675 "S.isalnum() -> bool\n\ 7676\n\ 7677Return True if all characters in S are alphanumeric\n\ 7678and there is at least one character in S, False otherwise."); 7679 7680static PyObject* 7681unicode_isalnum(PyUnicodeObject *self) 7682{ 7683 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 7684 register const Py_UNICODE *e; 7685 7686 /* Shortcut for single character strings */ 7687 if (PyUnicode_GET_SIZE(self) == 1 && 7688 Py_UNICODE_ISALNUM(*p)) 7689 return PyBool_FromLong(1); 7690 7691 /* Special case for empty strings */ 7692 if (PyUnicode_GET_SIZE(self) == 0) 7693 return PyBool_FromLong(0); 7694 7695 e = p + PyUnicode_GET_SIZE(self); 7696 for (; p < e; p++) { 7697 if (!Py_UNICODE_ISALNUM(*p)) 7698 return PyBool_FromLong(0); 7699 } 7700 return PyBool_FromLong(1); 7701} 7702 7703PyDoc_STRVAR(isdecimal__doc__, 7704 "S.isdecimal() -> bool\n\ 7705\n\ 7706Return True if there are only decimal characters in S,\n\ 7707False otherwise."); 7708 7709static PyObject* 7710unicode_isdecimal(PyUnicodeObject *self) 7711{ 7712 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 7713 register const Py_UNICODE *e; 7714 7715 /* Shortcut for single character strings */ 7716 if (PyUnicode_GET_SIZE(self) == 1 && 7717 Py_UNICODE_ISDECIMAL(*p)) 7718 return PyBool_FromLong(1); 7719 7720 /* Special case for empty strings */ 7721 if (PyUnicode_GET_SIZE(self) == 0) 7722 return PyBool_FromLong(0); 7723 7724 e = p + PyUnicode_GET_SIZE(self); 7725 for (; p < e; p++) { 7726 if (!Py_UNICODE_ISDECIMAL(*p)) 7727 return PyBool_FromLong(0); 7728 } 7729 return PyBool_FromLong(1); 7730} 7731 7732PyDoc_STRVAR(isdigit__doc__, 7733 "S.isdigit() -> bool\n\ 7734\n\ 7735Return True if all characters in S are digits\n\ 7736and there is at least one character in S, False otherwise."); 7737 7738static PyObject* 7739unicode_isdigit(PyUnicodeObject *self) 7740{ 7741 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 7742 register const Py_UNICODE *e; 7743 7744 /* Shortcut for single character strings */ 7745 if (PyUnicode_GET_SIZE(self) == 1 && 7746 Py_UNICODE_ISDIGIT(*p)) 7747 return PyBool_FromLong(1); 7748 7749 /* Special case for empty strings */ 7750 if (PyUnicode_GET_SIZE(self) == 0) 7751 return PyBool_FromLong(0); 7752 7753 e = p + PyUnicode_GET_SIZE(self); 7754 for (; p < e; p++) { 7755 if (!Py_UNICODE_ISDIGIT(*p)) 7756 return PyBool_FromLong(0); 7757 } 7758 return PyBool_FromLong(1); 7759} 7760 7761PyDoc_STRVAR(isnumeric__doc__, 7762 "S.isnumeric() -> bool\n\ 7763\n\ 7764Return True if there are only numeric characters in S,\n\ 7765False otherwise."); 7766 7767static PyObject* 7768unicode_isnumeric(PyUnicodeObject *self) 7769{ 7770 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 7771 register const Py_UNICODE *e; 7772 7773 /* Shortcut for single character strings */ 7774 if (PyUnicode_GET_SIZE(self) == 1 && 7775 Py_UNICODE_ISNUMERIC(*p)) 7776 return PyBool_FromLong(1); 7777 7778 /* Special case for empty strings */ 7779 if (PyUnicode_GET_SIZE(self) == 0) 7780 return PyBool_FromLong(0); 7781 7782 e = p + PyUnicode_GET_SIZE(self); 7783 for (; p < e; p++) { 7784 if (!Py_UNICODE_ISNUMERIC(*p)) 7785 return PyBool_FromLong(0); 7786 } 7787 return PyBool_FromLong(1); 7788} 7789 7790int 7791PyUnicode_IsIdentifier(PyObject *self) 7792{ 7793 register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self); 7794 register const Py_UNICODE *e; 7795 7796 /* Special case for empty strings */ 7797 if (PyUnicode_GET_SIZE(self) == 0) 7798 return 0; 7799 7800 /* PEP 3131 says that the first character must be in 7801 XID_Start and subsequent characters in XID_Continue, 7802 and for the ASCII range, the 2.x rules apply (i.e 7803 start with letters and underscore, continue with 7804 letters, digits, underscore). However, given the current 7805 definition of XID_Start and XID_Continue, it is sufficient 7806 to check just for these, except that _ must be allowed 7807 as starting an identifier. */ 7808 if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */) 7809 return 0; 7810 7811 e = p + PyUnicode_GET_SIZE(self); 7812 for (p++; p < e; p++) { 7813 if (!_PyUnicode_IsXidContinue(*p)) 7814 return 0; 7815 } 7816 return 1; 7817} 7818 7819PyDoc_STRVAR(isidentifier__doc__, 7820 "S.isidentifier() -> bool\n\ 7821\n\ 7822Return True if S is a valid identifier according\n\ 7823to the language definition."); 7824 7825static PyObject* 7826unicode_isidentifier(PyObject *self) 7827{ 7828 return PyBool_FromLong(PyUnicode_IsIdentifier(self)); 7829} 7830 7831PyDoc_STRVAR(isprintable__doc__, 7832 "S.isprintable() -> bool\n\ 7833\n\ 7834Return True if all characters in S are considered\n\ 7835printable in repr() or S is empty, False otherwise."); 7836 7837static PyObject* 7838unicode_isprintable(PyObject *self) 7839{ 7840 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 7841 register const Py_UNICODE *e; 7842 7843 /* Shortcut for single character strings */ 7844 if (PyUnicode_GET_SIZE(self) == 1 && Py_UNICODE_ISPRINTABLE(*p)) { 7845 Py_RETURN_TRUE; 7846 } 7847 7848 e = p + PyUnicode_GET_SIZE(self); 7849 for (; p < e; p++) { 7850 if (!Py_UNICODE_ISPRINTABLE(*p)) { 7851 Py_RETURN_FALSE; 7852 } 7853 } 7854 Py_RETURN_TRUE; 7855} 7856 7857PyDoc_STRVAR(join__doc__, 7858 "S.join(iterable) -> str\n\ 7859\n\ 7860Return a string which is the concatenation of the strings in the\n\ 7861iterable. The separator between elements is S."); 7862 7863static PyObject* 7864unicode_join(PyObject *self, PyObject *data) 7865{ 7866 return PyUnicode_Join(self, data); 7867} 7868 7869static Py_ssize_t 7870unicode_length(PyUnicodeObject *self) 7871{ 7872 return self->length; 7873} 7874 7875PyDoc_STRVAR(ljust__doc__, 7876 "S.ljust(width[, fillchar]) -> str\n\ 7877\n\ 7878Return S left-justified in a Unicode string of length width. Padding is\n\ 7879done using the specified fill character (default is a space)."); 7880 7881static PyObject * 7882unicode_ljust(PyUnicodeObject *self, PyObject *args) 7883{ 7884 Py_ssize_t width; 7885 Py_UNICODE fillchar = ' '; 7886 7887 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar)) 7888 return NULL; 7889 7890 if (self->length >= width && PyUnicode_CheckExact(self)) { 7891 Py_INCREF(self); 7892 return (PyObject*) self; 7893 } 7894 7895 return (PyObject*) pad(self, 0, width - self->length, fillchar); 7896} 7897 7898PyDoc_STRVAR(lower__doc__, 7899 "S.lower() -> str\n\ 7900\n\ 7901Return a copy of the string S converted to lowercase."); 7902 7903static PyObject* 7904unicode_lower(PyUnicodeObject *self) 7905{ 7906 return fixup(self, fixlower); 7907} 7908 7909#define LEFTSTRIP 0 7910#define RIGHTSTRIP 1 7911#define BOTHSTRIP 2 7912 7913/* Arrays indexed by above */ 7914static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"}; 7915 7916#define STRIPNAME(i) (stripformat[i]+3) 7917 7918/* externally visible for str.strip(unicode) */ 7919PyObject * 7920_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj) 7921{ 7922 Py_UNICODE *s = PyUnicode_AS_UNICODE(self); 7923 Py_ssize_t len = PyUnicode_GET_SIZE(self); 7924 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj); 7925 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj); 7926 Py_ssize_t i, j; 7927 7928 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen); 7929 7930 i = 0; 7931 if (striptype != RIGHTSTRIP) { 7932 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) { 7933 i++; 7934 } 7935 } 7936 7937 j = len; 7938 if (striptype != LEFTSTRIP) { 7939 do { 7940 j--; 7941 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen)); 7942 j++; 7943 } 7944 7945 if (i == 0 && j == len && PyUnicode_CheckExact(self)) { 7946 Py_INCREF(self); 7947 return (PyObject*)self; 7948 } 7949 else 7950 return PyUnicode_FromUnicode(s+i, j-i); 7951} 7952 7953 7954static PyObject * 7955do_strip(PyUnicodeObject *self, int striptype) 7956{ 7957 Py_UNICODE *s = PyUnicode_AS_UNICODE(self); 7958 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j; 7959 7960 i = 0; 7961 if (striptype != RIGHTSTRIP) { 7962 while (i < len && Py_UNICODE_ISSPACE(s[i])) { 7963 i++; 7964 } 7965 } 7966 7967 j = len; 7968 if (striptype != LEFTSTRIP) { 7969 do { 7970 j--; 7971 } while (j >= i && Py_UNICODE_ISSPACE(s[j])); 7972 j++; 7973 } 7974 7975 if (i == 0 && j == len && PyUnicode_CheckExact(self)) { 7976 Py_INCREF(self); 7977 return (PyObject*)self; 7978 } 7979 else 7980 return PyUnicode_FromUnicode(s+i, j-i); 7981} 7982 7983 7984static PyObject * 7985do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args) 7986{ 7987 PyObject *sep = NULL; 7988 7989 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep)) 7990 return NULL; 7991 7992 if (sep != NULL && sep != Py_None) { 7993 if (PyUnicode_Check(sep)) 7994 return _PyUnicode_XStrip(self, striptype, sep); 7995 else { 7996 PyErr_Format(PyExc_TypeError, 7997 "%s arg must be None or str", 7998 STRIPNAME(striptype)); 7999 return NULL; 8000 } 8001 } 8002 8003 return do_strip(self, striptype); 8004} 8005 8006 8007PyDoc_STRVAR(strip__doc__, 8008 "S.strip([chars]) -> str\n\ 8009\n\ 8010Return a copy of the string S with leading and trailing\n\ 8011whitespace removed.\n\ 8012If chars is given and not None, remove characters in chars instead."); 8013 8014static PyObject * 8015unicode_strip(PyUnicodeObject *self, PyObject *args) 8016{ 8017 if (PyTuple_GET_SIZE(args) == 0) 8018 return do_strip(self, BOTHSTRIP); /* Common case */ 8019 else 8020 return do_argstrip(self, BOTHSTRIP, args); 8021} 8022 8023 8024PyDoc_STRVAR(lstrip__doc__, 8025 "S.lstrip([chars]) -> str\n\ 8026\n\ 8027Return a copy of the string S with leading whitespace removed.\n\ 8028If chars is given and not None, remove characters in chars instead."); 8029 8030static PyObject * 8031unicode_lstrip(PyUnicodeObject *self, PyObject *args) 8032{ 8033 if (PyTuple_GET_SIZE(args) == 0) 8034 return do_strip(self, LEFTSTRIP); /* Common case */ 8035 else 8036 return do_argstrip(self, LEFTSTRIP, args); 8037} 8038 8039 8040PyDoc_STRVAR(rstrip__doc__, 8041 "S.rstrip([chars]) -> str\n\ 8042\n\ 8043Return a copy of the string S with trailing whitespace removed.\n\ 8044If chars is given and not None, remove characters in chars instead."); 8045 8046static PyObject * 8047unicode_rstrip(PyUnicodeObject *self, PyObject *args) 8048{ 8049 if (PyTuple_GET_SIZE(args) == 0) 8050 return do_strip(self, RIGHTSTRIP); /* Common case */ 8051 else 8052 return do_argstrip(self, RIGHTSTRIP, args); 8053} 8054 8055 8056static PyObject* 8057unicode_repeat(PyUnicodeObject *str, Py_ssize_t len) 8058{ 8059 PyUnicodeObject *u; 8060 Py_UNICODE *p; 8061 Py_ssize_t nchars; 8062 size_t nbytes; 8063 8064 if (len < 1) { 8065 Py_INCREF(unicode_empty); 8066 return (PyObject *)unicode_empty; 8067 } 8068 8069 if (len == 1 && PyUnicode_CheckExact(str)) { 8070 /* no repeat, return original string */ 8071 Py_INCREF(str); 8072 return (PyObject*) str; 8073 } 8074 8075 /* ensure # of chars needed doesn't overflow int and # of bytes 8076 * needed doesn't overflow size_t 8077 */ 8078 nchars = len * str->length; 8079 if (nchars / len != str->length) { 8080 PyErr_SetString(PyExc_OverflowError, 8081 "repeated string is too long"); 8082 return NULL; 8083 } 8084 nbytes = (nchars + 1) * sizeof(Py_UNICODE); 8085 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) { 8086 PyErr_SetString(PyExc_OverflowError, 8087 "repeated string is too long"); 8088 return NULL; 8089 } 8090 u = _PyUnicode_New(nchars); 8091 if (!u) 8092 return NULL; 8093 8094 p = u->str; 8095 8096 if (str->length == 1) { 8097 Py_UNICODE_FILL(p, str->str[0], len); 8098 } else { 8099 Py_ssize_t done = str->length; /* number of characters copied this far */ 8100 Py_UNICODE_COPY(p, str->str, str->length); 8101 while (done < nchars) { 8102 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done; 8103 Py_UNICODE_COPY(p+done, p, n); 8104 done += n; 8105 } 8106 } 8107 8108 return (PyObject*) u; 8109} 8110 8111PyObject *PyUnicode_Replace(PyObject *obj, 8112 PyObject *subobj, 8113 PyObject *replobj, 8114 Py_ssize_t maxcount) 8115{ 8116 PyObject *self; 8117 PyObject *str1; 8118 PyObject *str2; 8119 PyObject *result; 8120 8121 self = PyUnicode_FromObject(obj); 8122 if (self == NULL) 8123 return NULL; 8124 str1 = PyUnicode_FromObject(subobj); 8125 if (str1 == NULL) { 8126 Py_DECREF(self); 8127 return NULL; 8128 } 8129 str2 = PyUnicode_FromObject(replobj); 8130 if (str2 == NULL) { 8131 Py_DECREF(self); 8132 Py_DECREF(str1); 8133 return NULL; 8134 } 8135 result = replace((PyUnicodeObject *)self, 8136 (PyUnicodeObject *)str1, 8137 (PyUnicodeObject *)str2, 8138 maxcount); 8139 Py_DECREF(self); 8140 Py_DECREF(str1); 8141 Py_DECREF(str2); 8142 return result; 8143} 8144 8145PyDoc_STRVAR(replace__doc__, 8146 "S.replace(old, new[, count]) -> str\n\ 8147\n\ 8148Return a copy of S with all occurrences of substring\n\ 8149old replaced by new. If the optional argument count is\n\ 8150given, only the first count occurrences are replaced."); 8151 8152static PyObject* 8153unicode_replace(PyUnicodeObject *self, PyObject *args) 8154{ 8155 PyUnicodeObject *str1; 8156 PyUnicodeObject *str2; 8157 Py_ssize_t maxcount = -1; 8158 PyObject *result; 8159 8160 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount)) 8161 return NULL; 8162 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1); 8163 if (str1 == NULL) 8164 return NULL; 8165 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2); 8166 if (str2 == NULL) { 8167 Py_DECREF(str1); 8168 return NULL; 8169 } 8170 8171 result = replace(self, str1, str2, maxcount); 8172 8173 Py_DECREF(str1); 8174 Py_DECREF(str2); 8175 return result; 8176} 8177 8178static 8179PyObject *unicode_repr(PyObject *unicode) 8180{ 8181 PyObject *repr; 8182 Py_UNICODE *p; 8183 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode); 8184 Py_ssize_t size = PyUnicode_GET_SIZE(unicode); 8185 8186 /* XXX(nnorwitz): rather than over-allocating, it would be 8187 better to choose a different scheme. Perhaps scan the 8188 first N-chars of the string and allocate based on that size. 8189 */ 8190 /* Initial allocation is based on the longest-possible unichr 8191 escape. 8192 8193 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source 8194 unichr, so in this case it's the longest unichr escape. In 8195 narrow (UTF-16) builds this is five chars per source unichr 8196 since there are two unichrs in the surrogate pair, so in narrow 8197 (UTF-16) builds it's not the longest unichr escape. 8198 8199 In wide or narrow builds '\uxxxx' is 6 chars per source unichr, 8200 so in the narrow (UTF-16) build case it's the longest unichr 8201 escape. 8202 */ 8203 8204 repr = PyUnicode_FromUnicode(NULL, 8205 2 /* quotes */ 8206#ifdef Py_UNICODE_WIDE 8207 + 10*size 8208#else 8209 + 6*size 8210#endif 8211 + 1); 8212 if (repr == NULL) 8213 return NULL; 8214 8215 p = PyUnicode_AS_UNICODE(repr); 8216 8217 /* Add quote */ 8218 *p++ = (findchar(s, size, '\'') && 8219 !findchar(s, size, '"')) ? '"' : '\''; 8220 while (size-- > 0) { 8221 Py_UNICODE ch = *s++; 8222 8223 /* Escape quotes and backslashes */ 8224 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) { 8225 *p++ = '\\'; 8226 *p++ = ch; 8227 continue; 8228 } 8229 8230 /* Map special whitespace to '\t', \n', '\r' */ 8231 if (ch == '\t') { 8232 *p++ = '\\'; 8233 *p++ = 't'; 8234 } 8235 else if (ch == '\n') { 8236 *p++ = '\\'; 8237 *p++ = 'n'; 8238 } 8239 else if (ch == '\r') { 8240 *p++ = '\\'; 8241 *p++ = 'r'; 8242 } 8243 8244 /* Map non-printable US ASCII to '\xhh' */ 8245 else if (ch < ' ' || ch == 0x7F) { 8246 *p++ = '\\'; 8247 *p++ = 'x'; 8248 *p++ = hexdigits[(ch >> 4) & 0x000F]; 8249 *p++ = hexdigits[ch & 0x000F]; 8250 } 8251 8252 /* Copy ASCII characters as-is */ 8253 else if (ch < 0x7F) { 8254 *p++ = ch; 8255 } 8256 8257 /* Non-ASCII characters */ 8258 else { 8259 Py_UCS4 ucs = ch; 8260 8261#ifndef Py_UNICODE_WIDE 8262 Py_UNICODE ch2 = 0; 8263 /* Get code point from surrogate pair */ 8264 if (size > 0) { 8265 ch2 = *s; 8266 if (ch >= 0xD800 && ch < 0xDC00 && ch2 >= 0xDC00 8267 && ch2 <= 0xDFFF) { 8268 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) 8269 + 0x00010000; 8270 s++; 8271 size--; 8272 } 8273 } 8274#endif 8275 /* Map Unicode whitespace and control characters 8276 (categories Z* and C* except ASCII space) 8277 */ 8278 if (!Py_UNICODE_ISPRINTABLE(ucs)) { 8279 /* Map 8-bit characters to '\xhh' */ 8280 if (ucs <= 0xff) { 8281 *p++ = '\\'; 8282 *p++ = 'x'; 8283 *p++ = hexdigits[(ch >> 4) & 0x000F]; 8284 *p++ = hexdigits[ch & 0x000F]; 8285 } 8286 /* Map 21-bit characters to '\U00xxxxxx' */ 8287 else if (ucs >= 0x10000) { 8288 *p++ = '\\'; 8289 *p++ = 'U'; 8290 *p++ = hexdigits[(ucs >> 28) & 0x0000000F]; 8291 *p++ = hexdigits[(ucs >> 24) & 0x0000000F]; 8292 *p++ = hexdigits[(ucs >> 20) & 0x0000000F]; 8293 *p++ = hexdigits[(ucs >> 16) & 0x0000000F]; 8294 *p++ = hexdigits[(ucs >> 12) & 0x0000000F]; 8295 *p++ = hexdigits[(ucs >> 8) & 0x0000000F]; 8296 *p++ = hexdigits[(ucs >> 4) & 0x0000000F]; 8297 *p++ = hexdigits[ucs & 0x0000000F]; 8298 } 8299 /* Map 16-bit characters to '\uxxxx' */ 8300 else { 8301 *p++ = '\\'; 8302 *p++ = 'u'; 8303 *p++ = hexdigits[(ucs >> 12) & 0x000F]; 8304 *p++ = hexdigits[(ucs >> 8) & 0x000F]; 8305 *p++ = hexdigits[(ucs >> 4) & 0x000F]; 8306 *p++ = hexdigits[ucs & 0x000F]; 8307 } 8308 } 8309 /* Copy characters as-is */ 8310 else { 8311 *p++ = ch; 8312#ifndef Py_UNICODE_WIDE 8313 if (ucs >= 0x10000) 8314 *p++ = ch2; 8315#endif 8316 } 8317 } 8318 } 8319 /* Add quote */ 8320 *p++ = PyUnicode_AS_UNICODE(repr)[0]; 8321 8322 *p = '\0'; 8323 PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr)); 8324 return repr; 8325} 8326 8327PyDoc_STRVAR(rfind__doc__, 8328 "S.rfind(sub[, start[, end]]) -> int\n\ 8329\n\ 8330Return the highest index in S where substring sub is found,\n\ 8331such that sub is contained within s[start:end]. Optional\n\ 8332arguments start and end are interpreted as in slice notation.\n\ 8333\n\ 8334Return -1 on failure."); 8335 8336static PyObject * 8337unicode_rfind(PyUnicodeObject *self, PyObject *args) 8338{ 8339 PyObject *substring; 8340 Py_ssize_t start; 8341 Py_ssize_t end; 8342 Py_ssize_t result; 8343 8344 if (!_ParseTupleFinds(args, &substring, &start, &end)) 8345 return NULL; 8346 8347 result = stringlib_rfind_slice( 8348 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self), 8349 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring), 8350 start, end 8351 ); 8352 8353 Py_DECREF(substring); 8354 8355 return PyLong_FromSsize_t(result); 8356} 8357 8358PyDoc_STRVAR(rindex__doc__, 8359 "S.rindex(sub[, start[, end]]) -> int\n\ 8360\n\ 8361Like S.rfind() but raise ValueError when the substring is not found."); 8362 8363static PyObject * 8364unicode_rindex(PyUnicodeObject *self, PyObject *args) 8365{ 8366 PyObject *substring; 8367 Py_ssize_t start; 8368 Py_ssize_t end; 8369 Py_ssize_t result; 8370 8371 if (!_ParseTupleFinds(args, &substring, &start, &end)) 8372 return NULL; 8373 8374 result = stringlib_rfind_slice( 8375 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self), 8376 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring), 8377 start, end 8378 ); 8379 8380 Py_DECREF(substring); 8381 8382 if (result < 0) { 8383 PyErr_SetString(PyExc_ValueError, "substring not found"); 8384 return NULL; 8385 } 8386 return PyLong_FromSsize_t(result); 8387} 8388 8389PyDoc_STRVAR(rjust__doc__, 8390 "S.rjust(width[, fillchar]) -> str\n\ 8391\n\ 8392Return S right-justified in a string of length width. Padding is\n\ 8393done using the specified fill character (default is a space)."); 8394 8395static PyObject * 8396unicode_rjust(PyUnicodeObject *self, PyObject *args) 8397{ 8398 Py_ssize_t width; 8399 Py_UNICODE fillchar = ' '; 8400 8401 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar)) 8402 return NULL; 8403 8404 if (self->length >= width && PyUnicode_CheckExact(self)) { 8405 Py_INCREF(self); 8406 return (PyObject*) self; 8407 } 8408 8409 return (PyObject*) pad(self, width - self->length, 0, fillchar); 8410} 8411 8412PyObject *PyUnicode_Split(PyObject *s, 8413 PyObject *sep, 8414 Py_ssize_t maxsplit) 8415{ 8416 PyObject *result; 8417 8418 s = PyUnicode_FromObject(s); 8419 if (s == NULL) 8420 return NULL; 8421 if (sep != NULL) { 8422 sep = PyUnicode_FromObject(sep); 8423 if (sep == NULL) { 8424 Py_DECREF(s); 8425 return NULL; 8426 } 8427 } 8428 8429 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit); 8430 8431 Py_DECREF(s); 8432 Py_XDECREF(sep); 8433 return result; 8434} 8435 8436PyDoc_STRVAR(split__doc__, 8437 "S.split([sep[, maxsplit]]) -> list of strings\n\ 8438\n\ 8439Return a list of the words in S, using sep as the\n\ 8440delimiter string. If maxsplit is given, at most maxsplit\n\ 8441splits are done. If sep is not specified or is None, any\n\ 8442whitespace string is a separator and empty strings are\n\ 8443removed from the result."); 8444 8445static PyObject* 8446unicode_split(PyUnicodeObject *self, PyObject *args) 8447{ 8448 PyObject *substring = Py_None; 8449 Py_ssize_t maxcount = -1; 8450 8451 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount)) 8452 return NULL; 8453 8454 if (substring == Py_None) 8455 return split(self, NULL, maxcount); 8456 else if (PyUnicode_Check(substring)) 8457 return split(self, (PyUnicodeObject *)substring, maxcount); 8458 else 8459 return PyUnicode_Split((PyObject *)self, substring, maxcount); 8460} 8461 8462PyObject * 8463PyUnicode_Partition(PyObject *str_in, PyObject *sep_in) 8464{ 8465 PyObject* str_obj; 8466 PyObject* sep_obj; 8467 PyObject* out; 8468 8469 str_obj = PyUnicode_FromObject(str_in); 8470 if (!str_obj) 8471 return NULL; 8472 sep_obj = PyUnicode_FromObject(sep_in); 8473 if (!sep_obj) { 8474 Py_DECREF(str_obj); 8475 return NULL; 8476 } 8477 8478 out = stringlib_partition( 8479 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj), 8480 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj) 8481 ); 8482 8483 Py_DECREF(sep_obj); 8484 Py_DECREF(str_obj); 8485 8486 return out; 8487} 8488 8489 8490PyObject * 8491PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in) 8492{ 8493 PyObject* str_obj; 8494 PyObject* sep_obj; 8495 PyObject* out; 8496 8497 str_obj = PyUnicode_FromObject(str_in); 8498 if (!str_obj) 8499 return NULL; 8500 sep_obj = PyUnicode_FromObject(sep_in); 8501 if (!sep_obj) { 8502 Py_DECREF(str_obj); 8503 return NULL; 8504 } 8505 8506 out = stringlib_rpartition( 8507 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj), 8508 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj) 8509 ); 8510 8511 Py_DECREF(sep_obj); 8512 Py_DECREF(str_obj); 8513 8514 return out; 8515} 8516 8517PyDoc_STRVAR(partition__doc__, 8518 "S.partition(sep) -> (head, sep, tail)\n\ 8519\n\ 8520Search for the separator sep in S, and return the part before it,\n\ 8521the separator itself, and the part after it. If the separator is not\n\ 8522found, return S and two empty strings."); 8523 8524static PyObject* 8525unicode_partition(PyUnicodeObject *self, PyObject *separator) 8526{ 8527 return PyUnicode_Partition((PyObject *)self, separator); 8528} 8529 8530PyDoc_STRVAR(rpartition__doc__, 8531 "S.rpartition(sep) -> (head, sep, tail)\n\ 8532\n\ 8533Search for the separator sep in S, starting at the end of S, and return\n\ 8534the part before it, the separator itself, and the part after it. If the\n\ 8535separator is not found, return two empty strings and S."); 8536 8537static PyObject* 8538unicode_rpartition(PyUnicodeObject *self, PyObject *separator) 8539{ 8540 return PyUnicode_RPartition((PyObject *)self, separator); 8541} 8542 8543PyObject *PyUnicode_RSplit(PyObject *s, 8544 PyObject *sep, 8545 Py_ssize_t maxsplit) 8546{ 8547 PyObject *result; 8548 8549 s = PyUnicode_FromObject(s); 8550 if (s == NULL) 8551 return NULL; 8552 if (sep != NULL) { 8553 sep = PyUnicode_FromObject(sep); 8554 if (sep == NULL) { 8555 Py_DECREF(s); 8556 return NULL; 8557 } 8558 } 8559 8560 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit); 8561 8562 Py_DECREF(s); 8563 Py_XDECREF(sep); 8564 return result; 8565} 8566 8567PyDoc_STRVAR(rsplit__doc__, 8568 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\ 8569\n\ 8570Return a list of the words in S, using sep as the\n\ 8571delimiter string, starting at the end of the string and\n\ 8572working to the front. If maxsplit is given, at most maxsplit\n\ 8573splits are done. If sep is not specified, any whitespace string\n\ 8574is a separator."); 8575 8576static PyObject* 8577unicode_rsplit(PyUnicodeObject *self, PyObject *args) 8578{ 8579 PyObject *substring = Py_None; 8580 Py_ssize_t maxcount = -1; 8581 8582 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount)) 8583 return NULL; 8584 8585 if (substring == Py_None) 8586 return rsplit(self, NULL, maxcount); 8587 else if (PyUnicode_Check(substring)) 8588 return rsplit(self, (PyUnicodeObject *)substring, maxcount); 8589 else 8590 return PyUnicode_RSplit((PyObject *)self, substring, maxcount); 8591} 8592 8593PyDoc_STRVAR(splitlines__doc__, 8594 "S.splitlines([keepends]) -> list of strings\n\ 8595\n\ 8596Return a list of the lines in S, breaking at line boundaries.\n\ 8597Line breaks are not included in the resulting list unless keepends\n\ 8598is given and true."); 8599 8600static PyObject* 8601unicode_splitlines(PyUnicodeObject *self, PyObject *args) 8602{ 8603 int keepends = 0; 8604 8605 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends)) 8606 return NULL; 8607 8608 return PyUnicode_Splitlines((PyObject *)self, keepends); 8609} 8610 8611static 8612PyObject *unicode_str(PyObject *self) 8613{ 8614 if (PyUnicode_CheckExact(self)) { 8615 Py_INCREF(self); 8616 return self; 8617 } else 8618 /* Subtype -- return genuine unicode string with the same value. */ 8619 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self), 8620 PyUnicode_GET_SIZE(self)); 8621} 8622 8623PyDoc_STRVAR(swapcase__doc__, 8624 "S.swapcase() -> str\n\ 8625\n\ 8626Return a copy of S with uppercase characters converted to lowercase\n\ 8627and vice versa."); 8628 8629static PyObject* 8630unicode_swapcase(PyUnicodeObject *self) 8631{ 8632 return fixup(self, fixswapcase); 8633} 8634 8635PyDoc_STRVAR(maketrans__doc__, 8636 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\ 8637\n\ 8638Return a translation table usable for str.translate().\n\ 8639If there is only one argument, it must be a dictionary mapping Unicode\n\ 8640ordinals (integers) or characters to Unicode ordinals, strings or None.\n\ 8641Character keys will be then converted to ordinals.\n\ 8642If there are two arguments, they must be strings of equal length, and\n\ 8643in the resulting dictionary, each character in x will be mapped to the\n\ 8644character at the same position in y. If there is a third argument, it\n\ 8645must be a string, whose characters will be mapped to None in the result."); 8646 8647static PyObject* 8648unicode_maketrans(PyUnicodeObject *null, PyObject *args) 8649{ 8650 PyObject *x, *y = NULL, *z = NULL; 8651 PyObject *new = NULL, *key, *value; 8652 Py_ssize_t i = 0; 8653 int res; 8654 8655 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z)) 8656 return NULL; 8657 new = PyDict_New(); 8658 if (!new) 8659 return NULL; 8660 if (y != NULL) { 8661 /* x must be a string too, of equal length */ 8662 Py_ssize_t ylen = PyUnicode_GET_SIZE(y); 8663 if (!PyUnicode_Check(x)) { 8664 PyErr_SetString(PyExc_TypeError, "first maketrans argument must " 8665 "be a string if there is a second argument"); 8666 goto err; 8667 } 8668 if (PyUnicode_GET_SIZE(x) != ylen) { 8669 PyErr_SetString(PyExc_ValueError, "the first two maketrans " 8670 "arguments must have equal length"); 8671 goto err; 8672 } 8673 /* create entries for translating chars in x to those in y */ 8674 for (i = 0; i < PyUnicode_GET_SIZE(x); i++) { 8675 key = PyLong_FromLong(PyUnicode_AS_UNICODE(x)[i]); 8676 value = PyLong_FromLong(PyUnicode_AS_UNICODE(y)[i]); 8677 if (!key || !value) 8678 goto err; 8679 res = PyDict_SetItem(new, key, value); 8680 Py_DECREF(key); 8681 Py_DECREF(value); 8682 if (res < 0) 8683 goto err; 8684 } 8685 /* create entries for deleting chars in z */ 8686 if (z != NULL) { 8687 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) { 8688 key = PyLong_FromLong(PyUnicode_AS_UNICODE(z)[i]); 8689 if (!key) 8690 goto err; 8691 res = PyDict_SetItem(new, key, Py_None); 8692 Py_DECREF(key); 8693 if (res < 0) 8694 goto err; 8695 } 8696 } 8697 } else { 8698 /* x must be a dict */ 8699 if (!PyDict_CheckExact(x)) { 8700 PyErr_SetString(PyExc_TypeError, "if you give only one argument " 8701 "to maketrans it must be a dict"); 8702 goto err; 8703 } 8704 /* copy entries into the new dict, converting string keys to int keys */ 8705 while (PyDict_Next(x, &i, &key, &value)) { 8706 if (PyUnicode_Check(key)) { 8707 /* convert string keys to integer keys */ 8708 PyObject *newkey; 8709 if (PyUnicode_GET_SIZE(key) != 1) { 8710 PyErr_SetString(PyExc_ValueError, "string keys in translate " 8711 "table must be of length 1"); 8712 goto err; 8713 } 8714 newkey = PyLong_FromLong(PyUnicode_AS_UNICODE(key)[0]); 8715 if (!newkey) 8716 goto err; 8717 res = PyDict_SetItem(new, newkey, value); 8718 Py_DECREF(newkey); 8719 if (res < 0) 8720 goto err; 8721 } else if (PyLong_Check(key)) { 8722 /* just keep integer keys */ 8723 if (PyDict_SetItem(new, key, value) < 0) 8724 goto err; 8725 } else { 8726 PyErr_SetString(PyExc_TypeError, "keys in translate table must " 8727 "be strings or integers"); 8728 goto err; 8729 } 8730 } 8731 } 8732 return new; 8733 err: 8734 Py_DECREF(new); 8735 return NULL; 8736} 8737 8738PyDoc_STRVAR(translate__doc__, 8739 "S.translate(table) -> str\n\ 8740\n\ 8741Return a copy of the string S, where all characters have been mapped\n\ 8742through the given translation table, which must be a mapping of\n\ 8743Unicode ordinals to Unicode ordinals, strings, or None.\n\ 8744Unmapped characters are left untouched. Characters mapped to None\n\ 8745are deleted."); 8746 8747static PyObject* 8748unicode_translate(PyUnicodeObject *self, PyObject *table) 8749{ 8750 return PyUnicode_TranslateCharmap(self->str, self->length, table, "ignore"); 8751} 8752 8753PyDoc_STRVAR(upper__doc__, 8754 "S.upper() -> str\n\ 8755\n\ 8756Return a copy of S converted to uppercase."); 8757 8758static PyObject* 8759unicode_upper(PyUnicodeObject *self) 8760{ 8761 return fixup(self, fixupper); 8762} 8763 8764PyDoc_STRVAR(zfill__doc__, 8765 "S.zfill(width) -> str\n\ 8766\n\ 8767Pad a numeric string S with zeros on the left, to fill a field\n\ 8768of the specified width. The string S is never truncated."); 8769 8770static PyObject * 8771unicode_zfill(PyUnicodeObject *self, PyObject *args) 8772{ 8773 Py_ssize_t fill; 8774 PyUnicodeObject *u; 8775 8776 Py_ssize_t width; 8777 if (!PyArg_ParseTuple(args, "n:zfill", &width)) 8778 return NULL; 8779 8780 if (self->length >= width) { 8781 if (PyUnicode_CheckExact(self)) { 8782 Py_INCREF(self); 8783 return (PyObject*) self; 8784 } 8785 else 8786 return PyUnicode_FromUnicode( 8787 PyUnicode_AS_UNICODE(self), 8788 PyUnicode_GET_SIZE(self) 8789 ); 8790 } 8791 8792 fill = width - self->length; 8793 8794 u = pad(self, fill, 0, '0'); 8795 8796 if (u == NULL) 8797 return NULL; 8798 8799 if (u->str[fill] == '+' || u->str[fill] == '-') { 8800 /* move sign to beginning of string */ 8801 u->str[0] = u->str[fill]; 8802 u->str[fill] = '0'; 8803 } 8804 8805 return (PyObject*) u; 8806} 8807 8808#if 0 8809static PyObject* 8810unicode_freelistsize(PyUnicodeObject *self) 8811{ 8812 return PyLong_FromLong(numfree); 8813} 8814#endif 8815 8816PyDoc_STRVAR(startswith__doc__, 8817 "S.startswith(prefix[, start[, end]]) -> bool\n\ 8818\n\ 8819Return True if S starts with the specified prefix, False otherwise.\n\ 8820With optional start, test S beginning at that position.\n\ 8821With optional end, stop comparing S at that position.\n\ 8822prefix can also be a tuple of strings to try."); 8823 8824static PyObject * 8825unicode_startswith(PyUnicodeObject *self, 8826 PyObject *args) 8827{ 8828 PyObject *subobj; 8829 PyUnicodeObject *substring; 8830 Py_ssize_t start = 0; 8831 Py_ssize_t end = PY_SSIZE_T_MAX; 8832 int result; 8833 8834 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj, 8835 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 8836 return NULL; 8837 if (PyTuple_Check(subobj)) { 8838 Py_ssize_t i; 8839 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) { 8840 substring = (PyUnicodeObject *)PyUnicode_FromObject( 8841 PyTuple_GET_ITEM(subobj, i)); 8842 if (substring == NULL) 8843 return NULL; 8844 result = tailmatch(self, substring, start, end, -1); 8845 Py_DECREF(substring); 8846 if (result) { 8847 Py_RETURN_TRUE; 8848 } 8849 } 8850 /* nothing matched */ 8851 Py_RETURN_FALSE; 8852 } 8853 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj); 8854 if (substring == NULL) 8855 return NULL; 8856 result = tailmatch(self, substring, start, end, -1); 8857 Py_DECREF(substring); 8858 return PyBool_FromLong(result); 8859} 8860 8861 8862PyDoc_STRVAR(endswith__doc__, 8863 "S.endswith(suffix[, start[, end]]) -> bool\n\ 8864\n\ 8865Return True if S ends with the specified suffix, False otherwise.\n\ 8866With optional start, test S beginning at that position.\n\ 8867With optional end, stop comparing S at that position.\n\ 8868suffix can also be a tuple of strings to try."); 8869 8870static PyObject * 8871unicode_endswith(PyUnicodeObject *self, 8872 PyObject *args) 8873{ 8874 PyObject *subobj; 8875 PyUnicodeObject *substring; 8876 Py_ssize_t start = 0; 8877 Py_ssize_t end = PY_SSIZE_T_MAX; 8878 int result; 8879 8880 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj, 8881 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 8882 return NULL; 8883 if (PyTuple_Check(subobj)) { 8884 Py_ssize_t i; 8885 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) { 8886 substring = (PyUnicodeObject *)PyUnicode_FromObject( 8887 PyTuple_GET_ITEM(subobj, i)); 8888 if (substring == NULL) 8889 return NULL; 8890 result = tailmatch(self, substring, start, end, +1); 8891 Py_DECREF(substring); 8892 if (result) { 8893 Py_RETURN_TRUE; 8894 } 8895 } 8896 Py_RETURN_FALSE; 8897 } 8898 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj); 8899 if (substring == NULL) 8900 return NULL; 8901 8902 result = tailmatch(self, substring, start, end, +1); 8903 Py_DECREF(substring); 8904 return PyBool_FromLong(result); 8905} 8906 8907#include "stringlib/string_format.h" 8908 8909PyDoc_STRVAR(format__doc__, 8910 "S.format(*args, **kwargs) -> str\n\ 8911\n\ 8912"); 8913 8914static PyObject * 8915unicode__format__(PyObject* self, PyObject* args) 8916{ 8917 PyObject *format_spec; 8918 8919 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec)) 8920 return NULL; 8921 8922 return _PyUnicode_FormatAdvanced(self, 8923 PyUnicode_AS_UNICODE(format_spec), 8924 PyUnicode_GET_SIZE(format_spec)); 8925} 8926 8927PyDoc_STRVAR(p_format__doc__, 8928 "S.__format__(format_spec) -> str\n\ 8929\n\ 8930"); 8931 8932static PyObject * 8933unicode__sizeof__(PyUnicodeObject *v) 8934{ 8935 return PyLong_FromSsize_t(sizeof(PyUnicodeObject) + 8936 sizeof(Py_UNICODE) * (v->length + 1)); 8937} 8938 8939PyDoc_STRVAR(sizeof__doc__, 8940 "S.__sizeof__() -> size of S in memory, in bytes"); 8941 8942static PyObject * 8943unicode_getnewargs(PyUnicodeObject *v) 8944{ 8945 return Py_BuildValue("(u#)", v->str, v->length); 8946} 8947 8948 8949static PyMethodDef unicode_methods[] = { 8950 8951 /* Order is according to common usage: often used methods should 8952 appear first, since lookup is done sequentially. */ 8953 8954 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__}, 8955 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__}, 8956 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__}, 8957 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__}, 8958 {"join", (PyCFunction) unicode_join, METH_O, join__doc__}, 8959 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__}, 8960 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__}, 8961 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__}, 8962 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__}, 8963 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__}, 8964 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__}, 8965 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__}, 8966 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__}, 8967 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__}, 8968 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__}, 8969 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__}, 8970 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__}, 8971 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__}, 8972 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__}, 8973 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__}, 8974 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__}, 8975 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__}, 8976 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__}, 8977 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__}, 8978 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__}, 8979 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__}, 8980 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__}, 8981 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__}, 8982 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__}, 8983 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__}, 8984 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__}, 8985 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__}, 8986 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__}, 8987 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__}, 8988 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__}, 8989 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__}, 8990 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__}, 8991 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__}, 8992 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__}, 8993 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__}, 8994 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__}, 8995 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__}, 8996 {"maketrans", (PyCFunction) unicode_maketrans, 8997 METH_VARARGS | METH_STATIC, maketrans__doc__}, 8998 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__}, 8999#if 0 9000 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__}, 9001#endif 9002 9003#if 0 9004 /* This one is just used for debugging the implementation. */ 9005 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS}, 9006#endif 9007 9008 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS}, 9009 {NULL, NULL} 9010}; 9011 9012static PyObject * 9013unicode_mod(PyObject *v, PyObject *w) 9014{ 9015 if (!PyUnicode_Check(v)) { 9016 Py_INCREF(Py_NotImplemented); 9017 return Py_NotImplemented; 9018 } 9019 return PyUnicode_Format(v, w); 9020} 9021 9022static PyNumberMethods unicode_as_number = { 9023 0, /*nb_add*/ 9024 0, /*nb_subtract*/ 9025 0, /*nb_multiply*/ 9026 unicode_mod, /*nb_remainder*/ 9027}; 9028 9029static PySequenceMethods unicode_as_sequence = { 9030 (lenfunc) unicode_length, /* sq_length */ 9031 PyUnicode_Concat, /* sq_concat */ 9032 (ssizeargfunc) unicode_repeat, /* sq_repeat */ 9033 (ssizeargfunc) unicode_getitem, /* sq_item */ 9034 0, /* sq_slice */ 9035 0, /* sq_ass_item */ 9036 0, /* sq_ass_slice */ 9037 PyUnicode_Contains, /* sq_contains */ 9038}; 9039 9040static PyObject* 9041unicode_subscript(PyUnicodeObject* self, PyObject* item) 9042{ 9043 if (PyIndex_Check(item)) { 9044 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError); 9045 if (i == -1 && PyErr_Occurred()) 9046 return NULL; 9047 if (i < 0) 9048 i += PyUnicode_GET_SIZE(self); 9049 return unicode_getitem(self, i); 9050 } else if (PySlice_Check(item)) { 9051 Py_ssize_t start, stop, step, slicelength, cur, i; 9052 Py_UNICODE* source_buf; 9053 Py_UNICODE* result_buf; 9054 PyObject* result; 9055 9056 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self), 9057 &start, &stop, &step, &slicelength) < 0) { 9058 return NULL; 9059 } 9060 9061 if (slicelength <= 0) { 9062 return PyUnicode_FromUnicode(NULL, 0); 9063 } else if (start == 0 && step == 1 && slicelength == self->length && 9064 PyUnicode_CheckExact(self)) { 9065 Py_INCREF(self); 9066 return (PyObject *)self; 9067 } else if (step == 1) { 9068 return PyUnicode_FromUnicode(self->str + start, slicelength); 9069 } else { 9070 source_buf = PyUnicode_AS_UNICODE((PyObject*)self); 9071 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength* 9072 sizeof(Py_UNICODE)); 9073 9074 if (result_buf == NULL) 9075 return PyErr_NoMemory(); 9076 9077 for (cur = start, i = 0; i < slicelength; cur += step, i++) { 9078 result_buf[i] = source_buf[cur]; 9079 } 9080 9081 result = PyUnicode_FromUnicode(result_buf, slicelength); 9082 PyObject_FREE(result_buf); 9083 return result; 9084 } 9085 } else { 9086 PyErr_SetString(PyExc_TypeError, "string indices must be integers"); 9087 return NULL; 9088 } 9089} 9090 9091static PyMappingMethods unicode_as_mapping = { 9092 (lenfunc)unicode_length, /* mp_length */ 9093 (binaryfunc)unicode_subscript, /* mp_subscript */ 9094 (objobjargproc)0, /* mp_ass_subscript */ 9095}; 9096 9097 9098/* Helpers for PyUnicode_Format() */ 9099 9100static PyObject * 9101getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx) 9102{ 9103 Py_ssize_t argidx = *p_argidx; 9104 if (argidx < arglen) { 9105 (*p_argidx)++; 9106 if (arglen < 0) 9107 return args; 9108 else 9109 return PyTuple_GetItem(args, argidx); 9110 } 9111 PyErr_SetString(PyExc_TypeError, 9112 "not enough arguments for format string"); 9113 return NULL; 9114} 9115 9116/* Returns a new reference to a PyUnicode object, or NULL on failure. */ 9117 9118static PyObject * 9119formatfloat(PyObject *v, int flags, int prec, int type) 9120{ 9121 char *p; 9122 PyObject *result; 9123 double x; 9124 9125 x = PyFloat_AsDouble(v); 9126 if (x == -1.0 && PyErr_Occurred()) 9127 return NULL; 9128 9129 if (prec < 0) 9130 prec = 6; 9131 9132 p = PyOS_double_to_string(x, type, prec, 9133 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL); 9134 if (p == NULL) 9135 return NULL; 9136 result = PyUnicode_FromStringAndSize(p, strlen(p)); 9137 PyMem_Free(p); 9138 return result; 9139} 9140 9141static PyObject* 9142formatlong(PyObject *val, int flags, int prec, int type) 9143{ 9144 char *buf; 9145 int len; 9146 PyObject *str; /* temporary string object. */ 9147 PyObject *result; 9148 9149 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len); 9150 if (!str) 9151 return NULL; 9152 result = PyUnicode_FromStringAndSize(buf, len); 9153 Py_DECREF(str); 9154 return result; 9155} 9156 9157static int 9158formatchar(Py_UNICODE *buf, 9159 size_t buflen, 9160 PyObject *v) 9161{ 9162 /* presume that the buffer is at least 3 characters long */ 9163 if (PyUnicode_Check(v)) { 9164 if (PyUnicode_GET_SIZE(v) == 1) { 9165 buf[0] = PyUnicode_AS_UNICODE(v)[0]; 9166 buf[1] = '\0'; 9167 return 1; 9168 } 9169#ifndef Py_UNICODE_WIDE 9170 if (PyUnicode_GET_SIZE(v) == 2) { 9171 /* Decode a valid surrogate pair */ 9172 int c0 = PyUnicode_AS_UNICODE(v)[0]; 9173 int c1 = PyUnicode_AS_UNICODE(v)[1]; 9174 if (0xD800 <= c0 && c0 <= 0xDBFF && 9175 0xDC00 <= c1 && c1 <= 0xDFFF) { 9176 buf[0] = c0; 9177 buf[1] = c1; 9178 buf[2] = '\0'; 9179 return 2; 9180 } 9181 } 9182#endif 9183 goto onError; 9184 } 9185 else { 9186 /* Integer input truncated to a character */ 9187 long x; 9188 x = PyLong_AsLong(v); 9189 if (x == -1 && PyErr_Occurred()) 9190 goto onError; 9191 9192 if (x < 0 || x > 0x10ffff) { 9193 PyErr_SetString(PyExc_OverflowError, 9194 "%c arg not in range(0x110000)"); 9195 return -1; 9196 } 9197 9198#ifndef Py_UNICODE_WIDE 9199 if (x > 0xffff) { 9200 x -= 0x10000; 9201 buf[0] = (Py_UNICODE)(0xD800 | (x >> 10)); 9202 buf[1] = (Py_UNICODE)(0xDC00 | (x & 0x3FF)); 9203 return 2; 9204 } 9205#endif 9206 buf[0] = (Py_UNICODE) x; 9207 buf[1] = '\0'; 9208 return 1; 9209 } 9210 9211 onError: 9212 PyErr_SetString(PyExc_TypeError, 9213 "%c requires int or char"); 9214 return -1; 9215} 9216 9217/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...) 9218 FORMATBUFLEN is the length of the buffer in which chars are formatted. 9219*/ 9220#define FORMATBUFLEN (size_t)10 9221 9222PyObject *PyUnicode_Format(PyObject *format, 9223 PyObject *args) 9224{ 9225 Py_UNICODE *fmt, *res; 9226 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx; 9227 int args_owned = 0; 9228 PyUnicodeObject *result = NULL; 9229 PyObject *dict = NULL; 9230 PyObject *uformat; 9231 9232 if (format == NULL || args == NULL) { 9233 PyErr_BadInternalCall(); 9234 return NULL; 9235 } 9236 uformat = PyUnicode_FromObject(format); 9237 if (uformat == NULL) 9238 return NULL; 9239 fmt = PyUnicode_AS_UNICODE(uformat); 9240 fmtcnt = PyUnicode_GET_SIZE(uformat); 9241 9242 reslen = rescnt = fmtcnt + 100; 9243 result = _PyUnicode_New(reslen); 9244 if (result == NULL) 9245 goto onError; 9246 res = PyUnicode_AS_UNICODE(result); 9247 9248 if (PyTuple_Check(args)) { 9249 arglen = PyTuple_Size(args); 9250 argidx = 0; 9251 } 9252 else { 9253 arglen = -1; 9254 argidx = -2; 9255 } 9256 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) && 9257 !PyUnicode_Check(args)) 9258 dict = args; 9259 9260 while (--fmtcnt >= 0) { 9261 if (*fmt != '%') { 9262 if (--rescnt < 0) { 9263 rescnt = fmtcnt + 100; 9264 reslen += rescnt; 9265 if (_PyUnicode_Resize(&result, reslen) < 0) 9266 goto onError; 9267 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt; 9268 --rescnt; 9269 } 9270 *res++ = *fmt++; 9271 } 9272 else { 9273 /* Got a format specifier */ 9274 int flags = 0; 9275 Py_ssize_t width = -1; 9276 int prec = -1; 9277 Py_UNICODE c = '\0'; 9278 Py_UNICODE fill; 9279 int isnumok; 9280 PyObject *v = NULL; 9281 PyObject *temp = NULL; 9282 Py_UNICODE *pbuf; 9283 Py_UNICODE sign; 9284 Py_ssize_t len; 9285 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For formatchar() */ 9286 9287 fmt++; 9288 if (*fmt == '(') { 9289 Py_UNICODE *keystart; 9290 Py_ssize_t keylen; 9291 PyObject *key; 9292 int pcount = 1; 9293 9294 if (dict == NULL) { 9295 PyErr_SetString(PyExc_TypeError, 9296 "format requires a mapping"); 9297 goto onError; 9298 } 9299 ++fmt; 9300 --fmtcnt; 9301 keystart = fmt; 9302 /* Skip over balanced parentheses */ 9303 while (pcount > 0 && --fmtcnt >= 0) { 9304 if (*fmt == ')') 9305 --pcount; 9306 else if (*fmt == '(') 9307 ++pcount; 9308 fmt++; 9309 } 9310 keylen = fmt - keystart - 1; 9311 if (fmtcnt < 0 || pcount > 0) { 9312 PyErr_SetString(PyExc_ValueError, 9313 "incomplete format key"); 9314 goto onError; 9315 } 9316#if 0 9317 /* keys are converted to strings using UTF-8 and 9318 then looked up since Python uses strings to hold 9319 variables names etc. in its namespaces and we 9320 wouldn't want to break common idioms. */ 9321 key = PyUnicode_EncodeUTF8(keystart, 9322 keylen, 9323 NULL); 9324#else 9325 key = PyUnicode_FromUnicode(keystart, keylen); 9326#endif 9327 if (key == NULL) 9328 goto onError; 9329 if (args_owned) { 9330 Py_DECREF(args); 9331 args_owned = 0; 9332 } 9333 args = PyObject_GetItem(dict, key); 9334 Py_DECREF(key); 9335 if (args == NULL) { 9336 goto onError; 9337 } 9338 args_owned = 1; 9339 arglen = -1; 9340 argidx = -2; 9341 } 9342 while (--fmtcnt >= 0) { 9343 switch (c = *fmt++) { 9344 case '-': flags |= F_LJUST; continue; 9345 case '+': flags |= F_SIGN; continue; 9346 case ' ': flags |= F_BLANK; continue; 9347 case '#': flags |= F_ALT; continue; 9348 case '0': flags |= F_ZERO; continue; 9349 } 9350 break; 9351 } 9352 if (c == '*') { 9353 v = getnextarg(args, arglen, &argidx); 9354 if (v == NULL) 9355 goto onError; 9356 if (!PyLong_Check(v)) { 9357 PyErr_SetString(PyExc_TypeError, 9358 "* wants int"); 9359 goto onError; 9360 } 9361 width = PyLong_AsLong(v); 9362 if (width == -1 && PyErr_Occurred()) 9363 goto onError; 9364 if (width < 0) { 9365 flags |= F_LJUST; 9366 width = -width; 9367 } 9368 if (--fmtcnt >= 0) 9369 c = *fmt++; 9370 } 9371 else if (c >= '0' && c <= '9') { 9372 width = c - '0'; 9373 while (--fmtcnt >= 0) { 9374 c = *fmt++; 9375 if (c < '0' || c > '9') 9376 break; 9377 if ((width*10) / 10 != width) { 9378 PyErr_SetString(PyExc_ValueError, 9379 "width too big"); 9380 goto onError; 9381 } 9382 width = width*10 + (c - '0'); 9383 } 9384 } 9385 if (c == '.') { 9386 prec = 0; 9387 if (--fmtcnt >= 0) 9388 c = *fmt++; 9389 if (c == '*') { 9390 v = getnextarg(args, arglen, &argidx); 9391 if (v == NULL) 9392 goto onError; 9393 if (!PyLong_Check(v)) { 9394 PyErr_SetString(PyExc_TypeError, 9395 "* wants int"); 9396 goto onError; 9397 } 9398 prec = PyLong_AsLong(v); 9399 if (prec == -1 && PyErr_Occurred()) 9400 goto onError; 9401 if (prec < 0) 9402 prec = 0; 9403 if (--fmtcnt >= 0) 9404 c = *fmt++; 9405 } 9406 else if (c >= '0' && c <= '9') { 9407 prec = c - '0'; 9408 while (--fmtcnt >= 0) { 9409 c = *fmt++; 9410 if (c < '0' || c > '9') 9411 break; 9412 if ((prec*10) / 10 != prec) { 9413 PyErr_SetString(PyExc_ValueError, 9414 "prec too big"); 9415 goto onError; 9416 } 9417 prec = prec*10 + (c - '0'); 9418 } 9419 } 9420 } /* prec */ 9421 if (fmtcnt >= 0) { 9422 if (c == 'h' || c == 'l' || c == 'L') { 9423 if (--fmtcnt >= 0) 9424 c = *fmt++; 9425 } 9426 } 9427 if (fmtcnt < 0) { 9428 PyErr_SetString(PyExc_ValueError, 9429 "incomplete format"); 9430 goto onError; 9431 } 9432 if (c != '%') { 9433 v = getnextarg(args, arglen, &argidx); 9434 if (v == NULL) 9435 goto onError; 9436 } 9437 sign = 0; 9438 fill = ' '; 9439 switch (c) { 9440 9441 case '%': 9442 pbuf = formatbuf; 9443 /* presume that buffer length is at least 1 */ 9444 pbuf[0] = '%'; 9445 len = 1; 9446 break; 9447 9448 case 's': 9449 case 'r': 9450 case 'a': 9451 if (PyUnicode_CheckExact(v) && c == 's') { 9452 temp = v; 9453 Py_INCREF(temp); 9454 } 9455 else { 9456 if (c == 's') 9457 temp = PyObject_Str(v); 9458 else if (c == 'r') 9459 temp = PyObject_Repr(v); 9460 else 9461 temp = PyObject_ASCII(v); 9462 if (temp == NULL) 9463 goto onError; 9464 if (PyUnicode_Check(temp)) 9465 /* nothing to do */; 9466 else { 9467 Py_DECREF(temp); 9468 PyErr_SetString(PyExc_TypeError, 9469 "%s argument has non-string str()"); 9470 goto onError; 9471 } 9472 } 9473 pbuf = PyUnicode_AS_UNICODE(temp); 9474 len = PyUnicode_GET_SIZE(temp); 9475 if (prec >= 0 && len > prec) 9476 len = prec; 9477 break; 9478 9479 case 'i': 9480 case 'd': 9481 case 'u': 9482 case 'o': 9483 case 'x': 9484 case 'X': 9485 if (c == 'i') 9486 c = 'd'; 9487 isnumok = 0; 9488 if (PyNumber_Check(v)) { 9489 PyObject *iobj=NULL; 9490 9491 if (PyLong_Check(v)) { 9492 iobj = v; 9493 Py_INCREF(iobj); 9494 } 9495 else { 9496 iobj = PyNumber_Long(v); 9497 } 9498 if (iobj!=NULL) { 9499 if (PyLong_Check(iobj)) { 9500 isnumok = 1; 9501 temp = formatlong(iobj, flags, prec, c); 9502 Py_DECREF(iobj); 9503 if (!temp) 9504 goto onError; 9505 pbuf = PyUnicode_AS_UNICODE(temp); 9506 len = PyUnicode_GET_SIZE(temp); 9507 sign = 1; 9508 } 9509 else { 9510 Py_DECREF(iobj); 9511 } 9512 } 9513 } 9514 if (!isnumok) { 9515 PyErr_Format(PyExc_TypeError, 9516 "%%%c format: a number is required, " 9517 "not %.200s", (char)c, Py_TYPE(v)->tp_name); 9518 goto onError; 9519 } 9520 if (flags & F_ZERO) 9521 fill = '0'; 9522 break; 9523 9524 case 'e': 9525 case 'E': 9526 case 'f': 9527 case 'F': 9528 case 'g': 9529 case 'G': 9530 temp = formatfloat(v, flags, prec, c); 9531 if (!temp) 9532 goto onError; 9533 pbuf = PyUnicode_AS_UNICODE(temp); 9534 len = PyUnicode_GET_SIZE(temp); 9535 sign = 1; 9536 if (flags & F_ZERO) 9537 fill = '0'; 9538 break; 9539 9540 case 'c': 9541 pbuf = formatbuf; 9542 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v); 9543 if (len < 0) 9544 goto onError; 9545 break; 9546 9547 default: 9548 PyErr_Format(PyExc_ValueError, 9549 "unsupported format character '%c' (0x%x) " 9550 "at index %zd", 9551 (31<=c && c<=126) ? (char)c : '?', 9552 (int)c, 9553 (Py_ssize_t)(fmt - 1 - 9554 PyUnicode_AS_UNICODE(uformat))); 9555 goto onError; 9556 } 9557 if (sign) { 9558 if (*pbuf == '-' || *pbuf == '+') { 9559 sign = *pbuf++; 9560 len--; 9561 } 9562 else if (flags & F_SIGN) 9563 sign = '+'; 9564 else if (flags & F_BLANK) 9565 sign = ' '; 9566 else 9567 sign = 0; 9568 } 9569 if (width < len) 9570 width = len; 9571 if (rescnt - (sign != 0) < width) { 9572 reslen -= rescnt; 9573 rescnt = width + fmtcnt + 100; 9574 reslen += rescnt; 9575 if (reslen < 0) { 9576 Py_XDECREF(temp); 9577 PyErr_NoMemory(); 9578 goto onError; 9579 } 9580 if (_PyUnicode_Resize(&result, reslen) < 0) { 9581 Py_XDECREF(temp); 9582 goto onError; 9583 } 9584 res = PyUnicode_AS_UNICODE(result) 9585 + reslen - rescnt; 9586 } 9587 if (sign) { 9588 if (fill != ' ') 9589 *res++ = sign; 9590 rescnt--; 9591 if (width > len) 9592 width--; 9593 } 9594 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) { 9595 assert(pbuf[0] == '0'); 9596 assert(pbuf[1] == c); 9597 if (fill != ' ') { 9598 *res++ = *pbuf++; 9599 *res++ = *pbuf++; 9600 } 9601 rescnt -= 2; 9602 width -= 2; 9603 if (width < 0) 9604 width = 0; 9605 len -= 2; 9606 } 9607 if (width > len && !(flags & F_LJUST)) { 9608 do { 9609 --rescnt; 9610 *res++ = fill; 9611 } while (--width > len); 9612 } 9613 if (fill == ' ') { 9614 if (sign) 9615 *res++ = sign; 9616 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) { 9617 assert(pbuf[0] == '0'); 9618 assert(pbuf[1] == c); 9619 *res++ = *pbuf++; 9620 *res++ = *pbuf++; 9621 } 9622 } 9623 Py_UNICODE_COPY(res, pbuf, len); 9624 res += len; 9625 rescnt -= len; 9626 while (--width >= len) { 9627 --rescnt; 9628 *res++ = ' '; 9629 } 9630 if (dict && (argidx < arglen) && c != '%') { 9631 PyErr_SetString(PyExc_TypeError, 9632 "not all arguments converted during string formatting"); 9633 Py_XDECREF(temp); 9634 goto onError; 9635 } 9636 Py_XDECREF(temp); 9637 } /* '%' */ 9638 } /* until end */ 9639 if (argidx < arglen && !dict) { 9640 PyErr_SetString(PyExc_TypeError, 9641 "not all arguments converted during string formatting"); 9642 goto onError; 9643 } 9644 9645 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0) 9646 goto onError; 9647 if (args_owned) { 9648 Py_DECREF(args); 9649 } 9650 Py_DECREF(uformat); 9651 return (PyObject *)result; 9652 9653 onError: 9654 Py_XDECREF(result); 9655 Py_DECREF(uformat); 9656 if (args_owned) { 9657 Py_DECREF(args); 9658 } 9659 return NULL; 9660} 9661 9662static PyObject * 9663unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds); 9664 9665static PyObject * 9666unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds) 9667{ 9668 PyObject *x = NULL; 9669 static char *kwlist[] = {"object", "encoding", "errors", 0}; 9670 char *encoding = NULL; 9671 char *errors = NULL; 9672 9673 if (type != &PyUnicode_Type) 9674 return unicode_subtype_new(type, args, kwds); 9675 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str", 9676 kwlist, &x, &encoding, &errors)) 9677 return NULL; 9678 if (x == NULL) 9679 return (PyObject *)_PyUnicode_New(0); 9680 if (encoding == NULL && errors == NULL) 9681 return PyObject_Str(x); 9682 else 9683 return PyUnicode_FromEncodedObject(x, encoding, errors); 9684} 9685 9686static PyObject * 9687unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds) 9688{ 9689 PyUnicodeObject *tmp, *pnew; 9690 Py_ssize_t n; 9691 9692 assert(PyType_IsSubtype(type, &PyUnicode_Type)); 9693 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds); 9694 if (tmp == NULL) 9695 return NULL; 9696 assert(PyUnicode_Check(tmp)); 9697 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length); 9698 if (pnew == NULL) { 9699 Py_DECREF(tmp); 9700 return NULL; 9701 } 9702 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1)); 9703 if (pnew->str == NULL) { 9704 _Py_ForgetReference((PyObject *)pnew); 9705 PyObject_Del(pnew); 9706 Py_DECREF(tmp); 9707 return PyErr_NoMemory(); 9708 } 9709 Py_UNICODE_COPY(pnew->str, tmp->str, n+1); 9710 pnew->length = n; 9711 pnew->hash = tmp->hash; 9712 Py_DECREF(tmp); 9713 return (PyObject *)pnew; 9714} 9715 9716PyDoc_STRVAR(unicode_doc, 9717 "str(string[, encoding[, errors]]) -> str\n\ 9718\n\ 9719Create a new string object from the given encoded string.\n\ 9720encoding defaults to the current default string encoding.\n\ 9721errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'."); 9722 9723static PyObject *unicode_iter(PyObject *seq); 9724 9725PyTypeObject PyUnicode_Type = { 9726 PyVarObject_HEAD_INIT(&PyType_Type, 0) 9727 "str", /* tp_name */ 9728 sizeof(PyUnicodeObject), /* tp_size */ 9729 0, /* tp_itemsize */ 9730 /* Slots */ 9731 (destructor)unicode_dealloc, /* tp_dealloc */ 9732 0, /* tp_print */ 9733 0, /* tp_getattr */ 9734 0, /* tp_setattr */ 9735 0, /* tp_reserved */ 9736 unicode_repr, /* tp_repr */ 9737 &unicode_as_number, /* tp_as_number */ 9738 &unicode_as_sequence, /* tp_as_sequence */ 9739 &unicode_as_mapping, /* tp_as_mapping */ 9740 (hashfunc) unicode_hash, /* tp_hash*/ 9741 0, /* tp_call*/ 9742 (reprfunc) unicode_str, /* tp_str */ 9743 PyObject_GenericGetAttr, /* tp_getattro */ 9744 0, /* tp_setattro */ 9745 0, /* tp_as_buffer */ 9746 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | 9747 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */ 9748 unicode_doc, /* tp_doc */ 9749 0, /* tp_traverse */ 9750 0, /* tp_clear */ 9751 PyUnicode_RichCompare, /* tp_richcompare */ 9752 0, /* tp_weaklistoffset */ 9753 unicode_iter, /* tp_iter */ 9754 0, /* tp_iternext */ 9755 unicode_methods, /* tp_methods */ 9756 0, /* tp_members */ 9757 0, /* tp_getset */ 9758 &PyBaseObject_Type, /* tp_base */ 9759 0, /* tp_dict */ 9760 0, /* tp_descr_get */ 9761 0, /* tp_descr_set */ 9762 0, /* tp_dictoffset */ 9763 0, /* tp_init */ 9764 0, /* tp_alloc */ 9765 unicode_new, /* tp_new */ 9766 PyObject_Del, /* tp_free */ 9767}; 9768 9769/* Initialize the Unicode implementation */ 9770 9771void _PyUnicode_Init(void) 9772{ 9773 int i; 9774 9775 /* XXX - move this array to unicodectype.c ? */ 9776 Py_UNICODE linebreak[] = { 9777 0x000A, /* LINE FEED */ 9778 0x000D, /* CARRIAGE RETURN */ 9779 0x001C, /* FILE SEPARATOR */ 9780 0x001D, /* GROUP SEPARATOR */ 9781 0x001E, /* RECORD SEPARATOR */ 9782 0x0085, /* NEXT LINE */ 9783 0x2028, /* LINE SEPARATOR */ 9784 0x2029, /* PARAGRAPH SEPARATOR */ 9785 }; 9786 9787 /* Init the implementation */ 9788 free_list = NULL; 9789 numfree = 0; 9790 unicode_empty = _PyUnicode_New(0); 9791 if (!unicode_empty) 9792 return; 9793 9794 for (i = 0; i < 256; i++) 9795 unicode_latin1[i] = NULL; 9796 if (PyType_Ready(&PyUnicode_Type) < 0) 9797 Py_FatalError("Can't initialize 'unicode'"); 9798 9799 /* initialize the linebreak bloom filter */ 9800 bloom_linebreak = make_bloom_mask( 9801 linebreak, sizeof(linebreak) / sizeof(linebreak[0]) 9802 ); 9803 9804 PyType_Ready(&EncodingMapType); 9805} 9806 9807/* Finalize the Unicode implementation */ 9808 9809int 9810PyUnicode_ClearFreeList(void) 9811{ 9812 int freelist_size = numfree; 9813 PyUnicodeObject *u; 9814 9815 for (u = free_list; u != NULL;) { 9816 PyUnicodeObject *v = u; 9817 u = *(PyUnicodeObject **)u; 9818 if (v->str) 9819 PyObject_DEL(v->str); 9820 Py_XDECREF(v->defenc); 9821 PyObject_Del(v); 9822 numfree--; 9823 } 9824 free_list = NULL; 9825 assert(numfree == 0); 9826 return freelist_size; 9827} 9828 9829void 9830_PyUnicode_Fini(void) 9831{ 9832 int i; 9833 9834 Py_XDECREF(unicode_empty); 9835 unicode_empty = NULL; 9836 9837 for (i = 0; i < 256; i++) { 9838 if (unicode_latin1[i]) { 9839 Py_DECREF(unicode_latin1[i]); 9840 unicode_latin1[i] = NULL; 9841 } 9842 } 9843 (void)PyUnicode_ClearFreeList(); 9844} 9845 9846void 9847PyUnicode_InternInPlace(PyObject **p) 9848{ 9849 register PyUnicodeObject *s = (PyUnicodeObject *)(*p); 9850 PyObject *t; 9851 if (s == NULL || !PyUnicode_Check(s)) 9852 Py_FatalError( 9853 "PyUnicode_InternInPlace: unicode strings only please!"); 9854 /* If it's a subclass, we don't really know what putting 9855 it in the interned dict might do. */ 9856 if (!PyUnicode_CheckExact(s)) 9857 return; 9858 if (PyUnicode_CHECK_INTERNED(s)) 9859 return; 9860 if (interned == NULL) { 9861 interned = PyDict_New(); 9862 if (interned == NULL) { 9863 PyErr_Clear(); /* Don't leave an exception */ 9864 return; 9865 } 9866 } 9867 /* It might be that the GetItem call fails even 9868 though the key is present in the dictionary, 9869 namely when this happens during a stack overflow. */ 9870 Py_ALLOW_RECURSION 9871 t = PyDict_GetItem(interned, (PyObject *)s); 9872 Py_END_ALLOW_RECURSION 9873 9874 if (t) { 9875 Py_INCREF(t); 9876 Py_DECREF(*p); 9877 *p = t; 9878 return; 9879 } 9880 9881 PyThreadState_GET()->recursion_critical = 1; 9882 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) { 9883 PyErr_Clear(); 9884 PyThreadState_GET()->recursion_critical = 0; 9885 return; 9886 } 9887 PyThreadState_GET()->recursion_critical = 0; 9888 /* The two references in interned are not counted by refcnt. 9889 The deallocator will take care of this */ 9890 Py_REFCNT(s) -= 2; 9891 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL; 9892} 9893 9894void 9895PyUnicode_InternImmortal(PyObject **p) 9896{ 9897 PyUnicode_InternInPlace(p); 9898 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) { 9899 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL; 9900 Py_INCREF(*p); 9901 } 9902} 9903 9904PyObject * 9905PyUnicode_InternFromString(const char *cp) 9906{ 9907 PyObject *s = PyUnicode_FromString(cp); 9908 if (s == NULL) 9909 return NULL; 9910 PyUnicode_InternInPlace(&s); 9911 return s; 9912} 9913 9914void _Py_ReleaseInternedUnicodeStrings(void) 9915{ 9916 PyObject *keys; 9917 PyUnicodeObject *s; 9918 Py_ssize_t i, n; 9919 Py_ssize_t immortal_size = 0, mortal_size = 0; 9920 9921 if (interned == NULL || !PyDict_Check(interned)) 9922 return; 9923 keys = PyDict_Keys(interned); 9924 if (keys == NULL || !PyList_Check(keys)) { 9925 PyErr_Clear(); 9926 return; 9927 } 9928 9929 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak 9930 detector, interned unicode strings are not forcibly deallocated; 9931 rather, we give them their stolen references back, and then clear 9932 and DECREF the interned dict. */ 9933 9934 n = PyList_GET_SIZE(keys); 9935 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n", 9936 n); 9937 for (i = 0; i < n; i++) { 9938 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i); 9939 switch (s->state) { 9940 case SSTATE_NOT_INTERNED: 9941 /* XXX Shouldn't happen */ 9942 break; 9943 case SSTATE_INTERNED_IMMORTAL: 9944 Py_REFCNT(s) += 1; 9945 immortal_size += s->length; 9946 break; 9947 case SSTATE_INTERNED_MORTAL: 9948 Py_REFCNT(s) += 2; 9949 mortal_size += s->length; 9950 break; 9951 default: 9952 Py_FatalError("Inconsistent interned string state."); 9953 } 9954 s->state = SSTATE_NOT_INTERNED; 9955 } 9956 fprintf(stderr, "total size of all interned strings: " 9957 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d " 9958 "mortal/immortal\n", mortal_size, immortal_size); 9959 Py_DECREF(keys); 9960 PyDict_Clear(interned); 9961 Py_DECREF(interned); 9962 interned = NULL; 9963} 9964 9965 9966/********************* Unicode Iterator **************************/ 9967 9968typedef struct { 9969 PyObject_HEAD 9970 Py_ssize_t it_index; 9971 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */ 9972} unicodeiterobject; 9973 9974static void 9975unicodeiter_dealloc(unicodeiterobject *it) 9976{ 9977 _PyObject_GC_UNTRACK(it); 9978 Py_XDECREF(it->it_seq); 9979 PyObject_GC_Del(it); 9980} 9981 9982static int 9983unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg) 9984{ 9985 Py_VISIT(it->it_seq); 9986 return 0; 9987} 9988 9989static PyObject * 9990unicodeiter_next(unicodeiterobject *it) 9991{ 9992 PyUnicodeObject *seq; 9993 PyObject *item; 9994 9995 assert(it != NULL); 9996 seq = it->it_seq; 9997 if (seq == NULL) 9998 return NULL; 9999 assert(PyUnicode_Check(seq)); 10000 10001 if (it->it_index < PyUnicode_GET_SIZE(seq)) { 10002 item = PyUnicode_FromUnicode( 10003 PyUnicode_AS_UNICODE(seq)+it->it_index, 1); 10004 if (item != NULL) 10005 ++it->it_index; 10006 return item; 10007 } 10008 10009 Py_DECREF(seq); 10010 it->it_seq = NULL; 10011 return NULL; 10012} 10013 10014static PyObject * 10015unicodeiter_len(unicodeiterobject *it) 10016{ 10017 Py_ssize_t len = 0; 10018 if (it->it_seq) 10019 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index; 10020 return PyLong_FromSsize_t(len); 10021} 10022 10023PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it))."); 10024 10025static PyMethodDef unicodeiter_methods[] = { 10026 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS, 10027 length_hint_doc}, 10028 {NULL, NULL} /* sentinel */ 10029}; 10030 10031PyTypeObject PyUnicodeIter_Type = { 10032 PyVarObject_HEAD_INIT(&PyType_Type, 0) 10033 "str_iterator", /* tp_name */ 10034 sizeof(unicodeiterobject), /* tp_basicsize */ 10035 0, /* tp_itemsize */ 10036 /* methods */ 10037 (destructor)unicodeiter_dealloc, /* tp_dealloc */ 10038 0, /* tp_print */ 10039 0, /* tp_getattr */ 10040 0, /* tp_setattr */ 10041 0, /* tp_reserved */ 10042 0, /* tp_repr */ 10043 0, /* tp_as_number */ 10044 0, /* tp_as_sequence */ 10045 0, /* tp_as_mapping */ 10046 0, /* tp_hash */ 10047 0, /* tp_call */ 10048 0, /* tp_str */ 10049 PyObject_GenericGetAttr, /* tp_getattro */ 10050 0, /* tp_setattro */ 10051 0, /* tp_as_buffer */ 10052 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */ 10053 0, /* tp_doc */ 10054 (traverseproc)unicodeiter_traverse, /* tp_traverse */ 10055 0, /* tp_clear */ 10056 0, /* tp_richcompare */ 10057 0, /* tp_weaklistoffset */ 10058 PyObject_SelfIter, /* tp_iter */ 10059 (iternextfunc)unicodeiter_next, /* tp_iternext */ 10060 unicodeiter_methods, /* tp_methods */ 10061 0, 10062}; 10063 10064static PyObject * 10065unicode_iter(PyObject *seq) 10066{ 10067 unicodeiterobject *it; 10068 10069 if (!PyUnicode_Check(seq)) { 10070 PyErr_BadInternalCall(); 10071 return NULL; 10072 } 10073 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type); 10074 if (it == NULL) 10075 return NULL; 10076 it->it_index = 0; 10077 Py_INCREF(seq); 10078 it->it_seq = (PyUnicodeObject *)seq; 10079 _PyObject_GC_TRACK(it); 10080 return (PyObject *)it; 10081} 10082 10083size_t 10084Py_UNICODE_strlen(const Py_UNICODE *u) 10085{ 10086 int res = 0; 10087 while(*u++) 10088 res++; 10089 return res; 10090} 10091 10092Py_UNICODE* 10093Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2) 10094{ 10095 Py_UNICODE *u = s1; 10096 while ((*u++ = *s2++)); 10097 return s1; 10098} 10099 10100Py_UNICODE* 10101Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n) 10102{ 10103 Py_UNICODE *u = s1; 10104 while ((*u++ = *s2++)) 10105 if (n-- == 0) 10106 break; 10107 return s1; 10108} 10109 10110Py_UNICODE* 10111Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2) 10112{ 10113 Py_UNICODE *u1 = s1; 10114 u1 += Py_UNICODE_strlen(u1); 10115 Py_UNICODE_strcpy(u1, s2); 10116 return s1; 10117} 10118 10119int 10120Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2) 10121{ 10122 while (*s1 && *s2 && *s1 == *s2) 10123 s1++, s2++; 10124 if (*s1 && *s2) 10125 return (*s1 < *s2) ? -1 : +1; 10126 if (*s1) 10127 return 1; 10128 if (*s2) 10129 return -1; 10130 return 0; 10131} 10132 10133int 10134Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n) 10135{ 10136 register Py_UNICODE u1, u2; 10137 for (; n != 0; n--) { 10138 u1 = *s1; 10139 u2 = *s2; 10140 if (u1 != u2) 10141 return (u1 < u2) ? -1 : +1; 10142 if (u1 == '\0') 10143 return 0; 10144 s1++; 10145 s2++; 10146 } 10147 return 0; 10148} 10149 10150Py_UNICODE* 10151Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c) 10152{ 10153 const Py_UNICODE *p; 10154 for (p = s; *p; p++) 10155 if (*p == c) 10156 return (Py_UNICODE*)p; 10157 return NULL; 10158} 10159 10160Py_UNICODE* 10161Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c) 10162{ 10163 const Py_UNICODE *p; 10164 p = s + Py_UNICODE_strlen(s); 10165 while (p != s) { 10166 p--; 10167 if (*p == c) 10168 return (Py_UNICODE*)p; 10169 } 10170 return NULL; 10171} 10172 10173Py_UNICODE* 10174PyUnicode_AsUnicodeCopy(PyObject *object) 10175{ 10176 PyUnicodeObject *unicode = (PyUnicodeObject *)object; 10177 Py_UNICODE *copy; 10178 Py_ssize_t size; 10179 10180 /* Ensure we won't overflow the size. */ 10181 if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) { 10182 PyErr_NoMemory(); 10183 return NULL; 10184 } 10185 size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */ 10186 size *= sizeof(Py_UNICODE); 10187 copy = PyMem_Malloc(size); 10188 if (copy == NULL) { 10189 PyErr_NoMemory(); 10190 return NULL; 10191 } 10192 memcpy(copy, PyUnicode_AS_UNICODE(unicode), size); 10193 return copy; 10194} 10195 10196/* A _string module, to export formatter_parser and formatter_field_name_split 10197 to the string.Formatter class implemented in Python. */ 10198 10199static PyMethodDef _string_methods[] = { 10200 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split, 10201 METH_O, PyDoc_STR("split the argument as a field name")}, 10202 {"formatter_parser", (PyCFunction) formatter_parser, 10203 METH_O, PyDoc_STR("parse the argument as a format string")}, 10204 {NULL, NULL} 10205}; 10206 10207static struct PyModuleDef _string_module = { 10208 PyModuleDef_HEAD_INIT, 10209 "_string", 10210 PyDoc_STR("string helper module"), 10211 0, 10212 _string_methods, 10213 NULL, 10214 NULL, 10215 NULL, 10216 NULL 10217}; 10218 10219PyMODINIT_FUNC 10220PyInit__string(void) 10221{ 10222 return PyModule_Create(&_string_module); 10223} 10224 10225 10226#ifdef __cplusplus 10227} 10228#endif 10229