unicodeobject.c revision 84d79ddce2176ae54825da32e096d6332a8d5138
1/* 2 3Unicode implementation based on original code by Fredrik Lundh, 4modified by Marc-Andre Lemburg <mal@lemburg.com> according to the 5Unicode Integration Proposal (see file Misc/unicode.txt). 6 7Major speed upgrades to the method implementations at the Reykjavik 8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke. 9 10Copyright (c) Corporation for National Research Initiatives. 11 12-------------------------------------------------------------------- 13The original string type implementation is: 14 15 Copyright (c) 1999 by Secret Labs AB 16 Copyright (c) 1999 by Fredrik Lundh 17 18By obtaining, using, and/or copying this software and/or its 19associated documentation, you agree that you have read, understood, 20and will comply with the following terms and conditions: 21 22Permission to use, copy, modify, and distribute this software and its 23associated documentation for any purpose and without fee is hereby 24granted, provided that the above copyright notice appears in all 25copies, and that both that copyright notice and this permission notice 26appear in supporting documentation, and that the name of Secret Labs 27AB or the author not be used in advertising or publicity pertaining to 28distribution of the software without specific, written prior 29permission. 30 31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO 32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND 33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR 34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT 37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 38-------------------------------------------------------------------- 39 40*/ 41 42#define PY_SSIZE_T_CLEAN 43#include "Python.h" 44 45#include "unicodeobject.h" 46#include "ucnhash.h" 47 48#ifdef MS_WINDOWS 49#include <windows.h> 50#endif 51 52/* Limit for the Unicode object free list */ 53 54#define MAX_UNICODE_FREELIST_SIZE 1024 55 56/* Limit for the Unicode object free list stay alive optimization. 57 58 The implementation will keep allocated Unicode memory intact for 59 all objects on the free list having a size less than this 60 limit. This reduces malloc() overhead for small Unicode objects. 61 62 At worst this will result in MAX_UNICODE_FREELIST_SIZE * 63 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT + 64 malloc()-overhead) bytes of unused garbage. 65 66 Setting the limit to 0 effectively turns the feature off. 67 68 Note: This is an experimental feature ! If you get core dumps when 69 using Unicode objects, turn this feature off. 70 71*/ 72 73#define KEEPALIVE_SIZE_LIMIT 9 74 75/* Endianness switches; defaults to little endian */ 76 77#ifdef WORDS_BIGENDIAN 78# define BYTEORDER_IS_BIG_ENDIAN 79#else 80# define BYTEORDER_IS_LITTLE_ENDIAN 81#endif 82 83/* --- Globals ------------------------------------------------------------ 84 85 The globals are initialized by the _PyUnicode_Init() API and should 86 not be used before calling that API. 87 88*/ 89 90 91#ifdef __cplusplus 92extern "C" { 93#endif 94 95/* Free list for Unicode objects */ 96static PyUnicodeObject *unicode_freelist; 97static int unicode_freelist_size; 98 99/* The empty Unicode object is shared to improve performance. */ 100static PyUnicodeObject *unicode_empty; 101 102/* Single character Unicode strings in the Latin-1 range are being 103 shared as well. */ 104static PyUnicodeObject *unicode_latin1[256]; 105 106/* Default encoding to use and assume when NULL is passed as encoding 107 parameter; it is initialized by _PyUnicode_Init(). 108 109 Always use the PyUnicode_SetDefaultEncoding() and 110 PyUnicode_GetDefaultEncoding() APIs to access this global. 111 112*/ 113static char unicode_default_encoding[100]; 114 115Py_UNICODE 116PyUnicode_GetMax(void) 117{ 118#ifdef Py_UNICODE_WIDE 119 return 0x10FFFF; 120#else 121 /* This is actually an illegal character, so it should 122 not be passed to unichr. */ 123 return 0xFFFF; 124#endif 125} 126 127/* --- Bloom Filters ----------------------------------------------------- */ 128 129/* stuff to implement simple "bloom filters" for Unicode characters. 130 to keep things simple, we use a single bitmask, using the least 5 131 bits from each unicode characters as the bit index. */ 132 133/* the linebreak mask is set up by Unicode_Init below */ 134 135#define BLOOM_MASK unsigned long 136 137static BLOOM_MASK bloom_linebreak; 138 139#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F)))) 140 141#define BLOOM_LINEBREAK(ch)\ 142 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK((ch))) 143 144Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len) 145{ 146 /* calculate simple bloom-style bitmask for a given unicode string */ 147 148 long mask; 149 Py_ssize_t i; 150 151 mask = 0; 152 for (i = 0; i < len; i++) 153 mask |= (1 << (ptr[i] & 0x1F)); 154 155 return mask; 156} 157 158Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen) 159{ 160 Py_ssize_t i; 161 162 for (i = 0; i < setlen; i++) 163 if (set[i] == chr) 164 return 1; 165 166 return 0; 167} 168 169#define BLOOM_MEMBER(mask, chr, set, setlen)\ 170 BLOOM(mask, chr) && unicode_member(chr, set, setlen) 171 172/* --- Unicode Object ----------------------------------------------------- */ 173 174static 175int unicode_resize(register PyUnicodeObject *unicode, 176 Py_ssize_t length) 177{ 178 void *oldstr; 179 180 /* Shortcut if there's nothing much to do. */ 181 if (unicode->length == length) 182 goto reset; 183 184 /* Resizing shared object (unicode_empty or single character 185 objects) in-place is not allowed. Use PyUnicode_Resize() 186 instead ! */ 187 188 if (unicode == unicode_empty || 189 (unicode->length == 1 && 190 unicode->str[0] < 256U && 191 unicode_latin1[unicode->str[0]] == unicode)) { 192 PyErr_SetString(PyExc_SystemError, 193 "can't resize shared unicode objects"); 194 return -1; 195 } 196 197 /* We allocate one more byte to make sure the string is Ux0000 terminated. 198 The overallocation is also used by fastsearch, which assumes that it's 199 safe to look at str[length] (without making any assumptions about what 200 it contains). */ 201 202 oldstr = unicode->str; 203 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1); 204 if (!unicode->str) { 205 unicode->str = (Py_UNICODE *)oldstr; 206 PyErr_NoMemory(); 207 return -1; 208 } 209 unicode->str[length] = 0; 210 unicode->length = length; 211 212 reset: 213 /* Reset the object caches */ 214 if (unicode->defenc) { 215 Py_DECREF(unicode->defenc); 216 unicode->defenc = NULL; 217 } 218 unicode->hash = -1; 219 220 return 0; 221} 222 223/* We allocate one more byte to make sure the string is 224 Ux0000 terminated -- XXX is this needed ? 225 226 XXX This allocator could further be enhanced by assuring that the 227 free list never reduces its size below 1. 228 229*/ 230 231static 232PyUnicodeObject *_PyUnicode_New(Py_ssize_t length) 233{ 234 register PyUnicodeObject *unicode; 235 236 /* Optimization for empty strings */ 237 if (length == 0 && unicode_empty != NULL) { 238 Py_INCREF(unicode_empty); 239 return unicode_empty; 240 } 241 242 /* Unicode freelist & memory allocation */ 243 if (unicode_freelist) { 244 unicode = unicode_freelist; 245 unicode_freelist = *(PyUnicodeObject **)unicode; 246 unicode_freelist_size--; 247 if (unicode->str) { 248 /* Keep-Alive optimization: we only upsize the buffer, 249 never downsize it. */ 250 if ((unicode->length < length) && 251 unicode_resize(unicode, length) < 0) { 252 PyMem_DEL(unicode->str); 253 goto onError; 254 } 255 } 256 else { 257 unicode->str = PyMem_NEW(Py_UNICODE, length + 1); 258 } 259 PyObject_INIT(unicode, &PyUnicode_Type); 260 } 261 else { 262 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type); 263 if (unicode == NULL) 264 return NULL; 265 unicode->str = PyMem_NEW(Py_UNICODE, length + 1); 266 } 267 268 if (!unicode->str) { 269 PyErr_NoMemory(); 270 goto onError; 271 } 272 /* Initialize the first element to guard against cases where 273 * the caller fails before initializing str -- unicode_resize() 274 * reads str[0], and the Keep-Alive optimization can keep memory 275 * allocated for str alive across a call to unicode_dealloc(unicode). 276 * We don't want unicode_resize to read uninitialized memory in 277 * that case. 278 */ 279 unicode->str[0] = 0; 280 unicode->str[length] = 0; 281 unicode->length = length; 282 unicode->hash = -1; 283 unicode->defenc = NULL; 284 return unicode; 285 286 onError: 287 _Py_ForgetReference((PyObject *)unicode); 288 PyObject_Del(unicode); 289 return NULL; 290} 291 292static 293void unicode_dealloc(register PyUnicodeObject *unicode) 294{ 295 if (PyUnicode_CheckExact(unicode) && 296 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) { 297 /* Keep-Alive optimization */ 298 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) { 299 PyMem_DEL(unicode->str); 300 unicode->str = NULL; 301 unicode->length = 0; 302 } 303 if (unicode->defenc) { 304 Py_DECREF(unicode->defenc); 305 unicode->defenc = NULL; 306 } 307 /* Add to free list */ 308 *(PyUnicodeObject **)unicode = unicode_freelist; 309 unicode_freelist = unicode; 310 unicode_freelist_size++; 311 } 312 else { 313 PyMem_DEL(unicode->str); 314 Py_XDECREF(unicode->defenc); 315 unicode->ob_type->tp_free((PyObject *)unicode); 316 } 317} 318 319int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length) 320{ 321 register PyUnicodeObject *v; 322 323 /* Argument checks */ 324 if (unicode == NULL) { 325 PyErr_BadInternalCall(); 326 return -1; 327 } 328 v = (PyUnicodeObject *)*unicode; 329 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1 || length < 0) { 330 PyErr_BadInternalCall(); 331 return -1; 332 } 333 334 /* Resizing unicode_empty and single character objects is not 335 possible since these are being shared. We simply return a fresh 336 copy with the same Unicode content. */ 337 if (v->length != length && 338 (v == unicode_empty || v->length == 1)) { 339 PyUnicodeObject *w = _PyUnicode_New(length); 340 if (w == NULL) 341 return -1; 342 Py_UNICODE_COPY(w->str, v->str, 343 length < v->length ? length : v->length); 344 Py_DECREF(*unicode); 345 *unicode = (PyObject *)w; 346 return 0; 347 } 348 349 /* Note that we don't have to modify *unicode for unshared Unicode 350 objects, since we can modify them in-place. */ 351 return unicode_resize(v, length); 352} 353 354/* Internal API for use in unicodeobject.c only ! */ 355#define _PyUnicode_Resize(unicodevar, length) \ 356 PyUnicode_Resize(((PyObject **)(unicodevar)), length) 357 358PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u, 359 Py_ssize_t size) 360{ 361 PyUnicodeObject *unicode; 362 363 /* If the Unicode data is known at construction time, we can apply 364 some optimizations which share commonly used objects. */ 365 if (u != NULL) { 366 367 /* Optimization for empty strings */ 368 if (size == 0 && unicode_empty != NULL) { 369 Py_INCREF(unicode_empty); 370 return (PyObject *)unicode_empty; 371 } 372 373 /* Single character Unicode objects in the Latin-1 range are 374 shared when using this constructor */ 375 if (size == 1 && *u < 256) { 376 unicode = unicode_latin1[*u]; 377 if (!unicode) { 378 unicode = _PyUnicode_New(1); 379 if (!unicode) 380 return NULL; 381 unicode->str[0] = *u; 382 unicode_latin1[*u] = unicode; 383 } 384 Py_INCREF(unicode); 385 return (PyObject *)unicode; 386 } 387 } 388 389 unicode = _PyUnicode_New(size); 390 if (!unicode) 391 return NULL; 392 393 /* Copy the Unicode data into the new object */ 394 if (u != NULL) 395 Py_UNICODE_COPY(unicode->str, u, size); 396 397 return (PyObject *)unicode; 398} 399 400#ifdef HAVE_WCHAR_H 401 402PyObject *PyUnicode_FromWideChar(register const wchar_t *w, 403 Py_ssize_t size) 404{ 405 PyUnicodeObject *unicode; 406 407 if (w == NULL) { 408 PyErr_BadInternalCall(); 409 return NULL; 410 } 411 412 unicode = _PyUnicode_New(size); 413 if (!unicode) 414 return NULL; 415 416 /* Copy the wchar_t data into the new object */ 417#ifdef HAVE_USABLE_WCHAR_T 418 memcpy(unicode->str, w, size * sizeof(wchar_t)); 419#else 420 { 421 register Py_UNICODE *u; 422 register Py_ssize_t i; 423 u = PyUnicode_AS_UNICODE(unicode); 424 for (i = size; i > 0; i--) 425 *u++ = *w++; 426 } 427#endif 428 429 return (PyObject *)unicode; 430} 431 432Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode, 433 wchar_t *w, 434 Py_ssize_t size) 435{ 436 if (unicode == NULL) { 437 PyErr_BadInternalCall(); 438 return -1; 439 } 440 441 /* If possible, try to copy the 0-termination as well */ 442 if (size > PyUnicode_GET_SIZE(unicode)) 443 size = PyUnicode_GET_SIZE(unicode) + 1; 444 445#ifdef HAVE_USABLE_WCHAR_T 446 memcpy(w, unicode->str, size * sizeof(wchar_t)); 447#else 448 { 449 register Py_UNICODE *u; 450 register Py_ssize_t i; 451 u = PyUnicode_AS_UNICODE(unicode); 452 for (i = size; i > 0; i--) 453 *w++ = *u++; 454 } 455#endif 456 457 if (size > PyUnicode_GET_SIZE(unicode)) 458 return PyUnicode_GET_SIZE(unicode); 459 else 460 return size; 461} 462 463#endif 464 465PyObject *PyUnicode_FromOrdinal(int ordinal) 466{ 467 Py_UNICODE s[1]; 468 469#ifdef Py_UNICODE_WIDE 470 if (ordinal < 0 || ordinal > 0x10ffff) { 471 PyErr_SetString(PyExc_ValueError, 472 "unichr() arg not in range(0x110000) " 473 "(wide Python build)"); 474 return NULL; 475 } 476#else 477 if (ordinal < 0 || ordinal > 0xffff) { 478 PyErr_SetString(PyExc_ValueError, 479 "unichr() arg not in range(0x10000) " 480 "(narrow Python build)"); 481 return NULL; 482 } 483#endif 484 485 s[0] = (Py_UNICODE)ordinal; 486 return PyUnicode_FromUnicode(s, 1); 487} 488 489PyObject *PyUnicode_FromObject(register PyObject *obj) 490{ 491 /* XXX Perhaps we should make this API an alias of 492 PyObject_Unicode() instead ?! */ 493 if (PyUnicode_CheckExact(obj)) { 494 Py_INCREF(obj); 495 return obj; 496 } 497 if (PyUnicode_Check(obj)) { 498 /* For a Unicode subtype that's not a Unicode object, 499 return a true Unicode object with the same data. */ 500 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj), 501 PyUnicode_GET_SIZE(obj)); 502 } 503 return PyUnicode_FromEncodedObject(obj, NULL, "strict"); 504} 505 506PyObject *PyUnicode_FromEncodedObject(register PyObject *obj, 507 const char *encoding, 508 const char *errors) 509{ 510 const char *s = NULL; 511 Py_ssize_t len; 512 PyObject *v; 513 514 if (obj == NULL) { 515 PyErr_BadInternalCall(); 516 return NULL; 517 } 518 519#if 0 520 /* For b/w compatibility we also accept Unicode objects provided 521 that no encodings is given and then redirect to 522 PyObject_Unicode() which then applies the additional logic for 523 Unicode subclasses. 524 525 NOTE: This API should really only be used for object which 526 represent *encoded* Unicode ! 527 528 */ 529 if (PyUnicode_Check(obj)) { 530 if (encoding) { 531 PyErr_SetString(PyExc_TypeError, 532 "decoding Unicode is not supported"); 533 return NULL; 534 } 535 return PyObject_Unicode(obj); 536 } 537#else 538 if (PyUnicode_Check(obj)) { 539 PyErr_SetString(PyExc_TypeError, 540 "decoding Unicode is not supported"); 541 return NULL; 542 } 543#endif 544 545 /* Coerce object */ 546 if (PyString_Check(obj)) { 547 s = PyString_AS_STRING(obj); 548 len = PyString_GET_SIZE(obj); 549 } 550 else if (PyObject_AsCharBuffer(obj, &s, &len)) { 551 /* Overwrite the error message with something more useful in 552 case of a TypeError. */ 553 if (PyErr_ExceptionMatches(PyExc_TypeError)) 554 PyErr_Format(PyExc_TypeError, 555 "coercing to Unicode: need string or buffer, " 556 "%.80s found", 557 obj->ob_type->tp_name); 558 goto onError; 559 } 560 561 /* Convert to Unicode */ 562 if (len == 0) { 563 Py_INCREF(unicode_empty); 564 v = (PyObject *)unicode_empty; 565 } 566 else 567 v = PyUnicode_Decode(s, len, encoding, errors); 568 569 return v; 570 571 onError: 572 return NULL; 573} 574 575PyObject *PyUnicode_Decode(const char *s, 576 Py_ssize_t size, 577 const char *encoding, 578 const char *errors) 579{ 580 PyObject *buffer = NULL, *unicode; 581 582 if (encoding == NULL) 583 encoding = PyUnicode_GetDefaultEncoding(); 584 585 /* Shortcuts for common default encodings */ 586 if (strcmp(encoding, "utf-8") == 0) 587 return PyUnicode_DecodeUTF8(s, size, errors); 588 else if (strcmp(encoding, "latin-1") == 0) 589 return PyUnicode_DecodeLatin1(s, size, errors); 590#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T) 591 else if (strcmp(encoding, "mbcs") == 0) 592 return PyUnicode_DecodeMBCS(s, size, errors); 593#endif 594 else if (strcmp(encoding, "ascii") == 0) 595 return PyUnicode_DecodeASCII(s, size, errors); 596 597 /* Decode via the codec registry */ 598 buffer = PyBuffer_FromMemory((void *)s, size); 599 if (buffer == NULL) 600 goto onError; 601 unicode = PyCodec_Decode(buffer, encoding, errors); 602 if (unicode == NULL) 603 goto onError; 604 if (!PyUnicode_Check(unicode)) { 605 PyErr_Format(PyExc_TypeError, 606 "decoder did not return an unicode object (type=%.400s)", 607 unicode->ob_type->tp_name); 608 Py_DECREF(unicode); 609 goto onError; 610 } 611 Py_DECREF(buffer); 612 return unicode; 613 614 onError: 615 Py_XDECREF(buffer); 616 return NULL; 617} 618 619PyObject *PyUnicode_AsDecodedObject(PyObject *unicode, 620 const char *encoding, 621 const char *errors) 622{ 623 PyObject *v; 624 625 if (!PyUnicode_Check(unicode)) { 626 PyErr_BadArgument(); 627 goto onError; 628 } 629 630 if (encoding == NULL) 631 encoding = PyUnicode_GetDefaultEncoding(); 632 633 /* Decode via the codec registry */ 634 v = PyCodec_Decode(unicode, encoding, errors); 635 if (v == NULL) 636 goto onError; 637 return v; 638 639 onError: 640 return NULL; 641} 642 643PyObject *PyUnicode_Encode(const Py_UNICODE *s, 644 Py_ssize_t size, 645 const char *encoding, 646 const char *errors) 647{ 648 PyObject *v, *unicode; 649 650 unicode = PyUnicode_FromUnicode(s, size); 651 if (unicode == NULL) 652 return NULL; 653 v = PyUnicode_AsEncodedString(unicode, encoding, errors); 654 Py_DECREF(unicode); 655 return v; 656} 657 658PyObject *PyUnicode_AsEncodedObject(PyObject *unicode, 659 const char *encoding, 660 const char *errors) 661{ 662 PyObject *v; 663 664 if (!PyUnicode_Check(unicode)) { 665 PyErr_BadArgument(); 666 goto onError; 667 } 668 669 if (encoding == NULL) 670 encoding = PyUnicode_GetDefaultEncoding(); 671 672 /* Encode via the codec registry */ 673 v = PyCodec_Encode(unicode, encoding, errors); 674 if (v == NULL) 675 goto onError; 676 return v; 677 678 onError: 679 return NULL; 680} 681 682PyObject *PyUnicode_AsEncodedString(PyObject *unicode, 683 const char *encoding, 684 const char *errors) 685{ 686 PyObject *v; 687 688 if (!PyUnicode_Check(unicode)) { 689 PyErr_BadArgument(); 690 goto onError; 691 } 692 693 if (encoding == NULL) 694 encoding = PyUnicode_GetDefaultEncoding(); 695 696 /* Shortcuts for common default encodings */ 697 if (errors == NULL) { 698 if (strcmp(encoding, "utf-8") == 0) 699 return PyUnicode_AsUTF8String(unicode); 700 else if (strcmp(encoding, "latin-1") == 0) 701 return PyUnicode_AsLatin1String(unicode); 702#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T) 703 else if (strcmp(encoding, "mbcs") == 0) 704 return PyUnicode_AsMBCSString(unicode); 705#endif 706 else if (strcmp(encoding, "ascii") == 0) 707 return PyUnicode_AsASCIIString(unicode); 708 } 709 710 /* Encode via the codec registry */ 711 v = PyCodec_Encode(unicode, encoding, errors); 712 if (v == NULL) 713 goto onError; 714 if (!PyString_Check(v)) { 715 PyErr_Format(PyExc_TypeError, 716 "encoder did not return a string object (type=%.400s)", 717 v->ob_type->tp_name); 718 Py_DECREF(v); 719 goto onError; 720 } 721 return v; 722 723 onError: 724 return NULL; 725} 726 727PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode, 728 const char *errors) 729{ 730 PyObject *v = ((PyUnicodeObject *)unicode)->defenc; 731 732 if (v) 733 return v; 734 v = PyUnicode_AsEncodedString(unicode, NULL, errors); 735 if (v && errors == NULL) 736 ((PyUnicodeObject *)unicode)->defenc = v; 737 return v; 738} 739 740Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode) 741{ 742 if (!PyUnicode_Check(unicode)) { 743 PyErr_BadArgument(); 744 goto onError; 745 } 746 return PyUnicode_AS_UNICODE(unicode); 747 748 onError: 749 return NULL; 750} 751 752Py_ssize_t PyUnicode_GetSize(PyObject *unicode) 753{ 754 if (!PyUnicode_Check(unicode)) { 755 PyErr_BadArgument(); 756 goto onError; 757 } 758 return PyUnicode_GET_SIZE(unicode); 759 760 onError: 761 return -1; 762} 763 764const char *PyUnicode_GetDefaultEncoding(void) 765{ 766 return unicode_default_encoding; 767} 768 769int PyUnicode_SetDefaultEncoding(const char *encoding) 770{ 771 PyObject *v; 772 773 /* Make sure the encoding is valid. As side effect, this also 774 loads the encoding into the codec registry cache. */ 775 v = _PyCodec_Lookup(encoding); 776 if (v == NULL) 777 goto onError; 778 Py_DECREF(v); 779 strncpy(unicode_default_encoding, 780 encoding, 781 sizeof(unicode_default_encoding)); 782 return 0; 783 784 onError: 785 return -1; 786} 787 788/* error handling callback helper: 789 build arguments, call the callback and check the arguments, 790 if no exception occurred, copy the replacement to the output 791 and adjust various state variables. 792 return 0 on success, -1 on error 793*/ 794 795static 796int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler, 797 const char *encoding, const char *reason, 798 const char *input, Py_ssize_t insize, Py_ssize_t *startinpos, Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr, 799 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr) 800{ 801 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple"; 802 803 PyObject *restuple = NULL; 804 PyObject *repunicode = NULL; 805 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output); 806 Py_ssize_t requiredsize; 807 Py_ssize_t newpos; 808 Py_UNICODE *repptr; 809 Py_ssize_t repsize; 810 int res = -1; 811 812 if (*errorHandler == NULL) { 813 *errorHandler = PyCodec_LookupError(errors); 814 if (*errorHandler == NULL) 815 goto onError; 816 } 817 818 if (*exceptionObject == NULL) { 819 *exceptionObject = PyUnicodeDecodeError_Create( 820 encoding, input, insize, *startinpos, *endinpos, reason); 821 if (*exceptionObject == NULL) 822 goto onError; 823 } 824 else { 825 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos)) 826 goto onError; 827 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos)) 828 goto onError; 829 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason)) 830 goto onError; 831 } 832 833 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL); 834 if (restuple == NULL) 835 goto onError; 836 if (!PyTuple_Check(restuple)) { 837 PyErr_Format(PyExc_TypeError, &argparse[4]); 838 goto onError; 839 } 840 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos)) 841 goto onError; 842 if (newpos<0) 843 newpos = insize+newpos; 844 if (newpos<0 || newpos>insize) { 845 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos); 846 goto onError; 847 } 848 849 /* need more space? (at least enough for what we 850 have+the replacement+the rest of the string (starting 851 at the new input position), so we won't have to check space 852 when there are no errors in the rest of the string) */ 853 repptr = PyUnicode_AS_UNICODE(repunicode); 854 repsize = PyUnicode_GET_SIZE(repunicode); 855 requiredsize = *outpos + repsize + insize-newpos; 856 if (requiredsize > outsize) { 857 if (requiredsize<2*outsize) 858 requiredsize = 2*outsize; 859 if (PyUnicode_Resize(output, requiredsize) < 0) 860 goto onError; 861 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos; 862 } 863 *endinpos = newpos; 864 *inptr = input + newpos; 865 Py_UNICODE_COPY(*outptr, repptr, repsize); 866 *outptr += repsize; 867 *outpos += repsize; 868 /* we made it! */ 869 res = 0; 870 871 onError: 872 Py_XDECREF(restuple); 873 return res; 874} 875 876/* --- UTF-7 Codec -------------------------------------------------------- */ 877 878/* see RFC2152 for details */ 879 880static 881char utf7_special[128] = { 882 /* indicate whether a UTF-7 character is special i.e. cannot be directly 883 encoded: 884 0 - not special 885 1 - special 886 2 - whitespace (optional) 887 3 - RFC2152 Set O (optional) */ 888 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1, 889 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 890 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1, 891 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0, 892 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 893 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3, 894 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 895 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1, 896 897}; 898 899/* Note: The comparison (c) <= 0 is a trick to work-around gcc 900 warnings about the comparison always being false; since 901 utf7_special[0] is 1, we can safely make that one comparison 902 true */ 903 904#define SPECIAL(c, encodeO, encodeWS) \ 905 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \ 906 (encodeWS && (utf7_special[(c)] == 2)) || \ 907 (encodeO && (utf7_special[(c)] == 3))) 908 909#define B64(n) \ 910 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f]) 911#define B64CHAR(c) \ 912 (isalnum(c) || (c) == '+' || (c) == '/') 913#define UB64(c) \ 914 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \ 915 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 ) 916 917#define ENCODE(out, ch, bits) \ 918 while (bits >= 6) { \ 919 *out++ = B64(ch >> (bits-6)); \ 920 bits -= 6; \ 921 } 922 923#define DECODE(out, ch, bits, surrogate) \ 924 while (bits >= 16) { \ 925 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \ 926 bits -= 16; \ 927 if (surrogate) { \ 928 /* We have already generated an error for the high surrogate \ 929 so let's not bother seeing if the low surrogate is correct or not */ \ 930 surrogate = 0; \ 931 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \ 932 /* This is a surrogate pair. Unfortunately we can't represent \ 933 it in a 16-bit character */ \ 934 surrogate = 1; \ 935 errmsg = "code pairs are not supported"; \ 936 goto utf7Error; \ 937 } else { \ 938 *out++ = outCh; \ 939 } \ 940 } 941 942PyObject *PyUnicode_DecodeUTF7(const char *s, 943 Py_ssize_t size, 944 const char *errors) 945{ 946 const char *starts = s; 947 Py_ssize_t startinpos; 948 Py_ssize_t endinpos; 949 Py_ssize_t outpos; 950 const char *e; 951 PyUnicodeObject *unicode; 952 Py_UNICODE *p; 953 const char *errmsg = ""; 954 int inShift = 0; 955 unsigned int bitsleft = 0; 956 unsigned long charsleft = 0; 957 int surrogate = 0; 958 PyObject *errorHandler = NULL; 959 PyObject *exc = NULL; 960 961 unicode = _PyUnicode_New(size); 962 if (!unicode) 963 return NULL; 964 if (size == 0) 965 return (PyObject *)unicode; 966 967 p = unicode->str; 968 e = s + size; 969 970 while (s < e) { 971 Py_UNICODE ch; 972 restart: 973 ch = *s; 974 975 if (inShift) { 976 if ((ch == '-') || !B64CHAR(ch)) { 977 inShift = 0; 978 s++; 979 980 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate); 981 if (bitsleft >= 6) { 982 /* The shift sequence has a partial character in it. If 983 bitsleft < 6 then we could just classify it as padding 984 but that is not the case here */ 985 986 errmsg = "partial character in shift sequence"; 987 goto utf7Error; 988 } 989 /* According to RFC2152 the remaining bits should be zero. We 990 choose to signal an error/insert a replacement character 991 here so indicate the potential of a misencoded character. */ 992 993 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */ 994 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) { 995 errmsg = "non-zero padding bits in shift sequence"; 996 goto utf7Error; 997 } 998 999 if (ch == '-') { 1000 if ((s < e) && (*(s) == '-')) { 1001 *p++ = '-'; 1002 inShift = 1; 1003 } 1004 } else if (SPECIAL(ch,0,0)) { 1005 errmsg = "unexpected special character"; 1006 goto utf7Error; 1007 } else { 1008 *p++ = ch; 1009 } 1010 } else { 1011 charsleft = (charsleft << 6) | UB64(ch); 1012 bitsleft += 6; 1013 s++; 1014 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate); 1015 } 1016 } 1017 else if ( ch == '+' ) { 1018 startinpos = s-starts; 1019 s++; 1020 if (s < e && *s == '-') { 1021 s++; 1022 *p++ = '+'; 1023 } else 1024 { 1025 inShift = 1; 1026 bitsleft = 0; 1027 } 1028 } 1029 else if (SPECIAL(ch,0,0)) { 1030 errmsg = "unexpected special character"; 1031 s++; 1032 goto utf7Error; 1033 } 1034 else { 1035 *p++ = ch; 1036 s++; 1037 } 1038 continue; 1039 utf7Error: 1040 outpos = p-PyUnicode_AS_UNICODE(unicode); 1041 endinpos = s-starts; 1042 if (unicode_decode_call_errorhandler( 1043 errors, &errorHandler, 1044 "utf7", errmsg, 1045 starts, size, &startinpos, &endinpos, &exc, &s, 1046 (PyObject **)&unicode, &outpos, &p)) 1047 goto onError; 1048 } 1049 1050 if (inShift) { 1051 outpos = p-PyUnicode_AS_UNICODE(unicode); 1052 endinpos = size; 1053 if (unicode_decode_call_errorhandler( 1054 errors, &errorHandler, 1055 "utf7", "unterminated shift sequence", 1056 starts, size, &startinpos, &endinpos, &exc, &s, 1057 (PyObject **)&unicode, &outpos, &p)) 1058 goto onError; 1059 if (s < e) 1060 goto restart; 1061 } 1062 1063 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0) 1064 goto onError; 1065 1066 Py_XDECREF(errorHandler); 1067 Py_XDECREF(exc); 1068 return (PyObject *)unicode; 1069 1070onError: 1071 Py_XDECREF(errorHandler); 1072 Py_XDECREF(exc); 1073 Py_DECREF(unicode); 1074 return NULL; 1075} 1076 1077 1078PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s, 1079 Py_ssize_t size, 1080 int encodeSetO, 1081 int encodeWhiteSpace, 1082 const char *errors) 1083{ 1084 PyObject *v; 1085 /* It might be possible to tighten this worst case */ 1086 Py_ssize_t cbAllocated = 5 * size; 1087 int inShift = 0; 1088 Py_ssize_t i = 0; 1089 unsigned int bitsleft = 0; 1090 unsigned long charsleft = 0; 1091 char * out; 1092 char * start; 1093 1094 if (size == 0) 1095 return PyString_FromStringAndSize(NULL, 0); 1096 1097 v = PyString_FromStringAndSize(NULL, cbAllocated); 1098 if (v == NULL) 1099 return NULL; 1100 1101 start = out = PyString_AS_STRING(v); 1102 for (;i < size; ++i) { 1103 Py_UNICODE ch = s[i]; 1104 1105 if (!inShift) { 1106 if (ch == '+') { 1107 *out++ = '+'; 1108 *out++ = '-'; 1109 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) { 1110 charsleft = ch; 1111 bitsleft = 16; 1112 *out++ = '+'; 1113 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft); 1114 inShift = bitsleft > 0; 1115 } else { 1116 *out++ = (char) ch; 1117 } 1118 } else { 1119 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) { 1120 *out++ = B64(charsleft << (6-bitsleft)); 1121 charsleft = 0; 1122 bitsleft = 0; 1123 /* Characters not in the BASE64 set implicitly unshift the sequence 1124 so no '-' is required, except if the character is itself a '-' */ 1125 if (B64CHAR(ch) || ch == '-') { 1126 *out++ = '-'; 1127 } 1128 inShift = 0; 1129 *out++ = (char) ch; 1130 } else { 1131 bitsleft += 16; 1132 charsleft = (charsleft << 16) | ch; 1133 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft); 1134 1135 /* If the next character is special then we dont' need to terminate 1136 the shift sequence. If the next character is not a BASE64 character 1137 or '-' then the shift sequence will be terminated implicitly and we 1138 don't have to insert a '-'. */ 1139 1140 if (bitsleft == 0) { 1141 if (i + 1 < size) { 1142 Py_UNICODE ch2 = s[i+1]; 1143 1144 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) { 1145 1146 } else if (B64CHAR(ch2) || ch2 == '-') { 1147 *out++ = '-'; 1148 inShift = 0; 1149 } else { 1150 inShift = 0; 1151 } 1152 1153 } 1154 else { 1155 *out++ = '-'; 1156 inShift = 0; 1157 } 1158 } 1159 } 1160 } 1161 } 1162 if (bitsleft) { 1163 *out++= B64(charsleft << (6-bitsleft) ); 1164 *out++ = '-'; 1165 } 1166 1167 _PyString_Resize(&v, out - start); 1168 return v; 1169} 1170 1171#undef SPECIAL 1172#undef B64 1173#undef B64CHAR 1174#undef UB64 1175#undef ENCODE 1176#undef DECODE 1177 1178/* --- UTF-8 Codec -------------------------------------------------------- */ 1179 1180static 1181char utf8_code_length[256] = { 1182 /* Map UTF-8 encoded prefix byte to sequence length. zero means 1183 illegal prefix. see RFC 2279 for details */ 1184 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1185 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1186 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1187 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1188 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1189 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1190 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1191 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1192 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1193 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1194 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1195 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1196 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1197 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1198 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1199 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0 1200}; 1201 1202PyObject *PyUnicode_DecodeUTF8(const char *s, 1203 Py_ssize_t size, 1204 const char *errors) 1205{ 1206 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL); 1207} 1208 1209PyObject *PyUnicode_DecodeUTF8Stateful(const char *s, 1210 Py_ssize_t size, 1211 const char *errors, 1212 Py_ssize_t *consumed) 1213{ 1214 const char *starts = s; 1215 int n; 1216 Py_ssize_t startinpos; 1217 Py_ssize_t endinpos; 1218 Py_ssize_t outpos; 1219 const char *e; 1220 PyUnicodeObject *unicode; 1221 Py_UNICODE *p; 1222 const char *errmsg = ""; 1223 PyObject *errorHandler = NULL; 1224 PyObject *exc = NULL; 1225 1226 /* Note: size will always be longer than the resulting Unicode 1227 character count */ 1228 unicode = _PyUnicode_New(size); 1229 if (!unicode) 1230 return NULL; 1231 if (size == 0) { 1232 if (consumed) 1233 *consumed = 0; 1234 return (PyObject *)unicode; 1235 } 1236 1237 /* Unpack UTF-8 encoded data */ 1238 p = unicode->str; 1239 e = s + size; 1240 1241 while (s < e) { 1242 Py_UCS4 ch = (unsigned char)*s; 1243 1244 if (ch < 0x80) { 1245 *p++ = (Py_UNICODE)ch; 1246 s++; 1247 continue; 1248 } 1249 1250 n = utf8_code_length[ch]; 1251 1252 if (s + n > e) { 1253 if (consumed) 1254 break; 1255 else { 1256 errmsg = "unexpected end of data"; 1257 startinpos = s-starts; 1258 endinpos = size; 1259 goto utf8Error; 1260 } 1261 } 1262 1263 switch (n) { 1264 1265 case 0: 1266 errmsg = "unexpected code byte"; 1267 startinpos = s-starts; 1268 endinpos = startinpos+1; 1269 goto utf8Error; 1270 1271 case 1: 1272 errmsg = "internal error"; 1273 startinpos = s-starts; 1274 endinpos = startinpos+1; 1275 goto utf8Error; 1276 1277 case 2: 1278 if ((s[1] & 0xc0) != 0x80) { 1279 errmsg = "invalid data"; 1280 startinpos = s-starts; 1281 endinpos = startinpos+2; 1282 goto utf8Error; 1283 } 1284 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f); 1285 if (ch < 0x80) { 1286 startinpos = s-starts; 1287 endinpos = startinpos+2; 1288 errmsg = "illegal encoding"; 1289 goto utf8Error; 1290 } 1291 else 1292 *p++ = (Py_UNICODE)ch; 1293 break; 1294 1295 case 3: 1296 if ((s[1] & 0xc0) != 0x80 || 1297 (s[2] & 0xc0) != 0x80) { 1298 errmsg = "invalid data"; 1299 startinpos = s-starts; 1300 endinpos = startinpos+3; 1301 goto utf8Error; 1302 } 1303 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f); 1304 if (ch < 0x0800) { 1305 /* Note: UTF-8 encodings of surrogates are considered 1306 legal UTF-8 sequences; 1307 1308 XXX For wide builds (UCS-4) we should probably try 1309 to recombine the surrogates into a single code 1310 unit. 1311 */ 1312 errmsg = "illegal encoding"; 1313 startinpos = s-starts; 1314 endinpos = startinpos+3; 1315 goto utf8Error; 1316 } 1317 else 1318 *p++ = (Py_UNICODE)ch; 1319 break; 1320 1321 case 4: 1322 if ((s[1] & 0xc0) != 0x80 || 1323 (s[2] & 0xc0) != 0x80 || 1324 (s[3] & 0xc0) != 0x80) { 1325 errmsg = "invalid data"; 1326 startinpos = s-starts; 1327 endinpos = startinpos+4; 1328 goto utf8Error; 1329 } 1330 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) + 1331 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f); 1332 /* validate and convert to UTF-16 */ 1333 if ((ch < 0x10000) /* minimum value allowed for 4 1334 byte encoding */ 1335 || (ch > 0x10ffff)) /* maximum value allowed for 1336 UTF-16 */ 1337 { 1338 errmsg = "illegal encoding"; 1339 startinpos = s-starts; 1340 endinpos = startinpos+4; 1341 goto utf8Error; 1342 } 1343#ifdef Py_UNICODE_WIDE 1344 *p++ = (Py_UNICODE)ch; 1345#else 1346 /* compute and append the two surrogates: */ 1347 1348 /* translate from 10000..10FFFF to 0..FFFF */ 1349 ch -= 0x10000; 1350 1351 /* high surrogate = top 10 bits added to D800 */ 1352 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10)); 1353 1354 /* low surrogate = bottom 10 bits added to DC00 */ 1355 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF)); 1356#endif 1357 break; 1358 1359 default: 1360 /* Other sizes are only needed for UCS-4 */ 1361 errmsg = "unsupported Unicode code range"; 1362 startinpos = s-starts; 1363 endinpos = startinpos+n; 1364 goto utf8Error; 1365 } 1366 s += n; 1367 continue; 1368 1369 utf8Error: 1370 outpos = p-PyUnicode_AS_UNICODE(unicode); 1371 if (unicode_decode_call_errorhandler( 1372 errors, &errorHandler, 1373 "utf8", errmsg, 1374 starts, size, &startinpos, &endinpos, &exc, &s, 1375 (PyObject **)&unicode, &outpos, &p)) 1376 goto onError; 1377 } 1378 if (consumed) 1379 *consumed = s-starts; 1380 1381 /* Adjust length */ 1382 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0) 1383 goto onError; 1384 1385 Py_XDECREF(errorHandler); 1386 Py_XDECREF(exc); 1387 return (PyObject *)unicode; 1388 1389onError: 1390 Py_XDECREF(errorHandler); 1391 Py_XDECREF(exc); 1392 Py_DECREF(unicode); 1393 return NULL; 1394} 1395 1396/* Allocation strategy: if the string is short, convert into a stack buffer 1397 and allocate exactly as much space needed at the end. Else allocate the 1398 maximum possible needed (4 result bytes per Unicode character), and return 1399 the excess memory at the end. 1400*/ 1401PyObject * 1402PyUnicode_EncodeUTF8(const Py_UNICODE *s, 1403 Py_ssize_t size, 1404 const char *errors) 1405{ 1406#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */ 1407 1408 Py_ssize_t i; /* index into s of next input byte */ 1409 PyObject *v; /* result string object */ 1410 char *p; /* next free byte in output buffer */ 1411 Py_ssize_t nallocated; /* number of result bytes allocated */ 1412 Py_ssize_t nneeded; /* number of result bytes needed */ 1413 char stackbuf[MAX_SHORT_UNICHARS * 4]; 1414 1415 assert(s != NULL); 1416 assert(size >= 0); 1417 1418 if (size <= MAX_SHORT_UNICHARS) { 1419 /* Write into the stack buffer; nallocated can't overflow. 1420 * At the end, we'll allocate exactly as much heap space as it 1421 * turns out we need. 1422 */ 1423 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int); 1424 v = NULL; /* will allocate after we're done */ 1425 p = stackbuf; 1426 } 1427 else { 1428 /* Overallocate on the heap, and give the excess back at the end. */ 1429 nallocated = size * 4; 1430 if (nallocated / 4 != size) /* overflow! */ 1431 return PyErr_NoMemory(); 1432 v = PyString_FromStringAndSize(NULL, nallocated); 1433 if (v == NULL) 1434 return NULL; 1435 p = PyString_AS_STRING(v); 1436 } 1437 1438 for (i = 0; i < size;) { 1439 Py_UCS4 ch = s[i++]; 1440 1441 if (ch < 0x80) 1442 /* Encode ASCII */ 1443 *p++ = (char) ch; 1444 1445 else if (ch < 0x0800) { 1446 /* Encode Latin-1 */ 1447 *p++ = (char)(0xc0 | (ch >> 6)); 1448 *p++ = (char)(0x80 | (ch & 0x3f)); 1449 } 1450 else { 1451 /* Encode UCS2 Unicode ordinals */ 1452 if (ch < 0x10000) { 1453 /* Special case: check for high surrogate */ 1454 if (0xD800 <= ch && ch <= 0xDBFF && i != size) { 1455 Py_UCS4 ch2 = s[i]; 1456 /* Check for low surrogate and combine the two to 1457 form a UCS4 value */ 1458 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) { 1459 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000; 1460 i++; 1461 goto encodeUCS4; 1462 } 1463 /* Fall through: handles isolated high surrogates */ 1464 } 1465 *p++ = (char)(0xe0 | (ch >> 12)); 1466 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); 1467 *p++ = (char)(0x80 | (ch & 0x3f)); 1468 continue; 1469 } 1470encodeUCS4: 1471 /* Encode UCS4 Unicode ordinals */ 1472 *p++ = (char)(0xf0 | (ch >> 18)); 1473 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f)); 1474 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); 1475 *p++ = (char)(0x80 | (ch & 0x3f)); 1476 } 1477 } 1478 1479 if (v == NULL) { 1480 /* This was stack allocated. */ 1481 nneeded = p - stackbuf; 1482 assert(nneeded <= nallocated); 1483 v = PyString_FromStringAndSize(stackbuf, nneeded); 1484 } 1485 else { 1486 /* Cut back to size actually needed. */ 1487 nneeded = p - PyString_AS_STRING(v); 1488 assert(nneeded <= nallocated); 1489 _PyString_Resize(&v, nneeded); 1490 } 1491 return v; 1492 1493#undef MAX_SHORT_UNICHARS 1494} 1495 1496PyObject *PyUnicode_AsUTF8String(PyObject *unicode) 1497{ 1498 if (!PyUnicode_Check(unicode)) { 1499 PyErr_BadArgument(); 1500 return NULL; 1501 } 1502 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode), 1503 PyUnicode_GET_SIZE(unicode), 1504 NULL); 1505} 1506 1507/* --- UTF-16 Codec ------------------------------------------------------- */ 1508 1509PyObject * 1510PyUnicode_DecodeUTF16(const char *s, 1511 Py_ssize_t size, 1512 const char *errors, 1513 int *byteorder) 1514{ 1515 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL); 1516} 1517 1518PyObject * 1519PyUnicode_DecodeUTF16Stateful(const char *s, 1520 Py_ssize_t size, 1521 const char *errors, 1522 int *byteorder, 1523 Py_ssize_t *consumed) 1524{ 1525 const char *starts = s; 1526 Py_ssize_t startinpos; 1527 Py_ssize_t endinpos; 1528 Py_ssize_t outpos; 1529 PyUnicodeObject *unicode; 1530 Py_UNICODE *p; 1531 const unsigned char *q, *e; 1532 int bo = 0; /* assume native ordering by default */ 1533 const char *errmsg = ""; 1534 /* Offsets from q for retrieving byte pairs in the right order. */ 1535#ifdef BYTEORDER_IS_LITTLE_ENDIAN 1536 int ihi = 1, ilo = 0; 1537#else 1538 int ihi = 0, ilo = 1; 1539#endif 1540 PyObject *errorHandler = NULL; 1541 PyObject *exc = NULL; 1542 1543 /* Note: size will always be longer than the resulting Unicode 1544 character count */ 1545 unicode = _PyUnicode_New(size); 1546 if (!unicode) 1547 return NULL; 1548 if (size == 0) 1549 return (PyObject *)unicode; 1550 1551 /* Unpack UTF-16 encoded data */ 1552 p = unicode->str; 1553 q = (unsigned char *)s; 1554 e = q + size; 1555 1556 if (byteorder) 1557 bo = *byteorder; 1558 1559 /* Check for BOM marks (U+FEFF) in the input and adjust current 1560 byte order setting accordingly. In native mode, the leading BOM 1561 mark is skipped, in all other modes, it is copied to the output 1562 stream as-is (giving a ZWNBSP character). */ 1563 if (bo == 0) { 1564 if (size >= 2) { 1565 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo]; 1566#ifdef BYTEORDER_IS_LITTLE_ENDIAN 1567 if (bom == 0xFEFF) { 1568 q += 2; 1569 bo = -1; 1570 } 1571 else if (bom == 0xFFFE) { 1572 q += 2; 1573 bo = 1; 1574 } 1575#else 1576 if (bom == 0xFEFF) { 1577 q += 2; 1578 bo = 1; 1579 } 1580 else if (bom == 0xFFFE) { 1581 q += 2; 1582 bo = -1; 1583 } 1584#endif 1585 } 1586 } 1587 1588 if (bo == -1) { 1589 /* force LE */ 1590 ihi = 1; 1591 ilo = 0; 1592 } 1593 else if (bo == 1) { 1594 /* force BE */ 1595 ihi = 0; 1596 ilo = 1; 1597 } 1598 1599 while (q < e) { 1600 Py_UNICODE ch; 1601 /* remaining bytes at the end? (size should be even) */ 1602 if (e-q<2) { 1603 if (consumed) 1604 break; 1605 errmsg = "truncated data"; 1606 startinpos = ((const char *)q)-starts; 1607 endinpos = ((const char *)e)-starts; 1608 goto utf16Error; 1609 /* The remaining input chars are ignored if the callback 1610 chooses to skip the input */ 1611 } 1612 ch = (q[ihi] << 8) | q[ilo]; 1613 1614 q += 2; 1615 1616 if (ch < 0xD800 || ch > 0xDFFF) { 1617 *p++ = ch; 1618 continue; 1619 } 1620 1621 /* UTF-16 code pair: */ 1622 if (q >= e) { 1623 errmsg = "unexpected end of data"; 1624 startinpos = (((const char *)q)-2)-starts; 1625 endinpos = ((const char *)e)-starts; 1626 goto utf16Error; 1627 } 1628 if (0xD800 <= ch && ch <= 0xDBFF) { 1629 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo]; 1630 q += 2; 1631 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) { 1632#ifndef Py_UNICODE_WIDE 1633 *p++ = ch; 1634 *p++ = ch2; 1635#else 1636 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000; 1637#endif 1638 continue; 1639 } 1640 else { 1641 errmsg = "illegal UTF-16 surrogate"; 1642 startinpos = (((const char *)q)-4)-starts; 1643 endinpos = startinpos+2; 1644 goto utf16Error; 1645 } 1646 1647 } 1648 errmsg = "illegal encoding"; 1649 startinpos = (((const char *)q)-2)-starts; 1650 endinpos = startinpos+2; 1651 /* Fall through to report the error */ 1652 1653 utf16Error: 1654 outpos = p-PyUnicode_AS_UNICODE(unicode); 1655 if (unicode_decode_call_errorhandler( 1656 errors, &errorHandler, 1657 "utf16", errmsg, 1658 starts, size, &startinpos, &endinpos, &exc, (const char **)&q, 1659 (PyObject **)&unicode, &outpos, &p)) 1660 goto onError; 1661 } 1662 1663 if (byteorder) 1664 *byteorder = bo; 1665 1666 if (consumed) 1667 *consumed = (const char *)q-starts; 1668 1669 /* Adjust length */ 1670 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0) 1671 goto onError; 1672 1673 Py_XDECREF(errorHandler); 1674 Py_XDECREF(exc); 1675 return (PyObject *)unicode; 1676 1677onError: 1678 Py_DECREF(unicode); 1679 Py_XDECREF(errorHandler); 1680 Py_XDECREF(exc); 1681 return NULL; 1682} 1683 1684PyObject * 1685PyUnicode_EncodeUTF16(const Py_UNICODE *s, 1686 Py_ssize_t size, 1687 const char *errors, 1688 int byteorder) 1689{ 1690 PyObject *v; 1691 unsigned char *p; 1692#ifdef Py_UNICODE_WIDE 1693 int i, pairs; 1694#else 1695 const int pairs = 0; 1696#endif 1697 /* Offsets from p for storing byte pairs in the right order. */ 1698#ifdef BYTEORDER_IS_LITTLE_ENDIAN 1699 int ihi = 1, ilo = 0; 1700#else 1701 int ihi = 0, ilo = 1; 1702#endif 1703 1704#define STORECHAR(CH) \ 1705 do { \ 1706 p[ihi] = ((CH) >> 8) & 0xff; \ 1707 p[ilo] = (CH) & 0xff; \ 1708 p += 2; \ 1709 } while(0) 1710 1711#ifdef Py_UNICODE_WIDE 1712 for (i = pairs = 0; i < size; i++) 1713 if (s[i] >= 0x10000) 1714 pairs++; 1715#endif 1716 v = PyString_FromStringAndSize(NULL, 1717 2 * (size + pairs + (byteorder == 0))); 1718 if (v == NULL) 1719 return NULL; 1720 1721 p = (unsigned char *)PyString_AS_STRING(v); 1722 if (byteorder == 0) 1723 STORECHAR(0xFEFF); 1724 if (size == 0) 1725 return v; 1726 1727 if (byteorder == -1) { 1728 /* force LE */ 1729 ihi = 1; 1730 ilo = 0; 1731 } 1732 else if (byteorder == 1) { 1733 /* force BE */ 1734 ihi = 0; 1735 ilo = 1; 1736 } 1737 1738 while (size-- > 0) { 1739 Py_UNICODE ch = *s++; 1740 Py_UNICODE ch2 = 0; 1741#ifdef Py_UNICODE_WIDE 1742 if (ch >= 0x10000) { 1743 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF); 1744 ch = 0xD800 | ((ch-0x10000) >> 10); 1745 } 1746#endif 1747 STORECHAR(ch); 1748 if (ch2) 1749 STORECHAR(ch2); 1750 } 1751 return v; 1752#undef STORECHAR 1753} 1754 1755PyObject *PyUnicode_AsUTF16String(PyObject *unicode) 1756{ 1757 if (!PyUnicode_Check(unicode)) { 1758 PyErr_BadArgument(); 1759 return NULL; 1760 } 1761 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode), 1762 PyUnicode_GET_SIZE(unicode), 1763 NULL, 1764 0); 1765} 1766 1767/* --- Unicode Escape Codec ----------------------------------------------- */ 1768 1769static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL; 1770 1771PyObject *PyUnicode_DecodeUnicodeEscape(const char *s, 1772 Py_ssize_t size, 1773 const char *errors) 1774{ 1775 const char *starts = s; 1776 Py_ssize_t startinpos; 1777 Py_ssize_t endinpos; 1778 Py_ssize_t outpos; 1779 int i; 1780 PyUnicodeObject *v; 1781 Py_UNICODE *p; 1782 const char *end; 1783 char* message; 1784 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */ 1785 PyObject *errorHandler = NULL; 1786 PyObject *exc = NULL; 1787 1788 /* Escaped strings will always be longer than the resulting 1789 Unicode string, so we start with size here and then reduce the 1790 length after conversion to the true value. 1791 (but if the error callback returns a long replacement string 1792 we'll have to allocate more space) */ 1793 v = _PyUnicode_New(size); 1794 if (v == NULL) 1795 goto onError; 1796 if (size == 0) 1797 return (PyObject *)v; 1798 1799 p = PyUnicode_AS_UNICODE(v); 1800 end = s + size; 1801 1802 while (s < end) { 1803 unsigned char c; 1804 Py_UNICODE x; 1805 int digits; 1806 1807 /* Non-escape characters are interpreted as Unicode ordinals */ 1808 if (*s != '\\') { 1809 *p++ = (unsigned char) *s++; 1810 continue; 1811 } 1812 1813 startinpos = s-starts; 1814 /* \ - Escapes */ 1815 s++; 1816 switch (*s++) { 1817 1818 /* \x escapes */ 1819 case '\n': break; 1820 case '\\': *p++ = '\\'; break; 1821 case '\'': *p++ = '\''; break; 1822 case '\"': *p++ = '\"'; break; 1823 case 'b': *p++ = '\b'; break; 1824 case 'f': *p++ = '\014'; break; /* FF */ 1825 case 't': *p++ = '\t'; break; 1826 case 'n': *p++ = '\n'; break; 1827 case 'r': *p++ = '\r'; break; 1828 case 'v': *p++ = '\013'; break; /* VT */ 1829 case 'a': *p++ = '\007'; break; /* BEL, not classic C */ 1830 1831 /* \OOO (octal) escapes */ 1832 case '0': case '1': case '2': case '3': 1833 case '4': case '5': case '6': case '7': 1834 x = s[-1] - '0'; 1835 if ('0' <= *s && *s <= '7') { 1836 x = (x<<3) + *s++ - '0'; 1837 if ('0' <= *s && *s <= '7') 1838 x = (x<<3) + *s++ - '0'; 1839 } 1840 *p++ = x; 1841 break; 1842 1843 /* hex escapes */ 1844 /* \xXX */ 1845 case 'x': 1846 digits = 2; 1847 message = "truncated \\xXX escape"; 1848 goto hexescape; 1849 1850 /* \uXXXX */ 1851 case 'u': 1852 digits = 4; 1853 message = "truncated \\uXXXX escape"; 1854 goto hexescape; 1855 1856 /* \UXXXXXXXX */ 1857 case 'U': 1858 digits = 8; 1859 message = "truncated \\UXXXXXXXX escape"; 1860 hexescape: 1861 chr = 0; 1862 outpos = p-PyUnicode_AS_UNICODE(v); 1863 if (s+digits>end) { 1864 endinpos = size; 1865 if (unicode_decode_call_errorhandler( 1866 errors, &errorHandler, 1867 "unicodeescape", "end of string in escape sequence", 1868 starts, size, &startinpos, &endinpos, &exc, &s, 1869 (PyObject **)&v, &outpos, &p)) 1870 goto onError; 1871 goto nextByte; 1872 } 1873 for (i = 0; i < digits; ++i) { 1874 c = (unsigned char) s[i]; 1875 if (!isxdigit(c)) { 1876 endinpos = (s+i+1)-starts; 1877 if (unicode_decode_call_errorhandler( 1878 errors, &errorHandler, 1879 "unicodeescape", message, 1880 starts, size, &startinpos, &endinpos, &exc, &s, 1881 (PyObject **)&v, &outpos, &p)) 1882 goto onError; 1883 goto nextByte; 1884 } 1885 chr = (chr<<4) & ~0xF; 1886 if (c >= '0' && c <= '9') 1887 chr += c - '0'; 1888 else if (c >= 'a' && c <= 'f') 1889 chr += 10 + c - 'a'; 1890 else 1891 chr += 10 + c - 'A'; 1892 } 1893 s += i; 1894 if (chr == 0xffffffff && PyErr_Occurred()) 1895 /* _decoding_error will have already written into the 1896 target buffer. */ 1897 break; 1898 store: 1899 /* when we get here, chr is a 32-bit unicode character */ 1900 if (chr <= 0xffff) 1901 /* UCS-2 character */ 1902 *p++ = (Py_UNICODE) chr; 1903 else if (chr <= 0x10ffff) { 1904 /* UCS-4 character. Either store directly, or as 1905 surrogate pair. */ 1906#ifdef Py_UNICODE_WIDE 1907 *p++ = chr; 1908#else 1909 chr -= 0x10000L; 1910 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10); 1911 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF); 1912#endif 1913 } else { 1914 endinpos = s-starts; 1915 outpos = p-PyUnicode_AS_UNICODE(v); 1916 if (unicode_decode_call_errorhandler( 1917 errors, &errorHandler, 1918 "unicodeescape", "illegal Unicode character", 1919 starts, size, &startinpos, &endinpos, &exc, &s, 1920 (PyObject **)&v, &outpos, &p)) 1921 goto onError; 1922 } 1923 break; 1924 1925 /* \N{name} */ 1926 case 'N': 1927 message = "malformed \\N character escape"; 1928 if (ucnhash_CAPI == NULL) { 1929 /* load the unicode data module */ 1930 PyObject *m, *api; 1931 m = PyImport_ImportModule("unicodedata"); 1932 if (m == NULL) 1933 goto ucnhashError; 1934 api = PyObject_GetAttrString(m, "ucnhash_CAPI"); 1935 Py_DECREF(m); 1936 if (api == NULL) 1937 goto ucnhashError; 1938 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api); 1939 Py_DECREF(api); 1940 if (ucnhash_CAPI == NULL) 1941 goto ucnhashError; 1942 } 1943 if (*s == '{') { 1944 const char *start = s+1; 1945 /* look for the closing brace */ 1946 while (*s != '}' && s < end) 1947 s++; 1948 if (s > start && s < end && *s == '}') { 1949 /* found a name. look it up in the unicode database */ 1950 message = "unknown Unicode character name"; 1951 s++; 1952 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr)) 1953 goto store; 1954 } 1955 } 1956 endinpos = s-starts; 1957 outpos = p-PyUnicode_AS_UNICODE(v); 1958 if (unicode_decode_call_errorhandler( 1959 errors, &errorHandler, 1960 "unicodeescape", message, 1961 starts, size, &startinpos, &endinpos, &exc, &s, 1962 (PyObject **)&v, &outpos, &p)) 1963 goto onError; 1964 break; 1965 1966 default: 1967 if (s > end) { 1968 message = "\\ at end of string"; 1969 s--; 1970 endinpos = s-starts; 1971 outpos = p-PyUnicode_AS_UNICODE(v); 1972 if (unicode_decode_call_errorhandler( 1973 errors, &errorHandler, 1974 "unicodeescape", message, 1975 starts, size, &startinpos, &endinpos, &exc, &s, 1976 (PyObject **)&v, &outpos, &p)) 1977 goto onError; 1978 } 1979 else { 1980 *p++ = '\\'; 1981 *p++ = (unsigned char)s[-1]; 1982 } 1983 break; 1984 } 1985 nextByte: 1986 ; 1987 } 1988 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0) 1989 goto onError; 1990 Py_XDECREF(errorHandler); 1991 Py_XDECREF(exc); 1992 return (PyObject *)v; 1993 1994ucnhashError: 1995 PyErr_SetString( 1996 PyExc_UnicodeError, 1997 "\\N escapes not supported (can't load unicodedata module)" 1998 ); 1999 Py_XDECREF(v); 2000 Py_XDECREF(errorHandler); 2001 Py_XDECREF(exc); 2002 return NULL; 2003 2004onError: 2005 Py_XDECREF(v); 2006 Py_XDECREF(errorHandler); 2007 Py_XDECREF(exc); 2008 return NULL; 2009} 2010 2011/* Return a Unicode-Escape string version of the Unicode object. 2012 2013 If quotes is true, the string is enclosed in u"" or u'' quotes as 2014 appropriate. 2015 2016*/ 2017 2018Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s, 2019 Py_ssize_t size, 2020 Py_UNICODE ch) 2021{ 2022 /* like wcschr, but doesn't stop at NULL characters */ 2023 2024 while (size-- > 0) { 2025 if (*s == ch) 2026 return s; 2027 s++; 2028 } 2029 2030 return NULL; 2031} 2032 2033static 2034PyObject *unicodeescape_string(const Py_UNICODE *s, 2035 Py_ssize_t size, 2036 int quotes) 2037{ 2038 PyObject *repr; 2039 char *p; 2040 2041 static const char *hexdigit = "0123456789abcdef"; 2042 2043 /* XXX(nnorwitz): rather than over-allocating, it would be 2044 better to choose a different scheme. Perhaps scan the 2045 first N-chars of the string and allocate based on that size. 2046 */ 2047 /* Initial allocation is based on the longest-possible unichr 2048 escape. 2049 2050 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source 2051 unichr, so in this case it's the longest unichr escape. In 2052 narrow (UTF-16) builds this is five chars per source unichr 2053 since there are two unichrs in the surrogate pair, so in narrow 2054 (UTF-16) builds it's not the longest unichr escape. 2055 2056 In wide or narrow builds '\uxxxx' is 6 chars per source unichr, 2057 so in the narrow (UTF-16) build case it's the longest unichr 2058 escape. 2059 */ 2060 2061 repr = PyString_FromStringAndSize(NULL, 2062 2 2063#ifdef Py_UNICODE_WIDE 2064 + 10*size 2065#else 2066 + 6*size 2067#endif 2068 + 1); 2069 if (repr == NULL) 2070 return NULL; 2071 2072 p = PyString_AS_STRING(repr); 2073 2074 if (quotes) { 2075 *p++ = 'u'; 2076 *p++ = (findchar(s, size, '\'') && 2077 !findchar(s, size, '"')) ? '"' : '\''; 2078 } 2079 while (size-- > 0) { 2080 Py_UNICODE ch = *s++; 2081 2082 /* Escape quotes and backslashes */ 2083 if ((quotes && 2084 ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) || ch == '\\') { 2085 *p++ = '\\'; 2086 *p++ = (char) ch; 2087 continue; 2088 } 2089 2090#ifdef Py_UNICODE_WIDE 2091 /* Map 21-bit characters to '\U00xxxxxx' */ 2092 else if (ch >= 0x10000) { 2093 *p++ = '\\'; 2094 *p++ = 'U'; 2095 *p++ = hexdigit[(ch >> 28) & 0x0000000F]; 2096 *p++ = hexdigit[(ch >> 24) & 0x0000000F]; 2097 *p++ = hexdigit[(ch >> 20) & 0x0000000F]; 2098 *p++ = hexdigit[(ch >> 16) & 0x0000000F]; 2099 *p++ = hexdigit[(ch >> 12) & 0x0000000F]; 2100 *p++ = hexdigit[(ch >> 8) & 0x0000000F]; 2101 *p++ = hexdigit[(ch >> 4) & 0x0000000F]; 2102 *p++ = hexdigit[ch & 0x0000000F]; 2103 continue; 2104 } 2105#else 2106 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */ 2107 else if (ch >= 0xD800 && ch < 0xDC00) { 2108 Py_UNICODE ch2; 2109 Py_UCS4 ucs; 2110 2111 ch2 = *s++; 2112 size--; 2113 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) { 2114 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000; 2115 *p++ = '\\'; 2116 *p++ = 'U'; 2117 *p++ = hexdigit[(ucs >> 28) & 0x0000000F]; 2118 *p++ = hexdigit[(ucs >> 24) & 0x0000000F]; 2119 *p++ = hexdigit[(ucs >> 20) & 0x0000000F]; 2120 *p++ = hexdigit[(ucs >> 16) & 0x0000000F]; 2121 *p++ = hexdigit[(ucs >> 12) & 0x0000000F]; 2122 *p++ = hexdigit[(ucs >> 8) & 0x0000000F]; 2123 *p++ = hexdigit[(ucs >> 4) & 0x0000000F]; 2124 *p++ = hexdigit[ucs & 0x0000000F]; 2125 continue; 2126 } 2127 /* Fall through: isolated surrogates are copied as-is */ 2128 s--; 2129 size++; 2130 } 2131#endif 2132 2133 /* Map 16-bit characters to '\uxxxx' */ 2134 if (ch >= 256) { 2135 *p++ = '\\'; 2136 *p++ = 'u'; 2137 *p++ = hexdigit[(ch >> 12) & 0x000F]; 2138 *p++ = hexdigit[(ch >> 8) & 0x000F]; 2139 *p++ = hexdigit[(ch >> 4) & 0x000F]; 2140 *p++ = hexdigit[ch & 0x000F]; 2141 } 2142 2143 /* Map special whitespace to '\t', \n', '\r' */ 2144 else if (ch == '\t') { 2145 *p++ = '\\'; 2146 *p++ = 't'; 2147 } 2148 else if (ch == '\n') { 2149 *p++ = '\\'; 2150 *p++ = 'n'; 2151 } 2152 else if (ch == '\r') { 2153 *p++ = '\\'; 2154 *p++ = 'r'; 2155 } 2156 2157 /* Map non-printable US ASCII to '\xhh' */ 2158 else if (ch < ' ' || ch >= 0x7F) { 2159 *p++ = '\\'; 2160 *p++ = 'x'; 2161 *p++ = hexdigit[(ch >> 4) & 0x000F]; 2162 *p++ = hexdigit[ch & 0x000F]; 2163 } 2164 2165 /* Copy everything else as-is */ 2166 else 2167 *p++ = (char) ch; 2168 } 2169 if (quotes) 2170 *p++ = PyString_AS_STRING(repr)[1]; 2171 2172 *p = '\0'; 2173 _PyString_Resize(&repr, p - PyString_AS_STRING(repr)); 2174 return repr; 2175} 2176 2177PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s, 2178 Py_ssize_t size) 2179{ 2180 return unicodeescape_string(s, size, 0); 2181} 2182 2183PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode) 2184{ 2185 if (!PyUnicode_Check(unicode)) { 2186 PyErr_BadArgument(); 2187 return NULL; 2188 } 2189 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode), 2190 PyUnicode_GET_SIZE(unicode)); 2191} 2192 2193/* --- Raw Unicode Escape Codec ------------------------------------------- */ 2194 2195PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s, 2196 Py_ssize_t size, 2197 const char *errors) 2198{ 2199 const char *starts = s; 2200 Py_ssize_t startinpos; 2201 Py_ssize_t endinpos; 2202 Py_ssize_t outpos; 2203 PyUnicodeObject *v; 2204 Py_UNICODE *p; 2205 const char *end; 2206 const char *bs; 2207 PyObject *errorHandler = NULL; 2208 PyObject *exc = NULL; 2209 2210 /* Escaped strings will always be longer than the resulting 2211 Unicode string, so we start with size here and then reduce the 2212 length after conversion to the true value. (But decoding error 2213 handler might have to resize the string) */ 2214 v = _PyUnicode_New(size); 2215 if (v == NULL) 2216 goto onError; 2217 if (size == 0) 2218 return (PyObject *)v; 2219 p = PyUnicode_AS_UNICODE(v); 2220 end = s + size; 2221 while (s < end) { 2222 unsigned char c; 2223 Py_UCS4 x; 2224 int i; 2225 int count; 2226 2227 /* Non-escape characters are interpreted as Unicode ordinals */ 2228 if (*s != '\\') { 2229 *p++ = (unsigned char)*s++; 2230 continue; 2231 } 2232 startinpos = s-starts; 2233 2234 /* \u-escapes are only interpreted iff the number of leading 2235 backslashes if odd */ 2236 bs = s; 2237 for (;s < end;) { 2238 if (*s != '\\') 2239 break; 2240 *p++ = (unsigned char)*s++; 2241 } 2242 if (((s - bs) & 1) == 0 || 2243 s >= end || 2244 (*s != 'u' && *s != 'U')) { 2245 continue; 2246 } 2247 p--; 2248 count = *s=='u' ? 4 : 8; 2249 s++; 2250 2251 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */ 2252 outpos = p-PyUnicode_AS_UNICODE(v); 2253 for (x = 0, i = 0; i < count; ++i, ++s) { 2254 c = (unsigned char)*s; 2255 if (!isxdigit(c)) { 2256 endinpos = s-starts; 2257 if (unicode_decode_call_errorhandler( 2258 errors, &errorHandler, 2259 "rawunicodeescape", "truncated \\uXXXX", 2260 starts, size, &startinpos, &endinpos, &exc, &s, 2261 (PyObject **)&v, &outpos, &p)) 2262 goto onError; 2263 goto nextByte; 2264 } 2265 x = (x<<4) & ~0xF; 2266 if (c >= '0' && c <= '9') 2267 x += c - '0'; 2268 else if (c >= 'a' && c <= 'f') 2269 x += 10 + c - 'a'; 2270 else 2271 x += 10 + c - 'A'; 2272 } 2273#ifndef Py_UNICODE_WIDE 2274 if (x > 0x10000) { 2275 if (unicode_decode_call_errorhandler( 2276 errors, &errorHandler, 2277 "rawunicodeescape", "\\Uxxxxxxxx out of range", 2278 starts, size, &startinpos, &endinpos, &exc, &s, 2279 (PyObject **)&v, &outpos, &p)) 2280 goto onError; 2281 } 2282#endif 2283 *p++ = x; 2284 nextByte: 2285 ; 2286 } 2287 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0) 2288 goto onError; 2289 Py_XDECREF(errorHandler); 2290 Py_XDECREF(exc); 2291 return (PyObject *)v; 2292 2293 onError: 2294 Py_XDECREF(v); 2295 Py_XDECREF(errorHandler); 2296 Py_XDECREF(exc); 2297 return NULL; 2298} 2299 2300PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s, 2301 Py_ssize_t size) 2302{ 2303 PyObject *repr; 2304 char *p; 2305 char *q; 2306 2307 static const char *hexdigit = "0123456789abcdef"; 2308 2309#ifdef Py_UNICODE_WIDE 2310 repr = PyString_FromStringAndSize(NULL, 10 * size); 2311#else 2312 repr = PyString_FromStringAndSize(NULL, 6 * size); 2313#endif 2314 if (repr == NULL) 2315 return NULL; 2316 if (size == 0) 2317 return repr; 2318 2319 p = q = PyString_AS_STRING(repr); 2320 while (size-- > 0) { 2321 Py_UNICODE ch = *s++; 2322#ifdef Py_UNICODE_WIDE 2323 /* Map 32-bit characters to '\Uxxxxxxxx' */ 2324 if (ch >= 0x10000) { 2325 *p++ = '\\'; 2326 *p++ = 'U'; 2327 *p++ = hexdigit[(ch >> 28) & 0xf]; 2328 *p++ = hexdigit[(ch >> 24) & 0xf]; 2329 *p++ = hexdigit[(ch >> 20) & 0xf]; 2330 *p++ = hexdigit[(ch >> 16) & 0xf]; 2331 *p++ = hexdigit[(ch >> 12) & 0xf]; 2332 *p++ = hexdigit[(ch >> 8) & 0xf]; 2333 *p++ = hexdigit[(ch >> 4) & 0xf]; 2334 *p++ = hexdigit[ch & 15]; 2335 } 2336 else 2337#endif 2338 /* Map 16-bit characters to '\uxxxx' */ 2339 if (ch >= 256) { 2340 *p++ = '\\'; 2341 *p++ = 'u'; 2342 *p++ = hexdigit[(ch >> 12) & 0xf]; 2343 *p++ = hexdigit[(ch >> 8) & 0xf]; 2344 *p++ = hexdigit[(ch >> 4) & 0xf]; 2345 *p++ = hexdigit[ch & 15]; 2346 } 2347 /* Copy everything else as-is */ 2348 else 2349 *p++ = (char) ch; 2350 } 2351 *p = '\0'; 2352 _PyString_Resize(&repr, p - q); 2353 return repr; 2354} 2355 2356PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode) 2357{ 2358 if (!PyUnicode_Check(unicode)) { 2359 PyErr_BadArgument(); 2360 return NULL; 2361 } 2362 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode), 2363 PyUnicode_GET_SIZE(unicode)); 2364} 2365 2366/* --- Unicode Internal Codec ------------------------------------------- */ 2367 2368PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s, 2369 Py_ssize_t size, 2370 const char *errors) 2371{ 2372 const char *starts = s; 2373 Py_ssize_t startinpos; 2374 Py_ssize_t endinpos; 2375 Py_ssize_t outpos; 2376 PyUnicodeObject *v; 2377 Py_UNICODE *p; 2378 const char *end; 2379 const char *reason; 2380 PyObject *errorHandler = NULL; 2381 PyObject *exc = NULL; 2382 2383#ifdef Py_UNICODE_WIDE 2384 Py_UNICODE unimax = PyUnicode_GetMax(); 2385#endif 2386 2387 /* XXX overflow detection missing */ 2388 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE); 2389 if (v == NULL) 2390 goto onError; 2391 if (PyUnicode_GetSize((PyObject *)v) == 0) 2392 return (PyObject *)v; 2393 p = PyUnicode_AS_UNICODE(v); 2394 end = s + size; 2395 2396 while (s < end) { 2397 memcpy(p, s, sizeof(Py_UNICODE)); 2398 /* We have to sanity check the raw data, otherwise doom looms for 2399 some malformed UCS-4 data. */ 2400 if ( 2401 #ifdef Py_UNICODE_WIDE 2402 *p > unimax || *p < 0 || 2403 #endif 2404 end-s < Py_UNICODE_SIZE 2405 ) 2406 { 2407 startinpos = s - starts; 2408 if (end-s < Py_UNICODE_SIZE) { 2409 endinpos = end-starts; 2410 reason = "truncated input"; 2411 } 2412 else { 2413 endinpos = s - starts + Py_UNICODE_SIZE; 2414 reason = "illegal code point (> 0x10FFFF)"; 2415 } 2416 outpos = p - PyUnicode_AS_UNICODE(v); 2417 if (unicode_decode_call_errorhandler( 2418 errors, &errorHandler, 2419 "unicode_internal", reason, 2420 starts, size, &startinpos, &endinpos, &exc, &s, 2421 (PyObject **)&v, &outpos, &p)) { 2422 goto onError; 2423 } 2424 } 2425 else { 2426 p++; 2427 s += Py_UNICODE_SIZE; 2428 } 2429 } 2430 2431 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0) 2432 goto onError; 2433 Py_XDECREF(errorHandler); 2434 Py_XDECREF(exc); 2435 return (PyObject *)v; 2436 2437 onError: 2438 Py_XDECREF(v); 2439 Py_XDECREF(errorHandler); 2440 Py_XDECREF(exc); 2441 return NULL; 2442} 2443 2444/* --- Latin-1 Codec ------------------------------------------------------ */ 2445 2446PyObject *PyUnicode_DecodeLatin1(const char *s, 2447 Py_ssize_t size, 2448 const char *errors) 2449{ 2450 PyUnicodeObject *v; 2451 Py_UNICODE *p; 2452 2453 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */ 2454 if (size == 1) { 2455 Py_UNICODE r = *(unsigned char*)s; 2456 return PyUnicode_FromUnicode(&r, 1); 2457 } 2458 2459 v = _PyUnicode_New(size); 2460 if (v == NULL) 2461 goto onError; 2462 if (size == 0) 2463 return (PyObject *)v; 2464 p = PyUnicode_AS_UNICODE(v); 2465 while (size-- > 0) 2466 *p++ = (unsigned char)*s++; 2467 return (PyObject *)v; 2468 2469 onError: 2470 Py_XDECREF(v); 2471 return NULL; 2472} 2473 2474/* create or adjust a UnicodeEncodeError */ 2475static void make_encode_exception(PyObject **exceptionObject, 2476 const char *encoding, 2477 const Py_UNICODE *unicode, Py_ssize_t size, 2478 Py_ssize_t startpos, Py_ssize_t endpos, 2479 const char *reason) 2480{ 2481 if (*exceptionObject == NULL) { 2482 *exceptionObject = PyUnicodeEncodeError_Create( 2483 encoding, unicode, size, startpos, endpos, reason); 2484 } 2485 else { 2486 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos)) 2487 goto onError; 2488 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos)) 2489 goto onError; 2490 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason)) 2491 goto onError; 2492 return; 2493 onError: 2494 Py_DECREF(*exceptionObject); 2495 *exceptionObject = NULL; 2496 } 2497} 2498 2499/* raises a UnicodeEncodeError */ 2500static void raise_encode_exception(PyObject **exceptionObject, 2501 const char *encoding, 2502 const Py_UNICODE *unicode, Py_ssize_t size, 2503 Py_ssize_t startpos, Py_ssize_t endpos, 2504 const char *reason) 2505{ 2506 make_encode_exception(exceptionObject, 2507 encoding, unicode, size, startpos, endpos, reason); 2508 if (*exceptionObject != NULL) 2509 PyCodec_StrictErrors(*exceptionObject); 2510} 2511 2512/* error handling callback helper: 2513 build arguments, call the callback and check the arguments, 2514 put the result into newpos and return the replacement string, which 2515 has to be freed by the caller */ 2516static PyObject *unicode_encode_call_errorhandler(const char *errors, 2517 PyObject **errorHandler, 2518 const char *encoding, const char *reason, 2519 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject, 2520 Py_ssize_t startpos, Py_ssize_t endpos, 2521 Py_ssize_t *newpos) 2522{ 2523 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple"; 2524 2525 PyObject *restuple; 2526 PyObject *resunicode; 2527 2528 if (*errorHandler == NULL) { 2529 *errorHandler = PyCodec_LookupError(errors); 2530 if (*errorHandler == NULL) 2531 return NULL; 2532 } 2533 2534 make_encode_exception(exceptionObject, 2535 encoding, unicode, size, startpos, endpos, reason); 2536 if (*exceptionObject == NULL) 2537 return NULL; 2538 2539 restuple = PyObject_CallFunctionObjArgs( 2540 *errorHandler, *exceptionObject, NULL); 2541 if (restuple == NULL) 2542 return NULL; 2543 if (!PyTuple_Check(restuple)) { 2544 PyErr_Format(PyExc_TypeError, &argparse[4]); 2545 Py_DECREF(restuple); 2546 return NULL; 2547 } 2548 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, 2549 &resunicode, newpos)) { 2550 Py_DECREF(restuple); 2551 return NULL; 2552 } 2553 if (*newpos<0) 2554 *newpos = size+*newpos; 2555 if (*newpos<0 || *newpos>size) { 2556 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos); 2557 Py_DECREF(restuple); 2558 return NULL; 2559 } 2560 Py_INCREF(resunicode); 2561 Py_DECREF(restuple); 2562 return resunicode; 2563} 2564 2565static PyObject *unicode_encode_ucs1(const Py_UNICODE *p, 2566 Py_ssize_t size, 2567 const char *errors, 2568 int limit) 2569{ 2570 /* output object */ 2571 PyObject *res; 2572 /* pointers to the beginning and end+1 of input */ 2573 const Py_UNICODE *startp = p; 2574 const Py_UNICODE *endp = p + size; 2575 /* pointer to the beginning of the unencodable characters */ 2576 /* const Py_UNICODE *badp = NULL; */ 2577 /* pointer into the output */ 2578 char *str; 2579 /* current output position */ 2580 Py_ssize_t respos = 0; 2581 Py_ssize_t ressize; 2582 const char *encoding = (limit == 256) ? "latin-1" : "ascii"; 2583 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)"; 2584 PyObject *errorHandler = NULL; 2585 PyObject *exc = NULL; 2586 /* the following variable is used for caching string comparisons 2587 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */ 2588 int known_errorHandler = -1; 2589 2590 /* allocate enough for a simple encoding without 2591 replacements, if we need more, we'll resize */ 2592 res = PyString_FromStringAndSize(NULL, size); 2593 if (res == NULL) 2594 goto onError; 2595 if (size == 0) 2596 return res; 2597 str = PyString_AS_STRING(res); 2598 ressize = size; 2599 2600 while (p<endp) { 2601 Py_UNICODE c = *p; 2602 2603 /* can we encode this? */ 2604 if (c<limit) { 2605 /* no overflow check, because we know that the space is enough */ 2606 *str++ = (char)c; 2607 ++p; 2608 } 2609 else { 2610 Py_ssize_t unicodepos = p-startp; 2611 Py_ssize_t requiredsize; 2612 PyObject *repunicode; 2613 Py_ssize_t repsize; 2614 Py_ssize_t newpos; 2615 Py_ssize_t respos; 2616 Py_UNICODE *uni2; 2617 /* startpos for collecting unencodable chars */ 2618 const Py_UNICODE *collstart = p; 2619 const Py_UNICODE *collend = p; 2620 /* find all unecodable characters */ 2621 while ((collend < endp) && ((*collend)>=limit)) 2622 ++collend; 2623 /* cache callback name lookup (if not done yet, i.e. it's the first error) */ 2624 if (known_errorHandler==-1) { 2625 if ((errors==NULL) || (!strcmp(errors, "strict"))) 2626 known_errorHandler = 1; 2627 else if (!strcmp(errors, "replace")) 2628 known_errorHandler = 2; 2629 else if (!strcmp(errors, "ignore")) 2630 known_errorHandler = 3; 2631 else if (!strcmp(errors, "xmlcharrefreplace")) 2632 known_errorHandler = 4; 2633 else 2634 known_errorHandler = 0; 2635 } 2636 switch (known_errorHandler) { 2637 case 1: /* strict */ 2638 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason); 2639 goto onError; 2640 case 2: /* replace */ 2641 while (collstart++<collend) 2642 *str++ = '?'; /* fall through */ 2643 case 3: /* ignore */ 2644 p = collend; 2645 break; 2646 case 4: /* xmlcharrefreplace */ 2647 respos = str-PyString_AS_STRING(res); 2648 /* determine replacement size (temporarily (mis)uses p) */ 2649 for (p = collstart, repsize = 0; p < collend; ++p) { 2650 if (*p<10) 2651 repsize += 2+1+1; 2652 else if (*p<100) 2653 repsize += 2+2+1; 2654 else if (*p<1000) 2655 repsize += 2+3+1; 2656 else if (*p<10000) 2657 repsize += 2+4+1; 2658#ifndef Py_UNICODE_WIDE 2659 else 2660 repsize += 2+5+1; 2661#else 2662 else if (*p<100000) 2663 repsize += 2+5+1; 2664 else if (*p<1000000) 2665 repsize += 2+6+1; 2666 else 2667 repsize += 2+7+1; 2668#endif 2669 } 2670 requiredsize = respos+repsize+(endp-collend); 2671 if (requiredsize > ressize) { 2672 if (requiredsize<2*ressize) 2673 requiredsize = 2*ressize; 2674 if (_PyString_Resize(&res, requiredsize)) 2675 goto onError; 2676 str = PyString_AS_STRING(res) + respos; 2677 ressize = requiredsize; 2678 } 2679 /* generate replacement (temporarily (mis)uses p) */ 2680 for (p = collstart; p < collend; ++p) { 2681 str += sprintf(str, "&#%d;", (int)*p); 2682 } 2683 p = collend; 2684 break; 2685 default: 2686 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler, 2687 encoding, reason, startp, size, &exc, 2688 collstart-startp, collend-startp, &newpos); 2689 if (repunicode == NULL) 2690 goto onError; 2691 /* need more space? (at least enough for what we 2692 have+the replacement+the rest of the string, so 2693 we won't have to check space for encodable characters) */ 2694 respos = str-PyString_AS_STRING(res); 2695 repsize = PyUnicode_GET_SIZE(repunicode); 2696 requiredsize = respos+repsize+(endp-collend); 2697 if (requiredsize > ressize) { 2698 if (requiredsize<2*ressize) 2699 requiredsize = 2*ressize; 2700 if (_PyString_Resize(&res, requiredsize)) { 2701 Py_DECREF(repunicode); 2702 goto onError; 2703 } 2704 str = PyString_AS_STRING(res) + respos; 2705 ressize = requiredsize; 2706 } 2707 /* check if there is anything unencodable in the replacement 2708 and copy it to the output */ 2709 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) { 2710 c = *uni2; 2711 if (c >= limit) { 2712 raise_encode_exception(&exc, encoding, startp, size, 2713 unicodepos, unicodepos+1, reason); 2714 Py_DECREF(repunicode); 2715 goto onError; 2716 } 2717 *str = (char)c; 2718 } 2719 p = startp + newpos; 2720 Py_DECREF(repunicode); 2721 } 2722 } 2723 } 2724 /* Resize if we allocated to much */ 2725 respos = str-PyString_AS_STRING(res); 2726 if (respos<ressize) 2727 /* If this falls res will be NULL */ 2728 _PyString_Resize(&res, respos); 2729 Py_XDECREF(errorHandler); 2730 Py_XDECREF(exc); 2731 return res; 2732 2733 onError: 2734 Py_XDECREF(res); 2735 Py_XDECREF(errorHandler); 2736 Py_XDECREF(exc); 2737 return NULL; 2738} 2739 2740PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p, 2741 Py_ssize_t size, 2742 const char *errors) 2743{ 2744 return unicode_encode_ucs1(p, size, errors, 256); 2745} 2746 2747PyObject *PyUnicode_AsLatin1String(PyObject *unicode) 2748{ 2749 if (!PyUnicode_Check(unicode)) { 2750 PyErr_BadArgument(); 2751 return NULL; 2752 } 2753 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode), 2754 PyUnicode_GET_SIZE(unicode), 2755 NULL); 2756} 2757 2758/* --- 7-bit ASCII Codec -------------------------------------------------- */ 2759 2760PyObject *PyUnicode_DecodeASCII(const char *s, 2761 Py_ssize_t size, 2762 const char *errors) 2763{ 2764 const char *starts = s; 2765 PyUnicodeObject *v; 2766 Py_UNICODE *p; 2767 Py_ssize_t startinpos; 2768 Py_ssize_t endinpos; 2769 Py_ssize_t outpos; 2770 const char *e; 2771 PyObject *errorHandler = NULL; 2772 PyObject *exc = NULL; 2773 2774 /* ASCII is equivalent to the first 128 ordinals in Unicode. */ 2775 if (size == 1 && *(unsigned char*)s < 128) { 2776 Py_UNICODE r = *(unsigned char*)s; 2777 return PyUnicode_FromUnicode(&r, 1); 2778 } 2779 2780 v = _PyUnicode_New(size); 2781 if (v == NULL) 2782 goto onError; 2783 if (size == 0) 2784 return (PyObject *)v; 2785 p = PyUnicode_AS_UNICODE(v); 2786 e = s + size; 2787 while (s < e) { 2788 register unsigned char c = (unsigned char)*s; 2789 if (c < 128) { 2790 *p++ = c; 2791 ++s; 2792 } 2793 else { 2794 startinpos = s-starts; 2795 endinpos = startinpos + 1; 2796 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v); 2797 if (unicode_decode_call_errorhandler( 2798 errors, &errorHandler, 2799 "ascii", "ordinal not in range(128)", 2800 starts, size, &startinpos, &endinpos, &exc, &s, 2801 (PyObject **)&v, &outpos, &p)) 2802 goto onError; 2803 } 2804 } 2805 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v)) 2806 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0) 2807 goto onError; 2808 Py_XDECREF(errorHandler); 2809 Py_XDECREF(exc); 2810 return (PyObject *)v; 2811 2812 onError: 2813 Py_XDECREF(v); 2814 Py_XDECREF(errorHandler); 2815 Py_XDECREF(exc); 2816 return NULL; 2817} 2818 2819PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p, 2820 Py_ssize_t size, 2821 const char *errors) 2822{ 2823 return unicode_encode_ucs1(p, size, errors, 128); 2824} 2825 2826PyObject *PyUnicode_AsASCIIString(PyObject *unicode) 2827{ 2828 if (!PyUnicode_Check(unicode)) { 2829 PyErr_BadArgument(); 2830 return NULL; 2831 } 2832 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode), 2833 PyUnicode_GET_SIZE(unicode), 2834 NULL); 2835} 2836 2837#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T) 2838 2839/* --- MBCS codecs for Windows -------------------------------------------- */ 2840 2841#if SIZEOF_INT < SIZEOF_SSIZE_T 2842#define NEED_RETRY 2843#endif 2844 2845/* XXX This code is limited to "true" double-byte encodings, as 2846 a) it assumes an incomplete character consists of a single byte, and 2847 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte 2848 encodings, see IsDBCSLeadByteEx documentation. */ 2849 2850static int is_dbcs_lead_byte(const char *s, int offset) 2851{ 2852 const char *curr = s + offset; 2853 2854 if (IsDBCSLeadByte(*curr)) { 2855 const char *prev = CharPrev(s, curr); 2856 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2); 2857 } 2858 return 0; 2859} 2860 2861/* 2862 * Decode MBCS string into unicode object. If 'final' is set, converts 2863 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise. 2864 */ 2865static int decode_mbcs(PyUnicodeObject **v, 2866 const char *s, /* MBCS string */ 2867 int size, /* sizeof MBCS string */ 2868 int final) 2869{ 2870 Py_UNICODE *p; 2871 Py_ssize_t n = 0; 2872 int usize = 0; 2873 2874 assert(size >= 0); 2875 2876 /* Skip trailing lead-byte unless 'final' is set */ 2877 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1)) 2878 --size; 2879 2880 /* First get the size of the result */ 2881 if (size > 0) { 2882 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0); 2883 if (usize == 0) { 2884 PyErr_SetFromWindowsErrWithFilename(0, NULL); 2885 return -1; 2886 } 2887 } 2888 2889 if (*v == NULL) { 2890 /* Create unicode object */ 2891 *v = _PyUnicode_New(usize); 2892 if (*v == NULL) 2893 return -1; 2894 } 2895 else { 2896 /* Extend unicode object */ 2897 n = PyUnicode_GET_SIZE(*v); 2898 if (_PyUnicode_Resize(v, n + usize) < 0) 2899 return -1; 2900 } 2901 2902 /* Do the conversion */ 2903 if (size > 0) { 2904 p = PyUnicode_AS_UNICODE(*v) + n; 2905 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) { 2906 PyErr_SetFromWindowsErrWithFilename(0, NULL); 2907 return -1; 2908 } 2909 } 2910 2911 return size; 2912} 2913 2914PyObject *PyUnicode_DecodeMBCSStateful(const char *s, 2915 Py_ssize_t size, 2916 const char *errors, 2917 Py_ssize_t *consumed) 2918{ 2919 PyUnicodeObject *v = NULL; 2920 int done; 2921 2922 if (consumed) 2923 *consumed = 0; 2924 2925#ifdef NEED_RETRY 2926 retry: 2927 if (size > INT_MAX) 2928 done = decode_mbcs(&v, s, INT_MAX, 0); 2929 else 2930#endif 2931 done = decode_mbcs(&v, s, (int)size, !consumed); 2932 2933 if (done < 0) { 2934 Py_XDECREF(v); 2935 return NULL; 2936 } 2937 2938 if (consumed) 2939 *consumed += done; 2940 2941#ifdef NEED_RETRY 2942 if (size > INT_MAX) { 2943 s += done; 2944 size -= done; 2945 goto retry; 2946 } 2947#endif 2948 2949 return (PyObject *)v; 2950} 2951 2952PyObject *PyUnicode_DecodeMBCS(const char *s, 2953 Py_ssize_t size, 2954 const char *errors) 2955{ 2956 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL); 2957} 2958 2959/* 2960 * Convert unicode into string object (MBCS). 2961 * Returns 0 if succeed, -1 otherwise. 2962 */ 2963static int encode_mbcs(PyObject **repr, 2964 const Py_UNICODE *p, /* unicode */ 2965 int size) /* size of unicode */ 2966{ 2967 int mbcssize = 0; 2968 Py_ssize_t n = 0; 2969 2970 assert(size >= 0); 2971 2972 /* First get the size of the result */ 2973 if (size > 0) { 2974 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL); 2975 if (mbcssize == 0) { 2976 PyErr_SetFromWindowsErrWithFilename(0, NULL); 2977 return -1; 2978 } 2979 } 2980 2981 if (*repr == NULL) { 2982 /* Create string object */ 2983 *repr = PyString_FromStringAndSize(NULL, mbcssize); 2984 if (*repr == NULL) 2985 return -1; 2986 } 2987 else { 2988 /* Extend string object */ 2989 n = PyString_Size(*repr); 2990 if (_PyString_Resize(repr, n + mbcssize) < 0) 2991 return -1; 2992 } 2993 2994 /* Do the conversion */ 2995 if (size > 0) { 2996 char *s = PyString_AS_STRING(*repr) + n; 2997 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) { 2998 PyErr_SetFromWindowsErrWithFilename(0, NULL); 2999 return -1; 3000 } 3001 } 3002 3003 return 0; 3004} 3005 3006PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p, 3007 Py_ssize_t size, 3008 const char *errors) 3009{ 3010 PyObject *repr = NULL; 3011 int ret; 3012 3013#ifdef NEED_RETRY 3014 retry: 3015 if (size > INT_MAX) 3016 ret = encode_mbcs(&repr, p, INT_MAX); 3017 else 3018#endif 3019 ret = encode_mbcs(&repr, p, (int)size); 3020 3021 if (ret < 0) { 3022 Py_XDECREF(repr); 3023 return NULL; 3024 } 3025 3026#ifdef NEED_RETRY 3027 if (size > INT_MAX) { 3028 p += INT_MAX; 3029 size -= INT_MAX; 3030 goto retry; 3031 } 3032#endif 3033 3034 return repr; 3035} 3036 3037PyObject *PyUnicode_AsMBCSString(PyObject *unicode) 3038{ 3039 if (!PyUnicode_Check(unicode)) { 3040 PyErr_BadArgument(); 3041 return NULL; 3042 } 3043 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode), 3044 PyUnicode_GET_SIZE(unicode), 3045 NULL); 3046} 3047 3048#undef NEED_RETRY 3049 3050#endif /* MS_WINDOWS */ 3051 3052/* --- Character Mapping Codec -------------------------------------------- */ 3053 3054PyObject *PyUnicode_DecodeCharmap(const char *s, 3055 Py_ssize_t size, 3056 PyObject *mapping, 3057 const char *errors) 3058{ 3059 const char *starts = s; 3060 Py_ssize_t startinpos; 3061 Py_ssize_t endinpos; 3062 Py_ssize_t outpos; 3063 const char *e; 3064 PyUnicodeObject *v; 3065 Py_UNICODE *p; 3066 Py_ssize_t extrachars = 0; 3067 PyObject *errorHandler = NULL; 3068 PyObject *exc = NULL; 3069 Py_UNICODE *mapstring = NULL; 3070 Py_ssize_t maplen = 0; 3071 3072 /* Default to Latin-1 */ 3073 if (mapping == NULL) 3074 return PyUnicode_DecodeLatin1(s, size, errors); 3075 3076 v = _PyUnicode_New(size); 3077 if (v == NULL) 3078 goto onError; 3079 if (size == 0) 3080 return (PyObject *)v; 3081 p = PyUnicode_AS_UNICODE(v); 3082 e = s + size; 3083 if (PyUnicode_CheckExact(mapping)) { 3084 mapstring = PyUnicode_AS_UNICODE(mapping); 3085 maplen = PyUnicode_GET_SIZE(mapping); 3086 while (s < e) { 3087 unsigned char ch = *s; 3088 Py_UNICODE x = 0xfffe; /* illegal value */ 3089 3090 if (ch < maplen) 3091 x = mapstring[ch]; 3092 3093 if (x == 0xfffe) { 3094 /* undefined mapping */ 3095 outpos = p-PyUnicode_AS_UNICODE(v); 3096 startinpos = s-starts; 3097 endinpos = startinpos+1; 3098 if (unicode_decode_call_errorhandler( 3099 errors, &errorHandler, 3100 "charmap", "character maps to <undefined>", 3101 starts, size, &startinpos, &endinpos, &exc, &s, 3102 (PyObject **)&v, &outpos, &p)) { 3103 goto onError; 3104 } 3105 continue; 3106 } 3107 *p++ = x; 3108 ++s; 3109 } 3110 } 3111 else { 3112 while (s < e) { 3113 unsigned char ch = *s; 3114 PyObject *w, *x; 3115 3116 /* Get mapping (char ordinal -> integer, Unicode char or None) */ 3117 w = PyInt_FromLong((long)ch); 3118 if (w == NULL) 3119 goto onError; 3120 x = PyObject_GetItem(mapping, w); 3121 Py_DECREF(w); 3122 if (x == NULL) { 3123 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 3124 /* No mapping found means: mapping is undefined. */ 3125 PyErr_Clear(); 3126 x = Py_None; 3127 Py_INCREF(x); 3128 } else 3129 goto onError; 3130 } 3131 3132 /* Apply mapping */ 3133 if (PyInt_Check(x)) { 3134 long value = PyInt_AS_LONG(x); 3135 if (value < 0 || value > 65535) { 3136 PyErr_SetString(PyExc_TypeError, 3137 "character mapping must be in range(65536)"); 3138 Py_DECREF(x); 3139 goto onError; 3140 } 3141 *p++ = (Py_UNICODE)value; 3142 } 3143 else if (x == Py_None) { 3144 /* undefined mapping */ 3145 outpos = p-PyUnicode_AS_UNICODE(v); 3146 startinpos = s-starts; 3147 endinpos = startinpos+1; 3148 if (unicode_decode_call_errorhandler( 3149 errors, &errorHandler, 3150 "charmap", "character maps to <undefined>", 3151 starts, size, &startinpos, &endinpos, &exc, &s, 3152 (PyObject **)&v, &outpos, &p)) { 3153 Py_DECREF(x); 3154 goto onError; 3155 } 3156 Py_DECREF(x); 3157 continue; 3158 } 3159 else if (PyUnicode_Check(x)) { 3160 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x); 3161 3162 if (targetsize == 1) 3163 /* 1-1 mapping */ 3164 *p++ = *PyUnicode_AS_UNICODE(x); 3165 3166 else if (targetsize > 1) { 3167 /* 1-n mapping */ 3168 if (targetsize > extrachars) { 3169 /* resize first */ 3170 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v); 3171 Py_ssize_t needed = (targetsize - extrachars) + \ 3172 (targetsize << 2); 3173 extrachars += needed; 3174 /* XXX overflow detection missing */ 3175 if (_PyUnicode_Resize(&v, 3176 PyUnicode_GET_SIZE(v) + needed) < 0) { 3177 Py_DECREF(x); 3178 goto onError; 3179 } 3180 p = PyUnicode_AS_UNICODE(v) + oldpos; 3181 } 3182 Py_UNICODE_COPY(p, 3183 PyUnicode_AS_UNICODE(x), 3184 targetsize); 3185 p += targetsize; 3186 extrachars -= targetsize; 3187 } 3188 /* 1-0 mapping: skip the character */ 3189 } 3190 else { 3191 /* wrong return value */ 3192 PyErr_SetString(PyExc_TypeError, 3193 "character mapping must return integer, None or unicode"); 3194 Py_DECREF(x); 3195 goto onError; 3196 } 3197 Py_DECREF(x); 3198 ++s; 3199 } 3200 } 3201 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v)) 3202 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0) 3203 goto onError; 3204 Py_XDECREF(errorHandler); 3205 Py_XDECREF(exc); 3206 return (PyObject *)v; 3207 3208 onError: 3209 Py_XDECREF(errorHandler); 3210 Py_XDECREF(exc); 3211 Py_XDECREF(v); 3212 return NULL; 3213} 3214 3215/* Charmap encoding: the lookup table */ 3216 3217struct encoding_map{ 3218 PyObject_HEAD 3219 unsigned char level1[32]; 3220 int count2, count3; 3221 unsigned char level23[1]; 3222}; 3223 3224static PyObject* 3225encoding_map_size(PyObject *obj, PyObject* args) 3226{ 3227 struct encoding_map *map = (struct encoding_map*)obj; 3228 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 + 3229 128*map->count3); 3230} 3231 3232static PyMethodDef encoding_map_methods[] = { 3233 {"size", encoding_map_size, METH_NOARGS, 3234 PyDoc_STR("Return the size (in bytes) of this object") }, 3235 { 0 } 3236}; 3237 3238static void 3239encoding_map_dealloc(PyObject* o) 3240{ 3241 PyObject_FREE(o); 3242} 3243 3244static PyTypeObject EncodingMapType = { 3245 PyObject_HEAD_INIT(NULL) 3246 0, /*ob_size*/ 3247 "EncodingMap", /*tp_name*/ 3248 sizeof(struct encoding_map), /*tp_basicsize*/ 3249 0, /*tp_itemsize*/ 3250 /* methods */ 3251 encoding_map_dealloc, /*tp_dealloc*/ 3252 0, /*tp_print*/ 3253 0, /*tp_getattr*/ 3254 0, /*tp_setattr*/ 3255 0, /*tp_compare*/ 3256 0, /*tp_repr*/ 3257 0, /*tp_as_number*/ 3258 0, /*tp_as_sequence*/ 3259 0, /*tp_as_mapping*/ 3260 0, /*tp_hash*/ 3261 0, /*tp_call*/ 3262 0, /*tp_str*/ 3263 0, /*tp_getattro*/ 3264 0, /*tp_setattro*/ 3265 0, /*tp_as_buffer*/ 3266 Py_TPFLAGS_DEFAULT, /*tp_flags*/ 3267 0, /*tp_doc*/ 3268 0, /*tp_traverse*/ 3269 0, /*tp_clear*/ 3270 0, /*tp_richcompare*/ 3271 0, /*tp_weaklistoffset*/ 3272 0, /*tp_iter*/ 3273 0, /*tp_iternext*/ 3274 encoding_map_methods, /*tp_methods*/ 3275 0, /*tp_members*/ 3276 0, /*tp_getset*/ 3277 0, /*tp_base*/ 3278 0, /*tp_dict*/ 3279 0, /*tp_descr_get*/ 3280 0, /*tp_descr_set*/ 3281 0, /*tp_dictoffset*/ 3282 0, /*tp_init*/ 3283 0, /*tp_alloc*/ 3284 0, /*tp_new*/ 3285 0, /*tp_free*/ 3286 0, /*tp_is_gc*/ 3287}; 3288 3289PyObject* 3290PyUnicode_BuildEncodingMap(PyObject* string) 3291{ 3292 Py_UNICODE *decode; 3293 PyObject *result; 3294 struct encoding_map *mresult; 3295 int i; 3296 int need_dict = 0; 3297 unsigned char level1[32]; 3298 unsigned char level2[512]; 3299 unsigned char *mlevel1, *mlevel2, *mlevel3; 3300 int count2 = 0, count3 = 0; 3301 3302 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) { 3303 PyErr_BadArgument(); 3304 return NULL; 3305 } 3306 decode = PyUnicode_AS_UNICODE(string); 3307 memset(level1, 0xFF, sizeof level1); 3308 memset(level2, 0xFF, sizeof level2); 3309 3310 /* If there isn't a one-to-one mapping of NULL to \0, 3311 or if there are non-BMP characters, we need to use 3312 a mapping dictionary. */ 3313 if (decode[0] != 0) 3314 need_dict = 1; 3315 for (i = 1; i < 256; i++) { 3316 int l1, l2; 3317 if (decode[i] == 0 3318 #ifdef Py_UNICODE_WIDE 3319 || decode[i] > 0xFFFF 3320 #endif 3321 ) { 3322 need_dict = 1; 3323 break; 3324 } 3325 if (decode[i] == 0xFFFE) 3326 /* unmapped character */ 3327 continue; 3328 l1 = decode[i] >> 11; 3329 l2 = decode[i] >> 7; 3330 if (level1[l1] == 0xFF) 3331 level1[l1] = count2++; 3332 if (level2[l2] == 0xFF) 3333 level2[l2] = count3++; 3334 } 3335 3336 if (count2 >= 0xFF || count3 >= 0xFF) 3337 need_dict = 1; 3338 3339 if (need_dict) { 3340 PyObject *result = PyDict_New(); 3341 PyObject *key, *value; 3342 if (!result) 3343 return NULL; 3344 for (i = 0; i < 256; i++) { 3345 key = value = NULL; 3346 key = PyInt_FromLong(decode[i]); 3347 value = PyInt_FromLong(i); 3348 if (!key || !value) 3349 goto failed1; 3350 if (PyDict_SetItem(result, key, value) == -1) 3351 goto failed1; 3352 Py_DECREF(key); 3353 Py_DECREF(value); 3354 } 3355 return result; 3356 failed1: 3357 Py_XDECREF(key); 3358 Py_XDECREF(value); 3359 Py_DECREF(result); 3360 return NULL; 3361 } 3362 3363 /* Create a three-level trie */ 3364 result = PyObject_MALLOC(sizeof(struct encoding_map) + 3365 16*count2 + 128*count3 - 1); 3366 if (!result) 3367 return PyErr_NoMemory(); 3368 PyObject_Init(result, &EncodingMapType); 3369 mresult = (struct encoding_map*)result; 3370 mresult->count2 = count2; 3371 mresult->count3 = count3; 3372 mlevel1 = mresult->level1; 3373 mlevel2 = mresult->level23; 3374 mlevel3 = mresult->level23 + 16*count2; 3375 memcpy(mlevel1, level1, 32); 3376 memset(mlevel2, 0xFF, 16*count2); 3377 memset(mlevel3, 0, 128*count3); 3378 count3 = 0; 3379 for (i = 1; i < 256; i++) { 3380 int o1, o2, o3, i2, i3; 3381 if (decode[i] == 0xFFFE) 3382 /* unmapped character */ 3383 continue; 3384 o1 = decode[i]>>11; 3385 o2 = (decode[i]>>7) & 0xF; 3386 i2 = 16*mlevel1[o1] + o2; 3387 if (mlevel2[i2] == 0xFF) 3388 mlevel2[i2] = count3++; 3389 o3 = decode[i] & 0x7F; 3390 i3 = 128*mlevel2[i2] + o3; 3391 mlevel3[i3] = i; 3392 } 3393 return result; 3394} 3395 3396static int 3397encoding_map_lookup(Py_UNICODE c, PyObject *mapping) 3398{ 3399 struct encoding_map *map = (struct encoding_map*)mapping; 3400 int l1 = c>>11; 3401 int l2 = (c>>7) & 0xF; 3402 int l3 = c & 0x7F; 3403 int i; 3404 3405#ifdef Py_UNICODE_WIDE 3406 if (c > 0xFFFF) { 3407 return -1; 3408 } 3409#endif 3410 if (c == 0) 3411 return 0; 3412 /* level 1*/ 3413 i = map->level1[l1]; 3414 if (i == 0xFF) { 3415 return -1; 3416 } 3417 /* level 2*/ 3418 i = map->level23[16*i+l2]; 3419 if (i == 0xFF) { 3420 return -1; 3421 } 3422 /* level 3 */ 3423 i = map->level23[16*map->count2 + 128*i + l3]; 3424 if (i == 0) { 3425 return -1; 3426 } 3427 return i; 3428} 3429 3430/* Lookup the character ch in the mapping. If the character 3431 can't be found, Py_None is returned (or NULL, if another 3432 error occurred). */ 3433static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping) 3434{ 3435 PyObject *w = PyInt_FromLong((long)c); 3436 PyObject *x; 3437 3438 if (w == NULL) 3439 return NULL; 3440 x = PyObject_GetItem(mapping, w); 3441 Py_DECREF(w); 3442 if (x == NULL) { 3443 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 3444 /* No mapping found means: mapping is undefined. */ 3445 PyErr_Clear(); 3446 x = Py_None; 3447 Py_INCREF(x); 3448 return x; 3449 } else 3450 return NULL; 3451 } 3452 else if (x == Py_None) 3453 return x; 3454 else if (PyInt_Check(x)) { 3455 long value = PyInt_AS_LONG(x); 3456 if (value < 0 || value > 255) { 3457 PyErr_SetString(PyExc_TypeError, 3458 "character mapping must be in range(256)"); 3459 Py_DECREF(x); 3460 return NULL; 3461 } 3462 return x; 3463 } 3464 else if (PyString_Check(x)) 3465 return x; 3466 else { 3467 /* wrong return value */ 3468 PyErr_SetString(PyExc_TypeError, 3469 "character mapping must return integer, None or str"); 3470 Py_DECREF(x); 3471 return NULL; 3472 } 3473} 3474 3475static int 3476charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize) 3477{ 3478 Py_ssize_t outsize = PyString_GET_SIZE(*outobj); 3479 /* exponentially overallocate to minimize reallocations */ 3480 if (requiredsize < 2*outsize) 3481 requiredsize = 2*outsize; 3482 if (_PyString_Resize(outobj, requiredsize)) { 3483 return 0; 3484 } 3485 return 1; 3486} 3487 3488typedef enum charmapencode_result { 3489 enc_SUCCESS, enc_FAILED, enc_EXCEPTION 3490}charmapencode_result; 3491/* lookup the character, put the result in the output string and adjust 3492 various state variables. Reallocate the output string if not enough 3493 space is available. Return a new reference to the object that 3494 was put in the output buffer, or Py_None, if the mapping was undefined 3495 (in which case no character was written) or NULL, if a 3496 reallocation error occurred. The caller must decref the result */ 3497static 3498charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping, 3499 PyObject **outobj, Py_ssize_t *outpos) 3500{ 3501 PyObject *rep; 3502 char *outstart; 3503 Py_ssize_t outsize = PyString_GET_SIZE(*outobj); 3504 3505 if (mapping->ob_type == &EncodingMapType) { 3506 int res = encoding_map_lookup(c, mapping); 3507 Py_ssize_t requiredsize = *outpos+1; 3508 if (res == -1) 3509 return enc_FAILED; 3510 if (outsize<requiredsize) 3511 if (!charmapencode_resize(outobj, outpos, requiredsize)) 3512 return enc_EXCEPTION; 3513 outstart = PyString_AS_STRING(*outobj); 3514 outstart[(*outpos)++] = (char)res; 3515 return enc_SUCCESS; 3516 } 3517 3518 rep = charmapencode_lookup(c, mapping); 3519 if (rep==NULL) 3520 return enc_EXCEPTION; 3521 else if (rep==Py_None) { 3522 Py_DECREF(rep); 3523 return enc_FAILED; 3524 } else { 3525 if (PyInt_Check(rep)) { 3526 Py_ssize_t requiredsize = *outpos+1; 3527 if (outsize<requiredsize) 3528 if (!charmapencode_resize(outobj, outpos, requiredsize)) { 3529 Py_DECREF(rep); 3530 return enc_EXCEPTION; 3531 } 3532 outstart = PyString_AS_STRING(*outobj); 3533 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep); 3534 } 3535 else { 3536 const char *repchars = PyString_AS_STRING(rep); 3537 Py_ssize_t repsize = PyString_GET_SIZE(rep); 3538 Py_ssize_t requiredsize = *outpos+repsize; 3539 if (outsize<requiredsize) 3540 if (!charmapencode_resize(outobj, outpos, requiredsize)) { 3541 Py_DECREF(rep); 3542 return enc_EXCEPTION; 3543 } 3544 outstart = PyString_AS_STRING(*outobj); 3545 memcpy(outstart + *outpos, repchars, repsize); 3546 *outpos += repsize; 3547 } 3548 } 3549 Py_DECREF(rep); 3550 return enc_SUCCESS; 3551} 3552 3553/* handle an error in PyUnicode_EncodeCharmap 3554 Return 0 on success, -1 on error */ 3555static 3556int charmap_encoding_error( 3557 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping, 3558 PyObject **exceptionObject, 3559 int *known_errorHandler, PyObject **errorHandler, const char *errors, 3560 PyObject **res, Py_ssize_t *respos) 3561{ 3562 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */ 3563 Py_ssize_t repsize; 3564 Py_ssize_t newpos; 3565 Py_UNICODE *uni2; 3566 /* startpos for collecting unencodable chars */ 3567 Py_ssize_t collstartpos = *inpos; 3568 Py_ssize_t collendpos = *inpos+1; 3569 Py_ssize_t collpos; 3570 char *encoding = "charmap"; 3571 char *reason = "character maps to <undefined>"; 3572 charmapencode_result x; 3573 3574 /* find all unencodable characters */ 3575 while (collendpos < size) { 3576 PyObject *rep; 3577 if (mapping->ob_type == &EncodingMapType) { 3578 int res = encoding_map_lookup(p[collendpos], mapping); 3579 if (res != -1) 3580 break; 3581 ++collendpos; 3582 continue; 3583 } 3584 3585 rep = charmapencode_lookup(p[collendpos], mapping); 3586 if (rep==NULL) 3587 return -1; 3588 else if (rep!=Py_None) { 3589 Py_DECREF(rep); 3590 break; 3591 } 3592 Py_DECREF(rep); 3593 ++collendpos; 3594 } 3595 /* cache callback name lookup 3596 * (if not done yet, i.e. it's the first error) */ 3597 if (*known_errorHandler==-1) { 3598 if ((errors==NULL) || (!strcmp(errors, "strict"))) 3599 *known_errorHandler = 1; 3600 else if (!strcmp(errors, "replace")) 3601 *known_errorHandler = 2; 3602 else if (!strcmp(errors, "ignore")) 3603 *known_errorHandler = 3; 3604 else if (!strcmp(errors, "xmlcharrefreplace")) 3605 *known_errorHandler = 4; 3606 else 3607 *known_errorHandler = 0; 3608 } 3609 switch (*known_errorHandler) { 3610 case 1: /* strict */ 3611 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason); 3612 return -1; 3613 case 2: /* replace */ 3614 for (collpos = collstartpos; collpos<collendpos; ++collpos) { 3615 x = charmapencode_output('?', mapping, res, respos); 3616 if (x==enc_EXCEPTION) { 3617 return -1; 3618 } 3619 else if (x==enc_FAILED) { 3620 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason); 3621 return -1; 3622 } 3623 } 3624 /* fall through */ 3625 case 3: /* ignore */ 3626 *inpos = collendpos; 3627 break; 3628 case 4: /* xmlcharrefreplace */ 3629 /* generate replacement (temporarily (mis)uses p) */ 3630 for (collpos = collstartpos; collpos < collendpos; ++collpos) { 3631 char buffer[2+29+1+1]; 3632 char *cp; 3633 sprintf(buffer, "&#%d;", (int)p[collpos]); 3634 for (cp = buffer; *cp; ++cp) { 3635 x = charmapencode_output(*cp, mapping, res, respos); 3636 if (x==enc_EXCEPTION) 3637 return -1; 3638 else if (x==enc_FAILED) { 3639 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason); 3640 return -1; 3641 } 3642 } 3643 } 3644 *inpos = collendpos; 3645 break; 3646 default: 3647 repunicode = unicode_encode_call_errorhandler(errors, errorHandler, 3648 encoding, reason, p, size, exceptionObject, 3649 collstartpos, collendpos, &newpos); 3650 if (repunicode == NULL) 3651 return -1; 3652 /* generate replacement */ 3653 repsize = PyUnicode_GET_SIZE(repunicode); 3654 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) { 3655 x = charmapencode_output(*uni2, mapping, res, respos); 3656 if (x==enc_EXCEPTION) { 3657 return -1; 3658 } 3659 else if (x==enc_FAILED) { 3660 Py_DECREF(repunicode); 3661 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason); 3662 return -1; 3663 } 3664 } 3665 *inpos = newpos; 3666 Py_DECREF(repunicode); 3667 } 3668 return 0; 3669} 3670 3671PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p, 3672 Py_ssize_t size, 3673 PyObject *mapping, 3674 const char *errors) 3675{ 3676 /* output object */ 3677 PyObject *res = NULL; 3678 /* current input position */ 3679 Py_ssize_t inpos = 0; 3680 /* current output position */ 3681 Py_ssize_t respos = 0; 3682 PyObject *errorHandler = NULL; 3683 PyObject *exc = NULL; 3684 /* the following variable is used for caching string comparisons 3685 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3686 * 3=ignore, 4=xmlcharrefreplace */ 3687 int known_errorHandler = -1; 3688 3689 /* Default to Latin-1 */ 3690 if (mapping == NULL) 3691 return PyUnicode_EncodeLatin1(p, size, errors); 3692 3693 /* allocate enough for a simple encoding without 3694 replacements, if we need more, we'll resize */ 3695 res = PyString_FromStringAndSize(NULL, size); 3696 if (res == NULL) 3697 goto onError; 3698 if (size == 0) 3699 return res; 3700 3701 while (inpos<size) { 3702 /* try to encode it */ 3703 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos); 3704 if (x==enc_EXCEPTION) /* error */ 3705 goto onError; 3706 if (x==enc_FAILED) { /* unencodable character */ 3707 if (charmap_encoding_error(p, size, &inpos, mapping, 3708 &exc, 3709 &known_errorHandler, &errorHandler, errors, 3710 &res, &respos)) { 3711 goto onError; 3712 } 3713 } 3714 else 3715 /* done with this character => adjust input position */ 3716 ++inpos; 3717 } 3718 3719 /* Resize if we allocated to much */ 3720 if (respos<PyString_GET_SIZE(res)) { 3721 if (_PyString_Resize(&res, respos)) 3722 goto onError; 3723 } 3724 Py_XDECREF(exc); 3725 Py_XDECREF(errorHandler); 3726 return res; 3727 3728 onError: 3729 Py_XDECREF(res); 3730 Py_XDECREF(exc); 3731 Py_XDECREF(errorHandler); 3732 return NULL; 3733} 3734 3735PyObject *PyUnicode_AsCharmapString(PyObject *unicode, 3736 PyObject *mapping) 3737{ 3738 if (!PyUnicode_Check(unicode) || mapping == NULL) { 3739 PyErr_BadArgument(); 3740 return NULL; 3741 } 3742 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode), 3743 PyUnicode_GET_SIZE(unicode), 3744 mapping, 3745 NULL); 3746} 3747 3748/* create or adjust a UnicodeTranslateError */ 3749static void make_translate_exception(PyObject **exceptionObject, 3750 const Py_UNICODE *unicode, Py_ssize_t size, 3751 Py_ssize_t startpos, Py_ssize_t endpos, 3752 const char *reason) 3753{ 3754 if (*exceptionObject == NULL) { 3755 *exceptionObject = PyUnicodeTranslateError_Create( 3756 unicode, size, startpos, endpos, reason); 3757 } 3758 else { 3759 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos)) 3760 goto onError; 3761 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos)) 3762 goto onError; 3763 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason)) 3764 goto onError; 3765 return; 3766 onError: 3767 Py_DECREF(*exceptionObject); 3768 *exceptionObject = NULL; 3769 } 3770} 3771 3772/* raises a UnicodeTranslateError */ 3773static void raise_translate_exception(PyObject **exceptionObject, 3774 const Py_UNICODE *unicode, Py_ssize_t size, 3775 Py_ssize_t startpos, Py_ssize_t endpos, 3776 const char *reason) 3777{ 3778 make_translate_exception(exceptionObject, 3779 unicode, size, startpos, endpos, reason); 3780 if (*exceptionObject != NULL) 3781 PyCodec_StrictErrors(*exceptionObject); 3782} 3783 3784/* error handling callback helper: 3785 build arguments, call the callback and check the arguments, 3786 put the result into newpos and return the replacement string, which 3787 has to be freed by the caller */ 3788static PyObject *unicode_translate_call_errorhandler(const char *errors, 3789 PyObject **errorHandler, 3790 const char *reason, 3791 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject, 3792 Py_ssize_t startpos, Py_ssize_t endpos, 3793 Py_ssize_t *newpos) 3794{ 3795 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple"; 3796 3797 Py_ssize_t i_newpos; 3798 PyObject *restuple; 3799 PyObject *resunicode; 3800 3801 if (*errorHandler == NULL) { 3802 *errorHandler = PyCodec_LookupError(errors); 3803 if (*errorHandler == NULL) 3804 return NULL; 3805 } 3806 3807 make_translate_exception(exceptionObject, 3808 unicode, size, startpos, endpos, reason); 3809 if (*exceptionObject == NULL) 3810 return NULL; 3811 3812 restuple = PyObject_CallFunctionObjArgs( 3813 *errorHandler, *exceptionObject, NULL); 3814 if (restuple == NULL) 3815 return NULL; 3816 if (!PyTuple_Check(restuple)) { 3817 PyErr_Format(PyExc_TypeError, &argparse[4]); 3818 Py_DECREF(restuple); 3819 return NULL; 3820 } 3821 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, 3822 &resunicode, &i_newpos)) { 3823 Py_DECREF(restuple); 3824 return NULL; 3825 } 3826 if (i_newpos<0) 3827 *newpos = size+i_newpos; 3828 else 3829 *newpos = i_newpos; 3830 if (*newpos<0 || *newpos>size) { 3831 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos); 3832 Py_DECREF(restuple); 3833 return NULL; 3834 } 3835 Py_INCREF(resunicode); 3836 Py_DECREF(restuple); 3837 return resunicode; 3838} 3839 3840/* Lookup the character ch in the mapping and put the result in result, 3841 which must be decrefed by the caller. 3842 Return 0 on success, -1 on error */ 3843static 3844int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result) 3845{ 3846 PyObject *w = PyInt_FromLong((long)c); 3847 PyObject *x; 3848 3849 if (w == NULL) 3850 return -1; 3851 x = PyObject_GetItem(mapping, w); 3852 Py_DECREF(w); 3853 if (x == NULL) { 3854 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 3855 /* No mapping found means: use 1:1 mapping. */ 3856 PyErr_Clear(); 3857 *result = NULL; 3858 return 0; 3859 } else 3860 return -1; 3861 } 3862 else if (x == Py_None) { 3863 *result = x; 3864 return 0; 3865 } 3866 else if (PyInt_Check(x)) { 3867 long value = PyInt_AS_LONG(x); 3868 long max = PyUnicode_GetMax(); 3869 if (value < 0 || value > max) { 3870 PyErr_Format(PyExc_TypeError, 3871 "character mapping must be in range(0x%lx)", max+1); 3872 Py_DECREF(x); 3873 return -1; 3874 } 3875 *result = x; 3876 return 0; 3877 } 3878 else if (PyUnicode_Check(x)) { 3879 *result = x; 3880 return 0; 3881 } 3882 else { 3883 /* wrong return value */ 3884 PyErr_SetString(PyExc_TypeError, 3885 "character mapping must return integer, None or unicode"); 3886 Py_DECREF(x); 3887 return -1; 3888 } 3889} 3890/* ensure that *outobj is at least requiredsize characters long, 3891if not reallocate and adjust various state variables. 3892Return 0 on success, -1 on error */ 3893static 3894int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp, 3895 Py_ssize_t requiredsize) 3896{ 3897 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj); 3898 if (requiredsize > oldsize) { 3899 /* remember old output position */ 3900 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj); 3901 /* exponentially overallocate to minimize reallocations */ 3902 if (requiredsize < 2 * oldsize) 3903 requiredsize = 2 * oldsize; 3904 if (_PyUnicode_Resize(outobj, requiredsize) < 0) 3905 return -1; 3906 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos; 3907 } 3908 return 0; 3909} 3910/* lookup the character, put the result in the output string and adjust 3911 various state variables. Return a new reference to the object that 3912 was put in the output buffer in *result, or Py_None, if the mapping was 3913 undefined (in which case no character was written). 3914 The called must decref result. 3915 Return 0 on success, -1 on error. */ 3916static 3917int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp, 3918 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp, 3919 PyObject **res) 3920{ 3921 if (charmaptranslate_lookup(*curinp, mapping, res)) 3922 return -1; 3923 if (*res==NULL) { 3924 /* not found => default to 1:1 mapping */ 3925 *(*outp)++ = *curinp; 3926 } 3927 else if (*res==Py_None) 3928 ; 3929 else if (PyInt_Check(*res)) { 3930 /* no overflow check, because we know that the space is enough */ 3931 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res); 3932 } 3933 else if (PyUnicode_Check(*res)) { 3934 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res); 3935 if (repsize==1) { 3936 /* no overflow check, because we know that the space is enough */ 3937 *(*outp)++ = *PyUnicode_AS_UNICODE(*res); 3938 } 3939 else if (repsize!=0) { 3940 /* more than one character */ 3941 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) + 3942 (insize - (curinp-startinp)) + 3943 repsize - 1; 3944 if (charmaptranslate_makespace(outobj, outp, requiredsize)) 3945 return -1; 3946 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize); 3947 *outp += repsize; 3948 } 3949 } 3950 else 3951 return -1; 3952 return 0; 3953} 3954 3955PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p, 3956 Py_ssize_t size, 3957 PyObject *mapping, 3958 const char *errors) 3959{ 3960 /* output object */ 3961 PyObject *res = NULL; 3962 /* pointers to the beginning and end+1 of input */ 3963 const Py_UNICODE *startp = p; 3964 const Py_UNICODE *endp = p + size; 3965 /* pointer into the output */ 3966 Py_UNICODE *str; 3967 /* current output position */ 3968 Py_ssize_t respos = 0; 3969 char *reason = "character maps to <undefined>"; 3970 PyObject *errorHandler = NULL; 3971 PyObject *exc = NULL; 3972 /* the following variable is used for caching string comparisons 3973 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3974 * 3=ignore, 4=xmlcharrefreplace */ 3975 int known_errorHandler = -1; 3976 3977 if (mapping == NULL) { 3978 PyErr_BadArgument(); 3979 return NULL; 3980 } 3981 3982 /* allocate enough for a simple 1:1 translation without 3983 replacements, if we need more, we'll resize */ 3984 res = PyUnicode_FromUnicode(NULL, size); 3985 if (res == NULL) 3986 goto onError; 3987 if (size == 0) 3988 return res; 3989 str = PyUnicode_AS_UNICODE(res); 3990 3991 while (p<endp) { 3992 /* try to encode it */ 3993 PyObject *x = NULL; 3994 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) { 3995 Py_XDECREF(x); 3996 goto onError; 3997 } 3998 Py_XDECREF(x); 3999 if (x!=Py_None) /* it worked => adjust input pointer */ 4000 ++p; 4001 else { /* untranslatable character */ 4002 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */ 4003 Py_ssize_t repsize; 4004 Py_ssize_t newpos; 4005 Py_UNICODE *uni2; 4006 /* startpos for collecting untranslatable chars */ 4007 const Py_UNICODE *collstart = p; 4008 const Py_UNICODE *collend = p+1; 4009 const Py_UNICODE *coll; 4010 4011 /* find all untranslatable characters */ 4012 while (collend < endp) { 4013 if (charmaptranslate_lookup(*collend, mapping, &x)) 4014 goto onError; 4015 Py_XDECREF(x); 4016 if (x!=Py_None) 4017 break; 4018 ++collend; 4019 } 4020 /* cache callback name lookup 4021 * (if not done yet, i.e. it's the first error) */ 4022 if (known_errorHandler==-1) { 4023 if ((errors==NULL) || (!strcmp(errors, "strict"))) 4024 known_errorHandler = 1; 4025 else if (!strcmp(errors, "replace")) 4026 known_errorHandler = 2; 4027 else if (!strcmp(errors, "ignore")) 4028 known_errorHandler = 3; 4029 else if (!strcmp(errors, "xmlcharrefreplace")) 4030 known_errorHandler = 4; 4031 else 4032 known_errorHandler = 0; 4033 } 4034 switch (known_errorHandler) { 4035 case 1: /* strict */ 4036 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason); 4037 goto onError; 4038 case 2: /* replace */ 4039 /* No need to check for space, this is a 1:1 replacement */ 4040 for (coll = collstart; coll<collend; ++coll) 4041 *str++ = '?'; 4042 /* fall through */ 4043 case 3: /* ignore */ 4044 p = collend; 4045 break; 4046 case 4: /* xmlcharrefreplace */ 4047 /* generate replacement (temporarily (mis)uses p) */ 4048 for (p = collstart; p < collend; ++p) { 4049 char buffer[2+29+1+1]; 4050 char *cp; 4051 sprintf(buffer, "&#%d;", (int)*p); 4052 if (charmaptranslate_makespace(&res, &str, 4053 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend))) 4054 goto onError; 4055 for (cp = buffer; *cp; ++cp) 4056 *str++ = *cp; 4057 } 4058 p = collend; 4059 break; 4060 default: 4061 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler, 4062 reason, startp, size, &exc, 4063 collstart-startp, collend-startp, &newpos); 4064 if (repunicode == NULL) 4065 goto onError; 4066 /* generate replacement */ 4067 repsize = PyUnicode_GET_SIZE(repunicode); 4068 if (charmaptranslate_makespace(&res, &str, 4069 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) { 4070 Py_DECREF(repunicode); 4071 goto onError; 4072 } 4073 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) 4074 *str++ = *uni2; 4075 p = startp + newpos; 4076 Py_DECREF(repunicode); 4077 } 4078 } 4079 } 4080 /* Resize if we allocated to much */ 4081 respos = str-PyUnicode_AS_UNICODE(res); 4082 if (respos<PyUnicode_GET_SIZE(res)) { 4083 if (_PyUnicode_Resize(&res, respos) < 0) 4084 goto onError; 4085 } 4086 Py_XDECREF(exc); 4087 Py_XDECREF(errorHandler); 4088 return res; 4089 4090 onError: 4091 Py_XDECREF(res); 4092 Py_XDECREF(exc); 4093 Py_XDECREF(errorHandler); 4094 return NULL; 4095} 4096 4097PyObject *PyUnicode_Translate(PyObject *str, 4098 PyObject *mapping, 4099 const char *errors) 4100{ 4101 PyObject *result; 4102 4103 str = PyUnicode_FromObject(str); 4104 if (str == NULL) 4105 goto onError; 4106 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str), 4107 PyUnicode_GET_SIZE(str), 4108 mapping, 4109 errors); 4110 Py_DECREF(str); 4111 return result; 4112 4113 onError: 4114 Py_XDECREF(str); 4115 return NULL; 4116} 4117 4118/* --- Decimal Encoder ---------------------------------------------------- */ 4119 4120int PyUnicode_EncodeDecimal(Py_UNICODE *s, 4121 Py_ssize_t length, 4122 char *output, 4123 const char *errors) 4124{ 4125 Py_UNICODE *p, *end; 4126 PyObject *errorHandler = NULL; 4127 PyObject *exc = NULL; 4128 const char *encoding = "decimal"; 4129 const char *reason = "invalid decimal Unicode string"; 4130 /* the following variable is used for caching string comparisons 4131 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */ 4132 int known_errorHandler = -1; 4133 4134 if (output == NULL) { 4135 PyErr_BadArgument(); 4136 return -1; 4137 } 4138 4139 p = s; 4140 end = s + length; 4141 while (p < end) { 4142 register Py_UNICODE ch = *p; 4143 int decimal; 4144 PyObject *repunicode; 4145 Py_ssize_t repsize; 4146 Py_ssize_t newpos; 4147 Py_UNICODE *uni2; 4148 Py_UNICODE *collstart; 4149 Py_UNICODE *collend; 4150 4151 if (Py_UNICODE_ISSPACE(ch)) { 4152 *output++ = ' '; 4153 ++p; 4154 continue; 4155 } 4156 decimal = Py_UNICODE_TODECIMAL(ch); 4157 if (decimal >= 0) { 4158 *output++ = '0' + decimal; 4159 ++p; 4160 continue; 4161 } 4162 if (0 < ch && ch < 256) { 4163 *output++ = (char)ch; 4164 ++p; 4165 continue; 4166 } 4167 /* All other characters are considered unencodable */ 4168 collstart = p; 4169 collend = p+1; 4170 while (collend < end) { 4171 if ((0 < *collend && *collend < 256) || 4172 !Py_UNICODE_ISSPACE(*collend) || 4173 Py_UNICODE_TODECIMAL(*collend)) 4174 break; 4175 } 4176 /* cache callback name lookup 4177 * (if not done yet, i.e. it's the first error) */ 4178 if (known_errorHandler==-1) { 4179 if ((errors==NULL) || (!strcmp(errors, "strict"))) 4180 known_errorHandler = 1; 4181 else if (!strcmp(errors, "replace")) 4182 known_errorHandler = 2; 4183 else if (!strcmp(errors, "ignore")) 4184 known_errorHandler = 3; 4185 else if (!strcmp(errors, "xmlcharrefreplace")) 4186 known_errorHandler = 4; 4187 else 4188 known_errorHandler = 0; 4189 } 4190 switch (known_errorHandler) { 4191 case 1: /* strict */ 4192 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason); 4193 goto onError; 4194 case 2: /* replace */ 4195 for (p = collstart; p < collend; ++p) 4196 *output++ = '?'; 4197 /* fall through */ 4198 case 3: /* ignore */ 4199 p = collend; 4200 break; 4201 case 4: /* xmlcharrefreplace */ 4202 /* generate replacement (temporarily (mis)uses p) */ 4203 for (p = collstart; p < collend; ++p) 4204 output += sprintf(output, "&#%d;", (int)*p); 4205 p = collend; 4206 break; 4207 default: 4208 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler, 4209 encoding, reason, s, length, &exc, 4210 collstart-s, collend-s, &newpos); 4211 if (repunicode == NULL) 4212 goto onError; 4213 /* generate replacement */ 4214 repsize = PyUnicode_GET_SIZE(repunicode); 4215 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) { 4216 Py_UNICODE ch = *uni2; 4217 if (Py_UNICODE_ISSPACE(ch)) 4218 *output++ = ' '; 4219 else { 4220 decimal = Py_UNICODE_TODECIMAL(ch); 4221 if (decimal >= 0) 4222 *output++ = '0' + decimal; 4223 else if (0 < ch && ch < 256) 4224 *output++ = (char)ch; 4225 else { 4226 Py_DECREF(repunicode); 4227 raise_encode_exception(&exc, encoding, 4228 s, length, collstart-s, collend-s, reason); 4229 goto onError; 4230 } 4231 } 4232 } 4233 p = s + newpos; 4234 Py_DECREF(repunicode); 4235 } 4236 } 4237 /* 0-terminate the output string */ 4238 *output++ = '\0'; 4239 Py_XDECREF(exc); 4240 Py_XDECREF(errorHandler); 4241 return 0; 4242 4243 onError: 4244 Py_XDECREF(exc); 4245 Py_XDECREF(errorHandler); 4246 return -1; 4247} 4248 4249/* --- Helpers ------------------------------------------------------------ */ 4250 4251#define STRINGLIB_CHAR Py_UNICODE 4252 4253#define STRINGLIB_LEN PyUnicode_GET_SIZE 4254#define STRINGLIB_NEW PyUnicode_FromUnicode 4255#define STRINGLIB_STR PyUnicode_AS_UNICODE 4256 4257Py_LOCAL_INLINE(int) 4258STRINGLIB_CMP(const Py_UNICODE* str, const Py_UNICODE* other, Py_ssize_t len) 4259{ 4260 if (str[0] != other[0]) 4261 return 1; 4262 return memcmp((void*) str, (void*) other, len * sizeof(Py_UNICODE)); 4263} 4264 4265#define STRINGLIB_EMPTY unicode_empty 4266 4267#include "stringlib/fastsearch.h" 4268 4269#include "stringlib/count.h" 4270#include "stringlib/find.h" 4271#include "stringlib/partition.h" 4272 4273/* helper macro to fixup start/end slice values */ 4274#define FIX_START_END(obj) \ 4275 if (start < 0) \ 4276 start += (obj)->length; \ 4277 if (start < 0) \ 4278 start = 0; \ 4279 if (end > (obj)->length) \ 4280 end = (obj)->length; \ 4281 if (end < 0) \ 4282 end += (obj)->length; \ 4283 if (end < 0) \ 4284 end = 0; 4285 4286Py_ssize_t PyUnicode_Count(PyObject *str, 4287 PyObject *substr, 4288 Py_ssize_t start, 4289 Py_ssize_t end) 4290{ 4291 Py_ssize_t result; 4292 PyUnicodeObject* str_obj; 4293 PyUnicodeObject* sub_obj; 4294 4295 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str); 4296 if (!str_obj) 4297 return -1; 4298 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr); 4299 if (!sub_obj) { 4300 Py_DECREF(str_obj); 4301 return -1; 4302 } 4303 4304 FIX_START_END(str_obj); 4305 4306 result = stringlib_count( 4307 str_obj->str + start, end - start, sub_obj->str, sub_obj->length 4308 ); 4309 4310 Py_DECREF(sub_obj); 4311 Py_DECREF(str_obj); 4312 4313 return result; 4314} 4315 4316Py_ssize_t PyUnicode_Find(PyObject *str, 4317 PyObject *sub, 4318 Py_ssize_t start, 4319 Py_ssize_t end, 4320 int direction) 4321{ 4322 Py_ssize_t result; 4323 4324 str = PyUnicode_FromObject(str); 4325 if (!str) 4326 return -2; 4327 sub = PyUnicode_FromObject(sub); 4328 if (!sub) { 4329 Py_DECREF(str); 4330 return -2; 4331 } 4332 4333 if (direction > 0) 4334 result = stringlib_find_slice( 4335 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str), 4336 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub), 4337 start, end 4338 ); 4339 else 4340 result = stringlib_rfind_slice( 4341 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str), 4342 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub), 4343 start, end 4344 ); 4345 4346 Py_DECREF(str); 4347 Py_DECREF(sub); 4348 4349 return result; 4350} 4351 4352static 4353int tailmatch(PyUnicodeObject *self, 4354 PyUnicodeObject *substring, 4355 Py_ssize_t start, 4356 Py_ssize_t end, 4357 int direction) 4358{ 4359 if (substring->length == 0) 4360 return 1; 4361 4362 FIX_START_END(self); 4363 4364 end -= substring->length; 4365 if (end < start) 4366 return 0; 4367 4368 if (direction > 0) { 4369 if (Py_UNICODE_MATCH(self, end, substring)) 4370 return 1; 4371 } else { 4372 if (Py_UNICODE_MATCH(self, start, substring)) 4373 return 1; 4374 } 4375 4376 return 0; 4377} 4378 4379Py_ssize_t PyUnicode_Tailmatch(PyObject *str, 4380 PyObject *substr, 4381 Py_ssize_t start, 4382 Py_ssize_t end, 4383 int direction) 4384{ 4385 Py_ssize_t result; 4386 4387 str = PyUnicode_FromObject(str); 4388 if (str == NULL) 4389 return -1; 4390 substr = PyUnicode_FromObject(substr); 4391 if (substr == NULL) { 4392 Py_DECREF(str); 4393 return -1; 4394 } 4395 4396 result = tailmatch((PyUnicodeObject *)str, 4397 (PyUnicodeObject *)substr, 4398 start, end, direction); 4399 Py_DECREF(str); 4400 Py_DECREF(substr); 4401 return result; 4402} 4403 4404/* Apply fixfct filter to the Unicode object self and return a 4405 reference to the modified object */ 4406 4407static 4408PyObject *fixup(PyUnicodeObject *self, 4409 int (*fixfct)(PyUnicodeObject *s)) 4410{ 4411 4412 PyUnicodeObject *u; 4413 4414 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length); 4415 if (u == NULL) 4416 return NULL; 4417 4418 Py_UNICODE_COPY(u->str, self->str, self->length); 4419 4420 if (!fixfct(u) && PyUnicode_CheckExact(self)) { 4421 /* fixfct should return TRUE if it modified the buffer. If 4422 FALSE, return a reference to the original buffer instead 4423 (to save space, not time) */ 4424 Py_INCREF(self); 4425 Py_DECREF(u); 4426 return (PyObject*) self; 4427 } 4428 return (PyObject*) u; 4429} 4430 4431static 4432int fixupper(PyUnicodeObject *self) 4433{ 4434 Py_ssize_t len = self->length; 4435 Py_UNICODE *s = self->str; 4436 int status = 0; 4437 4438 while (len-- > 0) { 4439 register Py_UNICODE ch; 4440 4441 ch = Py_UNICODE_TOUPPER(*s); 4442 if (ch != *s) { 4443 status = 1; 4444 *s = ch; 4445 } 4446 s++; 4447 } 4448 4449 return status; 4450} 4451 4452static 4453int fixlower(PyUnicodeObject *self) 4454{ 4455 Py_ssize_t len = self->length; 4456 Py_UNICODE *s = self->str; 4457 int status = 0; 4458 4459 while (len-- > 0) { 4460 register Py_UNICODE ch; 4461 4462 ch = Py_UNICODE_TOLOWER(*s); 4463 if (ch != *s) { 4464 status = 1; 4465 *s = ch; 4466 } 4467 s++; 4468 } 4469 4470 return status; 4471} 4472 4473static 4474int fixswapcase(PyUnicodeObject *self) 4475{ 4476 Py_ssize_t len = self->length; 4477 Py_UNICODE *s = self->str; 4478 int status = 0; 4479 4480 while (len-- > 0) { 4481 if (Py_UNICODE_ISUPPER(*s)) { 4482 *s = Py_UNICODE_TOLOWER(*s); 4483 status = 1; 4484 } else if (Py_UNICODE_ISLOWER(*s)) { 4485 *s = Py_UNICODE_TOUPPER(*s); 4486 status = 1; 4487 } 4488 s++; 4489 } 4490 4491 return status; 4492} 4493 4494static 4495int fixcapitalize(PyUnicodeObject *self) 4496{ 4497 Py_ssize_t len = self->length; 4498 Py_UNICODE *s = self->str; 4499 int status = 0; 4500 4501 if (len == 0) 4502 return 0; 4503 if (Py_UNICODE_ISLOWER(*s)) { 4504 *s = Py_UNICODE_TOUPPER(*s); 4505 status = 1; 4506 } 4507 s++; 4508 while (--len > 0) { 4509 if (Py_UNICODE_ISUPPER(*s)) { 4510 *s = Py_UNICODE_TOLOWER(*s); 4511 status = 1; 4512 } 4513 s++; 4514 } 4515 return status; 4516} 4517 4518static 4519int fixtitle(PyUnicodeObject *self) 4520{ 4521 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 4522 register Py_UNICODE *e; 4523 int previous_is_cased; 4524 4525 /* Shortcut for single character strings */ 4526 if (PyUnicode_GET_SIZE(self) == 1) { 4527 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p); 4528 if (*p != ch) { 4529 *p = ch; 4530 return 1; 4531 } 4532 else 4533 return 0; 4534 } 4535 4536 e = p + PyUnicode_GET_SIZE(self); 4537 previous_is_cased = 0; 4538 for (; p < e; p++) { 4539 register const Py_UNICODE ch = *p; 4540 4541 if (previous_is_cased) 4542 *p = Py_UNICODE_TOLOWER(ch); 4543 else 4544 *p = Py_UNICODE_TOTITLE(ch); 4545 4546 if (Py_UNICODE_ISLOWER(ch) || 4547 Py_UNICODE_ISUPPER(ch) || 4548 Py_UNICODE_ISTITLE(ch)) 4549 previous_is_cased = 1; 4550 else 4551 previous_is_cased = 0; 4552 } 4553 return 1; 4554} 4555 4556PyObject * 4557PyUnicode_Join(PyObject *separator, PyObject *seq) 4558{ 4559 PyObject *internal_separator = NULL; 4560 const Py_UNICODE blank = ' '; 4561 const Py_UNICODE *sep = ␣ 4562 Py_ssize_t seplen = 1; 4563 PyUnicodeObject *res = NULL; /* the result */ 4564 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */ 4565 Py_ssize_t res_used; /* # used bytes */ 4566 Py_UNICODE *res_p; /* pointer to free byte in res's string area */ 4567 PyObject *fseq; /* PySequence_Fast(seq) */ 4568 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */ 4569 PyObject *item; 4570 Py_ssize_t i; 4571 4572 fseq = PySequence_Fast(seq, ""); 4573 if (fseq == NULL) { 4574 return NULL; 4575 } 4576 4577 /* Grrrr. A codec may be invoked to convert str objects to 4578 * Unicode, and so it's possible to call back into Python code 4579 * during PyUnicode_FromObject(), and so it's possible for a sick 4580 * codec to change the size of fseq (if seq is a list). Therefore 4581 * we have to keep refetching the size -- can't assume seqlen 4582 * is invariant. 4583 */ 4584 seqlen = PySequence_Fast_GET_SIZE(fseq); 4585 /* If empty sequence, return u"". */ 4586 if (seqlen == 0) { 4587 res = _PyUnicode_New(0); /* empty sequence; return u"" */ 4588 goto Done; 4589 } 4590 /* If singleton sequence with an exact Unicode, return that. */ 4591 if (seqlen == 1) { 4592 item = PySequence_Fast_GET_ITEM(fseq, 0); 4593 if (PyUnicode_CheckExact(item)) { 4594 Py_INCREF(item); 4595 res = (PyUnicodeObject *)item; 4596 goto Done; 4597 } 4598 } 4599 4600 /* At least two items to join, or one that isn't exact Unicode. */ 4601 if (seqlen > 1) { 4602 /* Set up sep and seplen -- they're needed. */ 4603 if (separator == NULL) { 4604 sep = ␣ 4605 seplen = 1; 4606 } 4607 else { 4608 internal_separator = PyUnicode_FromObject(separator); 4609 if (internal_separator == NULL) 4610 goto onError; 4611 sep = PyUnicode_AS_UNICODE(internal_separator); 4612 seplen = PyUnicode_GET_SIZE(internal_separator); 4613 /* In case PyUnicode_FromObject() mutated seq. */ 4614 seqlen = PySequence_Fast_GET_SIZE(fseq); 4615 } 4616 } 4617 4618 /* Get space. */ 4619 res = _PyUnicode_New(res_alloc); 4620 if (res == NULL) 4621 goto onError; 4622 res_p = PyUnicode_AS_UNICODE(res); 4623 res_used = 0; 4624 4625 for (i = 0; i < seqlen; ++i) { 4626 Py_ssize_t itemlen; 4627 Py_ssize_t new_res_used; 4628 4629 item = PySequence_Fast_GET_ITEM(fseq, i); 4630 /* Convert item to Unicode. */ 4631 if (! PyUnicode_Check(item) && ! PyString_Check(item)) { 4632 PyErr_Format(PyExc_TypeError, 4633 "sequence item %zd: expected string or Unicode," 4634 " %.80s found", 4635 i, item->ob_type->tp_name); 4636 goto onError; 4637 } 4638 item = PyUnicode_FromObject(item); 4639 if (item == NULL) 4640 goto onError; 4641 /* We own a reference to item from here on. */ 4642 4643 /* In case PyUnicode_FromObject() mutated seq. */ 4644 seqlen = PySequence_Fast_GET_SIZE(fseq); 4645 4646 /* Make sure we have enough space for the separator and the item. */ 4647 itemlen = PyUnicode_GET_SIZE(item); 4648 new_res_used = res_used + itemlen; 4649 if (new_res_used < 0) 4650 goto Overflow; 4651 if (i < seqlen - 1) { 4652 new_res_used += seplen; 4653 if (new_res_used < 0) 4654 goto Overflow; 4655 } 4656 if (new_res_used > res_alloc) { 4657 /* double allocated size until it's big enough */ 4658 do { 4659 res_alloc += res_alloc; 4660 if (res_alloc <= 0) 4661 goto Overflow; 4662 } while (new_res_used > res_alloc); 4663 if (_PyUnicode_Resize(&res, res_alloc) < 0) { 4664 Py_DECREF(item); 4665 goto onError; 4666 } 4667 res_p = PyUnicode_AS_UNICODE(res) + res_used; 4668 } 4669 4670 /* Copy item, and maybe the separator. */ 4671 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen); 4672 res_p += itemlen; 4673 if (i < seqlen - 1) { 4674 Py_UNICODE_COPY(res_p, sep, seplen); 4675 res_p += seplen; 4676 } 4677 Py_DECREF(item); 4678 res_used = new_res_used; 4679 } 4680 4681 /* Shrink res to match the used area; this probably can't fail, 4682 * but it's cheap to check. 4683 */ 4684 if (_PyUnicode_Resize(&res, res_used) < 0) 4685 goto onError; 4686 4687 Done: 4688 Py_XDECREF(internal_separator); 4689 Py_DECREF(fseq); 4690 return (PyObject *)res; 4691 4692 Overflow: 4693 PyErr_SetString(PyExc_OverflowError, 4694 "join() result is too long for a Python string"); 4695 Py_DECREF(item); 4696 /* fall through */ 4697 4698 onError: 4699 Py_XDECREF(internal_separator); 4700 Py_DECREF(fseq); 4701 Py_XDECREF(res); 4702 return NULL; 4703} 4704 4705static 4706PyUnicodeObject *pad(PyUnicodeObject *self, 4707 Py_ssize_t left, 4708 Py_ssize_t right, 4709 Py_UNICODE fill) 4710{ 4711 PyUnicodeObject *u; 4712 4713 if (left < 0) 4714 left = 0; 4715 if (right < 0) 4716 right = 0; 4717 4718 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) { 4719 Py_INCREF(self); 4720 return self; 4721 } 4722 4723 u = _PyUnicode_New(left + self->length + right); 4724 if (u) { 4725 if (left) 4726 Py_UNICODE_FILL(u->str, fill, left); 4727 Py_UNICODE_COPY(u->str + left, self->str, self->length); 4728 if (right) 4729 Py_UNICODE_FILL(u->str + left + self->length, fill, right); 4730 } 4731 4732 return u; 4733} 4734 4735#define SPLIT_APPEND(data, left, right) \ 4736 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \ 4737 if (!str) \ 4738 goto onError; \ 4739 if (PyList_Append(list, str)) { \ 4740 Py_DECREF(str); \ 4741 goto onError; \ 4742 } \ 4743 else \ 4744 Py_DECREF(str); 4745 4746static 4747PyObject *split_whitespace(PyUnicodeObject *self, 4748 PyObject *list, 4749 Py_ssize_t maxcount) 4750{ 4751 register Py_ssize_t i; 4752 register Py_ssize_t j; 4753 Py_ssize_t len = self->length; 4754 PyObject *str; 4755 4756 for (i = j = 0; i < len; ) { 4757 /* find a token */ 4758 while (i < len && Py_UNICODE_ISSPACE(self->str[i])) 4759 i++; 4760 j = i; 4761 while (i < len && !Py_UNICODE_ISSPACE(self->str[i])) 4762 i++; 4763 if (j < i) { 4764 if (maxcount-- <= 0) 4765 break; 4766 SPLIT_APPEND(self->str, j, i); 4767 while (i < len && Py_UNICODE_ISSPACE(self->str[i])) 4768 i++; 4769 j = i; 4770 } 4771 } 4772 if (j < len) { 4773 SPLIT_APPEND(self->str, j, len); 4774 } 4775 return list; 4776 4777 onError: 4778 Py_DECREF(list); 4779 return NULL; 4780} 4781 4782PyObject *PyUnicode_Splitlines(PyObject *string, 4783 int keepends) 4784{ 4785 register Py_ssize_t i; 4786 register Py_ssize_t j; 4787 Py_ssize_t len; 4788 PyObject *list; 4789 PyObject *str; 4790 Py_UNICODE *data; 4791 4792 string = PyUnicode_FromObject(string); 4793 if (string == NULL) 4794 return NULL; 4795 data = PyUnicode_AS_UNICODE(string); 4796 len = PyUnicode_GET_SIZE(string); 4797 4798 list = PyList_New(0); 4799 if (!list) 4800 goto onError; 4801 4802 for (i = j = 0; i < len; ) { 4803 Py_ssize_t eol; 4804 4805 /* Find a line and append it */ 4806 while (i < len && !BLOOM_LINEBREAK(data[i])) 4807 i++; 4808 4809 /* Skip the line break reading CRLF as one line break */ 4810 eol = i; 4811 if (i < len) { 4812 if (data[i] == '\r' && i + 1 < len && 4813 data[i+1] == '\n') 4814 i += 2; 4815 else 4816 i++; 4817 if (keepends) 4818 eol = i; 4819 } 4820 SPLIT_APPEND(data, j, eol); 4821 j = i; 4822 } 4823 if (j < len) { 4824 SPLIT_APPEND(data, j, len); 4825 } 4826 4827 Py_DECREF(string); 4828 return list; 4829 4830 onError: 4831 Py_XDECREF(list); 4832 Py_DECREF(string); 4833 return NULL; 4834} 4835 4836static 4837PyObject *split_char(PyUnicodeObject *self, 4838 PyObject *list, 4839 Py_UNICODE ch, 4840 Py_ssize_t maxcount) 4841{ 4842 register Py_ssize_t i; 4843 register Py_ssize_t j; 4844 Py_ssize_t len = self->length; 4845 PyObject *str; 4846 4847 for (i = j = 0; i < len; ) { 4848 if (self->str[i] == ch) { 4849 if (maxcount-- <= 0) 4850 break; 4851 SPLIT_APPEND(self->str, j, i); 4852 i = j = i + 1; 4853 } else 4854 i++; 4855 } 4856 if (j <= len) { 4857 SPLIT_APPEND(self->str, j, len); 4858 } 4859 return list; 4860 4861 onError: 4862 Py_DECREF(list); 4863 return NULL; 4864} 4865 4866static 4867PyObject *split_substring(PyUnicodeObject *self, 4868 PyObject *list, 4869 PyUnicodeObject *substring, 4870 Py_ssize_t maxcount) 4871{ 4872 register Py_ssize_t i; 4873 register Py_ssize_t j; 4874 Py_ssize_t len = self->length; 4875 Py_ssize_t sublen = substring->length; 4876 PyObject *str; 4877 4878 for (i = j = 0; i <= len - sublen; ) { 4879 if (Py_UNICODE_MATCH(self, i, substring)) { 4880 if (maxcount-- <= 0) 4881 break; 4882 SPLIT_APPEND(self->str, j, i); 4883 i = j = i + sublen; 4884 } else 4885 i++; 4886 } 4887 if (j <= len) { 4888 SPLIT_APPEND(self->str, j, len); 4889 } 4890 return list; 4891 4892 onError: 4893 Py_DECREF(list); 4894 return NULL; 4895} 4896 4897static 4898PyObject *rsplit_whitespace(PyUnicodeObject *self, 4899 PyObject *list, 4900 Py_ssize_t maxcount) 4901{ 4902 register Py_ssize_t i; 4903 register Py_ssize_t j; 4904 Py_ssize_t len = self->length; 4905 PyObject *str; 4906 4907 for (i = j = len - 1; i >= 0; ) { 4908 /* find a token */ 4909 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i])) 4910 i--; 4911 j = i; 4912 while (i >= 0 && !Py_UNICODE_ISSPACE(self->str[i])) 4913 i--; 4914 if (j > i) { 4915 if (maxcount-- <= 0) 4916 break; 4917 SPLIT_APPEND(self->str, i + 1, j + 1); 4918 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i])) 4919 i--; 4920 j = i; 4921 } 4922 } 4923 if (j >= 0) { 4924 SPLIT_APPEND(self->str, 0, j + 1); 4925 } 4926 if (PyList_Reverse(list) < 0) 4927 goto onError; 4928 return list; 4929 4930 onError: 4931 Py_DECREF(list); 4932 return NULL; 4933} 4934 4935static 4936PyObject *rsplit_char(PyUnicodeObject *self, 4937 PyObject *list, 4938 Py_UNICODE ch, 4939 Py_ssize_t maxcount) 4940{ 4941 register Py_ssize_t i; 4942 register Py_ssize_t j; 4943 Py_ssize_t len = self->length; 4944 PyObject *str; 4945 4946 for (i = j = len - 1; i >= 0; ) { 4947 if (self->str[i] == ch) { 4948 if (maxcount-- <= 0) 4949 break; 4950 SPLIT_APPEND(self->str, i + 1, j + 1); 4951 j = i = i - 1; 4952 } else 4953 i--; 4954 } 4955 if (j >= -1) { 4956 SPLIT_APPEND(self->str, 0, j + 1); 4957 } 4958 if (PyList_Reverse(list) < 0) 4959 goto onError; 4960 return list; 4961 4962 onError: 4963 Py_DECREF(list); 4964 return NULL; 4965} 4966 4967static 4968PyObject *rsplit_substring(PyUnicodeObject *self, 4969 PyObject *list, 4970 PyUnicodeObject *substring, 4971 Py_ssize_t maxcount) 4972{ 4973 register Py_ssize_t i; 4974 register Py_ssize_t j; 4975 Py_ssize_t len = self->length; 4976 Py_ssize_t sublen = substring->length; 4977 PyObject *str; 4978 4979 for (i = len - sublen, j = len; i >= 0; ) { 4980 if (Py_UNICODE_MATCH(self, i, substring)) { 4981 if (maxcount-- <= 0) 4982 break; 4983 SPLIT_APPEND(self->str, i + sublen, j); 4984 j = i; 4985 i -= sublen; 4986 } else 4987 i--; 4988 } 4989 if (j >= 0) { 4990 SPLIT_APPEND(self->str, 0, j); 4991 } 4992 if (PyList_Reverse(list) < 0) 4993 goto onError; 4994 return list; 4995 4996 onError: 4997 Py_DECREF(list); 4998 return NULL; 4999} 5000 5001#undef SPLIT_APPEND 5002 5003static 5004PyObject *split(PyUnicodeObject *self, 5005 PyUnicodeObject *substring, 5006 Py_ssize_t maxcount) 5007{ 5008 PyObject *list; 5009 5010 if (maxcount < 0) 5011 maxcount = PY_SSIZE_T_MAX; 5012 5013 list = PyList_New(0); 5014 if (!list) 5015 return NULL; 5016 5017 if (substring == NULL) 5018 return split_whitespace(self,list,maxcount); 5019 5020 else if (substring->length == 1) 5021 return split_char(self,list,substring->str[0],maxcount); 5022 5023 else if (substring->length == 0) { 5024 Py_DECREF(list); 5025 PyErr_SetString(PyExc_ValueError, "empty separator"); 5026 return NULL; 5027 } 5028 else 5029 return split_substring(self,list,substring,maxcount); 5030} 5031 5032static 5033PyObject *rsplit(PyUnicodeObject *self, 5034 PyUnicodeObject *substring, 5035 Py_ssize_t maxcount) 5036{ 5037 PyObject *list; 5038 5039 if (maxcount < 0) 5040 maxcount = PY_SSIZE_T_MAX; 5041 5042 list = PyList_New(0); 5043 if (!list) 5044 return NULL; 5045 5046 if (substring == NULL) 5047 return rsplit_whitespace(self,list,maxcount); 5048 5049 else if (substring->length == 1) 5050 return rsplit_char(self,list,substring->str[0],maxcount); 5051 5052 else if (substring->length == 0) { 5053 Py_DECREF(list); 5054 PyErr_SetString(PyExc_ValueError, "empty separator"); 5055 return NULL; 5056 } 5057 else 5058 return rsplit_substring(self,list,substring,maxcount); 5059} 5060 5061static 5062PyObject *replace(PyUnicodeObject *self, 5063 PyUnicodeObject *str1, 5064 PyUnicodeObject *str2, 5065 Py_ssize_t maxcount) 5066{ 5067 PyUnicodeObject *u; 5068 5069 if (maxcount < 0) 5070 maxcount = PY_SSIZE_T_MAX; 5071 5072 if (str1->length == str2->length) { 5073 /* same length */ 5074 Py_ssize_t i; 5075 if (str1->length == 1) { 5076 /* replace characters */ 5077 Py_UNICODE u1, u2; 5078 if (!findchar(self->str, self->length, str1->str[0])) 5079 goto nothing; 5080 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length); 5081 if (!u) 5082 return NULL; 5083 Py_UNICODE_COPY(u->str, self->str, self->length); 5084 u1 = str1->str[0]; 5085 u2 = str2->str[0]; 5086 for (i = 0; i < u->length; i++) 5087 if (u->str[i] == u1) { 5088 if (--maxcount < 0) 5089 break; 5090 u->str[i] = u2; 5091 } 5092 } else { 5093 i = fastsearch( 5094 self->str, self->length, str1->str, str1->length, FAST_SEARCH 5095 ); 5096 if (i < 0) 5097 goto nothing; 5098 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length); 5099 if (!u) 5100 return NULL; 5101 Py_UNICODE_COPY(u->str, self->str, self->length); 5102 while (i <= self->length - str1->length) 5103 if (Py_UNICODE_MATCH(self, i, str1)) { 5104 if (--maxcount < 0) 5105 break; 5106 Py_UNICODE_COPY(u->str+i, str2->str, str2->length); 5107 i += str1->length; 5108 } else 5109 i++; 5110 } 5111 } else { 5112 5113 Py_ssize_t n, i, j, e; 5114 Py_ssize_t product, new_size, delta; 5115 Py_UNICODE *p; 5116 5117 /* replace strings */ 5118 n = stringlib_count(self->str, self->length, str1->str, str1->length); 5119 if (n > maxcount) 5120 n = maxcount; 5121 if (n == 0) 5122 goto nothing; 5123 /* new_size = self->length + n * (str2->length - str1->length)); */ 5124 delta = (str2->length - str1->length); 5125 if (delta == 0) { 5126 new_size = self->length; 5127 } else { 5128 product = n * (str2->length - str1->length); 5129 if ((product / (str2->length - str1->length)) != n) { 5130 PyErr_SetString(PyExc_OverflowError, 5131 "replace string is too long"); 5132 return NULL; 5133 } 5134 new_size = self->length + product; 5135 if (new_size < 0) { 5136 PyErr_SetString(PyExc_OverflowError, 5137 "replace string is too long"); 5138 return NULL; 5139 } 5140 } 5141 u = _PyUnicode_New(new_size); 5142 if (!u) 5143 return NULL; 5144 i = 0; 5145 p = u->str; 5146 e = self->length - str1->length; 5147 if (str1->length > 0) { 5148 while (n-- > 0) { 5149 /* look for next match */ 5150 j = i; 5151 while (j <= e) { 5152 if (Py_UNICODE_MATCH(self, j, str1)) 5153 break; 5154 j++; 5155 } 5156 if (j > i) { 5157 if (j > e) 5158 break; 5159 /* copy unchanged part [i:j] */ 5160 Py_UNICODE_COPY(p, self->str+i, j-i); 5161 p += j - i; 5162 } 5163 /* copy substitution string */ 5164 if (str2->length > 0) { 5165 Py_UNICODE_COPY(p, str2->str, str2->length); 5166 p += str2->length; 5167 } 5168 i = j + str1->length; 5169 } 5170 if (i < self->length) 5171 /* copy tail [i:] */ 5172 Py_UNICODE_COPY(p, self->str+i, self->length-i); 5173 } else { 5174 /* interleave */ 5175 while (n > 0) { 5176 Py_UNICODE_COPY(p, str2->str, str2->length); 5177 p += str2->length; 5178 if (--n <= 0) 5179 break; 5180 *p++ = self->str[i++]; 5181 } 5182 Py_UNICODE_COPY(p, self->str+i, self->length-i); 5183 } 5184 } 5185 return (PyObject *) u; 5186 5187nothing: 5188 /* nothing to replace; return original string (when possible) */ 5189 if (PyUnicode_CheckExact(self)) { 5190 Py_INCREF(self); 5191 return (PyObject *) self; 5192 } 5193 return PyUnicode_FromUnicode(self->str, self->length); 5194} 5195 5196/* --- Unicode Object Methods --------------------------------------------- */ 5197 5198PyDoc_STRVAR(title__doc__, 5199"S.title() -> unicode\n\ 5200\n\ 5201Return a titlecased version of S, i.e. words start with title case\n\ 5202characters, all remaining cased characters have lower case."); 5203 5204static PyObject* 5205unicode_title(PyUnicodeObject *self) 5206{ 5207 return fixup(self, fixtitle); 5208} 5209 5210PyDoc_STRVAR(capitalize__doc__, 5211"S.capitalize() -> unicode\n\ 5212\n\ 5213Return a capitalized version of S, i.e. make the first character\n\ 5214have upper case."); 5215 5216static PyObject* 5217unicode_capitalize(PyUnicodeObject *self) 5218{ 5219 return fixup(self, fixcapitalize); 5220} 5221 5222#if 0 5223PyDoc_STRVAR(capwords__doc__, 5224"S.capwords() -> unicode\n\ 5225\n\ 5226Apply .capitalize() to all words in S and return the result with\n\ 5227normalized whitespace (all whitespace strings are replaced by ' ')."); 5228 5229static PyObject* 5230unicode_capwords(PyUnicodeObject *self) 5231{ 5232 PyObject *list; 5233 PyObject *item; 5234 Py_ssize_t i; 5235 5236 /* Split into words */ 5237 list = split(self, NULL, -1); 5238 if (!list) 5239 return NULL; 5240 5241 /* Capitalize each word */ 5242 for (i = 0; i < PyList_GET_SIZE(list); i++) { 5243 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i), 5244 fixcapitalize); 5245 if (item == NULL) 5246 goto onError; 5247 Py_DECREF(PyList_GET_ITEM(list, i)); 5248 PyList_SET_ITEM(list, i, item); 5249 } 5250 5251 /* Join the words to form a new string */ 5252 item = PyUnicode_Join(NULL, list); 5253 5254onError: 5255 Py_DECREF(list); 5256 return (PyObject *)item; 5257} 5258#endif 5259 5260/* Argument converter. Coerces to a single unicode character */ 5261 5262static int 5263convert_uc(PyObject *obj, void *addr) 5264{ 5265 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr; 5266 PyObject *uniobj; 5267 Py_UNICODE *unistr; 5268 5269 uniobj = PyUnicode_FromObject(obj); 5270 if (uniobj == NULL) { 5271 PyErr_SetString(PyExc_TypeError, 5272 "The fill character cannot be converted to Unicode"); 5273 return 0; 5274 } 5275 if (PyUnicode_GET_SIZE(uniobj) != 1) { 5276 PyErr_SetString(PyExc_TypeError, 5277 "The fill character must be exactly one character long"); 5278 Py_DECREF(uniobj); 5279 return 0; 5280 } 5281 unistr = PyUnicode_AS_UNICODE(uniobj); 5282 *fillcharloc = unistr[0]; 5283 Py_DECREF(uniobj); 5284 return 1; 5285} 5286 5287PyDoc_STRVAR(center__doc__, 5288"S.center(width[, fillchar]) -> unicode\n\ 5289\n\ 5290Return S centered in a Unicode string of length width. Padding is\n\ 5291done using the specified fill character (default is a space)"); 5292 5293static PyObject * 5294unicode_center(PyUnicodeObject *self, PyObject *args) 5295{ 5296 Py_ssize_t marg, left; 5297 Py_ssize_t width; 5298 Py_UNICODE fillchar = ' '; 5299 5300 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar)) 5301 return NULL; 5302 5303 if (self->length >= width && PyUnicode_CheckExact(self)) { 5304 Py_INCREF(self); 5305 return (PyObject*) self; 5306 } 5307 5308 marg = width - self->length; 5309 left = marg / 2 + (marg & width & 1); 5310 5311 return (PyObject*) pad(self, left, marg - left, fillchar); 5312} 5313 5314#if 0 5315 5316/* This code should go into some future Unicode collation support 5317 module. The basic comparison should compare ordinals on a naive 5318 basis (this is what Java does and thus JPython too). */ 5319 5320/* speedy UTF-16 code point order comparison */ 5321/* gleaned from: */ 5322/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */ 5323 5324static short utf16Fixup[32] = 5325{ 5326 0, 0, 0, 0, 0, 0, 0, 0, 5327 0, 0, 0, 0, 0, 0, 0, 0, 5328 0, 0, 0, 0, 0, 0, 0, 0, 5329 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800 5330}; 5331 5332static int 5333unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2) 5334{ 5335 Py_ssize_t len1, len2; 5336 5337 Py_UNICODE *s1 = str1->str; 5338 Py_UNICODE *s2 = str2->str; 5339 5340 len1 = str1->length; 5341 len2 = str2->length; 5342 5343 while (len1 > 0 && len2 > 0) { 5344 Py_UNICODE c1, c2; 5345 5346 c1 = *s1++; 5347 c2 = *s2++; 5348 5349 if (c1 > (1<<11) * 26) 5350 c1 += utf16Fixup[c1>>11]; 5351 if (c2 > (1<<11) * 26) 5352 c2 += utf16Fixup[c2>>11]; 5353 /* now c1 and c2 are in UTF-32-compatible order */ 5354 5355 if (c1 != c2) 5356 return (c1 < c2) ? -1 : 1; 5357 5358 len1--; len2--; 5359 } 5360 5361 return (len1 < len2) ? -1 : (len1 != len2); 5362} 5363 5364#else 5365 5366static int 5367unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2) 5368{ 5369 register Py_ssize_t len1, len2; 5370 5371 Py_UNICODE *s1 = str1->str; 5372 Py_UNICODE *s2 = str2->str; 5373 5374 len1 = str1->length; 5375 len2 = str2->length; 5376 5377 while (len1 > 0 && len2 > 0) { 5378 Py_UNICODE c1, c2; 5379 5380 c1 = *s1++; 5381 c2 = *s2++; 5382 5383 if (c1 != c2) 5384 return (c1 < c2) ? -1 : 1; 5385 5386 len1--; len2--; 5387 } 5388 5389 return (len1 < len2) ? -1 : (len1 != len2); 5390} 5391 5392#endif 5393 5394int PyUnicode_Compare(PyObject *left, 5395 PyObject *right) 5396{ 5397 PyUnicodeObject *u = NULL, *v = NULL; 5398 int result; 5399 5400 /* Coerce the two arguments */ 5401 u = (PyUnicodeObject *)PyUnicode_FromObject(left); 5402 if (u == NULL) 5403 goto onError; 5404 v = (PyUnicodeObject *)PyUnicode_FromObject(right); 5405 if (v == NULL) 5406 goto onError; 5407 5408 /* Shortcut for empty or interned objects */ 5409 if (v == u) { 5410 Py_DECREF(u); 5411 Py_DECREF(v); 5412 return 0; 5413 } 5414 5415 result = unicode_compare(u, v); 5416 5417 Py_DECREF(u); 5418 Py_DECREF(v); 5419 return result; 5420 5421onError: 5422 Py_XDECREF(u); 5423 Py_XDECREF(v); 5424 return -1; 5425} 5426 5427PyObject *PyUnicode_RichCompare(PyObject *left, 5428 PyObject *right, 5429 int op) 5430{ 5431 int result; 5432 5433 result = PyUnicode_Compare(left, right); 5434 if (result == -1 && PyErr_Occurred()) 5435 goto onError; 5436 5437 /* Convert the return value to a Boolean */ 5438 switch (op) { 5439 case Py_EQ: 5440 result = (result == 0); 5441 break; 5442 case Py_NE: 5443 result = (result != 0); 5444 break; 5445 case Py_LE: 5446 result = (result <= 0); 5447 break; 5448 case Py_GE: 5449 result = (result >= 0); 5450 break; 5451 case Py_LT: 5452 result = (result == -1); 5453 break; 5454 case Py_GT: 5455 result = (result == 1); 5456 break; 5457 } 5458 return PyBool_FromLong(result); 5459 5460 onError: 5461 5462 /* Standard case 5463 5464 Type errors mean that PyUnicode_FromObject() could not convert 5465 one of the arguments (usually the right hand side) to Unicode, 5466 ie. we can't handle the comparison request. However, it is 5467 possible that the other object knows a comparison method, which 5468 is why we return Py_NotImplemented to give the other object a 5469 chance. 5470 5471 */ 5472 if (PyErr_ExceptionMatches(PyExc_TypeError)) { 5473 PyErr_Clear(); 5474 Py_INCREF(Py_NotImplemented); 5475 return Py_NotImplemented; 5476 } 5477 if (op != Py_EQ && op != Py_NE) 5478 return NULL; 5479 5480 /* Equality comparison. 5481 5482 This is a special case: we silence any PyExc_UnicodeDecodeError 5483 and instead turn it into a PyErr_UnicodeWarning. 5484 5485 */ 5486 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) 5487 return NULL; 5488 PyErr_Clear(); 5489 if (PyErr_Warn(PyExc_UnicodeWarning, 5490 (op == Py_EQ) ? 5491 "Unicode equal comparison " 5492 "failed to convert both arguments to Unicode - " 5493 "interpreting them as being unequal" : 5494 "Unicode unequal comparison " 5495 "failed to convert both arguments to Unicode - " 5496 "interpreting them as being unequal" 5497 ) < 0) 5498 return NULL; 5499 result = (op == Py_NE); 5500 return PyBool_FromLong(result); 5501} 5502 5503int PyUnicode_Contains(PyObject *container, 5504 PyObject *element) 5505{ 5506 PyObject *str, *sub; 5507 int result; 5508 5509 /* Coerce the two arguments */ 5510 sub = PyUnicode_FromObject(element); 5511 if (!sub) { 5512 PyErr_SetString(PyExc_TypeError, 5513 "'in <string>' requires string as left operand"); 5514 return -1; 5515 } 5516 5517 str = PyUnicode_FromObject(container); 5518 if (!str) { 5519 Py_DECREF(sub); 5520 return -1; 5521 } 5522 5523 result = stringlib_contains_obj(str, sub); 5524 5525 Py_DECREF(str); 5526 Py_DECREF(sub); 5527 5528 return result; 5529} 5530 5531/* Concat to string or Unicode object giving a new Unicode object. */ 5532 5533PyObject *PyUnicode_Concat(PyObject *left, 5534 PyObject *right) 5535{ 5536 PyUnicodeObject *u = NULL, *v = NULL, *w; 5537 5538 if (PyBytes_Check(left) || PyBytes_Check(right)) 5539 return PyBytes_Concat(left, right); 5540 5541 /* Coerce the two arguments */ 5542 u = (PyUnicodeObject *)PyUnicode_FromObject(left); 5543 if (u == NULL) 5544 goto onError; 5545 v = (PyUnicodeObject *)PyUnicode_FromObject(right); 5546 if (v == NULL) 5547 goto onError; 5548 5549 /* Shortcuts */ 5550 if (v == unicode_empty) { 5551 Py_DECREF(v); 5552 return (PyObject *)u; 5553 } 5554 if (u == unicode_empty) { 5555 Py_DECREF(u); 5556 return (PyObject *)v; 5557 } 5558 5559 /* Concat the two Unicode strings */ 5560 w = _PyUnicode_New(u->length + v->length); 5561 if (w == NULL) 5562 goto onError; 5563 Py_UNICODE_COPY(w->str, u->str, u->length); 5564 Py_UNICODE_COPY(w->str + u->length, v->str, v->length); 5565 5566 Py_DECREF(u); 5567 Py_DECREF(v); 5568 return (PyObject *)w; 5569 5570onError: 5571 Py_XDECREF(u); 5572 Py_XDECREF(v); 5573 return NULL; 5574} 5575 5576PyDoc_STRVAR(count__doc__, 5577"S.count(sub[, start[, end]]) -> int\n\ 5578\n\ 5579Return the number of non-overlapping occurrences of substring sub in\n\ 5580Unicode string S[start:end]. Optional arguments start and end are\n\ 5581interpreted as in slice notation."); 5582 5583static PyObject * 5584unicode_count(PyUnicodeObject *self, PyObject *args) 5585{ 5586 PyUnicodeObject *substring; 5587 Py_ssize_t start = 0; 5588 Py_ssize_t end = PY_SSIZE_T_MAX; 5589 PyObject *result; 5590 5591 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring, 5592 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 5593 return NULL; 5594 5595 substring = (PyUnicodeObject *)PyUnicode_FromObject( 5596 (PyObject *)substring); 5597 if (substring == NULL) 5598 return NULL; 5599 5600 FIX_START_END(self); 5601 5602 result = PyInt_FromSsize_t( 5603 stringlib_count(self->str + start, end - start, 5604 substring->str, substring->length) 5605 ); 5606 5607 Py_DECREF(substring); 5608 5609 return result; 5610} 5611 5612PyDoc_STRVAR(encode__doc__, 5613"S.encode([encoding[,errors]]) -> string or unicode\n\ 5614\n\ 5615Encodes S using the codec registered for encoding. encoding defaults\n\ 5616to the default encoding. errors may be given to set a different error\n\ 5617handling scheme. Default is 'strict' meaning that encoding errors raise\n\ 5618a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\ 5619'xmlcharrefreplace' as well as any other name registered with\n\ 5620codecs.register_error that can handle UnicodeEncodeErrors."); 5621 5622static PyObject * 5623unicode_encode(PyUnicodeObject *self, PyObject *args) 5624{ 5625 char *encoding = NULL; 5626 char *errors = NULL; 5627 PyObject *v; 5628 5629 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors)) 5630 return NULL; 5631 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors); 5632 if (v == NULL) 5633 goto onError; 5634 if (!PyString_Check(v) && !PyUnicode_Check(v)) { 5635 PyErr_Format(PyExc_TypeError, 5636 "encoder did not return a string/unicode object " 5637 "(type=%.400s)", 5638 v->ob_type->tp_name); 5639 Py_DECREF(v); 5640 return NULL; 5641 } 5642 return v; 5643 5644 onError: 5645 return NULL; 5646} 5647 5648PyDoc_STRVAR(decode__doc__, 5649"S.decode([encoding[,errors]]) -> string or unicode\n\ 5650\n\ 5651Decodes S using the codec registered for encoding. encoding defaults\n\ 5652to the default encoding. errors may be given to set a different error\n\ 5653handling scheme. Default is 'strict' meaning that encoding errors raise\n\ 5654a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\ 5655as well as any other name registerd with codecs.register_error that is\n\ 5656able to handle UnicodeDecodeErrors."); 5657 5658static PyObject * 5659unicode_decode(PyUnicodeObject *self, PyObject *args) 5660{ 5661 char *encoding = NULL; 5662 char *errors = NULL; 5663 PyObject *v; 5664 5665 if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors)) 5666 return NULL; 5667 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors); 5668 if (v == NULL) 5669 goto onError; 5670 if (!PyString_Check(v) && !PyUnicode_Check(v)) { 5671 PyErr_Format(PyExc_TypeError, 5672 "decoder did not return a string/unicode object " 5673 "(type=%.400s)", 5674 v->ob_type->tp_name); 5675 Py_DECREF(v); 5676 return NULL; 5677 } 5678 return v; 5679 5680 onError: 5681 return NULL; 5682} 5683 5684PyDoc_STRVAR(expandtabs__doc__, 5685"S.expandtabs([tabsize]) -> unicode\n\ 5686\n\ 5687Return a copy of S where all tab characters are expanded using spaces.\n\ 5688If tabsize is not given, a tab size of 8 characters is assumed."); 5689 5690static PyObject* 5691unicode_expandtabs(PyUnicodeObject *self, PyObject *args) 5692{ 5693 Py_UNICODE *e; 5694 Py_UNICODE *p; 5695 Py_UNICODE *q; 5696 Py_ssize_t i, j; 5697 PyUnicodeObject *u; 5698 int tabsize = 8; 5699 5700 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize)) 5701 return NULL; 5702 5703 /* First pass: determine size of output string */ 5704 i = j = 0; 5705 e = self->str + self->length; 5706 for (p = self->str; p < e; p++) 5707 if (*p == '\t') { 5708 if (tabsize > 0) 5709 j += tabsize - (j % tabsize); 5710 } 5711 else { 5712 j++; 5713 if (*p == '\n' || *p == '\r') { 5714 i += j; 5715 j = 0; 5716 } 5717 } 5718 5719 /* Second pass: create output string and fill it */ 5720 u = _PyUnicode_New(i + j); 5721 if (!u) 5722 return NULL; 5723 5724 j = 0; 5725 q = u->str; 5726 5727 for (p = self->str; p < e; p++) 5728 if (*p == '\t') { 5729 if (tabsize > 0) { 5730 i = tabsize - (j % tabsize); 5731 j += i; 5732 while (i--) 5733 *q++ = ' '; 5734 } 5735 } 5736 else { 5737 j++; 5738 *q++ = *p; 5739 if (*p == '\n' || *p == '\r') 5740 j = 0; 5741 } 5742 5743 return (PyObject*) u; 5744} 5745 5746PyDoc_STRVAR(find__doc__, 5747"S.find(sub [,start [,end]]) -> int\n\ 5748\n\ 5749Return the lowest index in S where substring sub is found,\n\ 5750such that sub is contained within s[start,end]. Optional\n\ 5751arguments start and end are interpreted as in slice notation.\n\ 5752\n\ 5753Return -1 on failure."); 5754 5755static PyObject * 5756unicode_find(PyUnicodeObject *self, PyObject *args) 5757{ 5758 PyObject *substring; 5759 Py_ssize_t start = 0; 5760 Py_ssize_t end = PY_SSIZE_T_MAX; 5761 Py_ssize_t result; 5762 5763 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring, 5764 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 5765 return NULL; 5766 substring = PyUnicode_FromObject(substring); 5767 if (!substring) 5768 return NULL; 5769 5770 result = stringlib_find_slice( 5771 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self), 5772 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring), 5773 start, end 5774 ); 5775 5776 Py_DECREF(substring); 5777 5778 return PyInt_FromSsize_t(result); 5779} 5780 5781static PyObject * 5782unicode_getitem(PyUnicodeObject *self, Py_ssize_t index) 5783{ 5784 if (index < 0 || index >= self->length) { 5785 PyErr_SetString(PyExc_IndexError, "string index out of range"); 5786 return NULL; 5787 } 5788 5789 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1); 5790} 5791 5792static long 5793unicode_hash(PyUnicodeObject *self) 5794{ 5795 /* Since Unicode objects compare equal to their ASCII string 5796 counterparts, they should use the individual character values 5797 as basis for their hash value. This is needed to assure that 5798 strings and Unicode objects behave in the same way as 5799 dictionary keys. */ 5800 5801 register Py_ssize_t len; 5802 register Py_UNICODE *p; 5803 register long x; 5804 5805 if (self->hash != -1) 5806 return self->hash; 5807 len = PyUnicode_GET_SIZE(self); 5808 p = PyUnicode_AS_UNICODE(self); 5809 x = *p << 7; 5810 while (--len >= 0) 5811 x = (1000003*x) ^ *p++; 5812 x ^= PyUnicode_GET_SIZE(self); 5813 if (x == -1) 5814 x = -2; 5815 self->hash = x; 5816 return x; 5817} 5818 5819PyDoc_STRVAR(index__doc__, 5820"S.index(sub [,start [,end]]) -> int\n\ 5821\n\ 5822Like S.find() but raise ValueError when the substring is not found."); 5823 5824static PyObject * 5825unicode_index(PyUnicodeObject *self, PyObject *args) 5826{ 5827 Py_ssize_t result; 5828 PyObject *substring; 5829 Py_ssize_t start = 0; 5830 Py_ssize_t end = PY_SSIZE_T_MAX; 5831 5832 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring, 5833 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 5834 return NULL; 5835 substring = PyUnicode_FromObject(substring); 5836 if (!substring) 5837 return NULL; 5838 5839 result = stringlib_find_slice( 5840 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self), 5841 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring), 5842 start, end 5843 ); 5844 5845 Py_DECREF(substring); 5846 5847 if (result < 0) { 5848 PyErr_SetString(PyExc_ValueError, "substring not found"); 5849 return NULL; 5850 } 5851 5852 return PyInt_FromSsize_t(result); 5853} 5854 5855PyDoc_STRVAR(islower__doc__, 5856"S.islower() -> bool\n\ 5857\n\ 5858Return True if all cased characters in S are lowercase and there is\n\ 5859at least one cased character in S, False otherwise."); 5860 5861static PyObject* 5862unicode_islower(PyUnicodeObject *self) 5863{ 5864 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 5865 register const Py_UNICODE *e; 5866 int cased; 5867 5868 /* Shortcut for single character strings */ 5869 if (PyUnicode_GET_SIZE(self) == 1) 5870 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p)); 5871 5872 /* Special case for empty strings */ 5873 if (PyUnicode_GET_SIZE(self) == 0) 5874 return PyBool_FromLong(0); 5875 5876 e = p + PyUnicode_GET_SIZE(self); 5877 cased = 0; 5878 for (; p < e; p++) { 5879 register const Py_UNICODE ch = *p; 5880 5881 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) 5882 return PyBool_FromLong(0); 5883 else if (!cased && Py_UNICODE_ISLOWER(ch)) 5884 cased = 1; 5885 } 5886 return PyBool_FromLong(cased); 5887} 5888 5889PyDoc_STRVAR(isupper__doc__, 5890"S.isupper() -> bool\n\ 5891\n\ 5892Return True if all cased characters in S are uppercase and there is\n\ 5893at least one cased character in S, False otherwise."); 5894 5895static PyObject* 5896unicode_isupper(PyUnicodeObject *self) 5897{ 5898 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 5899 register const Py_UNICODE *e; 5900 int cased; 5901 5902 /* Shortcut for single character strings */ 5903 if (PyUnicode_GET_SIZE(self) == 1) 5904 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0); 5905 5906 /* Special case for empty strings */ 5907 if (PyUnicode_GET_SIZE(self) == 0) 5908 return PyBool_FromLong(0); 5909 5910 e = p + PyUnicode_GET_SIZE(self); 5911 cased = 0; 5912 for (; p < e; p++) { 5913 register const Py_UNICODE ch = *p; 5914 5915 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch)) 5916 return PyBool_FromLong(0); 5917 else if (!cased && Py_UNICODE_ISUPPER(ch)) 5918 cased = 1; 5919 } 5920 return PyBool_FromLong(cased); 5921} 5922 5923PyDoc_STRVAR(istitle__doc__, 5924"S.istitle() -> bool\n\ 5925\n\ 5926Return True if S is a titlecased string and there is at least one\n\ 5927character in S, i.e. upper- and titlecase characters may only\n\ 5928follow uncased characters and lowercase characters only cased ones.\n\ 5929Return False otherwise."); 5930 5931static PyObject* 5932unicode_istitle(PyUnicodeObject *self) 5933{ 5934 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 5935 register const Py_UNICODE *e; 5936 int cased, previous_is_cased; 5937 5938 /* Shortcut for single character strings */ 5939 if (PyUnicode_GET_SIZE(self) == 1) 5940 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) || 5941 (Py_UNICODE_ISUPPER(*p) != 0)); 5942 5943 /* Special case for empty strings */ 5944 if (PyUnicode_GET_SIZE(self) == 0) 5945 return PyBool_FromLong(0); 5946 5947 e = p + PyUnicode_GET_SIZE(self); 5948 cased = 0; 5949 previous_is_cased = 0; 5950 for (; p < e; p++) { 5951 register const Py_UNICODE ch = *p; 5952 5953 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) { 5954 if (previous_is_cased) 5955 return PyBool_FromLong(0); 5956 previous_is_cased = 1; 5957 cased = 1; 5958 } 5959 else if (Py_UNICODE_ISLOWER(ch)) { 5960 if (!previous_is_cased) 5961 return PyBool_FromLong(0); 5962 previous_is_cased = 1; 5963 cased = 1; 5964 } 5965 else 5966 previous_is_cased = 0; 5967 } 5968 return PyBool_FromLong(cased); 5969} 5970 5971PyDoc_STRVAR(isspace__doc__, 5972"S.isspace() -> bool\n\ 5973\n\ 5974Return True if all characters in S are whitespace\n\ 5975and there is at least one character in S, False otherwise."); 5976 5977static PyObject* 5978unicode_isspace(PyUnicodeObject *self) 5979{ 5980 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 5981 register const Py_UNICODE *e; 5982 5983 /* Shortcut for single character strings */ 5984 if (PyUnicode_GET_SIZE(self) == 1 && 5985 Py_UNICODE_ISSPACE(*p)) 5986 return PyBool_FromLong(1); 5987 5988 /* Special case for empty strings */ 5989 if (PyUnicode_GET_SIZE(self) == 0) 5990 return PyBool_FromLong(0); 5991 5992 e = p + PyUnicode_GET_SIZE(self); 5993 for (; p < e; p++) { 5994 if (!Py_UNICODE_ISSPACE(*p)) 5995 return PyBool_FromLong(0); 5996 } 5997 return PyBool_FromLong(1); 5998} 5999 6000PyDoc_STRVAR(isalpha__doc__, 6001"S.isalpha() -> bool\n\ 6002\n\ 6003Return True if all characters in S are alphabetic\n\ 6004and there is at least one character in S, False otherwise."); 6005 6006static PyObject* 6007unicode_isalpha(PyUnicodeObject *self) 6008{ 6009 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 6010 register const Py_UNICODE *e; 6011 6012 /* Shortcut for single character strings */ 6013 if (PyUnicode_GET_SIZE(self) == 1 && 6014 Py_UNICODE_ISALPHA(*p)) 6015 return PyBool_FromLong(1); 6016 6017 /* Special case for empty strings */ 6018 if (PyUnicode_GET_SIZE(self) == 0) 6019 return PyBool_FromLong(0); 6020 6021 e = p + PyUnicode_GET_SIZE(self); 6022 for (; p < e; p++) { 6023 if (!Py_UNICODE_ISALPHA(*p)) 6024 return PyBool_FromLong(0); 6025 } 6026 return PyBool_FromLong(1); 6027} 6028 6029PyDoc_STRVAR(isalnum__doc__, 6030"S.isalnum() -> bool\n\ 6031\n\ 6032Return True if all characters in S are alphanumeric\n\ 6033and there is at least one character in S, False otherwise."); 6034 6035static PyObject* 6036unicode_isalnum(PyUnicodeObject *self) 6037{ 6038 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 6039 register const Py_UNICODE *e; 6040 6041 /* Shortcut for single character strings */ 6042 if (PyUnicode_GET_SIZE(self) == 1 && 6043 Py_UNICODE_ISALNUM(*p)) 6044 return PyBool_FromLong(1); 6045 6046 /* Special case for empty strings */ 6047 if (PyUnicode_GET_SIZE(self) == 0) 6048 return PyBool_FromLong(0); 6049 6050 e = p + PyUnicode_GET_SIZE(self); 6051 for (; p < e; p++) { 6052 if (!Py_UNICODE_ISALNUM(*p)) 6053 return PyBool_FromLong(0); 6054 } 6055 return PyBool_FromLong(1); 6056} 6057 6058PyDoc_STRVAR(isdecimal__doc__, 6059"S.isdecimal() -> bool\n\ 6060\n\ 6061Return True if there are only decimal characters in S,\n\ 6062False otherwise."); 6063 6064static PyObject* 6065unicode_isdecimal(PyUnicodeObject *self) 6066{ 6067 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 6068 register const Py_UNICODE *e; 6069 6070 /* Shortcut for single character strings */ 6071 if (PyUnicode_GET_SIZE(self) == 1 && 6072 Py_UNICODE_ISDECIMAL(*p)) 6073 return PyBool_FromLong(1); 6074 6075 /* Special case for empty strings */ 6076 if (PyUnicode_GET_SIZE(self) == 0) 6077 return PyBool_FromLong(0); 6078 6079 e = p + PyUnicode_GET_SIZE(self); 6080 for (; p < e; p++) { 6081 if (!Py_UNICODE_ISDECIMAL(*p)) 6082 return PyBool_FromLong(0); 6083 } 6084 return PyBool_FromLong(1); 6085} 6086 6087PyDoc_STRVAR(isdigit__doc__, 6088"S.isdigit() -> bool\n\ 6089\n\ 6090Return True if all characters in S are digits\n\ 6091and there is at least one character in S, False otherwise."); 6092 6093static PyObject* 6094unicode_isdigit(PyUnicodeObject *self) 6095{ 6096 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 6097 register const Py_UNICODE *e; 6098 6099 /* Shortcut for single character strings */ 6100 if (PyUnicode_GET_SIZE(self) == 1 && 6101 Py_UNICODE_ISDIGIT(*p)) 6102 return PyBool_FromLong(1); 6103 6104 /* Special case for empty strings */ 6105 if (PyUnicode_GET_SIZE(self) == 0) 6106 return PyBool_FromLong(0); 6107 6108 e = p + PyUnicode_GET_SIZE(self); 6109 for (; p < e; p++) { 6110 if (!Py_UNICODE_ISDIGIT(*p)) 6111 return PyBool_FromLong(0); 6112 } 6113 return PyBool_FromLong(1); 6114} 6115 6116PyDoc_STRVAR(isnumeric__doc__, 6117"S.isnumeric() -> bool\n\ 6118\n\ 6119Return True if there are only numeric characters in S,\n\ 6120False otherwise."); 6121 6122static PyObject* 6123unicode_isnumeric(PyUnicodeObject *self) 6124{ 6125 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 6126 register const Py_UNICODE *e; 6127 6128 /* Shortcut for single character strings */ 6129 if (PyUnicode_GET_SIZE(self) == 1 && 6130 Py_UNICODE_ISNUMERIC(*p)) 6131 return PyBool_FromLong(1); 6132 6133 /* Special case for empty strings */ 6134 if (PyUnicode_GET_SIZE(self) == 0) 6135 return PyBool_FromLong(0); 6136 6137 e = p + PyUnicode_GET_SIZE(self); 6138 for (; p < e; p++) { 6139 if (!Py_UNICODE_ISNUMERIC(*p)) 6140 return PyBool_FromLong(0); 6141 } 6142 return PyBool_FromLong(1); 6143} 6144 6145PyDoc_STRVAR(join__doc__, 6146"S.join(sequence) -> unicode\n\ 6147\n\ 6148Return a string which is the concatenation of the strings in the\n\ 6149sequence. The separator between elements is S."); 6150 6151static PyObject* 6152unicode_join(PyObject *self, PyObject *data) 6153{ 6154 return PyUnicode_Join(self, data); 6155} 6156 6157static Py_ssize_t 6158unicode_length(PyUnicodeObject *self) 6159{ 6160 return self->length; 6161} 6162 6163PyDoc_STRVAR(ljust__doc__, 6164"S.ljust(width[, fillchar]) -> int\n\ 6165\n\ 6166Return S left justified in a Unicode string of length width. Padding is\n\ 6167done using the specified fill character (default is a space)."); 6168 6169static PyObject * 6170unicode_ljust(PyUnicodeObject *self, PyObject *args) 6171{ 6172 Py_ssize_t width; 6173 Py_UNICODE fillchar = ' '; 6174 6175 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar)) 6176 return NULL; 6177 6178 if (self->length >= width && PyUnicode_CheckExact(self)) { 6179 Py_INCREF(self); 6180 return (PyObject*) self; 6181 } 6182 6183 return (PyObject*) pad(self, 0, width - self->length, fillchar); 6184} 6185 6186PyDoc_STRVAR(lower__doc__, 6187"S.lower() -> unicode\n\ 6188\n\ 6189Return a copy of the string S converted to lowercase."); 6190 6191static PyObject* 6192unicode_lower(PyUnicodeObject *self) 6193{ 6194 return fixup(self, fixlower); 6195} 6196 6197#define LEFTSTRIP 0 6198#define RIGHTSTRIP 1 6199#define BOTHSTRIP 2 6200 6201/* Arrays indexed by above */ 6202static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"}; 6203 6204#define STRIPNAME(i) (stripformat[i]+3) 6205 6206/* externally visible for str.strip(unicode) */ 6207PyObject * 6208_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj) 6209{ 6210 Py_UNICODE *s = PyUnicode_AS_UNICODE(self); 6211 Py_ssize_t len = PyUnicode_GET_SIZE(self); 6212 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj); 6213 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj); 6214 Py_ssize_t i, j; 6215 6216 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen); 6217 6218 i = 0; 6219 if (striptype != RIGHTSTRIP) { 6220 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) { 6221 i++; 6222 } 6223 } 6224 6225 j = len; 6226 if (striptype != LEFTSTRIP) { 6227 do { 6228 j--; 6229 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen)); 6230 j++; 6231 } 6232 6233 if (i == 0 && j == len && PyUnicode_CheckExact(self)) { 6234 Py_INCREF(self); 6235 return (PyObject*)self; 6236 } 6237 else 6238 return PyUnicode_FromUnicode(s+i, j-i); 6239} 6240 6241 6242static PyObject * 6243do_strip(PyUnicodeObject *self, int striptype) 6244{ 6245 Py_UNICODE *s = PyUnicode_AS_UNICODE(self); 6246 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j; 6247 6248 i = 0; 6249 if (striptype != RIGHTSTRIP) { 6250 while (i < len && Py_UNICODE_ISSPACE(s[i])) { 6251 i++; 6252 } 6253 } 6254 6255 j = len; 6256 if (striptype != LEFTSTRIP) { 6257 do { 6258 j--; 6259 } while (j >= i && Py_UNICODE_ISSPACE(s[j])); 6260 j++; 6261 } 6262 6263 if (i == 0 && j == len && PyUnicode_CheckExact(self)) { 6264 Py_INCREF(self); 6265 return (PyObject*)self; 6266 } 6267 else 6268 return PyUnicode_FromUnicode(s+i, j-i); 6269} 6270 6271 6272static PyObject * 6273do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args) 6274{ 6275 PyObject *sep = NULL; 6276 6277 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep)) 6278 return NULL; 6279 6280 if (sep != NULL && sep != Py_None) { 6281 if (PyUnicode_Check(sep)) 6282 return _PyUnicode_XStrip(self, striptype, sep); 6283 else if (PyString_Check(sep)) { 6284 PyObject *res; 6285 sep = PyUnicode_FromObject(sep); 6286 if (sep==NULL) 6287 return NULL; 6288 res = _PyUnicode_XStrip(self, striptype, sep); 6289 Py_DECREF(sep); 6290 return res; 6291 } 6292 else { 6293 PyErr_Format(PyExc_TypeError, 6294 "%s arg must be None, unicode or str", 6295 STRIPNAME(striptype)); 6296 return NULL; 6297 } 6298 } 6299 6300 return do_strip(self, striptype); 6301} 6302 6303 6304PyDoc_STRVAR(strip__doc__, 6305"S.strip([chars]) -> unicode\n\ 6306\n\ 6307Return a copy of the string S with leading and trailing\n\ 6308whitespace removed.\n\ 6309If chars is given and not None, remove characters in chars instead.\n\ 6310If chars is a str, it will be converted to unicode before stripping"); 6311 6312static PyObject * 6313unicode_strip(PyUnicodeObject *self, PyObject *args) 6314{ 6315 if (PyTuple_GET_SIZE(args) == 0) 6316 return do_strip(self, BOTHSTRIP); /* Common case */ 6317 else 6318 return do_argstrip(self, BOTHSTRIP, args); 6319} 6320 6321 6322PyDoc_STRVAR(lstrip__doc__, 6323"S.lstrip([chars]) -> unicode\n\ 6324\n\ 6325Return a copy of the string S with leading whitespace removed.\n\ 6326If chars is given and not None, remove characters in chars instead.\n\ 6327If chars is a str, it will be converted to unicode before stripping"); 6328 6329static PyObject * 6330unicode_lstrip(PyUnicodeObject *self, PyObject *args) 6331{ 6332 if (PyTuple_GET_SIZE(args) == 0) 6333 return do_strip(self, LEFTSTRIP); /* Common case */ 6334 else 6335 return do_argstrip(self, LEFTSTRIP, args); 6336} 6337 6338 6339PyDoc_STRVAR(rstrip__doc__, 6340"S.rstrip([chars]) -> unicode\n\ 6341\n\ 6342Return a copy of the string S with trailing whitespace removed.\n\ 6343If chars is given and not None, remove characters in chars instead.\n\ 6344If chars is a str, it will be converted to unicode before stripping"); 6345 6346static PyObject * 6347unicode_rstrip(PyUnicodeObject *self, PyObject *args) 6348{ 6349 if (PyTuple_GET_SIZE(args) == 0) 6350 return do_strip(self, RIGHTSTRIP); /* Common case */ 6351 else 6352 return do_argstrip(self, RIGHTSTRIP, args); 6353} 6354 6355 6356static PyObject* 6357unicode_repeat(PyUnicodeObject *str, Py_ssize_t len) 6358{ 6359 PyUnicodeObject *u; 6360 Py_UNICODE *p; 6361 Py_ssize_t nchars; 6362 size_t nbytes; 6363 6364 if (len < 0) 6365 len = 0; 6366 6367 if (len == 1 && PyUnicode_CheckExact(str)) { 6368 /* no repeat, return original string */ 6369 Py_INCREF(str); 6370 return (PyObject*) str; 6371 } 6372 6373 /* ensure # of chars needed doesn't overflow int and # of bytes 6374 * needed doesn't overflow size_t 6375 */ 6376 nchars = len * str->length; 6377 if (len && nchars / len != str->length) { 6378 PyErr_SetString(PyExc_OverflowError, 6379 "repeated string is too long"); 6380 return NULL; 6381 } 6382 nbytes = (nchars + 1) * sizeof(Py_UNICODE); 6383 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) { 6384 PyErr_SetString(PyExc_OverflowError, 6385 "repeated string is too long"); 6386 return NULL; 6387 } 6388 u = _PyUnicode_New(nchars); 6389 if (!u) 6390 return NULL; 6391 6392 p = u->str; 6393 6394 if (str->length == 1 && len > 0) { 6395 Py_UNICODE_FILL(p, str->str[0], len); 6396 } else { 6397 Py_ssize_t done = 0; /* number of characters copied this far */ 6398 if (done < nchars) { 6399 Py_UNICODE_COPY(p, str->str, str->length); 6400 done = str->length; 6401 } 6402 while (done < nchars) { 6403 int n = (done <= nchars-done) ? done : nchars-done; 6404 Py_UNICODE_COPY(p+done, p, n); 6405 done += n; 6406 } 6407 } 6408 6409 return (PyObject*) u; 6410} 6411 6412PyObject *PyUnicode_Replace(PyObject *obj, 6413 PyObject *subobj, 6414 PyObject *replobj, 6415 Py_ssize_t maxcount) 6416{ 6417 PyObject *self; 6418 PyObject *str1; 6419 PyObject *str2; 6420 PyObject *result; 6421 6422 self = PyUnicode_FromObject(obj); 6423 if (self == NULL) 6424 return NULL; 6425 str1 = PyUnicode_FromObject(subobj); 6426 if (str1 == NULL) { 6427 Py_DECREF(self); 6428 return NULL; 6429 } 6430 str2 = PyUnicode_FromObject(replobj); 6431 if (str2 == NULL) { 6432 Py_DECREF(self); 6433 Py_DECREF(str1); 6434 return NULL; 6435 } 6436 result = replace((PyUnicodeObject *)self, 6437 (PyUnicodeObject *)str1, 6438 (PyUnicodeObject *)str2, 6439 maxcount); 6440 Py_DECREF(self); 6441 Py_DECREF(str1); 6442 Py_DECREF(str2); 6443 return result; 6444} 6445 6446PyDoc_STRVAR(replace__doc__, 6447"S.replace (old, new[, maxsplit]) -> unicode\n\ 6448\n\ 6449Return a copy of S with all occurrences of substring\n\ 6450old replaced by new. If the optional argument maxsplit is\n\ 6451given, only the first maxsplit occurrences are replaced."); 6452 6453static PyObject* 6454unicode_replace(PyUnicodeObject *self, PyObject *args) 6455{ 6456 PyUnicodeObject *str1; 6457 PyUnicodeObject *str2; 6458 Py_ssize_t maxcount = -1; 6459 PyObject *result; 6460 6461 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount)) 6462 return NULL; 6463 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1); 6464 if (str1 == NULL) 6465 return NULL; 6466 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2); 6467 if (str2 == NULL) { 6468 Py_DECREF(str1); 6469 return NULL; 6470 } 6471 6472 result = replace(self, str1, str2, maxcount); 6473 6474 Py_DECREF(str1); 6475 Py_DECREF(str2); 6476 return result; 6477} 6478 6479static 6480PyObject *unicode_repr(PyObject *unicode) 6481{ 6482 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode), 6483 PyUnicode_GET_SIZE(unicode), 6484 1); 6485} 6486 6487PyDoc_STRVAR(rfind__doc__, 6488"S.rfind(sub [,start [,end]]) -> int\n\ 6489\n\ 6490Return the highest index in S where substring sub is found,\n\ 6491such that sub is contained within s[start,end]. Optional\n\ 6492arguments start and end are interpreted as in slice notation.\n\ 6493\n\ 6494Return -1 on failure."); 6495 6496static PyObject * 6497unicode_rfind(PyUnicodeObject *self, PyObject *args) 6498{ 6499 PyObject *substring; 6500 Py_ssize_t start = 0; 6501 Py_ssize_t end = PY_SSIZE_T_MAX; 6502 Py_ssize_t result; 6503 6504 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring, 6505 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 6506 return NULL; 6507 substring = PyUnicode_FromObject(substring); 6508 if (!substring) 6509 return NULL; 6510 6511 result = stringlib_rfind_slice( 6512 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self), 6513 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring), 6514 start, end 6515 ); 6516 6517 Py_DECREF(substring); 6518 6519 return PyInt_FromSsize_t(result); 6520} 6521 6522PyDoc_STRVAR(rindex__doc__, 6523"S.rindex(sub [,start [,end]]) -> int\n\ 6524\n\ 6525Like S.rfind() but raise ValueError when the substring is not found."); 6526 6527static PyObject * 6528unicode_rindex(PyUnicodeObject *self, PyObject *args) 6529{ 6530 PyObject *substring; 6531 Py_ssize_t start = 0; 6532 Py_ssize_t end = PY_SSIZE_T_MAX; 6533 Py_ssize_t result; 6534 6535 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring, 6536 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 6537 return NULL; 6538 substring = PyUnicode_FromObject(substring); 6539 if (!substring) 6540 return NULL; 6541 6542 result = stringlib_rfind_slice( 6543 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self), 6544 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring), 6545 start, end 6546 ); 6547 6548 Py_DECREF(substring); 6549 6550 if (result < 0) { 6551 PyErr_SetString(PyExc_ValueError, "substring not found"); 6552 return NULL; 6553 } 6554 return PyInt_FromSsize_t(result); 6555} 6556 6557PyDoc_STRVAR(rjust__doc__, 6558"S.rjust(width[, fillchar]) -> unicode\n\ 6559\n\ 6560Return S right justified in a Unicode string of length width. Padding is\n\ 6561done using the specified fill character (default is a space)."); 6562 6563static PyObject * 6564unicode_rjust(PyUnicodeObject *self, PyObject *args) 6565{ 6566 Py_ssize_t width; 6567 Py_UNICODE fillchar = ' '; 6568 6569 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar)) 6570 return NULL; 6571 6572 if (self->length >= width && PyUnicode_CheckExact(self)) { 6573 Py_INCREF(self); 6574 return (PyObject*) self; 6575 } 6576 6577 return (PyObject*) pad(self, width - self->length, 0, fillchar); 6578} 6579 6580static PyObject* 6581unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end) 6582{ 6583 /* standard clamping */ 6584 if (start < 0) 6585 start = 0; 6586 if (end < 0) 6587 end = 0; 6588 if (end > self->length) 6589 end = self->length; 6590 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) { 6591 /* full slice, return original string */ 6592 Py_INCREF(self); 6593 return (PyObject*) self; 6594 } 6595 if (start > end) 6596 start = end; 6597 /* copy slice */ 6598 return (PyObject*) PyUnicode_FromUnicode(self->str + start, 6599 end - start); 6600} 6601 6602PyObject *PyUnicode_Split(PyObject *s, 6603 PyObject *sep, 6604 Py_ssize_t maxsplit) 6605{ 6606 PyObject *result; 6607 6608 s = PyUnicode_FromObject(s); 6609 if (s == NULL) 6610 return NULL; 6611 if (sep != NULL) { 6612 sep = PyUnicode_FromObject(sep); 6613 if (sep == NULL) { 6614 Py_DECREF(s); 6615 return NULL; 6616 } 6617 } 6618 6619 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit); 6620 6621 Py_DECREF(s); 6622 Py_XDECREF(sep); 6623 return result; 6624} 6625 6626PyDoc_STRVAR(split__doc__, 6627"S.split([sep [,maxsplit]]) -> list of strings\n\ 6628\n\ 6629Return a list of the words in S, using sep as the\n\ 6630delimiter string. If maxsplit is given, at most maxsplit\n\ 6631splits are done. If sep is not specified or is None,\n\ 6632any whitespace string is a separator."); 6633 6634static PyObject* 6635unicode_split(PyUnicodeObject *self, PyObject *args) 6636{ 6637 PyObject *substring = Py_None; 6638 Py_ssize_t maxcount = -1; 6639 6640 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount)) 6641 return NULL; 6642 6643 if (substring == Py_None) 6644 return split(self, NULL, maxcount); 6645 else if (PyUnicode_Check(substring)) 6646 return split(self, (PyUnicodeObject *)substring, maxcount); 6647 else 6648 return PyUnicode_Split((PyObject *)self, substring, maxcount); 6649} 6650 6651PyObject * 6652PyUnicode_Partition(PyObject *str_in, PyObject *sep_in) 6653{ 6654 PyObject* str_obj; 6655 PyObject* sep_obj; 6656 PyObject* out; 6657 6658 str_obj = PyUnicode_FromObject(str_in); 6659 if (!str_obj) 6660 return NULL; 6661 sep_obj = PyUnicode_FromObject(sep_in); 6662 if (!sep_obj) { 6663 Py_DECREF(str_obj); 6664 return NULL; 6665 } 6666 6667 out = stringlib_partition( 6668 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj), 6669 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj) 6670 ); 6671 6672 Py_DECREF(sep_obj); 6673 Py_DECREF(str_obj); 6674 6675 return out; 6676} 6677 6678 6679PyObject * 6680PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in) 6681{ 6682 PyObject* str_obj; 6683 PyObject* sep_obj; 6684 PyObject* out; 6685 6686 str_obj = PyUnicode_FromObject(str_in); 6687 if (!str_obj) 6688 return NULL; 6689 sep_obj = PyUnicode_FromObject(sep_in); 6690 if (!sep_obj) { 6691 Py_DECREF(str_obj); 6692 return NULL; 6693 } 6694 6695 out = stringlib_rpartition( 6696 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj), 6697 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj) 6698 ); 6699 6700 Py_DECREF(sep_obj); 6701 Py_DECREF(str_obj); 6702 6703 return out; 6704} 6705 6706PyDoc_STRVAR(partition__doc__, 6707"S.partition(sep) -> (head, sep, tail)\n\ 6708\n\ 6709Searches for the separator sep in S, and returns the part before it,\n\ 6710the separator itself, and the part after it. If the separator is not\n\ 6711found, returns S and two empty strings."); 6712 6713static PyObject* 6714unicode_partition(PyUnicodeObject *self, PyObject *separator) 6715{ 6716 return PyUnicode_Partition((PyObject *)self, separator); 6717} 6718 6719PyDoc_STRVAR(rpartition__doc__, 6720"S.rpartition(sep) -> (tail, sep, head)\n\ 6721\n\ 6722Searches for the separator sep in S, starting at the end of S, and returns\n\ 6723the part before it, the separator itself, and the part after it. If the\n\ 6724separator is not found, returns two empty strings and S."); 6725 6726static PyObject* 6727unicode_rpartition(PyUnicodeObject *self, PyObject *separator) 6728{ 6729 return PyUnicode_RPartition((PyObject *)self, separator); 6730} 6731 6732PyObject *PyUnicode_RSplit(PyObject *s, 6733 PyObject *sep, 6734 Py_ssize_t maxsplit) 6735{ 6736 PyObject *result; 6737 6738 s = PyUnicode_FromObject(s); 6739 if (s == NULL) 6740 return NULL; 6741 if (sep != NULL) { 6742 sep = PyUnicode_FromObject(sep); 6743 if (sep == NULL) { 6744 Py_DECREF(s); 6745 return NULL; 6746 } 6747 } 6748 6749 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit); 6750 6751 Py_DECREF(s); 6752 Py_XDECREF(sep); 6753 return result; 6754} 6755 6756PyDoc_STRVAR(rsplit__doc__, 6757"S.rsplit([sep [,maxsplit]]) -> list of strings\n\ 6758\n\ 6759Return a list of the words in S, using sep as the\n\ 6760delimiter string, starting at the end of the string and\n\ 6761working to the front. If maxsplit is given, at most maxsplit\n\ 6762splits are done. If sep is not specified, any whitespace string\n\ 6763is a separator."); 6764 6765static PyObject* 6766unicode_rsplit(PyUnicodeObject *self, PyObject *args) 6767{ 6768 PyObject *substring = Py_None; 6769 Py_ssize_t maxcount = -1; 6770 6771 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount)) 6772 return NULL; 6773 6774 if (substring == Py_None) 6775 return rsplit(self, NULL, maxcount); 6776 else if (PyUnicode_Check(substring)) 6777 return rsplit(self, (PyUnicodeObject *)substring, maxcount); 6778 else 6779 return PyUnicode_RSplit((PyObject *)self, substring, maxcount); 6780} 6781 6782PyDoc_STRVAR(splitlines__doc__, 6783"S.splitlines([keepends]]) -> list of strings\n\ 6784\n\ 6785Return a list of the lines in S, breaking at line boundaries.\n\ 6786Line breaks are not included in the resulting list unless keepends\n\ 6787is given and true."); 6788 6789static PyObject* 6790unicode_splitlines(PyUnicodeObject *self, PyObject *args) 6791{ 6792 int keepends = 0; 6793 6794 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends)) 6795 return NULL; 6796 6797 return PyUnicode_Splitlines((PyObject *)self, keepends); 6798} 6799 6800static 6801PyObject *unicode_str(PyUnicodeObject *self) 6802{ 6803 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL); 6804} 6805 6806PyDoc_STRVAR(swapcase__doc__, 6807"S.swapcase() -> unicode\n\ 6808\n\ 6809Return a copy of S with uppercase characters converted to lowercase\n\ 6810and vice versa."); 6811 6812static PyObject* 6813unicode_swapcase(PyUnicodeObject *self) 6814{ 6815 return fixup(self, fixswapcase); 6816} 6817 6818PyDoc_STRVAR(translate__doc__, 6819"S.translate(table) -> unicode\n\ 6820\n\ 6821Return a copy of the string S, where all characters have been mapped\n\ 6822through the given translation table, which must be a mapping of\n\ 6823Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\ 6824Unmapped characters are left untouched. Characters mapped to None\n\ 6825are deleted."); 6826 6827static PyObject* 6828unicode_translate(PyUnicodeObject *self, PyObject *table) 6829{ 6830 return PyUnicode_TranslateCharmap(self->str, 6831 self->length, 6832 table, 6833 "ignore"); 6834} 6835 6836PyDoc_STRVAR(upper__doc__, 6837"S.upper() -> unicode\n\ 6838\n\ 6839Return a copy of S converted to uppercase."); 6840 6841static PyObject* 6842unicode_upper(PyUnicodeObject *self) 6843{ 6844 return fixup(self, fixupper); 6845} 6846 6847PyDoc_STRVAR(zfill__doc__, 6848"S.zfill(width) -> unicode\n\ 6849\n\ 6850Pad a numeric string x with zeros on the left, to fill a field\n\ 6851of the specified width. The string x is never truncated."); 6852 6853static PyObject * 6854unicode_zfill(PyUnicodeObject *self, PyObject *args) 6855{ 6856 Py_ssize_t fill; 6857 PyUnicodeObject *u; 6858 6859 Py_ssize_t width; 6860 if (!PyArg_ParseTuple(args, "n:zfill", &width)) 6861 return NULL; 6862 6863 if (self->length >= width) { 6864 if (PyUnicode_CheckExact(self)) { 6865 Py_INCREF(self); 6866 return (PyObject*) self; 6867 } 6868 else 6869 return PyUnicode_FromUnicode( 6870 PyUnicode_AS_UNICODE(self), 6871 PyUnicode_GET_SIZE(self) 6872 ); 6873 } 6874 6875 fill = width - self->length; 6876 6877 u = pad(self, fill, 0, '0'); 6878 6879 if (u == NULL) 6880 return NULL; 6881 6882 if (u->str[fill] == '+' || u->str[fill] == '-') { 6883 /* move sign to beginning of string */ 6884 u->str[0] = u->str[fill]; 6885 u->str[fill] = '0'; 6886 } 6887 6888 return (PyObject*) u; 6889} 6890 6891#if 0 6892static PyObject* 6893unicode_freelistsize(PyUnicodeObject *self) 6894{ 6895 return PyInt_FromLong(unicode_freelist_size); 6896} 6897#endif 6898 6899PyDoc_STRVAR(startswith__doc__, 6900"S.startswith(prefix[, start[, end]]) -> bool\n\ 6901\n\ 6902Return True if S starts with the specified prefix, False otherwise.\n\ 6903With optional start, test S beginning at that position.\n\ 6904With optional end, stop comparing S at that position.\n\ 6905prefix can also be a tuple of strings to try."); 6906 6907static PyObject * 6908unicode_startswith(PyUnicodeObject *self, 6909 PyObject *args) 6910{ 6911 PyObject *subobj; 6912 PyUnicodeObject *substring; 6913 Py_ssize_t start = 0; 6914 Py_ssize_t end = PY_SSIZE_T_MAX; 6915 int result; 6916 6917 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj, 6918 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 6919 return NULL; 6920 if (PyTuple_Check(subobj)) { 6921 Py_ssize_t i; 6922 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) { 6923 substring = (PyUnicodeObject *)PyUnicode_FromObject( 6924 PyTuple_GET_ITEM(subobj, i)); 6925 if (substring == NULL) 6926 return NULL; 6927 result = tailmatch(self, substring, start, end, -1); 6928 Py_DECREF(substring); 6929 if (result) { 6930 Py_RETURN_TRUE; 6931 } 6932 } 6933 /* nothing matched */ 6934 Py_RETURN_FALSE; 6935 } 6936 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj); 6937 if (substring == NULL) 6938 return NULL; 6939 result = tailmatch(self, substring, start, end, -1); 6940 Py_DECREF(substring); 6941 return PyBool_FromLong(result); 6942} 6943 6944 6945PyDoc_STRVAR(endswith__doc__, 6946"S.endswith(suffix[, start[, end]]) -> bool\n\ 6947\n\ 6948Return True if S ends with the specified suffix, False otherwise.\n\ 6949With optional start, test S beginning at that position.\n\ 6950With optional end, stop comparing S at that position.\n\ 6951suffix can also be a tuple of strings to try."); 6952 6953static PyObject * 6954unicode_endswith(PyUnicodeObject *self, 6955 PyObject *args) 6956{ 6957 PyObject *subobj; 6958 PyUnicodeObject *substring; 6959 Py_ssize_t start = 0; 6960 Py_ssize_t end = PY_SSIZE_T_MAX; 6961 int result; 6962 6963 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj, 6964 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 6965 return NULL; 6966 if (PyTuple_Check(subobj)) { 6967 Py_ssize_t i; 6968 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) { 6969 substring = (PyUnicodeObject *)PyUnicode_FromObject( 6970 PyTuple_GET_ITEM(subobj, i)); 6971 if (substring == NULL) 6972 return NULL; 6973 result = tailmatch(self, substring, start, end, +1); 6974 Py_DECREF(substring); 6975 if (result) { 6976 Py_RETURN_TRUE; 6977 } 6978 } 6979 Py_RETURN_FALSE; 6980 } 6981 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj); 6982 if (substring == NULL) 6983 return NULL; 6984 6985 result = tailmatch(self, substring, start, end, +1); 6986 Py_DECREF(substring); 6987 return PyBool_FromLong(result); 6988} 6989 6990 6991 6992static PyObject * 6993unicode_getnewargs(PyUnicodeObject *v) 6994{ 6995 return Py_BuildValue("(u#)", v->str, v->length); 6996} 6997 6998 6999static PyMethodDef unicode_methods[] = { 7000 7001 /* Order is according to common usage: often used methods should 7002 appear first, since lookup is done sequentially. */ 7003 7004 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__}, 7005 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__}, 7006 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__}, 7007 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__}, 7008 {"join", (PyCFunction) unicode_join, METH_O, join__doc__}, 7009 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__}, 7010 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__}, 7011 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__}, 7012 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__}, 7013 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__}, 7014 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__}, 7015 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__}, 7016 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__}, 7017 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__}, 7018 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__}, 7019 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__}, 7020 {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__}, 7021/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */ 7022 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__}, 7023 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__}, 7024 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__}, 7025 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__}, 7026 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__}, 7027 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__}, 7028 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__}, 7029 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__}, 7030 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__}, 7031 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__}, 7032 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__}, 7033 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__}, 7034 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__}, 7035 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__}, 7036 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__}, 7037 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__}, 7038 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__}, 7039 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__}, 7040 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__}, 7041 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__}, 7042 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__}, 7043 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__}, 7044#if 0 7045 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__}, 7046#endif 7047 7048#if 0 7049 /* This one is just used for debugging the implementation. */ 7050 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS}, 7051#endif 7052 7053 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS}, 7054 {NULL, NULL} 7055}; 7056 7057static PyObject * 7058unicode_mod(PyObject *v, PyObject *w) 7059{ 7060 if (!PyUnicode_Check(v)) { 7061 Py_INCREF(Py_NotImplemented); 7062 return Py_NotImplemented; 7063 } 7064 return PyUnicode_Format(v, w); 7065} 7066 7067static PyNumberMethods unicode_as_number = { 7068 0, /*nb_add*/ 7069 0, /*nb_subtract*/ 7070 0, /*nb_multiply*/ 7071 unicode_mod, /*nb_remainder*/ 7072}; 7073 7074static PySequenceMethods unicode_as_sequence = { 7075 (lenfunc) unicode_length, /* sq_length */ 7076 PyUnicode_Concat, /* sq_concat */ 7077 (ssizeargfunc) unicode_repeat, /* sq_repeat */ 7078 (ssizeargfunc) unicode_getitem, /* sq_item */ 7079 (ssizessizeargfunc) unicode_slice, /* sq_slice */ 7080 0, /* sq_ass_item */ 7081 0, /* sq_ass_slice */ 7082 PyUnicode_Contains, /* sq_contains */ 7083}; 7084 7085static PyObject* 7086unicode_subscript(PyUnicodeObject* self, PyObject* item) 7087{ 7088 if (PyIndex_Check(item)) { 7089 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError); 7090 if (i == -1 && PyErr_Occurred()) 7091 return NULL; 7092 if (i < 0) 7093 i += PyUnicode_GET_SIZE(self); 7094 return unicode_getitem(self, i); 7095 } else if (PySlice_Check(item)) { 7096 Py_ssize_t start, stop, step, slicelength, cur, i; 7097 Py_UNICODE* source_buf; 7098 Py_UNICODE* result_buf; 7099 PyObject* result; 7100 7101 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self), 7102 &start, &stop, &step, &slicelength) < 0) { 7103 return NULL; 7104 } 7105 7106 if (slicelength <= 0) { 7107 return PyUnicode_FromUnicode(NULL, 0); 7108 } else { 7109 source_buf = PyUnicode_AS_UNICODE((PyObject*)self); 7110 result_buf = (Py_UNICODE *)PyMem_MALLOC(slicelength* 7111 sizeof(Py_UNICODE)); 7112 7113 if (result_buf == NULL) 7114 return PyErr_NoMemory(); 7115 7116 for (cur = start, i = 0; i < slicelength; cur += step, i++) { 7117 result_buf[i] = source_buf[cur]; 7118 } 7119 7120 result = PyUnicode_FromUnicode(result_buf, slicelength); 7121 PyMem_FREE(result_buf); 7122 return result; 7123 } 7124 } else { 7125 PyErr_SetString(PyExc_TypeError, "string indices must be integers"); 7126 return NULL; 7127 } 7128} 7129 7130static PyMappingMethods unicode_as_mapping = { 7131 (lenfunc)unicode_length, /* mp_length */ 7132 (binaryfunc)unicode_subscript, /* mp_subscript */ 7133 (objobjargproc)0, /* mp_ass_subscript */ 7134}; 7135 7136static Py_ssize_t 7137unicode_buffer_getreadbuf(PyUnicodeObject *self, 7138 Py_ssize_t index, 7139 const void **ptr) 7140{ 7141 if (index != 0) { 7142 PyErr_SetString(PyExc_SystemError, 7143 "accessing non-existent unicode segment"); 7144 return -1; 7145 } 7146 *ptr = (void *) self->str; 7147 return PyUnicode_GET_DATA_SIZE(self); 7148} 7149 7150static Py_ssize_t 7151unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index, 7152 const void **ptr) 7153{ 7154 PyErr_SetString(PyExc_TypeError, 7155 "cannot use unicode as modifiable buffer"); 7156 return -1; 7157} 7158 7159static int 7160unicode_buffer_getsegcount(PyUnicodeObject *self, 7161 Py_ssize_t *lenp) 7162{ 7163 if (lenp) 7164 *lenp = PyUnicode_GET_DATA_SIZE(self); 7165 return 1; 7166} 7167 7168static Py_ssize_t 7169unicode_buffer_getcharbuf(PyUnicodeObject *self, 7170 Py_ssize_t index, 7171 const void **ptr) 7172{ 7173 PyObject *str; 7174 7175 if (index != 0) { 7176 PyErr_SetString(PyExc_SystemError, 7177 "accessing non-existent unicode segment"); 7178 return -1; 7179 } 7180 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL); 7181 if (str == NULL) 7182 return -1; 7183 *ptr = (void *) PyString_AS_STRING(str); 7184 return PyString_GET_SIZE(str); 7185} 7186 7187/* Helpers for PyUnicode_Format() */ 7188 7189static PyObject * 7190getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx) 7191{ 7192 Py_ssize_t argidx = *p_argidx; 7193 if (argidx < arglen) { 7194 (*p_argidx)++; 7195 if (arglen < 0) 7196 return args; 7197 else 7198 return PyTuple_GetItem(args, argidx); 7199 } 7200 PyErr_SetString(PyExc_TypeError, 7201 "not enough arguments for format string"); 7202 return NULL; 7203} 7204 7205#define F_LJUST (1<<0) 7206#define F_SIGN (1<<1) 7207#define F_BLANK (1<<2) 7208#define F_ALT (1<<3) 7209#define F_ZERO (1<<4) 7210 7211static Py_ssize_t 7212strtounicode(Py_UNICODE *buffer, const char *charbuffer) 7213{ 7214 register Py_ssize_t i; 7215 Py_ssize_t len = strlen(charbuffer); 7216 for (i = len - 1; i >= 0; i--) 7217 buffer[i] = (Py_UNICODE) charbuffer[i]; 7218 7219 return len; 7220} 7221 7222static int 7223doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x) 7224{ 7225 Py_ssize_t result; 7226 7227 PyOS_ascii_formatd((char *)buffer, len, format, x); 7228 result = strtounicode(buffer, (char *)buffer); 7229 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int); 7230} 7231 7232static int 7233longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x) 7234{ 7235 Py_ssize_t result; 7236 7237 PyOS_snprintf((char *)buffer, len, format, x); 7238 result = strtounicode(buffer, (char *)buffer); 7239 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int); 7240} 7241 7242/* XXX To save some code duplication, formatfloat/long/int could have been 7243 shared with stringobject.c, converting from 8-bit to Unicode after the 7244 formatting is done. */ 7245 7246static int 7247formatfloat(Py_UNICODE *buf, 7248 size_t buflen, 7249 int flags, 7250 int prec, 7251 int type, 7252 PyObject *v) 7253{ 7254 /* fmt = '%#.' + `prec` + `type` 7255 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/ 7256 char fmt[20]; 7257 double x; 7258 7259 x = PyFloat_AsDouble(v); 7260 if (x == -1.0 && PyErr_Occurred()) 7261 return -1; 7262 if (prec < 0) 7263 prec = 6; 7264 if (type == 'f' && (fabs(x) / 1e25) >= 1e25) 7265 type = 'g'; 7266 /* Worst case length calc to ensure no buffer overrun: 7267 7268 'g' formats: 7269 fmt = %#.<prec>g 7270 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp 7271 for any double rep.) 7272 len = 1 + prec + 1 + 2 + 5 = 9 + prec 7273 7274 'f' formats: 7275 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50) 7276 len = 1 + 50 + 1 + prec = 52 + prec 7277 7278 If prec=0 the effective precision is 1 (the leading digit is 7279 always given), therefore increase the length by one. 7280 7281 */ 7282 if ((type == 'g' && buflen <= (size_t)10 + (size_t)prec) || 7283 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) { 7284 PyErr_SetString(PyExc_OverflowError, 7285 "formatted float is too long (precision too large?)"); 7286 return -1; 7287 } 7288 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c", 7289 (flags&F_ALT) ? "#" : "", 7290 prec, type); 7291 return doubletounicode(buf, buflen, fmt, x); 7292} 7293 7294static PyObject* 7295formatlong(PyObject *val, int flags, int prec, int type) 7296{ 7297 char *buf; 7298 int i, len; 7299 PyObject *str; /* temporary string object. */ 7300 PyUnicodeObject *result; 7301 7302 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len); 7303 if (!str) 7304 return NULL; 7305 result = _PyUnicode_New(len); 7306 if (!result) { 7307 Py_DECREF(str); 7308 return NULL; 7309 } 7310 for (i = 0; i < len; i++) 7311 result->str[i] = buf[i]; 7312 result->str[len] = 0; 7313 Py_DECREF(str); 7314 return (PyObject*)result; 7315} 7316 7317static int 7318formatint(Py_UNICODE *buf, 7319 size_t buflen, 7320 int flags, 7321 int prec, 7322 int type, 7323 PyObject *v) 7324{ 7325 /* fmt = '%#.' + `prec` + 'l' + `type` 7326 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine) 7327 * + 1 + 1 7328 * = 24 7329 */ 7330 char fmt[64]; /* plenty big enough! */ 7331 char *sign; 7332 long x; 7333 7334 x = PyInt_AsLong(v); 7335 if (x == -1 && PyErr_Occurred()) 7336 return -1; 7337 if (x < 0 && type == 'u') { 7338 type = 'd'; 7339 } 7340 if (x < 0 && (type == 'x' || type == 'X' || type == 'o')) 7341 sign = "-"; 7342 else 7343 sign = ""; 7344 if (prec < 0) 7345 prec = 1; 7346 7347 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal)) 7348 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11 7349 */ 7350 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) { 7351 PyErr_SetString(PyExc_OverflowError, 7352 "formatted integer is too long (precision too large?)"); 7353 return -1; 7354 } 7355 7356 if ((flags & F_ALT) && 7357 (type == 'x' || type == 'X')) { 7358 /* When converting under %#x or %#X, there are a number 7359 * of issues that cause pain: 7360 * - when 0 is being converted, the C standard leaves off 7361 * the '0x' or '0X', which is inconsistent with other 7362 * %#x/%#X conversions and inconsistent with Python's 7363 * hex() function 7364 * - there are platforms that violate the standard and 7365 * convert 0 with the '0x' or '0X' 7366 * (Metrowerks, Compaq Tru64) 7367 * - there are platforms that give '0x' when converting 7368 * under %#X, but convert 0 in accordance with the 7369 * standard (OS/2 EMX) 7370 * 7371 * We can achieve the desired consistency by inserting our 7372 * own '0x' or '0X' prefix, and substituting %x/%X in place 7373 * of %#x/%#X. 7374 * 7375 * Note that this is the same approach as used in 7376 * formatint() in stringobject.c 7377 */ 7378 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c", 7379 sign, type, prec, type); 7380 } 7381 else { 7382 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c", 7383 sign, (flags&F_ALT) ? "#" : "", 7384 prec, type); 7385 } 7386 if (sign[0]) 7387 return longtounicode(buf, buflen, fmt, -x); 7388 else 7389 return longtounicode(buf, buflen, fmt, x); 7390} 7391 7392static int 7393formatchar(Py_UNICODE *buf, 7394 size_t buflen, 7395 PyObject *v) 7396{ 7397 /* presume that the buffer is at least 2 characters long */ 7398 if (PyUnicode_Check(v)) { 7399 if (PyUnicode_GET_SIZE(v) != 1) 7400 goto onError; 7401 buf[0] = PyUnicode_AS_UNICODE(v)[0]; 7402 } 7403 7404 else if (PyString_Check(v)) { 7405 if (PyString_GET_SIZE(v) != 1) 7406 goto onError; 7407 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0]; 7408 } 7409 7410 else { 7411 /* Integer input truncated to a character */ 7412 long x; 7413 x = PyInt_AsLong(v); 7414 if (x == -1 && PyErr_Occurred()) 7415 goto onError; 7416#ifdef Py_UNICODE_WIDE 7417 if (x < 0 || x > 0x10ffff) { 7418 PyErr_SetString(PyExc_OverflowError, 7419 "%c arg not in range(0x110000) " 7420 "(wide Python build)"); 7421 return -1; 7422 } 7423#else 7424 if (x < 0 || x > 0xffff) { 7425 PyErr_SetString(PyExc_OverflowError, 7426 "%c arg not in range(0x10000) " 7427 "(narrow Python build)"); 7428 return -1; 7429 } 7430#endif 7431 buf[0] = (Py_UNICODE) x; 7432 } 7433 buf[1] = '\0'; 7434 return 1; 7435 7436 onError: 7437 PyErr_SetString(PyExc_TypeError, 7438 "%c requires int or char"); 7439 return -1; 7440} 7441 7442/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...) 7443 7444 FORMATBUFLEN is the length of the buffer in which the floats, ints, & 7445 chars are formatted. XXX This is a magic number. Each formatting 7446 routine does bounds checking to ensure no overflow, but a better 7447 solution may be to malloc a buffer of appropriate size for each 7448 format. For now, the current solution is sufficient. 7449*/ 7450#define FORMATBUFLEN (size_t)120 7451 7452PyObject *PyUnicode_Format(PyObject *format, 7453 PyObject *args) 7454{ 7455 Py_UNICODE *fmt, *res; 7456 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx; 7457 int args_owned = 0; 7458 PyUnicodeObject *result = NULL; 7459 PyObject *dict = NULL; 7460 PyObject *uformat; 7461 7462 if (format == NULL || args == NULL) { 7463 PyErr_BadInternalCall(); 7464 return NULL; 7465 } 7466 uformat = PyUnicode_FromObject(format); 7467 if (uformat == NULL) 7468 return NULL; 7469 fmt = PyUnicode_AS_UNICODE(uformat); 7470 fmtcnt = PyUnicode_GET_SIZE(uformat); 7471 7472 reslen = rescnt = fmtcnt + 100; 7473 result = _PyUnicode_New(reslen); 7474 if (result == NULL) 7475 goto onError; 7476 res = PyUnicode_AS_UNICODE(result); 7477 7478 if (PyTuple_Check(args)) { 7479 arglen = PyTuple_Size(args); 7480 argidx = 0; 7481 } 7482 else { 7483 arglen = -1; 7484 argidx = -2; 7485 } 7486 if (args->ob_type->tp_as_mapping && !PyTuple_Check(args) && 7487 !PyObject_TypeCheck(args, &PyBaseString_Type)) 7488 dict = args; 7489 7490 while (--fmtcnt >= 0) { 7491 if (*fmt != '%') { 7492 if (--rescnt < 0) { 7493 rescnt = fmtcnt + 100; 7494 reslen += rescnt; 7495 if (_PyUnicode_Resize(&result, reslen) < 0) 7496 goto onError; 7497 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt; 7498 --rescnt; 7499 } 7500 *res++ = *fmt++; 7501 } 7502 else { 7503 /* Got a format specifier */ 7504 int flags = 0; 7505 Py_ssize_t width = -1; 7506 int prec = -1; 7507 Py_UNICODE c = '\0'; 7508 Py_UNICODE fill; 7509 PyObject *v = NULL; 7510 PyObject *temp = NULL; 7511 Py_UNICODE *pbuf; 7512 Py_UNICODE sign; 7513 Py_ssize_t len; 7514 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */ 7515 7516 fmt++; 7517 if (*fmt == '(') { 7518 Py_UNICODE *keystart; 7519 Py_ssize_t keylen; 7520 PyObject *key; 7521 int pcount = 1; 7522 7523 if (dict == NULL) { 7524 PyErr_SetString(PyExc_TypeError, 7525 "format requires a mapping"); 7526 goto onError; 7527 } 7528 ++fmt; 7529 --fmtcnt; 7530 keystart = fmt; 7531 /* Skip over balanced parentheses */ 7532 while (pcount > 0 && --fmtcnt >= 0) { 7533 if (*fmt == ')') 7534 --pcount; 7535 else if (*fmt == '(') 7536 ++pcount; 7537 fmt++; 7538 } 7539 keylen = fmt - keystart - 1; 7540 if (fmtcnt < 0 || pcount > 0) { 7541 PyErr_SetString(PyExc_ValueError, 7542 "incomplete format key"); 7543 goto onError; 7544 } 7545#if 0 7546 /* keys are converted to strings using UTF-8 and 7547 then looked up since Python uses strings to hold 7548 variables names etc. in its namespaces and we 7549 wouldn't want to break common idioms. */ 7550 key = PyUnicode_EncodeUTF8(keystart, 7551 keylen, 7552 NULL); 7553#else 7554 key = PyUnicode_FromUnicode(keystart, keylen); 7555#endif 7556 if (key == NULL) 7557 goto onError; 7558 if (args_owned) { 7559 Py_DECREF(args); 7560 args_owned = 0; 7561 } 7562 args = PyObject_GetItem(dict, key); 7563 Py_DECREF(key); 7564 if (args == NULL) { 7565 goto onError; 7566 } 7567 args_owned = 1; 7568 arglen = -1; 7569 argidx = -2; 7570 } 7571 while (--fmtcnt >= 0) { 7572 switch (c = *fmt++) { 7573 case '-': flags |= F_LJUST; continue; 7574 case '+': flags |= F_SIGN; continue; 7575 case ' ': flags |= F_BLANK; continue; 7576 case '#': flags |= F_ALT; continue; 7577 case '0': flags |= F_ZERO; continue; 7578 } 7579 break; 7580 } 7581 if (c == '*') { 7582 v = getnextarg(args, arglen, &argidx); 7583 if (v == NULL) 7584 goto onError; 7585 if (!PyInt_Check(v)) { 7586 PyErr_SetString(PyExc_TypeError, 7587 "* wants int"); 7588 goto onError; 7589 } 7590 width = PyInt_AsLong(v); 7591 if (width == -1 && PyErr_Occurred()) 7592 goto onError; 7593 if (width < 0) { 7594 flags |= F_LJUST; 7595 width = -width; 7596 } 7597 if (--fmtcnt >= 0) 7598 c = *fmt++; 7599 } 7600 else if (c >= '0' && c <= '9') { 7601 width = c - '0'; 7602 while (--fmtcnt >= 0) { 7603 c = *fmt++; 7604 if (c < '0' || c > '9') 7605 break; 7606 if ((width*10) / 10 != width) { 7607 PyErr_SetString(PyExc_ValueError, 7608 "width too big"); 7609 goto onError; 7610 } 7611 width = width*10 + (c - '0'); 7612 } 7613 } 7614 if (c == '.') { 7615 prec = 0; 7616 if (--fmtcnt >= 0) 7617 c = *fmt++; 7618 if (c == '*') { 7619 v = getnextarg(args, arglen, &argidx); 7620 if (v == NULL) 7621 goto onError; 7622 if (!PyInt_Check(v)) { 7623 PyErr_SetString(PyExc_TypeError, 7624 "* wants int"); 7625 goto onError; 7626 } 7627 prec = PyInt_AsLong(v); 7628 if (prec == -1 && PyErr_Occurred()) 7629 goto onError; 7630 if (prec < 0) 7631 prec = 0; 7632 if (--fmtcnt >= 0) 7633 c = *fmt++; 7634 } 7635 else if (c >= '0' && c <= '9') { 7636 prec = c - '0'; 7637 while (--fmtcnt >= 0) { 7638 c = Py_CHARMASK(*fmt++); 7639 if (c < '0' || c > '9') 7640 break; 7641 if ((prec*10) / 10 != prec) { 7642 PyErr_SetString(PyExc_ValueError, 7643 "prec too big"); 7644 goto onError; 7645 } 7646 prec = prec*10 + (c - '0'); 7647 } 7648 } 7649 } /* prec */ 7650 if (fmtcnt >= 0) { 7651 if (c == 'h' || c == 'l' || c == 'L') { 7652 if (--fmtcnt >= 0) 7653 c = *fmt++; 7654 } 7655 } 7656 if (fmtcnt < 0) { 7657 PyErr_SetString(PyExc_ValueError, 7658 "incomplete format"); 7659 goto onError; 7660 } 7661 if (c != '%') { 7662 v = getnextarg(args, arglen, &argidx); 7663 if (v == NULL) 7664 goto onError; 7665 } 7666 sign = 0; 7667 fill = ' '; 7668 switch (c) { 7669 7670 case '%': 7671 pbuf = formatbuf; 7672 /* presume that buffer length is at least 1 */ 7673 pbuf[0] = '%'; 7674 len = 1; 7675 break; 7676 7677 case 's': 7678 case 'r': 7679 if (PyUnicode_Check(v) && c == 's') { 7680 temp = v; 7681 Py_INCREF(temp); 7682 } 7683 else { 7684 PyObject *unicode; 7685 if (c == 's') 7686 temp = PyObject_Unicode(v); 7687 else 7688 temp = PyObject_Repr(v); 7689 if (temp == NULL) 7690 goto onError; 7691 if (PyUnicode_Check(temp)) 7692 /* nothing to do */; 7693 else if (PyString_Check(temp)) { 7694 /* convert to string to Unicode */ 7695 unicode = PyUnicode_Decode(PyString_AS_STRING(temp), 7696 PyString_GET_SIZE(temp), 7697 NULL, 7698 "strict"); 7699 Py_DECREF(temp); 7700 temp = unicode; 7701 if (temp == NULL) 7702 goto onError; 7703 } 7704 else { 7705 Py_DECREF(temp); 7706 PyErr_SetString(PyExc_TypeError, 7707 "%s argument has non-string str()"); 7708 goto onError; 7709 } 7710 } 7711 pbuf = PyUnicode_AS_UNICODE(temp); 7712 len = PyUnicode_GET_SIZE(temp); 7713 if (prec >= 0 && len > prec) 7714 len = prec; 7715 break; 7716 7717 case 'i': 7718 case 'd': 7719 case 'u': 7720 case 'o': 7721 case 'x': 7722 case 'X': 7723 if (c == 'i') 7724 c = 'd'; 7725 if (PyLong_Check(v)) { 7726 temp = formatlong(v, flags, prec, c); 7727 if (!temp) 7728 goto onError; 7729 pbuf = PyUnicode_AS_UNICODE(temp); 7730 len = PyUnicode_GET_SIZE(temp); 7731 sign = 1; 7732 } 7733 else { 7734 pbuf = formatbuf; 7735 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), 7736 flags, prec, c, v); 7737 if (len < 0) 7738 goto onError; 7739 sign = 1; 7740 } 7741 if (flags & F_ZERO) 7742 fill = '0'; 7743 break; 7744 7745 case 'e': 7746 case 'E': 7747 case 'f': 7748 case 'F': 7749 case 'g': 7750 case 'G': 7751 if (c == 'F') 7752 c = 'f'; 7753 pbuf = formatbuf; 7754 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), 7755 flags, prec, c, v); 7756 if (len < 0) 7757 goto onError; 7758 sign = 1; 7759 if (flags & F_ZERO) 7760 fill = '0'; 7761 break; 7762 7763 case 'c': 7764 pbuf = formatbuf; 7765 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v); 7766 if (len < 0) 7767 goto onError; 7768 break; 7769 7770 default: 7771 PyErr_Format(PyExc_ValueError, 7772 "unsupported format character '%c' (0x%x) " 7773 "at index %zd", 7774 (31<=c && c<=126) ? (char)c : '?', 7775 (int)c, 7776 (Py_ssize_t)(fmt - 1 - 7777 PyUnicode_AS_UNICODE(uformat))); 7778 goto onError; 7779 } 7780 if (sign) { 7781 if (*pbuf == '-' || *pbuf == '+') { 7782 sign = *pbuf++; 7783 len--; 7784 } 7785 else if (flags & F_SIGN) 7786 sign = '+'; 7787 else if (flags & F_BLANK) 7788 sign = ' '; 7789 else 7790 sign = 0; 7791 } 7792 if (width < len) 7793 width = len; 7794 if (rescnt - (sign != 0) < width) { 7795 reslen -= rescnt; 7796 rescnt = width + fmtcnt + 100; 7797 reslen += rescnt; 7798 if (reslen < 0) { 7799 Py_XDECREF(temp); 7800 PyErr_NoMemory(); 7801 goto onError; 7802 } 7803 if (_PyUnicode_Resize(&result, reslen) < 0) { 7804 Py_XDECREF(temp); 7805 goto onError; 7806 } 7807 res = PyUnicode_AS_UNICODE(result) 7808 + reslen - rescnt; 7809 } 7810 if (sign) { 7811 if (fill != ' ') 7812 *res++ = sign; 7813 rescnt--; 7814 if (width > len) 7815 width--; 7816 } 7817 if ((flags & F_ALT) && (c == 'x' || c == 'X')) { 7818 assert(pbuf[0] == '0'); 7819 assert(pbuf[1] == c); 7820 if (fill != ' ') { 7821 *res++ = *pbuf++; 7822 *res++ = *pbuf++; 7823 } 7824 rescnt -= 2; 7825 width -= 2; 7826 if (width < 0) 7827 width = 0; 7828 len -= 2; 7829 } 7830 if (width > len && !(flags & F_LJUST)) { 7831 do { 7832 --rescnt; 7833 *res++ = fill; 7834 } while (--width > len); 7835 } 7836 if (fill == ' ') { 7837 if (sign) 7838 *res++ = sign; 7839 if ((flags & F_ALT) && (c == 'x' || c == 'X')) { 7840 assert(pbuf[0] == '0'); 7841 assert(pbuf[1] == c); 7842 *res++ = *pbuf++; 7843 *res++ = *pbuf++; 7844 } 7845 } 7846 Py_UNICODE_COPY(res, pbuf, len); 7847 res += len; 7848 rescnt -= len; 7849 while (--width >= len) { 7850 --rescnt; 7851 *res++ = ' '; 7852 } 7853 if (dict && (argidx < arglen) && c != '%') { 7854 PyErr_SetString(PyExc_TypeError, 7855 "not all arguments converted during string formatting"); 7856 Py_XDECREF(temp); 7857 goto onError; 7858 } 7859 Py_XDECREF(temp); 7860 } /* '%' */ 7861 } /* until end */ 7862 if (argidx < arglen && !dict) { 7863 PyErr_SetString(PyExc_TypeError, 7864 "not all arguments converted during string formatting"); 7865 goto onError; 7866 } 7867 7868 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0) 7869 goto onError; 7870 if (args_owned) { 7871 Py_DECREF(args); 7872 } 7873 Py_DECREF(uformat); 7874 return (PyObject *)result; 7875 7876 onError: 7877 Py_XDECREF(result); 7878 Py_DECREF(uformat); 7879 if (args_owned) { 7880 Py_DECREF(args); 7881 } 7882 return NULL; 7883} 7884 7885static PyBufferProcs unicode_as_buffer = { 7886 (readbufferproc) unicode_buffer_getreadbuf, 7887 (writebufferproc) unicode_buffer_getwritebuf, 7888 (segcountproc) unicode_buffer_getsegcount, 7889 (charbufferproc) unicode_buffer_getcharbuf, 7890}; 7891 7892static PyObject * 7893unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds); 7894 7895static PyObject * 7896unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds) 7897{ 7898 PyObject *x = NULL; 7899 static char *kwlist[] = {"string", "encoding", "errors", 0}; 7900 char *encoding = NULL; 7901 char *errors = NULL; 7902 7903 if (type != &PyUnicode_Type) 7904 return unicode_subtype_new(type, args, kwds); 7905 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode", 7906 kwlist, &x, &encoding, &errors)) 7907 return NULL; 7908 if (x == NULL) 7909 return (PyObject *)_PyUnicode_New(0); 7910 if (encoding == NULL && errors == NULL) 7911 return PyObject_Unicode(x); 7912 else 7913 return PyUnicode_FromEncodedObject(x, encoding, errors); 7914} 7915 7916static PyObject * 7917unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds) 7918{ 7919 PyUnicodeObject *tmp, *pnew; 7920 Py_ssize_t n; 7921 7922 assert(PyType_IsSubtype(type, &PyUnicode_Type)); 7923 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds); 7924 if (tmp == NULL) 7925 return NULL; 7926 assert(PyUnicode_Check(tmp)); 7927 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length); 7928 if (pnew == NULL) { 7929 Py_DECREF(tmp); 7930 return NULL; 7931 } 7932 pnew->str = PyMem_NEW(Py_UNICODE, n+1); 7933 if (pnew->str == NULL) { 7934 _Py_ForgetReference((PyObject *)pnew); 7935 PyObject_Del(pnew); 7936 Py_DECREF(tmp); 7937 return PyErr_NoMemory(); 7938 } 7939 Py_UNICODE_COPY(pnew->str, tmp->str, n+1); 7940 pnew->length = n; 7941 pnew->hash = tmp->hash; 7942 Py_DECREF(tmp); 7943 return (PyObject *)pnew; 7944} 7945 7946PyDoc_STRVAR(unicode_doc, 7947"unicode(string [, encoding[, errors]]) -> object\n\ 7948\n\ 7949Create a new Unicode object from the given encoded string.\n\ 7950encoding defaults to the current default string encoding.\n\ 7951errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'."); 7952 7953static PyObject *unicode_iter(PyObject *seq); 7954 7955PyTypeObject PyUnicode_Type = { 7956 PyObject_HEAD_INIT(&PyType_Type) 7957 0, /* ob_size */ 7958 "unicode", /* tp_name */ 7959 sizeof(PyUnicodeObject), /* tp_size */ 7960 0, /* tp_itemsize */ 7961 /* Slots */ 7962 (destructor)unicode_dealloc, /* tp_dealloc */ 7963 0, /* tp_print */ 7964 0, /* tp_getattr */ 7965 0, /* tp_setattr */ 7966 0, /* tp_compare */ 7967 unicode_repr, /* tp_repr */ 7968 &unicode_as_number, /* tp_as_number */ 7969 &unicode_as_sequence, /* tp_as_sequence */ 7970 &unicode_as_mapping, /* tp_as_mapping */ 7971 (hashfunc) unicode_hash, /* tp_hash*/ 7972 0, /* tp_call*/ 7973 (reprfunc) unicode_str, /* tp_str */ 7974 PyObject_GenericGetAttr, /* tp_getattro */ 7975 0, /* tp_setattro */ 7976 &unicode_as_buffer, /* tp_as_buffer */ 7977 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | 7978 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */ 7979 unicode_doc, /* tp_doc */ 7980 0, /* tp_traverse */ 7981 0, /* tp_clear */ 7982 PyUnicode_RichCompare, /* tp_richcompare */ 7983 0, /* tp_weaklistoffset */ 7984 unicode_iter, /* tp_iter */ 7985 0, /* tp_iternext */ 7986 unicode_methods, /* tp_methods */ 7987 0, /* tp_members */ 7988 0, /* tp_getset */ 7989 &PyBaseString_Type, /* tp_base */ 7990 0, /* tp_dict */ 7991 0, /* tp_descr_get */ 7992 0, /* tp_descr_set */ 7993 0, /* tp_dictoffset */ 7994 0, /* tp_init */ 7995 0, /* tp_alloc */ 7996 unicode_new, /* tp_new */ 7997 PyObject_Del, /* tp_free */ 7998}; 7999 8000/* Initialize the Unicode implementation */ 8001 8002void _PyUnicode_Init(void) 8003{ 8004 int i; 8005 8006 /* XXX - move this array to unicodectype.c ? */ 8007 Py_UNICODE linebreak[] = { 8008 0x000A, /* LINE FEED */ 8009 0x000D, /* CARRIAGE RETURN */ 8010 0x001C, /* FILE SEPARATOR */ 8011 0x001D, /* GROUP SEPARATOR */ 8012 0x001E, /* RECORD SEPARATOR */ 8013 0x0085, /* NEXT LINE */ 8014 0x2028, /* LINE SEPARATOR */ 8015 0x2029, /* PARAGRAPH SEPARATOR */ 8016 }; 8017 8018 /* Init the implementation */ 8019 unicode_freelist = NULL; 8020 unicode_freelist_size = 0; 8021 unicode_empty = _PyUnicode_New(0); 8022 if (!unicode_empty) 8023 return; 8024 8025 strcpy(unicode_default_encoding, "ascii"); 8026 for (i = 0; i < 256; i++) 8027 unicode_latin1[i] = NULL; 8028 if (PyType_Ready(&PyUnicode_Type) < 0) 8029 Py_FatalError("Can't initialize 'unicode'"); 8030 8031 /* initialize the linebreak bloom filter */ 8032 bloom_linebreak = make_bloom_mask( 8033 linebreak, sizeof(linebreak) / sizeof(linebreak[0]) 8034 ); 8035 8036 PyType_Ready(&EncodingMapType); 8037} 8038 8039/* Finalize the Unicode implementation */ 8040 8041void 8042_PyUnicode_Fini(void) 8043{ 8044 PyUnicodeObject *u; 8045 int i; 8046 8047 Py_XDECREF(unicode_empty); 8048 unicode_empty = NULL; 8049 8050 for (i = 0; i < 256; i++) { 8051 if (unicode_latin1[i]) { 8052 Py_DECREF(unicode_latin1[i]); 8053 unicode_latin1[i] = NULL; 8054 } 8055 } 8056 8057 for (u = unicode_freelist; u != NULL;) { 8058 PyUnicodeObject *v = u; 8059 u = *(PyUnicodeObject **)u; 8060 if (v->str) 8061 PyMem_DEL(v->str); 8062 Py_XDECREF(v->defenc); 8063 PyObject_Del(v); 8064 } 8065 unicode_freelist = NULL; 8066 unicode_freelist_size = 0; 8067} 8068 8069 8070 8071/********************* Unicode Iterator **************************/ 8072 8073typedef struct { 8074 PyObject_HEAD 8075 Py_ssize_t it_index; 8076 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */ 8077} unicodeiterobject; 8078 8079static void 8080unicodeiter_dealloc(unicodeiterobject *it) 8081{ 8082 _PyObject_GC_UNTRACK(it); 8083 Py_XDECREF(it->it_seq); 8084 PyObject_GC_Del(it); 8085} 8086 8087static int 8088unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg) 8089{ 8090 Py_VISIT(it->it_seq); 8091 return 0; 8092} 8093 8094static PyObject * 8095unicodeiter_next(unicodeiterobject *it) 8096{ 8097 PyUnicodeObject *seq; 8098 PyObject *item; 8099 8100 assert(it != NULL); 8101 seq = it->it_seq; 8102 if (seq == NULL) 8103 return NULL; 8104 assert(PyUnicode_Check(seq)); 8105 8106 if (it->it_index < PyUnicode_GET_SIZE(seq)) { 8107 item = PyUnicode_FromUnicode( 8108 PyUnicode_AS_UNICODE(seq)+it->it_index, 1); 8109 if (item != NULL) 8110 ++it->it_index; 8111 return item; 8112 } 8113 8114 Py_DECREF(seq); 8115 it->it_seq = NULL; 8116 return NULL; 8117} 8118 8119static PyObject * 8120unicodeiter_len(unicodeiterobject *it) 8121{ 8122 Py_ssize_t len = 0; 8123 if (it->it_seq) 8124 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index; 8125 return PyInt_FromSsize_t(len); 8126} 8127 8128PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it))."); 8129 8130static PyMethodDef unicodeiter_methods[] = { 8131 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS, 8132 length_hint_doc}, 8133 {NULL, NULL} /* sentinel */ 8134}; 8135 8136PyTypeObject PyUnicodeIter_Type = { 8137 PyObject_HEAD_INIT(&PyType_Type) 8138 0, /* ob_size */ 8139 "unicodeiterator", /* tp_name */ 8140 sizeof(unicodeiterobject), /* tp_basicsize */ 8141 0, /* tp_itemsize */ 8142 /* methods */ 8143 (destructor)unicodeiter_dealloc, /* tp_dealloc */ 8144 0, /* tp_print */ 8145 0, /* tp_getattr */ 8146 0, /* tp_setattr */ 8147 0, /* tp_compare */ 8148 0, /* tp_repr */ 8149 0, /* tp_as_number */ 8150 0, /* tp_as_sequence */ 8151 0, /* tp_as_mapping */ 8152 0, /* tp_hash */ 8153 0, /* tp_call */ 8154 0, /* tp_str */ 8155 PyObject_GenericGetAttr, /* tp_getattro */ 8156 0, /* tp_setattro */ 8157 0, /* tp_as_buffer */ 8158 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */ 8159 0, /* tp_doc */ 8160 (traverseproc)unicodeiter_traverse, /* tp_traverse */ 8161 0, /* tp_clear */ 8162 0, /* tp_richcompare */ 8163 0, /* tp_weaklistoffset */ 8164 PyObject_SelfIter, /* tp_iter */ 8165 (iternextfunc)unicodeiter_next, /* tp_iternext */ 8166 unicodeiter_methods, /* tp_methods */ 8167 0, 8168}; 8169 8170static PyObject * 8171unicode_iter(PyObject *seq) 8172{ 8173 unicodeiterobject *it; 8174 8175 if (!PyUnicode_Check(seq)) { 8176 PyErr_BadInternalCall(); 8177 return NULL; 8178 } 8179 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type); 8180 if (it == NULL) 8181 return NULL; 8182 it->it_index = 0; 8183 Py_INCREF(seq); 8184 it->it_seq = (PyUnicodeObject *)seq; 8185 _PyObject_GC_TRACK(it); 8186 return (PyObject *)it; 8187} 8188 8189#ifdef __cplusplus 8190} 8191#endif 8192 8193 8194/* 8195Local variables: 8196c-basic-offset: 4 8197indent-tabs-mode: nil 8198End: 8199*/ 8200