unicodeobject.c revision 0e3f591aeeef9ed715f8770320f4c4c7332a8794
1/* 2 3Unicode implementation based on original code by Fredrik Lundh, 4modified by Marc-Andre Lemburg <mal@lemburg.com> according to the 5Unicode Integration Proposal (see file Misc/unicode.txt). 6 7Major speed upgrades to the method implementations at the Reykjavik 8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke. 9 10Copyright (c) Corporation for National Research Initiatives. 11 12-------------------------------------------------------------------- 13The original string type implementation is: 14 15 Copyright (c) 1999 by Secret Labs AB 16 Copyright (c) 1999 by Fredrik Lundh 17 18By obtaining, using, and/or copying this software and/or its 19associated documentation, you agree that you have read, understood, 20and will comply with the following terms and conditions: 21 22Permission to use, copy, modify, and distribute this software and its 23associated documentation for any purpose and without fee is hereby 24granted, provided that the above copyright notice appears in all 25copies, and that both that copyright notice and this permission notice 26appear in supporting documentation, and that the name of Secret Labs 27AB or the author not be used in advertising or publicity pertaining to 28distribution of the software without specific, written prior 29permission. 30 31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO 32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND 33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR 34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT 37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 38-------------------------------------------------------------------- 39 40*/ 41 42#define PY_SSIZE_T_CLEAN 43#include "Python.h" 44 45#include "unicodeobject.h" 46#include "ucnhash.h" 47 48#ifdef MS_WINDOWS 49#include <windows.h> 50#endif 51 52/* Limit for the Unicode object free list */ 53 54#define MAX_UNICODE_FREELIST_SIZE 1024 55 56/* Limit for the Unicode object free list stay alive optimization. 57 58 The implementation will keep allocated Unicode memory intact for 59 all objects on the free list having a size less than this 60 limit. This reduces malloc() overhead for small Unicode objects. 61 62 At worst this will result in MAX_UNICODE_FREELIST_SIZE * 63 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT + 64 malloc()-overhead) bytes of unused garbage. 65 66 Setting the limit to 0 effectively turns the feature off. 67 68 Note: This is an experimental feature ! If you get core dumps when 69 using Unicode objects, turn this feature off. 70 71*/ 72 73#define KEEPALIVE_SIZE_LIMIT 9 74 75/* Endianness switches; defaults to little endian */ 76 77#ifdef WORDS_BIGENDIAN 78# define BYTEORDER_IS_BIG_ENDIAN 79#else 80# define BYTEORDER_IS_LITTLE_ENDIAN 81#endif 82 83/* --- Globals ------------------------------------------------------------ 84 85 The globals are initialized by the _PyUnicode_Init() API and should 86 not be used before calling that API. 87 88*/ 89 90 91#ifdef __cplusplus 92extern "C" { 93#endif 94 95/* Free list for Unicode objects */ 96static PyUnicodeObject *unicode_freelist; 97static int unicode_freelist_size; 98 99/* The empty Unicode object is shared to improve performance. */ 100static PyUnicodeObject *unicode_empty; 101 102/* Single character Unicode strings in the Latin-1 range are being 103 shared as well. */ 104static PyUnicodeObject *unicode_latin1[256]; 105 106/* Default encoding to use and assume when NULL is passed as encoding 107 parameter; it is initialized by _PyUnicode_Init(). 108 109 Always use the PyUnicode_SetDefaultEncoding() and 110 PyUnicode_GetDefaultEncoding() APIs to access this global. 111 112*/ 113static char unicode_default_encoding[100]; 114 115Py_UNICODE 116PyUnicode_GetMax(void) 117{ 118#ifdef Py_UNICODE_WIDE 119 return 0x10FFFF; 120#else 121 /* This is actually an illegal character, so it should 122 not be passed to unichr. */ 123 return 0xFFFF; 124#endif 125} 126 127/* --- Bloom Filters ----------------------------------------------------- */ 128 129/* stuff to implement simple "bloom filters" for Unicode characters. 130 to keep things simple, we use a single bitmask, using the least 5 131 bits from each unicode characters as the bit index. */ 132 133/* the linebreak mask is set up by Unicode_Init below */ 134 135#define BLOOM_MASK unsigned long 136 137static BLOOM_MASK bloom_linebreak; 138 139#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F)))) 140 141#define BLOOM_LINEBREAK(ch)\ 142 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK((ch))) 143 144Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len) 145{ 146 /* calculate simple bloom-style bitmask for a given unicode string */ 147 148 long mask; 149 Py_ssize_t i; 150 151 mask = 0; 152 for (i = 0; i < len; i++) 153 mask |= (1 << (ptr[i] & 0x1F)); 154 155 return mask; 156} 157 158Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen) 159{ 160 Py_ssize_t i; 161 162 for (i = 0; i < setlen; i++) 163 if (set[i] == chr) 164 return 1; 165 166 return 0; 167} 168 169#define BLOOM_MEMBER(mask, chr, set, setlen)\ 170 BLOOM(mask, chr) && unicode_member(chr, set, setlen) 171 172/* --- Unicode Object ----------------------------------------------------- */ 173 174static 175int unicode_resize(register PyUnicodeObject *unicode, 176 Py_ssize_t length) 177{ 178 void *oldstr; 179 180 /* Shortcut if there's nothing much to do. */ 181 if (unicode->length == length) 182 goto reset; 183 184 /* Resizing shared object (unicode_empty or single character 185 objects) in-place is not allowed. Use PyUnicode_Resize() 186 instead ! */ 187 188 if (unicode == unicode_empty || 189 (unicode->length == 1 && 190 unicode->str[0] < 256U && 191 unicode_latin1[unicode->str[0]] == unicode)) { 192 PyErr_SetString(PyExc_SystemError, 193 "can't resize shared unicode objects"); 194 return -1; 195 } 196 197 /* We allocate one more byte to make sure the string is Ux0000 terminated. 198 The overallocation is also used by fastsearch, which assumes that it's 199 safe to look at str[length] (without making any assumptions about what 200 it contains). */ 201 202 oldstr = unicode->str; 203 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1); 204 if (!unicode->str) { 205 unicode->str = (Py_UNICODE *)oldstr; 206 PyErr_NoMemory(); 207 return -1; 208 } 209 unicode->str[length] = 0; 210 unicode->length = length; 211 212 reset: 213 /* Reset the object caches */ 214 if (unicode->defenc) { 215 Py_DECREF(unicode->defenc); 216 unicode->defenc = NULL; 217 } 218 unicode->hash = -1; 219 220 return 0; 221} 222 223/* We allocate one more byte to make sure the string is 224 Ux0000 terminated -- XXX is this needed ? 225 226 XXX This allocator could further be enhanced by assuring that the 227 free list never reduces its size below 1. 228 229*/ 230 231static 232PyUnicodeObject *_PyUnicode_New(Py_ssize_t length) 233{ 234 register PyUnicodeObject *unicode; 235 236 /* Optimization for empty strings */ 237 if (length == 0 && unicode_empty != NULL) { 238 Py_INCREF(unicode_empty); 239 return unicode_empty; 240 } 241 242 /* Unicode freelist & memory allocation */ 243 if (unicode_freelist) { 244 unicode = unicode_freelist; 245 unicode_freelist = *(PyUnicodeObject **)unicode; 246 unicode_freelist_size--; 247 if (unicode->str) { 248 /* Keep-Alive optimization: we only upsize the buffer, 249 never downsize it. */ 250 if ((unicode->length < length) && 251 unicode_resize(unicode, length) < 0) { 252 PyMem_DEL(unicode->str); 253 goto onError; 254 } 255 } 256 else { 257 unicode->str = PyMem_NEW(Py_UNICODE, length + 1); 258 } 259 PyObject_INIT(unicode, &PyUnicode_Type); 260 } 261 else { 262 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type); 263 if (unicode == NULL) 264 return NULL; 265 unicode->str = PyMem_NEW(Py_UNICODE, length + 1); 266 } 267 268 if (!unicode->str) { 269 PyErr_NoMemory(); 270 goto onError; 271 } 272 /* Initialize the first element to guard against cases where 273 * the caller fails before initializing str -- unicode_resize() 274 * reads str[0], and the Keep-Alive optimization can keep memory 275 * allocated for str alive across a call to unicode_dealloc(unicode). 276 * We don't want unicode_resize to read uninitialized memory in 277 * that case. 278 */ 279 unicode->str[0] = 0; 280 unicode->str[length] = 0; 281 unicode->length = length; 282 unicode->hash = -1; 283 unicode->defenc = NULL; 284 return unicode; 285 286 onError: 287 _Py_ForgetReference((PyObject *)unicode); 288 PyObject_Del(unicode); 289 return NULL; 290} 291 292static 293void unicode_dealloc(register PyUnicodeObject *unicode) 294{ 295 if (PyUnicode_CheckExact(unicode) && 296 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) { 297 /* Keep-Alive optimization */ 298 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) { 299 PyMem_DEL(unicode->str); 300 unicode->str = NULL; 301 unicode->length = 0; 302 } 303 if (unicode->defenc) { 304 Py_DECREF(unicode->defenc); 305 unicode->defenc = NULL; 306 } 307 /* Add to free list */ 308 *(PyUnicodeObject **)unicode = unicode_freelist; 309 unicode_freelist = unicode; 310 unicode_freelist_size++; 311 } 312 else { 313 PyMem_DEL(unicode->str); 314 Py_XDECREF(unicode->defenc); 315 unicode->ob_type->tp_free((PyObject *)unicode); 316 } 317} 318 319int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length) 320{ 321 register PyUnicodeObject *v; 322 323 /* Argument checks */ 324 if (unicode == NULL) { 325 PyErr_BadInternalCall(); 326 return -1; 327 } 328 v = (PyUnicodeObject *)*unicode; 329 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1 || length < 0) { 330 PyErr_BadInternalCall(); 331 return -1; 332 } 333 334 /* Resizing unicode_empty and single character objects is not 335 possible since these are being shared. We simply return a fresh 336 copy with the same Unicode content. */ 337 if (v->length != length && 338 (v == unicode_empty || v->length == 1)) { 339 PyUnicodeObject *w = _PyUnicode_New(length); 340 if (w == NULL) 341 return -1; 342 Py_UNICODE_COPY(w->str, v->str, 343 length < v->length ? length : v->length); 344 Py_DECREF(*unicode); 345 *unicode = (PyObject *)w; 346 return 0; 347 } 348 349 /* Note that we don't have to modify *unicode for unshared Unicode 350 objects, since we can modify them in-place. */ 351 return unicode_resize(v, length); 352} 353 354/* Internal API for use in unicodeobject.c only ! */ 355#define _PyUnicode_Resize(unicodevar, length) \ 356 PyUnicode_Resize(((PyObject **)(unicodevar)), length) 357 358PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u, 359 Py_ssize_t size) 360{ 361 PyUnicodeObject *unicode; 362 363 /* If the Unicode data is known at construction time, we can apply 364 some optimizations which share commonly used objects. */ 365 if (u != NULL) { 366 367 /* Optimization for empty strings */ 368 if (size == 0 && unicode_empty != NULL) { 369 Py_INCREF(unicode_empty); 370 return (PyObject *)unicode_empty; 371 } 372 373 /* Single character Unicode objects in the Latin-1 range are 374 shared when using this constructor */ 375 if (size == 1 && *u < 256) { 376 unicode = unicode_latin1[*u]; 377 if (!unicode) { 378 unicode = _PyUnicode_New(1); 379 if (!unicode) 380 return NULL; 381 unicode->str[0] = *u; 382 unicode_latin1[*u] = unicode; 383 } 384 Py_INCREF(unicode); 385 return (PyObject *)unicode; 386 } 387 } 388 389 unicode = _PyUnicode_New(size); 390 if (!unicode) 391 return NULL; 392 393 /* Copy the Unicode data into the new object */ 394 if (u != NULL) 395 Py_UNICODE_COPY(unicode->str, u, size); 396 397 return (PyObject *)unicode; 398} 399 400#ifdef HAVE_WCHAR_H 401 402PyObject *PyUnicode_FromWideChar(register const wchar_t *w, 403 Py_ssize_t size) 404{ 405 PyUnicodeObject *unicode; 406 407 if (w == NULL) { 408 PyErr_BadInternalCall(); 409 return NULL; 410 } 411 412 unicode = _PyUnicode_New(size); 413 if (!unicode) 414 return NULL; 415 416 /* Copy the wchar_t data into the new object */ 417#ifdef HAVE_USABLE_WCHAR_T 418 memcpy(unicode->str, w, size * sizeof(wchar_t)); 419#else 420 { 421 register Py_UNICODE *u; 422 register Py_ssize_t i; 423 u = PyUnicode_AS_UNICODE(unicode); 424 for (i = size; i > 0; i--) 425 *u++ = *w++; 426 } 427#endif 428 429 return (PyObject *)unicode; 430} 431 432Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode, 433 wchar_t *w, 434 Py_ssize_t size) 435{ 436 if (unicode == NULL) { 437 PyErr_BadInternalCall(); 438 return -1; 439 } 440 441 /* If possible, try to copy the 0-termination as well */ 442 if (size > PyUnicode_GET_SIZE(unicode)) 443 size = PyUnicode_GET_SIZE(unicode) + 1; 444 445#ifdef HAVE_USABLE_WCHAR_T 446 memcpy(w, unicode->str, size * sizeof(wchar_t)); 447#else 448 { 449 register Py_UNICODE *u; 450 register Py_ssize_t i; 451 u = PyUnicode_AS_UNICODE(unicode); 452 for (i = size; i > 0; i--) 453 *w++ = *u++; 454 } 455#endif 456 457 if (size > PyUnicode_GET_SIZE(unicode)) 458 return PyUnicode_GET_SIZE(unicode); 459 else 460 return size; 461} 462 463#endif 464 465PyObject *PyUnicode_FromOrdinal(int ordinal) 466{ 467 Py_UNICODE s[1]; 468 469#ifdef Py_UNICODE_WIDE 470 if (ordinal < 0 || ordinal > 0x10ffff) { 471 PyErr_SetString(PyExc_ValueError, 472 "unichr() arg not in range(0x110000) " 473 "(wide Python build)"); 474 return NULL; 475 } 476#else 477 if (ordinal < 0 || ordinal > 0xffff) { 478 PyErr_SetString(PyExc_ValueError, 479 "unichr() arg not in range(0x10000) " 480 "(narrow Python build)"); 481 return NULL; 482 } 483#endif 484 485 s[0] = (Py_UNICODE)ordinal; 486 return PyUnicode_FromUnicode(s, 1); 487} 488 489PyObject *PyUnicode_FromObject(register PyObject *obj) 490{ 491 /* XXX Perhaps we should make this API an alias of 492 PyObject_Unicode() instead ?! */ 493 if (PyUnicode_CheckExact(obj)) { 494 Py_INCREF(obj); 495 return obj; 496 } 497 if (PyUnicode_Check(obj)) { 498 /* For a Unicode subtype that's not a Unicode object, 499 return a true Unicode object with the same data. */ 500 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj), 501 PyUnicode_GET_SIZE(obj)); 502 } 503 return PyUnicode_FromEncodedObject(obj, NULL, "strict"); 504} 505 506PyObject *PyUnicode_FromEncodedObject(register PyObject *obj, 507 const char *encoding, 508 const char *errors) 509{ 510 const char *s = NULL; 511 Py_ssize_t len; 512 PyObject *v; 513 514 if (obj == NULL) { 515 PyErr_BadInternalCall(); 516 return NULL; 517 } 518 519#if 0 520 /* For b/w compatibility we also accept Unicode objects provided 521 that no encodings is given and then redirect to 522 PyObject_Unicode() which then applies the additional logic for 523 Unicode subclasses. 524 525 NOTE: This API should really only be used for object which 526 represent *encoded* Unicode ! 527 528 */ 529 if (PyUnicode_Check(obj)) { 530 if (encoding) { 531 PyErr_SetString(PyExc_TypeError, 532 "decoding Unicode is not supported"); 533 return NULL; 534 } 535 return PyObject_Unicode(obj); 536 } 537#else 538 if (PyUnicode_Check(obj)) { 539 PyErr_SetString(PyExc_TypeError, 540 "decoding Unicode is not supported"); 541 return NULL; 542 } 543#endif 544 545 /* Coerce object */ 546 if (PyString_Check(obj)) { 547 s = PyString_AS_STRING(obj); 548 len = PyString_GET_SIZE(obj); 549 } 550 else if (PyObject_AsCharBuffer(obj, &s, &len)) { 551 /* Overwrite the error message with something more useful in 552 case of a TypeError. */ 553 if (PyErr_ExceptionMatches(PyExc_TypeError)) 554 PyErr_Format(PyExc_TypeError, 555 "coercing to Unicode: need string or buffer, " 556 "%.80s found", 557 obj->ob_type->tp_name); 558 goto onError; 559 } 560 561 /* Convert to Unicode */ 562 if (len == 0) { 563 Py_INCREF(unicode_empty); 564 v = (PyObject *)unicode_empty; 565 } 566 else 567 v = PyUnicode_Decode(s, len, encoding, errors); 568 569 return v; 570 571 onError: 572 return NULL; 573} 574 575PyObject *PyUnicode_Decode(const char *s, 576 Py_ssize_t size, 577 const char *encoding, 578 const char *errors) 579{ 580 PyObject *buffer = NULL, *unicode; 581 582 if (encoding == NULL) 583 encoding = PyUnicode_GetDefaultEncoding(); 584 585 /* Shortcuts for common default encodings */ 586 if (strcmp(encoding, "utf-8") == 0) 587 return PyUnicode_DecodeUTF8(s, size, errors); 588 else if (strcmp(encoding, "latin-1") == 0) 589 return PyUnicode_DecodeLatin1(s, size, errors); 590#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T) 591 else if (strcmp(encoding, "mbcs") == 0) 592 return PyUnicode_DecodeMBCS(s, size, errors); 593#endif 594 else if (strcmp(encoding, "ascii") == 0) 595 return PyUnicode_DecodeASCII(s, size, errors); 596 597 /* Decode via the codec registry */ 598 buffer = PyBuffer_FromMemory((void *)s, size); 599 if (buffer == NULL) 600 goto onError; 601 unicode = PyCodec_Decode(buffer, encoding, errors); 602 if (unicode == NULL) 603 goto onError; 604 if (!PyUnicode_Check(unicode)) { 605 PyErr_Format(PyExc_TypeError, 606 "decoder did not return an unicode object (type=%.400s)", 607 unicode->ob_type->tp_name); 608 Py_DECREF(unicode); 609 goto onError; 610 } 611 Py_DECREF(buffer); 612 return unicode; 613 614 onError: 615 Py_XDECREF(buffer); 616 return NULL; 617} 618 619PyObject *PyUnicode_AsDecodedObject(PyObject *unicode, 620 const char *encoding, 621 const char *errors) 622{ 623 PyObject *v; 624 625 if (!PyUnicode_Check(unicode)) { 626 PyErr_BadArgument(); 627 goto onError; 628 } 629 630 if (encoding == NULL) 631 encoding = PyUnicode_GetDefaultEncoding(); 632 633 /* Decode via the codec registry */ 634 v = PyCodec_Decode(unicode, encoding, errors); 635 if (v == NULL) 636 goto onError; 637 return v; 638 639 onError: 640 return NULL; 641} 642 643PyObject *PyUnicode_Encode(const Py_UNICODE *s, 644 Py_ssize_t size, 645 const char *encoding, 646 const char *errors) 647{ 648 PyObject *v, *unicode; 649 650 unicode = PyUnicode_FromUnicode(s, size); 651 if (unicode == NULL) 652 return NULL; 653 v = PyUnicode_AsEncodedString(unicode, encoding, errors); 654 Py_DECREF(unicode); 655 return v; 656} 657 658PyObject *PyUnicode_AsEncodedObject(PyObject *unicode, 659 const char *encoding, 660 const char *errors) 661{ 662 PyObject *v; 663 664 if (!PyUnicode_Check(unicode)) { 665 PyErr_BadArgument(); 666 goto onError; 667 } 668 669 if (encoding == NULL) 670 encoding = PyUnicode_GetDefaultEncoding(); 671 672 /* Encode via the codec registry */ 673 v = PyCodec_Encode(unicode, encoding, errors); 674 if (v == NULL) 675 goto onError; 676 return v; 677 678 onError: 679 return NULL; 680} 681 682PyObject *PyUnicode_AsEncodedString(PyObject *unicode, 683 const char *encoding, 684 const char *errors) 685{ 686 PyObject *v; 687 688 if (!PyUnicode_Check(unicode)) { 689 PyErr_BadArgument(); 690 goto onError; 691 } 692 693 if (encoding == NULL) 694 encoding = PyUnicode_GetDefaultEncoding(); 695 696 /* Shortcuts for common default encodings */ 697 if (errors == NULL) { 698 if (strcmp(encoding, "utf-8") == 0) 699 return PyUnicode_AsUTF8String(unicode); 700 else if (strcmp(encoding, "latin-1") == 0) 701 return PyUnicode_AsLatin1String(unicode); 702#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T) 703 else if (strcmp(encoding, "mbcs") == 0) 704 return PyUnicode_AsMBCSString(unicode); 705#endif 706 else if (strcmp(encoding, "ascii") == 0) 707 return PyUnicode_AsASCIIString(unicode); 708 } 709 710 /* Encode via the codec registry */ 711 v = PyCodec_Encode(unicode, encoding, errors); 712 if (v == NULL) 713 goto onError; 714 if (!PyString_Check(v)) { 715 PyErr_Format(PyExc_TypeError, 716 "encoder did not return a string object (type=%.400s)", 717 v->ob_type->tp_name); 718 Py_DECREF(v); 719 goto onError; 720 } 721 return v; 722 723 onError: 724 return NULL; 725} 726 727PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode, 728 const char *errors) 729{ 730 PyObject *v = ((PyUnicodeObject *)unicode)->defenc; 731 732 if (v) 733 return v; 734 v = PyUnicode_AsEncodedString(unicode, NULL, errors); 735 if (v && errors == NULL) 736 ((PyUnicodeObject *)unicode)->defenc = v; 737 return v; 738} 739 740Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode) 741{ 742 if (!PyUnicode_Check(unicode)) { 743 PyErr_BadArgument(); 744 goto onError; 745 } 746 return PyUnicode_AS_UNICODE(unicode); 747 748 onError: 749 return NULL; 750} 751 752Py_ssize_t PyUnicode_GetSize(PyObject *unicode) 753{ 754 if (!PyUnicode_Check(unicode)) { 755 PyErr_BadArgument(); 756 goto onError; 757 } 758 return PyUnicode_GET_SIZE(unicode); 759 760 onError: 761 return -1; 762} 763 764const char *PyUnicode_GetDefaultEncoding(void) 765{ 766 return unicode_default_encoding; 767} 768 769int PyUnicode_SetDefaultEncoding(const char *encoding) 770{ 771 PyObject *v; 772 773 /* Make sure the encoding is valid. As side effect, this also 774 loads the encoding into the codec registry cache. */ 775 v = _PyCodec_Lookup(encoding); 776 if (v == NULL) 777 goto onError; 778 Py_DECREF(v); 779 strncpy(unicode_default_encoding, 780 encoding, 781 sizeof(unicode_default_encoding)); 782 return 0; 783 784 onError: 785 return -1; 786} 787 788/* error handling callback helper: 789 build arguments, call the callback and check the arguments, 790 if no exception occurred, copy the replacement to the output 791 and adjust various state variables. 792 return 0 on success, -1 on error 793*/ 794 795static 796int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler, 797 const char *encoding, const char *reason, 798 const char *input, Py_ssize_t insize, Py_ssize_t *startinpos, Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr, 799 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr) 800{ 801 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple"; 802 803 PyObject *restuple = NULL; 804 PyObject *repunicode = NULL; 805 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output); 806 Py_ssize_t requiredsize; 807 Py_ssize_t newpos; 808 Py_UNICODE *repptr; 809 Py_ssize_t repsize; 810 int res = -1; 811 812 if (*errorHandler == NULL) { 813 *errorHandler = PyCodec_LookupError(errors); 814 if (*errorHandler == NULL) 815 goto onError; 816 } 817 818 if (*exceptionObject == NULL) { 819 *exceptionObject = PyUnicodeDecodeError_Create( 820 encoding, input, insize, *startinpos, *endinpos, reason); 821 if (*exceptionObject == NULL) 822 goto onError; 823 } 824 else { 825 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos)) 826 goto onError; 827 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos)) 828 goto onError; 829 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason)) 830 goto onError; 831 } 832 833 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL); 834 if (restuple == NULL) 835 goto onError; 836 if (!PyTuple_Check(restuple)) { 837 PyErr_Format(PyExc_TypeError, &argparse[4]); 838 goto onError; 839 } 840 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos)) 841 goto onError; 842 if (newpos<0) 843 newpos = insize+newpos; 844 if (newpos<0 || newpos>insize) { 845 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos); 846 goto onError; 847 } 848 849 /* need more space? (at least enough for what we 850 have+the replacement+the rest of the string (starting 851 at the new input position), so we won't have to check space 852 when there are no errors in the rest of the string) */ 853 repptr = PyUnicode_AS_UNICODE(repunicode); 854 repsize = PyUnicode_GET_SIZE(repunicode); 855 requiredsize = *outpos + repsize + insize-newpos; 856 if (requiredsize > outsize) { 857 if (requiredsize<2*outsize) 858 requiredsize = 2*outsize; 859 if (PyUnicode_Resize(output, requiredsize) < 0) 860 goto onError; 861 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos; 862 } 863 *endinpos = newpos; 864 *inptr = input + newpos; 865 Py_UNICODE_COPY(*outptr, repptr, repsize); 866 *outptr += repsize; 867 *outpos += repsize; 868 /* we made it! */ 869 res = 0; 870 871 onError: 872 Py_XDECREF(restuple); 873 return res; 874} 875 876/* --- UTF-7 Codec -------------------------------------------------------- */ 877 878/* see RFC2152 for details */ 879 880static 881char utf7_special[128] = { 882 /* indicate whether a UTF-7 character is special i.e. cannot be directly 883 encoded: 884 0 - not special 885 1 - special 886 2 - whitespace (optional) 887 3 - RFC2152 Set O (optional) */ 888 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1, 889 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 890 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1, 891 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0, 892 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 893 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3, 894 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 895 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1, 896 897}; 898 899/* Note: The comparison (c) <= 0 is a trick to work-around gcc 900 warnings about the comparison always being false; since 901 utf7_special[0] is 1, we can safely make that one comparison 902 true */ 903 904#define SPECIAL(c, encodeO, encodeWS) \ 905 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \ 906 (encodeWS && (utf7_special[(c)] == 2)) || \ 907 (encodeO && (utf7_special[(c)] == 3))) 908 909#define B64(n) \ 910 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f]) 911#define B64CHAR(c) \ 912 (isalnum(c) || (c) == '+' || (c) == '/') 913#define UB64(c) \ 914 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \ 915 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 ) 916 917#define ENCODE(out, ch, bits) \ 918 while (bits >= 6) { \ 919 *out++ = B64(ch >> (bits-6)); \ 920 bits -= 6; \ 921 } 922 923#define DECODE(out, ch, bits, surrogate) \ 924 while (bits >= 16) { \ 925 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \ 926 bits -= 16; \ 927 if (surrogate) { \ 928 /* We have already generated an error for the high surrogate \ 929 so let's not bother seeing if the low surrogate is correct or not */ \ 930 surrogate = 0; \ 931 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \ 932 /* This is a surrogate pair. Unfortunately we can't represent \ 933 it in a 16-bit character */ \ 934 surrogate = 1; \ 935 errmsg = "code pairs are not supported"; \ 936 goto utf7Error; \ 937 } else { \ 938 *out++ = outCh; \ 939 } \ 940 } 941 942PyObject *PyUnicode_DecodeUTF7(const char *s, 943 Py_ssize_t size, 944 const char *errors) 945{ 946 const char *starts = s; 947 Py_ssize_t startinpos; 948 Py_ssize_t endinpos; 949 Py_ssize_t outpos; 950 const char *e; 951 PyUnicodeObject *unicode; 952 Py_UNICODE *p; 953 const char *errmsg = ""; 954 int inShift = 0; 955 unsigned int bitsleft = 0; 956 unsigned long charsleft = 0; 957 int surrogate = 0; 958 PyObject *errorHandler = NULL; 959 PyObject *exc = NULL; 960 961 unicode = _PyUnicode_New(size); 962 if (!unicode) 963 return NULL; 964 if (size == 0) 965 return (PyObject *)unicode; 966 967 p = unicode->str; 968 e = s + size; 969 970 while (s < e) { 971 Py_UNICODE ch; 972 restart: 973 ch = *s; 974 975 if (inShift) { 976 if ((ch == '-') || !B64CHAR(ch)) { 977 inShift = 0; 978 s++; 979 980 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate); 981 if (bitsleft >= 6) { 982 /* The shift sequence has a partial character in it. If 983 bitsleft < 6 then we could just classify it as padding 984 but that is not the case here */ 985 986 errmsg = "partial character in shift sequence"; 987 goto utf7Error; 988 } 989 /* According to RFC2152 the remaining bits should be zero. We 990 choose to signal an error/insert a replacement character 991 here so indicate the potential of a misencoded character. */ 992 993 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */ 994 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) { 995 errmsg = "non-zero padding bits in shift sequence"; 996 goto utf7Error; 997 } 998 999 if (ch == '-') { 1000 if ((s < e) && (*(s) == '-')) { 1001 *p++ = '-'; 1002 inShift = 1; 1003 } 1004 } else if (SPECIAL(ch,0,0)) { 1005 errmsg = "unexpected special character"; 1006 goto utf7Error; 1007 } else { 1008 *p++ = ch; 1009 } 1010 } else { 1011 charsleft = (charsleft << 6) | UB64(ch); 1012 bitsleft += 6; 1013 s++; 1014 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate); 1015 } 1016 } 1017 else if ( ch == '+' ) { 1018 startinpos = s-starts; 1019 s++; 1020 if (s < e && *s == '-') { 1021 s++; 1022 *p++ = '+'; 1023 } else 1024 { 1025 inShift = 1; 1026 bitsleft = 0; 1027 } 1028 } 1029 else if (SPECIAL(ch,0,0)) { 1030 errmsg = "unexpected special character"; 1031 s++; 1032 goto utf7Error; 1033 } 1034 else { 1035 *p++ = ch; 1036 s++; 1037 } 1038 continue; 1039 utf7Error: 1040 outpos = p-PyUnicode_AS_UNICODE(unicode); 1041 endinpos = s-starts; 1042 if (unicode_decode_call_errorhandler( 1043 errors, &errorHandler, 1044 "utf7", errmsg, 1045 starts, size, &startinpos, &endinpos, &exc, &s, 1046 (PyObject **)&unicode, &outpos, &p)) 1047 goto onError; 1048 } 1049 1050 if (inShift) { 1051 outpos = p-PyUnicode_AS_UNICODE(unicode); 1052 endinpos = size; 1053 if (unicode_decode_call_errorhandler( 1054 errors, &errorHandler, 1055 "utf7", "unterminated shift sequence", 1056 starts, size, &startinpos, &endinpos, &exc, &s, 1057 (PyObject **)&unicode, &outpos, &p)) 1058 goto onError; 1059 if (s < e) 1060 goto restart; 1061 } 1062 1063 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0) 1064 goto onError; 1065 1066 Py_XDECREF(errorHandler); 1067 Py_XDECREF(exc); 1068 return (PyObject *)unicode; 1069 1070onError: 1071 Py_XDECREF(errorHandler); 1072 Py_XDECREF(exc); 1073 Py_DECREF(unicode); 1074 return NULL; 1075} 1076 1077 1078PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s, 1079 Py_ssize_t size, 1080 int encodeSetO, 1081 int encodeWhiteSpace, 1082 const char *errors) 1083{ 1084 PyObject *v; 1085 /* It might be possible to tighten this worst case */ 1086 Py_ssize_t cbAllocated = 5 * size; 1087 int inShift = 0; 1088 Py_ssize_t i = 0; 1089 unsigned int bitsleft = 0; 1090 unsigned long charsleft = 0; 1091 char * out; 1092 char * start; 1093 1094 if (size == 0) 1095 return PyString_FromStringAndSize(NULL, 0); 1096 1097 v = PyString_FromStringAndSize(NULL, cbAllocated); 1098 if (v == NULL) 1099 return NULL; 1100 1101 start = out = PyString_AS_STRING(v); 1102 for (;i < size; ++i) { 1103 Py_UNICODE ch = s[i]; 1104 1105 if (!inShift) { 1106 if (ch == '+') { 1107 *out++ = '+'; 1108 *out++ = '-'; 1109 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) { 1110 charsleft = ch; 1111 bitsleft = 16; 1112 *out++ = '+'; 1113 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft); 1114 inShift = bitsleft > 0; 1115 } else { 1116 *out++ = (char) ch; 1117 } 1118 } else { 1119 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) { 1120 *out++ = B64(charsleft << (6-bitsleft)); 1121 charsleft = 0; 1122 bitsleft = 0; 1123 /* Characters not in the BASE64 set implicitly unshift the sequence 1124 so no '-' is required, except if the character is itself a '-' */ 1125 if (B64CHAR(ch) || ch == '-') { 1126 *out++ = '-'; 1127 } 1128 inShift = 0; 1129 *out++ = (char) ch; 1130 } else { 1131 bitsleft += 16; 1132 charsleft = (charsleft << 16) | ch; 1133 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft); 1134 1135 /* If the next character is special then we dont' need to terminate 1136 the shift sequence. If the next character is not a BASE64 character 1137 or '-' then the shift sequence will be terminated implicitly and we 1138 don't have to insert a '-'. */ 1139 1140 if (bitsleft == 0) { 1141 if (i + 1 < size) { 1142 Py_UNICODE ch2 = s[i+1]; 1143 1144 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) { 1145 1146 } else if (B64CHAR(ch2) || ch2 == '-') { 1147 *out++ = '-'; 1148 inShift = 0; 1149 } else { 1150 inShift = 0; 1151 } 1152 1153 } 1154 else { 1155 *out++ = '-'; 1156 inShift = 0; 1157 } 1158 } 1159 } 1160 } 1161 } 1162 if (bitsleft) { 1163 *out++= B64(charsleft << (6-bitsleft) ); 1164 *out++ = '-'; 1165 } 1166 1167 _PyString_Resize(&v, out - start); 1168 return v; 1169} 1170 1171#undef SPECIAL 1172#undef B64 1173#undef B64CHAR 1174#undef UB64 1175#undef ENCODE 1176#undef DECODE 1177 1178/* --- UTF-8 Codec -------------------------------------------------------- */ 1179 1180static 1181char utf8_code_length[256] = { 1182 /* Map UTF-8 encoded prefix byte to sequence length. zero means 1183 illegal prefix. see RFC 2279 for details */ 1184 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1185 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1186 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1187 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1188 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1189 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1190 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1191 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1192 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1193 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1194 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1195 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1196 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1197 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1198 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1199 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0 1200}; 1201 1202PyObject *PyUnicode_DecodeUTF8(const char *s, 1203 Py_ssize_t size, 1204 const char *errors) 1205{ 1206 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL); 1207} 1208 1209PyObject *PyUnicode_DecodeUTF8Stateful(const char *s, 1210 Py_ssize_t size, 1211 const char *errors, 1212 Py_ssize_t *consumed) 1213{ 1214 const char *starts = s; 1215 int n; 1216 Py_ssize_t startinpos; 1217 Py_ssize_t endinpos; 1218 Py_ssize_t outpos; 1219 const char *e; 1220 PyUnicodeObject *unicode; 1221 Py_UNICODE *p; 1222 const char *errmsg = ""; 1223 PyObject *errorHandler = NULL; 1224 PyObject *exc = NULL; 1225 1226 /* Note: size will always be longer than the resulting Unicode 1227 character count */ 1228 unicode = _PyUnicode_New(size); 1229 if (!unicode) 1230 return NULL; 1231 if (size == 0) { 1232 if (consumed) 1233 *consumed = 0; 1234 return (PyObject *)unicode; 1235 } 1236 1237 /* Unpack UTF-8 encoded data */ 1238 p = unicode->str; 1239 e = s + size; 1240 1241 while (s < e) { 1242 Py_UCS4 ch = (unsigned char)*s; 1243 1244 if (ch < 0x80) { 1245 *p++ = (Py_UNICODE)ch; 1246 s++; 1247 continue; 1248 } 1249 1250 n = utf8_code_length[ch]; 1251 1252 if (s + n > e) { 1253 if (consumed) 1254 break; 1255 else { 1256 errmsg = "unexpected end of data"; 1257 startinpos = s-starts; 1258 endinpos = size; 1259 goto utf8Error; 1260 } 1261 } 1262 1263 switch (n) { 1264 1265 case 0: 1266 errmsg = "unexpected code byte"; 1267 startinpos = s-starts; 1268 endinpos = startinpos+1; 1269 goto utf8Error; 1270 1271 case 1: 1272 errmsg = "internal error"; 1273 startinpos = s-starts; 1274 endinpos = startinpos+1; 1275 goto utf8Error; 1276 1277 case 2: 1278 if ((s[1] & 0xc0) != 0x80) { 1279 errmsg = "invalid data"; 1280 startinpos = s-starts; 1281 endinpos = startinpos+2; 1282 goto utf8Error; 1283 } 1284 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f); 1285 if (ch < 0x80) { 1286 startinpos = s-starts; 1287 endinpos = startinpos+2; 1288 errmsg = "illegal encoding"; 1289 goto utf8Error; 1290 } 1291 else 1292 *p++ = (Py_UNICODE)ch; 1293 break; 1294 1295 case 3: 1296 if ((s[1] & 0xc0) != 0x80 || 1297 (s[2] & 0xc0) != 0x80) { 1298 errmsg = "invalid data"; 1299 startinpos = s-starts; 1300 endinpos = startinpos+3; 1301 goto utf8Error; 1302 } 1303 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f); 1304 if (ch < 0x0800) { 1305 /* Note: UTF-8 encodings of surrogates are considered 1306 legal UTF-8 sequences; 1307 1308 XXX For wide builds (UCS-4) we should probably try 1309 to recombine the surrogates into a single code 1310 unit. 1311 */ 1312 errmsg = "illegal encoding"; 1313 startinpos = s-starts; 1314 endinpos = startinpos+3; 1315 goto utf8Error; 1316 } 1317 else 1318 *p++ = (Py_UNICODE)ch; 1319 break; 1320 1321 case 4: 1322 if ((s[1] & 0xc0) != 0x80 || 1323 (s[2] & 0xc0) != 0x80 || 1324 (s[3] & 0xc0) != 0x80) { 1325 errmsg = "invalid data"; 1326 startinpos = s-starts; 1327 endinpos = startinpos+4; 1328 goto utf8Error; 1329 } 1330 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) + 1331 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f); 1332 /* validate and convert to UTF-16 */ 1333 if ((ch < 0x10000) /* minimum value allowed for 4 1334 byte encoding */ 1335 || (ch > 0x10ffff)) /* maximum value allowed for 1336 UTF-16 */ 1337 { 1338 errmsg = "illegal encoding"; 1339 startinpos = s-starts; 1340 endinpos = startinpos+4; 1341 goto utf8Error; 1342 } 1343#ifdef Py_UNICODE_WIDE 1344 *p++ = (Py_UNICODE)ch; 1345#else 1346 /* compute and append the two surrogates: */ 1347 1348 /* translate from 10000..10FFFF to 0..FFFF */ 1349 ch -= 0x10000; 1350 1351 /* high surrogate = top 10 bits added to D800 */ 1352 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10)); 1353 1354 /* low surrogate = bottom 10 bits added to DC00 */ 1355 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF)); 1356#endif 1357 break; 1358 1359 default: 1360 /* Other sizes are only needed for UCS-4 */ 1361 errmsg = "unsupported Unicode code range"; 1362 startinpos = s-starts; 1363 endinpos = startinpos+n; 1364 goto utf8Error; 1365 } 1366 s += n; 1367 continue; 1368 1369 utf8Error: 1370 outpos = p-PyUnicode_AS_UNICODE(unicode); 1371 if (unicode_decode_call_errorhandler( 1372 errors, &errorHandler, 1373 "utf8", errmsg, 1374 starts, size, &startinpos, &endinpos, &exc, &s, 1375 (PyObject **)&unicode, &outpos, &p)) 1376 goto onError; 1377 } 1378 if (consumed) 1379 *consumed = s-starts; 1380 1381 /* Adjust length */ 1382 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0) 1383 goto onError; 1384 1385 Py_XDECREF(errorHandler); 1386 Py_XDECREF(exc); 1387 return (PyObject *)unicode; 1388 1389onError: 1390 Py_XDECREF(errorHandler); 1391 Py_XDECREF(exc); 1392 Py_DECREF(unicode); 1393 return NULL; 1394} 1395 1396/* Allocation strategy: if the string is short, convert into a stack buffer 1397 and allocate exactly as much space needed at the end. Else allocate the 1398 maximum possible needed (4 result bytes per Unicode character), and return 1399 the excess memory at the end. 1400*/ 1401PyObject * 1402PyUnicode_EncodeUTF8(const Py_UNICODE *s, 1403 Py_ssize_t size, 1404 const char *errors) 1405{ 1406#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */ 1407 1408 Py_ssize_t i; /* index into s of next input byte */ 1409 PyObject *v; /* result string object */ 1410 char *p; /* next free byte in output buffer */ 1411 Py_ssize_t nallocated; /* number of result bytes allocated */ 1412 Py_ssize_t nneeded; /* number of result bytes needed */ 1413 char stackbuf[MAX_SHORT_UNICHARS * 4]; 1414 1415 assert(s != NULL); 1416 assert(size >= 0); 1417 1418 if (size <= MAX_SHORT_UNICHARS) { 1419 /* Write into the stack buffer; nallocated can't overflow. 1420 * At the end, we'll allocate exactly as much heap space as it 1421 * turns out we need. 1422 */ 1423 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int); 1424 v = NULL; /* will allocate after we're done */ 1425 p = stackbuf; 1426 } 1427 else { 1428 /* Overallocate on the heap, and give the excess back at the end. */ 1429 nallocated = size * 4; 1430 if (nallocated / 4 != size) /* overflow! */ 1431 return PyErr_NoMemory(); 1432 v = PyString_FromStringAndSize(NULL, nallocated); 1433 if (v == NULL) 1434 return NULL; 1435 p = PyString_AS_STRING(v); 1436 } 1437 1438 for (i = 0; i < size;) { 1439 Py_UCS4 ch = s[i++]; 1440 1441 if (ch < 0x80) 1442 /* Encode ASCII */ 1443 *p++ = (char) ch; 1444 1445 else if (ch < 0x0800) { 1446 /* Encode Latin-1 */ 1447 *p++ = (char)(0xc0 | (ch >> 6)); 1448 *p++ = (char)(0x80 | (ch & 0x3f)); 1449 } 1450 else { 1451 /* Encode UCS2 Unicode ordinals */ 1452 if (ch < 0x10000) { 1453 /* Special case: check for high surrogate */ 1454 if (0xD800 <= ch && ch <= 0xDBFF && i != size) { 1455 Py_UCS4 ch2 = s[i]; 1456 /* Check for low surrogate and combine the two to 1457 form a UCS4 value */ 1458 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) { 1459 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000; 1460 i++; 1461 goto encodeUCS4; 1462 } 1463 /* Fall through: handles isolated high surrogates */ 1464 } 1465 *p++ = (char)(0xe0 | (ch >> 12)); 1466 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); 1467 *p++ = (char)(0x80 | (ch & 0x3f)); 1468 continue; 1469 } 1470encodeUCS4: 1471 /* Encode UCS4 Unicode ordinals */ 1472 *p++ = (char)(0xf0 | (ch >> 18)); 1473 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f)); 1474 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); 1475 *p++ = (char)(0x80 | (ch & 0x3f)); 1476 } 1477 } 1478 1479 if (v == NULL) { 1480 /* This was stack allocated. */ 1481 nneeded = p - stackbuf; 1482 assert(nneeded <= nallocated); 1483 v = PyString_FromStringAndSize(stackbuf, nneeded); 1484 } 1485 else { 1486 /* Cut back to size actually needed. */ 1487 nneeded = p - PyString_AS_STRING(v); 1488 assert(nneeded <= nallocated); 1489 _PyString_Resize(&v, nneeded); 1490 } 1491 return v; 1492 1493#undef MAX_SHORT_UNICHARS 1494} 1495 1496PyObject *PyUnicode_AsUTF8String(PyObject *unicode) 1497{ 1498 if (!PyUnicode_Check(unicode)) { 1499 PyErr_BadArgument(); 1500 return NULL; 1501 } 1502 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode), 1503 PyUnicode_GET_SIZE(unicode), 1504 NULL); 1505} 1506 1507/* --- UTF-16 Codec ------------------------------------------------------- */ 1508 1509PyObject * 1510PyUnicode_DecodeUTF16(const char *s, 1511 Py_ssize_t size, 1512 const char *errors, 1513 int *byteorder) 1514{ 1515 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL); 1516} 1517 1518PyObject * 1519PyUnicode_DecodeUTF16Stateful(const char *s, 1520 Py_ssize_t size, 1521 const char *errors, 1522 int *byteorder, 1523 Py_ssize_t *consumed) 1524{ 1525 const char *starts = s; 1526 Py_ssize_t startinpos; 1527 Py_ssize_t endinpos; 1528 Py_ssize_t outpos; 1529 PyUnicodeObject *unicode; 1530 Py_UNICODE *p; 1531 const unsigned char *q, *e; 1532 int bo = 0; /* assume native ordering by default */ 1533 const char *errmsg = ""; 1534 /* Offsets from q for retrieving byte pairs in the right order. */ 1535#ifdef BYTEORDER_IS_LITTLE_ENDIAN 1536 int ihi = 1, ilo = 0; 1537#else 1538 int ihi = 0, ilo = 1; 1539#endif 1540 PyObject *errorHandler = NULL; 1541 PyObject *exc = NULL; 1542 1543 /* Note: size will always be longer than the resulting Unicode 1544 character count */ 1545 unicode = _PyUnicode_New(size); 1546 if (!unicode) 1547 return NULL; 1548 if (size == 0) 1549 return (PyObject *)unicode; 1550 1551 /* Unpack UTF-16 encoded data */ 1552 p = unicode->str; 1553 q = (unsigned char *)s; 1554 e = q + size; 1555 1556 if (byteorder) 1557 bo = *byteorder; 1558 1559 /* Check for BOM marks (U+FEFF) in the input and adjust current 1560 byte order setting accordingly. In native mode, the leading BOM 1561 mark is skipped, in all other modes, it is copied to the output 1562 stream as-is (giving a ZWNBSP character). */ 1563 if (bo == 0) { 1564 if (size >= 2) { 1565 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo]; 1566#ifdef BYTEORDER_IS_LITTLE_ENDIAN 1567 if (bom == 0xFEFF) { 1568 q += 2; 1569 bo = -1; 1570 } 1571 else if (bom == 0xFFFE) { 1572 q += 2; 1573 bo = 1; 1574 } 1575#else 1576 if (bom == 0xFEFF) { 1577 q += 2; 1578 bo = 1; 1579 } 1580 else if (bom == 0xFFFE) { 1581 q += 2; 1582 bo = -1; 1583 } 1584#endif 1585 } 1586 } 1587 1588 if (bo == -1) { 1589 /* force LE */ 1590 ihi = 1; 1591 ilo = 0; 1592 } 1593 else if (bo == 1) { 1594 /* force BE */ 1595 ihi = 0; 1596 ilo = 1; 1597 } 1598 1599 while (q < e) { 1600 Py_UNICODE ch; 1601 /* remaining bytes at the end? (size should be even) */ 1602 if (e-q<2) { 1603 if (consumed) 1604 break; 1605 errmsg = "truncated data"; 1606 startinpos = ((const char *)q)-starts; 1607 endinpos = ((const char *)e)-starts; 1608 goto utf16Error; 1609 /* The remaining input chars are ignored if the callback 1610 chooses to skip the input */ 1611 } 1612 ch = (q[ihi] << 8) | q[ilo]; 1613 1614 q += 2; 1615 1616 if (ch < 0xD800 || ch > 0xDFFF) { 1617 *p++ = ch; 1618 continue; 1619 } 1620 1621 /* UTF-16 code pair: */ 1622 if (q >= e) { 1623 errmsg = "unexpected end of data"; 1624 startinpos = (((const char *)q)-2)-starts; 1625 endinpos = ((const char *)e)-starts; 1626 goto utf16Error; 1627 } 1628 if (0xD800 <= ch && ch <= 0xDBFF) { 1629 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo]; 1630 q += 2; 1631 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) { 1632#ifndef Py_UNICODE_WIDE 1633 *p++ = ch; 1634 *p++ = ch2; 1635#else 1636 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000; 1637#endif 1638 continue; 1639 } 1640 else { 1641 errmsg = "illegal UTF-16 surrogate"; 1642 startinpos = (((const char *)q)-4)-starts; 1643 endinpos = startinpos+2; 1644 goto utf16Error; 1645 } 1646 1647 } 1648 errmsg = "illegal encoding"; 1649 startinpos = (((const char *)q)-2)-starts; 1650 endinpos = startinpos+2; 1651 /* Fall through to report the error */ 1652 1653 utf16Error: 1654 outpos = p-PyUnicode_AS_UNICODE(unicode); 1655 if (unicode_decode_call_errorhandler( 1656 errors, &errorHandler, 1657 "utf16", errmsg, 1658 starts, size, &startinpos, &endinpos, &exc, (const char **)&q, 1659 (PyObject **)&unicode, &outpos, &p)) 1660 goto onError; 1661 } 1662 1663 if (byteorder) 1664 *byteorder = bo; 1665 1666 if (consumed) 1667 *consumed = (const char *)q-starts; 1668 1669 /* Adjust length */ 1670 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0) 1671 goto onError; 1672 1673 Py_XDECREF(errorHandler); 1674 Py_XDECREF(exc); 1675 return (PyObject *)unicode; 1676 1677onError: 1678 Py_DECREF(unicode); 1679 Py_XDECREF(errorHandler); 1680 Py_XDECREF(exc); 1681 return NULL; 1682} 1683 1684PyObject * 1685PyUnicode_EncodeUTF16(const Py_UNICODE *s, 1686 Py_ssize_t size, 1687 const char *errors, 1688 int byteorder) 1689{ 1690 PyObject *v; 1691 unsigned char *p; 1692#ifdef Py_UNICODE_WIDE 1693 int i, pairs; 1694#else 1695 const int pairs = 0; 1696#endif 1697 /* Offsets from p for storing byte pairs in the right order. */ 1698#ifdef BYTEORDER_IS_LITTLE_ENDIAN 1699 int ihi = 1, ilo = 0; 1700#else 1701 int ihi = 0, ilo = 1; 1702#endif 1703 1704#define STORECHAR(CH) \ 1705 do { \ 1706 p[ihi] = ((CH) >> 8) & 0xff; \ 1707 p[ilo] = (CH) & 0xff; \ 1708 p += 2; \ 1709 } while(0) 1710 1711#ifdef Py_UNICODE_WIDE 1712 for (i = pairs = 0; i < size; i++) 1713 if (s[i] >= 0x10000) 1714 pairs++; 1715#endif 1716 v = PyString_FromStringAndSize(NULL, 1717 2 * (size + pairs + (byteorder == 0))); 1718 if (v == NULL) 1719 return NULL; 1720 1721 p = (unsigned char *)PyString_AS_STRING(v); 1722 if (byteorder == 0) 1723 STORECHAR(0xFEFF); 1724 if (size == 0) 1725 return v; 1726 1727 if (byteorder == -1) { 1728 /* force LE */ 1729 ihi = 1; 1730 ilo = 0; 1731 } 1732 else if (byteorder == 1) { 1733 /* force BE */ 1734 ihi = 0; 1735 ilo = 1; 1736 } 1737 1738 while (size-- > 0) { 1739 Py_UNICODE ch = *s++; 1740 Py_UNICODE ch2 = 0; 1741#ifdef Py_UNICODE_WIDE 1742 if (ch >= 0x10000) { 1743 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF); 1744 ch = 0xD800 | ((ch-0x10000) >> 10); 1745 } 1746#endif 1747 STORECHAR(ch); 1748 if (ch2) 1749 STORECHAR(ch2); 1750 } 1751 return v; 1752#undef STORECHAR 1753} 1754 1755PyObject *PyUnicode_AsUTF16String(PyObject *unicode) 1756{ 1757 if (!PyUnicode_Check(unicode)) { 1758 PyErr_BadArgument(); 1759 return NULL; 1760 } 1761 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode), 1762 PyUnicode_GET_SIZE(unicode), 1763 NULL, 1764 0); 1765} 1766 1767/* --- Unicode Escape Codec ----------------------------------------------- */ 1768 1769static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL; 1770 1771PyObject *PyUnicode_DecodeUnicodeEscape(const char *s, 1772 Py_ssize_t size, 1773 const char *errors) 1774{ 1775 const char *starts = s; 1776 Py_ssize_t startinpos; 1777 Py_ssize_t endinpos; 1778 Py_ssize_t outpos; 1779 int i; 1780 PyUnicodeObject *v; 1781 Py_UNICODE *p; 1782 const char *end; 1783 char* message; 1784 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */ 1785 PyObject *errorHandler = NULL; 1786 PyObject *exc = NULL; 1787 1788 /* Escaped strings will always be longer than the resulting 1789 Unicode string, so we start with size here and then reduce the 1790 length after conversion to the true value. 1791 (but if the error callback returns a long replacement string 1792 we'll have to allocate more space) */ 1793 v = _PyUnicode_New(size); 1794 if (v == NULL) 1795 goto onError; 1796 if (size == 0) 1797 return (PyObject *)v; 1798 1799 p = PyUnicode_AS_UNICODE(v); 1800 end = s + size; 1801 1802 while (s < end) { 1803 unsigned char c; 1804 Py_UNICODE x; 1805 int digits; 1806 1807 /* Non-escape characters are interpreted as Unicode ordinals */ 1808 if (*s != '\\') { 1809 *p++ = (unsigned char) *s++; 1810 continue; 1811 } 1812 1813 startinpos = s-starts; 1814 /* \ - Escapes */ 1815 s++; 1816 switch (*s++) { 1817 1818 /* \x escapes */ 1819 case '\n': break; 1820 case '\\': *p++ = '\\'; break; 1821 case '\'': *p++ = '\''; break; 1822 case '\"': *p++ = '\"'; break; 1823 case 'b': *p++ = '\b'; break; 1824 case 'f': *p++ = '\014'; break; /* FF */ 1825 case 't': *p++ = '\t'; break; 1826 case 'n': *p++ = '\n'; break; 1827 case 'r': *p++ = '\r'; break; 1828 case 'v': *p++ = '\013'; break; /* VT */ 1829 case 'a': *p++ = '\007'; break; /* BEL, not classic C */ 1830 1831 /* \OOO (octal) escapes */ 1832 case '0': case '1': case '2': case '3': 1833 case '4': case '5': case '6': case '7': 1834 x = s[-1] - '0'; 1835 if ('0' <= *s && *s <= '7') { 1836 x = (x<<3) + *s++ - '0'; 1837 if ('0' <= *s && *s <= '7') 1838 x = (x<<3) + *s++ - '0'; 1839 } 1840 *p++ = x; 1841 break; 1842 1843 /* hex escapes */ 1844 /* \xXX */ 1845 case 'x': 1846 digits = 2; 1847 message = "truncated \\xXX escape"; 1848 goto hexescape; 1849 1850 /* \uXXXX */ 1851 case 'u': 1852 digits = 4; 1853 message = "truncated \\uXXXX escape"; 1854 goto hexescape; 1855 1856 /* \UXXXXXXXX */ 1857 case 'U': 1858 digits = 8; 1859 message = "truncated \\UXXXXXXXX escape"; 1860 hexescape: 1861 chr = 0; 1862 outpos = p-PyUnicode_AS_UNICODE(v); 1863 if (s+digits>end) { 1864 endinpos = size; 1865 if (unicode_decode_call_errorhandler( 1866 errors, &errorHandler, 1867 "unicodeescape", "end of string in escape sequence", 1868 starts, size, &startinpos, &endinpos, &exc, &s, 1869 (PyObject **)&v, &outpos, &p)) 1870 goto onError; 1871 goto nextByte; 1872 } 1873 for (i = 0; i < digits; ++i) { 1874 c = (unsigned char) s[i]; 1875 if (!isxdigit(c)) { 1876 endinpos = (s+i+1)-starts; 1877 if (unicode_decode_call_errorhandler( 1878 errors, &errorHandler, 1879 "unicodeescape", message, 1880 starts, size, &startinpos, &endinpos, &exc, &s, 1881 (PyObject **)&v, &outpos, &p)) 1882 goto onError; 1883 goto nextByte; 1884 } 1885 chr = (chr<<4) & ~0xF; 1886 if (c >= '0' && c <= '9') 1887 chr += c - '0'; 1888 else if (c >= 'a' && c <= 'f') 1889 chr += 10 + c - 'a'; 1890 else 1891 chr += 10 + c - 'A'; 1892 } 1893 s += i; 1894 if (chr == 0xffffffff && PyErr_Occurred()) 1895 /* _decoding_error will have already written into the 1896 target buffer. */ 1897 break; 1898 store: 1899 /* when we get here, chr is a 32-bit unicode character */ 1900 if (chr <= 0xffff) 1901 /* UCS-2 character */ 1902 *p++ = (Py_UNICODE) chr; 1903 else if (chr <= 0x10ffff) { 1904 /* UCS-4 character. Either store directly, or as 1905 surrogate pair. */ 1906#ifdef Py_UNICODE_WIDE 1907 *p++ = chr; 1908#else 1909 chr -= 0x10000L; 1910 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10); 1911 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF); 1912#endif 1913 } else { 1914 endinpos = s-starts; 1915 outpos = p-PyUnicode_AS_UNICODE(v); 1916 if (unicode_decode_call_errorhandler( 1917 errors, &errorHandler, 1918 "unicodeescape", "illegal Unicode character", 1919 starts, size, &startinpos, &endinpos, &exc, &s, 1920 (PyObject **)&v, &outpos, &p)) 1921 goto onError; 1922 } 1923 break; 1924 1925 /* \N{name} */ 1926 case 'N': 1927 message = "malformed \\N character escape"; 1928 if (ucnhash_CAPI == NULL) { 1929 /* load the unicode data module */ 1930 PyObject *m, *api; 1931 m = PyImport_ImportModule("unicodedata"); 1932 if (m == NULL) 1933 goto ucnhashError; 1934 api = PyObject_GetAttrString(m, "ucnhash_CAPI"); 1935 Py_DECREF(m); 1936 if (api == NULL) 1937 goto ucnhashError; 1938 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api); 1939 Py_DECREF(api); 1940 if (ucnhash_CAPI == NULL) 1941 goto ucnhashError; 1942 } 1943 if (*s == '{') { 1944 const char *start = s+1; 1945 /* look for the closing brace */ 1946 while (*s != '}' && s < end) 1947 s++; 1948 if (s > start && s < end && *s == '}') { 1949 /* found a name. look it up in the unicode database */ 1950 message = "unknown Unicode character name"; 1951 s++; 1952 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr)) 1953 goto store; 1954 } 1955 } 1956 endinpos = s-starts; 1957 outpos = p-PyUnicode_AS_UNICODE(v); 1958 if (unicode_decode_call_errorhandler( 1959 errors, &errorHandler, 1960 "unicodeescape", message, 1961 starts, size, &startinpos, &endinpos, &exc, &s, 1962 (PyObject **)&v, &outpos, &p)) 1963 goto onError; 1964 break; 1965 1966 default: 1967 if (s > end) { 1968 message = "\\ at end of string"; 1969 s--; 1970 endinpos = s-starts; 1971 outpos = p-PyUnicode_AS_UNICODE(v); 1972 if (unicode_decode_call_errorhandler( 1973 errors, &errorHandler, 1974 "unicodeescape", message, 1975 starts, size, &startinpos, &endinpos, &exc, &s, 1976 (PyObject **)&v, &outpos, &p)) 1977 goto onError; 1978 } 1979 else { 1980 *p++ = '\\'; 1981 *p++ = (unsigned char)s[-1]; 1982 } 1983 break; 1984 } 1985 nextByte: 1986 ; 1987 } 1988 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0) 1989 goto onError; 1990 Py_XDECREF(errorHandler); 1991 Py_XDECREF(exc); 1992 return (PyObject *)v; 1993 1994ucnhashError: 1995 PyErr_SetString( 1996 PyExc_UnicodeError, 1997 "\\N escapes not supported (can't load unicodedata module)" 1998 ); 1999 Py_XDECREF(v); 2000 Py_XDECREF(errorHandler); 2001 Py_XDECREF(exc); 2002 return NULL; 2003 2004onError: 2005 Py_XDECREF(v); 2006 Py_XDECREF(errorHandler); 2007 Py_XDECREF(exc); 2008 return NULL; 2009} 2010 2011/* Return a Unicode-Escape string version of the Unicode object. 2012 2013 If quotes is true, the string is enclosed in u"" or u'' quotes as 2014 appropriate. 2015 2016*/ 2017 2018Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s, 2019 Py_ssize_t size, 2020 Py_UNICODE ch) 2021{ 2022 /* like wcschr, but doesn't stop at NULL characters */ 2023 2024 while (size-- > 0) { 2025 if (*s == ch) 2026 return s; 2027 s++; 2028 } 2029 2030 return NULL; 2031} 2032 2033static 2034PyObject *unicodeescape_string(const Py_UNICODE *s, 2035 Py_ssize_t size, 2036 int quotes) 2037{ 2038 PyObject *repr; 2039 char *p; 2040 2041 static const char *hexdigit = "0123456789abcdef"; 2042 2043 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1); 2044 if (repr == NULL) 2045 return NULL; 2046 2047 p = PyString_AS_STRING(repr); 2048 2049 if (quotes) { 2050 *p++ = 'u'; 2051 *p++ = (findchar(s, size, '\'') && 2052 !findchar(s, size, '"')) ? '"' : '\''; 2053 } 2054 while (size-- > 0) { 2055 Py_UNICODE ch = *s++; 2056 2057 /* Escape quotes and backslashes */ 2058 if ((quotes && 2059 ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) || ch == '\\') { 2060 *p++ = '\\'; 2061 *p++ = (char) ch; 2062 continue; 2063 } 2064 2065#ifdef Py_UNICODE_WIDE 2066 /* Map 21-bit characters to '\U00xxxxxx' */ 2067 else if (ch >= 0x10000) { 2068 Py_ssize_t offset = p - PyString_AS_STRING(repr); 2069 2070 /* Resize the string if necessary */ 2071 if (offset + 12 > PyString_GET_SIZE(repr)) { 2072 if (_PyString_Resize(&repr, PyString_GET_SIZE(repr) + 100)) 2073 return NULL; 2074 p = PyString_AS_STRING(repr) + offset; 2075 } 2076 2077 *p++ = '\\'; 2078 *p++ = 'U'; 2079 *p++ = hexdigit[(ch >> 28) & 0x0000000F]; 2080 *p++ = hexdigit[(ch >> 24) & 0x0000000F]; 2081 *p++ = hexdigit[(ch >> 20) & 0x0000000F]; 2082 *p++ = hexdigit[(ch >> 16) & 0x0000000F]; 2083 *p++ = hexdigit[(ch >> 12) & 0x0000000F]; 2084 *p++ = hexdigit[(ch >> 8) & 0x0000000F]; 2085 *p++ = hexdigit[(ch >> 4) & 0x0000000F]; 2086 *p++ = hexdigit[ch & 0x0000000F]; 2087 continue; 2088 } 2089#endif 2090 /* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */ 2091 else if (ch >= 0xD800 && ch < 0xDC00) { 2092 Py_UNICODE ch2; 2093 Py_UCS4 ucs; 2094 2095 ch2 = *s++; 2096 size--; 2097 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) { 2098 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000; 2099 *p++ = '\\'; 2100 *p++ = 'U'; 2101 *p++ = hexdigit[(ucs >> 28) & 0x0000000F]; 2102 *p++ = hexdigit[(ucs >> 24) & 0x0000000F]; 2103 *p++ = hexdigit[(ucs >> 20) & 0x0000000F]; 2104 *p++ = hexdigit[(ucs >> 16) & 0x0000000F]; 2105 *p++ = hexdigit[(ucs >> 12) & 0x0000000F]; 2106 *p++ = hexdigit[(ucs >> 8) & 0x0000000F]; 2107 *p++ = hexdigit[(ucs >> 4) & 0x0000000F]; 2108 *p++ = hexdigit[ucs & 0x0000000F]; 2109 continue; 2110 } 2111 /* Fall through: isolated surrogates are copied as-is */ 2112 s--; 2113 size++; 2114 } 2115 2116 /* Map 16-bit characters to '\uxxxx' */ 2117 if (ch >= 256) { 2118 *p++ = '\\'; 2119 *p++ = 'u'; 2120 *p++ = hexdigit[(ch >> 12) & 0x000F]; 2121 *p++ = hexdigit[(ch >> 8) & 0x000F]; 2122 *p++ = hexdigit[(ch >> 4) & 0x000F]; 2123 *p++ = hexdigit[ch & 0x000F]; 2124 } 2125 2126 /* Map special whitespace to '\t', \n', '\r' */ 2127 else if (ch == '\t') { 2128 *p++ = '\\'; 2129 *p++ = 't'; 2130 } 2131 else if (ch == '\n') { 2132 *p++ = '\\'; 2133 *p++ = 'n'; 2134 } 2135 else if (ch == '\r') { 2136 *p++ = '\\'; 2137 *p++ = 'r'; 2138 } 2139 2140 /* Map non-printable US ASCII to '\xhh' */ 2141 else if (ch < ' ' || ch >= 0x7F) { 2142 *p++ = '\\'; 2143 *p++ = 'x'; 2144 *p++ = hexdigit[(ch >> 4) & 0x000F]; 2145 *p++ = hexdigit[ch & 0x000F]; 2146 } 2147 2148 /* Copy everything else as-is */ 2149 else 2150 *p++ = (char) ch; 2151 } 2152 if (quotes) 2153 *p++ = PyString_AS_STRING(repr)[1]; 2154 2155 *p = '\0'; 2156 _PyString_Resize(&repr, p - PyString_AS_STRING(repr)); 2157 return repr; 2158} 2159 2160PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s, 2161 Py_ssize_t size) 2162{ 2163 return unicodeescape_string(s, size, 0); 2164} 2165 2166PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode) 2167{ 2168 if (!PyUnicode_Check(unicode)) { 2169 PyErr_BadArgument(); 2170 return NULL; 2171 } 2172 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode), 2173 PyUnicode_GET_SIZE(unicode)); 2174} 2175 2176/* --- Raw Unicode Escape Codec ------------------------------------------- */ 2177 2178PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s, 2179 Py_ssize_t size, 2180 const char *errors) 2181{ 2182 const char *starts = s; 2183 Py_ssize_t startinpos; 2184 Py_ssize_t endinpos; 2185 Py_ssize_t outpos; 2186 PyUnicodeObject *v; 2187 Py_UNICODE *p; 2188 const char *end; 2189 const char *bs; 2190 PyObject *errorHandler = NULL; 2191 PyObject *exc = NULL; 2192 2193 /* Escaped strings will always be longer than the resulting 2194 Unicode string, so we start with size here and then reduce the 2195 length after conversion to the true value. (But decoding error 2196 handler might have to resize the string) */ 2197 v = _PyUnicode_New(size); 2198 if (v == NULL) 2199 goto onError; 2200 if (size == 0) 2201 return (PyObject *)v; 2202 p = PyUnicode_AS_UNICODE(v); 2203 end = s + size; 2204 while (s < end) { 2205 unsigned char c; 2206 Py_UCS4 x; 2207 int i; 2208 int count; 2209 2210 /* Non-escape characters are interpreted as Unicode ordinals */ 2211 if (*s != '\\') { 2212 *p++ = (unsigned char)*s++; 2213 continue; 2214 } 2215 startinpos = s-starts; 2216 2217 /* \u-escapes are only interpreted iff the number of leading 2218 backslashes if odd */ 2219 bs = s; 2220 for (;s < end;) { 2221 if (*s != '\\') 2222 break; 2223 *p++ = (unsigned char)*s++; 2224 } 2225 if (((s - bs) & 1) == 0 || 2226 s >= end || 2227 (*s != 'u' && *s != 'U')) { 2228 continue; 2229 } 2230 p--; 2231 count = *s=='u' ? 4 : 8; 2232 s++; 2233 2234 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */ 2235 outpos = p-PyUnicode_AS_UNICODE(v); 2236 for (x = 0, i = 0; i < count; ++i, ++s) { 2237 c = (unsigned char)*s; 2238 if (!isxdigit(c)) { 2239 endinpos = s-starts; 2240 if (unicode_decode_call_errorhandler( 2241 errors, &errorHandler, 2242 "rawunicodeescape", "truncated \\uXXXX", 2243 starts, size, &startinpos, &endinpos, &exc, &s, 2244 (PyObject **)&v, &outpos, &p)) 2245 goto onError; 2246 goto nextByte; 2247 } 2248 x = (x<<4) & ~0xF; 2249 if (c >= '0' && c <= '9') 2250 x += c - '0'; 2251 else if (c >= 'a' && c <= 'f') 2252 x += 10 + c - 'a'; 2253 else 2254 x += 10 + c - 'A'; 2255 } 2256#ifndef Py_UNICODE_WIDE 2257 if (x > 0x10000) { 2258 if (unicode_decode_call_errorhandler( 2259 errors, &errorHandler, 2260 "rawunicodeescape", "\\Uxxxxxxxx out of range", 2261 starts, size, &startinpos, &endinpos, &exc, &s, 2262 (PyObject **)&v, &outpos, &p)) 2263 goto onError; 2264 } 2265#endif 2266 *p++ = x; 2267 nextByte: 2268 ; 2269 } 2270 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0) 2271 goto onError; 2272 Py_XDECREF(errorHandler); 2273 Py_XDECREF(exc); 2274 return (PyObject *)v; 2275 2276 onError: 2277 Py_XDECREF(v); 2278 Py_XDECREF(errorHandler); 2279 Py_XDECREF(exc); 2280 return NULL; 2281} 2282 2283PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s, 2284 Py_ssize_t size) 2285{ 2286 PyObject *repr; 2287 char *p; 2288 char *q; 2289 2290 static const char *hexdigit = "0123456789abcdef"; 2291 2292#ifdef Py_UNICODE_WIDE 2293 repr = PyString_FromStringAndSize(NULL, 10 * size); 2294#else 2295 repr = PyString_FromStringAndSize(NULL, 6 * size); 2296#endif 2297 if (repr == NULL) 2298 return NULL; 2299 if (size == 0) 2300 return repr; 2301 2302 p = q = PyString_AS_STRING(repr); 2303 while (size-- > 0) { 2304 Py_UNICODE ch = *s++; 2305#ifdef Py_UNICODE_WIDE 2306 /* Map 32-bit characters to '\Uxxxxxxxx' */ 2307 if (ch >= 0x10000) { 2308 *p++ = '\\'; 2309 *p++ = 'U'; 2310 *p++ = hexdigit[(ch >> 28) & 0xf]; 2311 *p++ = hexdigit[(ch >> 24) & 0xf]; 2312 *p++ = hexdigit[(ch >> 20) & 0xf]; 2313 *p++ = hexdigit[(ch >> 16) & 0xf]; 2314 *p++ = hexdigit[(ch >> 12) & 0xf]; 2315 *p++ = hexdigit[(ch >> 8) & 0xf]; 2316 *p++ = hexdigit[(ch >> 4) & 0xf]; 2317 *p++ = hexdigit[ch & 15]; 2318 } 2319 else 2320#endif 2321 /* Map 16-bit characters to '\uxxxx' */ 2322 if (ch >= 256) { 2323 *p++ = '\\'; 2324 *p++ = 'u'; 2325 *p++ = hexdigit[(ch >> 12) & 0xf]; 2326 *p++ = hexdigit[(ch >> 8) & 0xf]; 2327 *p++ = hexdigit[(ch >> 4) & 0xf]; 2328 *p++ = hexdigit[ch & 15]; 2329 } 2330 /* Copy everything else as-is */ 2331 else 2332 *p++ = (char) ch; 2333 } 2334 *p = '\0'; 2335 _PyString_Resize(&repr, p - q); 2336 return repr; 2337} 2338 2339PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode) 2340{ 2341 if (!PyUnicode_Check(unicode)) { 2342 PyErr_BadArgument(); 2343 return NULL; 2344 } 2345 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode), 2346 PyUnicode_GET_SIZE(unicode)); 2347} 2348 2349/* --- Unicode Internal Codec ------------------------------------------- */ 2350 2351PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s, 2352 Py_ssize_t size, 2353 const char *errors) 2354{ 2355 const char *starts = s; 2356 Py_ssize_t startinpos; 2357 Py_ssize_t endinpos; 2358 Py_ssize_t outpos; 2359 PyUnicodeObject *v; 2360 Py_UNICODE *p; 2361 const char *end; 2362 const char *reason; 2363 PyObject *errorHandler = NULL; 2364 PyObject *exc = NULL; 2365 2366#ifdef Py_UNICODE_WIDE 2367 Py_UNICODE unimax = PyUnicode_GetMax(); 2368#endif 2369 2370 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE); 2371 if (v == NULL) 2372 goto onError; 2373 if (PyUnicode_GetSize((PyObject *)v) == 0) 2374 return (PyObject *)v; 2375 p = PyUnicode_AS_UNICODE(v); 2376 end = s + size; 2377 2378 while (s < end) { 2379 memcpy(p, s, sizeof(Py_UNICODE)); 2380 /* We have to sanity check the raw data, otherwise doom looms for 2381 some malformed UCS-4 data. */ 2382 if ( 2383 #ifdef Py_UNICODE_WIDE 2384 *p > unimax || *p < 0 || 2385 #endif 2386 end-s < Py_UNICODE_SIZE 2387 ) 2388 { 2389 startinpos = s - starts; 2390 if (end-s < Py_UNICODE_SIZE) { 2391 endinpos = end-starts; 2392 reason = "truncated input"; 2393 } 2394 else { 2395 endinpos = s - starts + Py_UNICODE_SIZE; 2396 reason = "illegal code point (> 0x10FFFF)"; 2397 } 2398 outpos = p - PyUnicode_AS_UNICODE(v); 2399 if (unicode_decode_call_errorhandler( 2400 errors, &errorHandler, 2401 "unicode_internal", reason, 2402 starts, size, &startinpos, &endinpos, &exc, &s, 2403 (PyObject **)&v, &outpos, &p)) { 2404 goto onError; 2405 } 2406 } 2407 else { 2408 p++; 2409 s += Py_UNICODE_SIZE; 2410 } 2411 } 2412 2413 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0) 2414 goto onError; 2415 Py_XDECREF(errorHandler); 2416 Py_XDECREF(exc); 2417 return (PyObject *)v; 2418 2419 onError: 2420 Py_XDECREF(v); 2421 Py_XDECREF(errorHandler); 2422 Py_XDECREF(exc); 2423 return NULL; 2424} 2425 2426/* --- Latin-1 Codec ------------------------------------------------------ */ 2427 2428PyObject *PyUnicode_DecodeLatin1(const char *s, 2429 Py_ssize_t size, 2430 const char *errors) 2431{ 2432 PyUnicodeObject *v; 2433 Py_UNICODE *p; 2434 2435 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */ 2436 if (size == 1) { 2437 Py_UNICODE r = *(unsigned char*)s; 2438 return PyUnicode_FromUnicode(&r, 1); 2439 } 2440 2441 v = _PyUnicode_New(size); 2442 if (v == NULL) 2443 goto onError; 2444 if (size == 0) 2445 return (PyObject *)v; 2446 p = PyUnicode_AS_UNICODE(v); 2447 while (size-- > 0) 2448 *p++ = (unsigned char)*s++; 2449 return (PyObject *)v; 2450 2451 onError: 2452 Py_XDECREF(v); 2453 return NULL; 2454} 2455 2456/* create or adjust a UnicodeEncodeError */ 2457static void make_encode_exception(PyObject **exceptionObject, 2458 const char *encoding, 2459 const Py_UNICODE *unicode, Py_ssize_t size, 2460 Py_ssize_t startpos, Py_ssize_t endpos, 2461 const char *reason) 2462{ 2463 if (*exceptionObject == NULL) { 2464 *exceptionObject = PyUnicodeEncodeError_Create( 2465 encoding, unicode, size, startpos, endpos, reason); 2466 } 2467 else { 2468 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos)) 2469 goto onError; 2470 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos)) 2471 goto onError; 2472 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason)) 2473 goto onError; 2474 return; 2475 onError: 2476 Py_DECREF(*exceptionObject); 2477 *exceptionObject = NULL; 2478 } 2479} 2480 2481/* raises a UnicodeEncodeError */ 2482static void raise_encode_exception(PyObject **exceptionObject, 2483 const char *encoding, 2484 const Py_UNICODE *unicode, Py_ssize_t size, 2485 Py_ssize_t startpos, Py_ssize_t endpos, 2486 const char *reason) 2487{ 2488 make_encode_exception(exceptionObject, 2489 encoding, unicode, size, startpos, endpos, reason); 2490 if (*exceptionObject != NULL) 2491 PyCodec_StrictErrors(*exceptionObject); 2492} 2493 2494/* error handling callback helper: 2495 build arguments, call the callback and check the arguments, 2496 put the result into newpos and return the replacement string, which 2497 has to be freed by the caller */ 2498static PyObject *unicode_encode_call_errorhandler(const char *errors, 2499 PyObject **errorHandler, 2500 const char *encoding, const char *reason, 2501 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject, 2502 Py_ssize_t startpos, Py_ssize_t endpos, 2503 Py_ssize_t *newpos) 2504{ 2505 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple"; 2506 2507 PyObject *restuple; 2508 PyObject *resunicode; 2509 2510 if (*errorHandler == NULL) { 2511 *errorHandler = PyCodec_LookupError(errors); 2512 if (*errorHandler == NULL) 2513 return NULL; 2514 } 2515 2516 make_encode_exception(exceptionObject, 2517 encoding, unicode, size, startpos, endpos, reason); 2518 if (*exceptionObject == NULL) 2519 return NULL; 2520 2521 restuple = PyObject_CallFunctionObjArgs( 2522 *errorHandler, *exceptionObject, NULL); 2523 if (restuple == NULL) 2524 return NULL; 2525 if (!PyTuple_Check(restuple)) { 2526 PyErr_Format(PyExc_TypeError, &argparse[4]); 2527 Py_DECREF(restuple); 2528 return NULL; 2529 } 2530 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, 2531 &resunicode, newpos)) { 2532 Py_DECREF(restuple); 2533 return NULL; 2534 } 2535 if (*newpos<0) 2536 *newpos = size+*newpos; 2537 if (*newpos<0 || *newpos>size) { 2538 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos); 2539 Py_DECREF(restuple); 2540 return NULL; 2541 } 2542 Py_INCREF(resunicode); 2543 Py_DECREF(restuple); 2544 return resunicode; 2545} 2546 2547static PyObject *unicode_encode_ucs1(const Py_UNICODE *p, 2548 Py_ssize_t size, 2549 const char *errors, 2550 int limit) 2551{ 2552 /* output object */ 2553 PyObject *res; 2554 /* pointers to the beginning and end+1 of input */ 2555 const Py_UNICODE *startp = p; 2556 const Py_UNICODE *endp = p + size; 2557 /* pointer to the beginning of the unencodable characters */ 2558 /* const Py_UNICODE *badp = NULL; */ 2559 /* pointer into the output */ 2560 char *str; 2561 /* current output position */ 2562 Py_ssize_t respos = 0; 2563 Py_ssize_t ressize; 2564 const char *encoding = (limit == 256) ? "latin-1" : "ascii"; 2565 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)"; 2566 PyObject *errorHandler = NULL; 2567 PyObject *exc = NULL; 2568 /* the following variable is used for caching string comparisons 2569 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */ 2570 int known_errorHandler = -1; 2571 2572 /* allocate enough for a simple encoding without 2573 replacements, if we need more, we'll resize */ 2574 res = PyString_FromStringAndSize(NULL, size); 2575 if (res == NULL) 2576 goto onError; 2577 if (size == 0) 2578 return res; 2579 str = PyString_AS_STRING(res); 2580 ressize = size; 2581 2582 while (p<endp) { 2583 Py_UNICODE c = *p; 2584 2585 /* can we encode this? */ 2586 if (c<limit) { 2587 /* no overflow check, because we know that the space is enough */ 2588 *str++ = (char)c; 2589 ++p; 2590 } 2591 else { 2592 Py_ssize_t unicodepos = p-startp; 2593 Py_ssize_t requiredsize; 2594 PyObject *repunicode; 2595 Py_ssize_t repsize; 2596 Py_ssize_t newpos; 2597 Py_ssize_t respos; 2598 Py_UNICODE *uni2; 2599 /* startpos for collecting unencodable chars */ 2600 const Py_UNICODE *collstart = p; 2601 const Py_UNICODE *collend = p; 2602 /* find all unecodable characters */ 2603 while ((collend < endp) && ((*collend)>=limit)) 2604 ++collend; 2605 /* cache callback name lookup (if not done yet, i.e. it's the first error) */ 2606 if (known_errorHandler==-1) { 2607 if ((errors==NULL) || (!strcmp(errors, "strict"))) 2608 known_errorHandler = 1; 2609 else if (!strcmp(errors, "replace")) 2610 known_errorHandler = 2; 2611 else if (!strcmp(errors, "ignore")) 2612 known_errorHandler = 3; 2613 else if (!strcmp(errors, "xmlcharrefreplace")) 2614 known_errorHandler = 4; 2615 else 2616 known_errorHandler = 0; 2617 } 2618 switch (known_errorHandler) { 2619 case 1: /* strict */ 2620 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason); 2621 goto onError; 2622 case 2: /* replace */ 2623 while (collstart++<collend) 2624 *str++ = '?'; /* fall through */ 2625 case 3: /* ignore */ 2626 p = collend; 2627 break; 2628 case 4: /* xmlcharrefreplace */ 2629 respos = str-PyString_AS_STRING(res); 2630 /* determine replacement size (temporarily (mis)uses p) */ 2631 for (p = collstart, repsize = 0; p < collend; ++p) { 2632 if (*p<10) 2633 repsize += 2+1+1; 2634 else if (*p<100) 2635 repsize += 2+2+1; 2636 else if (*p<1000) 2637 repsize += 2+3+1; 2638 else if (*p<10000) 2639 repsize += 2+4+1; 2640#ifndef Py_UNICODE_WIDE 2641 else 2642 repsize += 2+5+1; 2643#else 2644 else if (*p<100000) 2645 repsize += 2+5+1; 2646 else if (*p<1000000) 2647 repsize += 2+6+1; 2648 else 2649 repsize += 2+7+1; 2650#endif 2651 } 2652 requiredsize = respos+repsize+(endp-collend); 2653 if (requiredsize > ressize) { 2654 if (requiredsize<2*ressize) 2655 requiredsize = 2*ressize; 2656 if (_PyString_Resize(&res, requiredsize)) 2657 goto onError; 2658 str = PyString_AS_STRING(res) + respos; 2659 ressize = requiredsize; 2660 } 2661 /* generate replacement (temporarily (mis)uses p) */ 2662 for (p = collstart; p < collend; ++p) { 2663 str += sprintf(str, "&#%d;", (int)*p); 2664 } 2665 p = collend; 2666 break; 2667 default: 2668 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler, 2669 encoding, reason, startp, size, &exc, 2670 collstart-startp, collend-startp, &newpos); 2671 if (repunicode == NULL) 2672 goto onError; 2673 /* need more space? (at least enough for what we 2674 have+the replacement+the rest of the string, so 2675 we won't have to check space for encodable characters) */ 2676 respos = str-PyString_AS_STRING(res); 2677 repsize = PyUnicode_GET_SIZE(repunicode); 2678 requiredsize = respos+repsize+(endp-collend); 2679 if (requiredsize > ressize) { 2680 if (requiredsize<2*ressize) 2681 requiredsize = 2*ressize; 2682 if (_PyString_Resize(&res, requiredsize)) { 2683 Py_DECREF(repunicode); 2684 goto onError; 2685 } 2686 str = PyString_AS_STRING(res) + respos; 2687 ressize = requiredsize; 2688 } 2689 /* check if there is anything unencodable in the replacement 2690 and copy it to the output */ 2691 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) { 2692 c = *uni2; 2693 if (c >= limit) { 2694 raise_encode_exception(&exc, encoding, startp, size, 2695 unicodepos, unicodepos+1, reason); 2696 Py_DECREF(repunicode); 2697 goto onError; 2698 } 2699 *str = (char)c; 2700 } 2701 p = startp + newpos; 2702 Py_DECREF(repunicode); 2703 } 2704 } 2705 } 2706 /* Resize if we allocated to much */ 2707 respos = str-PyString_AS_STRING(res); 2708 if (respos<ressize) 2709 /* If this falls res will be NULL */ 2710 _PyString_Resize(&res, respos); 2711 Py_XDECREF(errorHandler); 2712 Py_XDECREF(exc); 2713 return res; 2714 2715 onError: 2716 Py_XDECREF(res); 2717 Py_XDECREF(errorHandler); 2718 Py_XDECREF(exc); 2719 return NULL; 2720} 2721 2722PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p, 2723 Py_ssize_t size, 2724 const char *errors) 2725{ 2726 return unicode_encode_ucs1(p, size, errors, 256); 2727} 2728 2729PyObject *PyUnicode_AsLatin1String(PyObject *unicode) 2730{ 2731 if (!PyUnicode_Check(unicode)) { 2732 PyErr_BadArgument(); 2733 return NULL; 2734 } 2735 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode), 2736 PyUnicode_GET_SIZE(unicode), 2737 NULL); 2738} 2739 2740/* --- 7-bit ASCII Codec -------------------------------------------------- */ 2741 2742PyObject *PyUnicode_DecodeASCII(const char *s, 2743 Py_ssize_t size, 2744 const char *errors) 2745{ 2746 const char *starts = s; 2747 PyUnicodeObject *v; 2748 Py_UNICODE *p; 2749 Py_ssize_t startinpos; 2750 Py_ssize_t endinpos; 2751 Py_ssize_t outpos; 2752 const char *e; 2753 PyObject *errorHandler = NULL; 2754 PyObject *exc = NULL; 2755 2756 /* ASCII is equivalent to the first 128 ordinals in Unicode. */ 2757 if (size == 1 && *(unsigned char*)s < 128) { 2758 Py_UNICODE r = *(unsigned char*)s; 2759 return PyUnicode_FromUnicode(&r, 1); 2760 } 2761 2762 v = _PyUnicode_New(size); 2763 if (v == NULL) 2764 goto onError; 2765 if (size == 0) 2766 return (PyObject *)v; 2767 p = PyUnicode_AS_UNICODE(v); 2768 e = s + size; 2769 while (s < e) { 2770 register unsigned char c = (unsigned char)*s; 2771 if (c < 128) { 2772 *p++ = c; 2773 ++s; 2774 } 2775 else { 2776 startinpos = s-starts; 2777 endinpos = startinpos + 1; 2778 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v); 2779 if (unicode_decode_call_errorhandler( 2780 errors, &errorHandler, 2781 "ascii", "ordinal not in range(128)", 2782 starts, size, &startinpos, &endinpos, &exc, &s, 2783 (PyObject **)&v, &outpos, &p)) 2784 goto onError; 2785 } 2786 } 2787 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v)) 2788 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0) 2789 goto onError; 2790 Py_XDECREF(errorHandler); 2791 Py_XDECREF(exc); 2792 return (PyObject *)v; 2793 2794 onError: 2795 Py_XDECREF(v); 2796 Py_XDECREF(errorHandler); 2797 Py_XDECREF(exc); 2798 return NULL; 2799} 2800 2801PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p, 2802 Py_ssize_t size, 2803 const char *errors) 2804{ 2805 return unicode_encode_ucs1(p, size, errors, 128); 2806} 2807 2808PyObject *PyUnicode_AsASCIIString(PyObject *unicode) 2809{ 2810 if (!PyUnicode_Check(unicode)) { 2811 PyErr_BadArgument(); 2812 return NULL; 2813 } 2814 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode), 2815 PyUnicode_GET_SIZE(unicode), 2816 NULL); 2817} 2818 2819#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T) 2820 2821/* --- MBCS codecs for Windows -------------------------------------------- */ 2822 2823#if SIZEOF_INT < SIZEOF_SSIZE_T 2824#define NEED_RETRY 2825#endif 2826 2827/* XXX This code is limited to "true" double-byte encodings, as 2828 a) it assumes an incomplete character consists of a single byte, and 2829 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte 2830 encodings, see IsDBCSLeadByteEx documentation. */ 2831 2832static int is_dbcs_lead_byte(const char *s, int offset) 2833{ 2834 const char *curr = s + offset; 2835 2836 if (IsDBCSLeadByte(*curr)) { 2837 const char *prev = CharPrev(s, curr); 2838 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2); 2839 } 2840 return 0; 2841} 2842 2843/* 2844 * Decode MBCS string into unicode object. If 'final' is set, converts 2845 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise. 2846 */ 2847static int decode_mbcs(PyUnicodeObject **v, 2848 const char *s, /* MBCS string */ 2849 int size, /* sizeof MBCS string */ 2850 int final) 2851{ 2852 Py_UNICODE *p; 2853 Py_ssize_t n = 0; 2854 int usize = 0; 2855 2856 assert(size >= 0); 2857 2858 /* Skip trailing lead-byte unless 'final' is set */ 2859 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1)) 2860 --size; 2861 2862 /* First get the size of the result */ 2863 if (size > 0) { 2864 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0); 2865 if (usize == 0) { 2866 PyErr_SetFromWindowsErrWithFilename(0, NULL); 2867 return -1; 2868 } 2869 } 2870 2871 if (*v == NULL) { 2872 /* Create unicode object */ 2873 *v = _PyUnicode_New(usize); 2874 if (*v == NULL) 2875 return -1; 2876 } 2877 else { 2878 /* Extend unicode object */ 2879 n = PyUnicode_GET_SIZE(*v); 2880 if (_PyUnicode_Resize(v, n + usize) < 0) 2881 return -1; 2882 } 2883 2884 /* Do the conversion */ 2885 if (size > 0) { 2886 p = PyUnicode_AS_UNICODE(*v) + n; 2887 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) { 2888 PyErr_SetFromWindowsErrWithFilename(0, NULL); 2889 return -1; 2890 } 2891 } 2892 2893 return size; 2894} 2895 2896PyObject *PyUnicode_DecodeMBCSStateful(const char *s, 2897 Py_ssize_t size, 2898 const char *errors, 2899 Py_ssize_t *consumed) 2900{ 2901 PyUnicodeObject *v = NULL; 2902 int done; 2903 2904 if (consumed) 2905 *consumed = 0; 2906 2907#ifdef NEED_RETRY 2908 retry: 2909 if (size > INT_MAX) 2910 done = decode_mbcs(&v, s, INT_MAX, 0); 2911 else 2912#endif 2913 done = decode_mbcs(&v, s, (int)size, !consumed); 2914 2915 if (done < 0) { 2916 Py_XDECREF(v); 2917 return NULL; 2918 } 2919 2920 if (consumed) 2921 *consumed += done; 2922 2923#ifdef NEED_RETRY 2924 if (size > INT_MAX) { 2925 s += done; 2926 size -= done; 2927 goto retry; 2928 } 2929#endif 2930 2931 return (PyObject *)v; 2932} 2933 2934PyObject *PyUnicode_DecodeMBCS(const char *s, 2935 Py_ssize_t size, 2936 const char *errors) 2937{ 2938 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL); 2939} 2940 2941/* 2942 * Convert unicode into string object (MBCS). 2943 * Returns 0 if succeed, -1 otherwise. 2944 */ 2945static int encode_mbcs(PyObject **repr, 2946 const Py_UNICODE *p, /* unicode */ 2947 int size) /* size of unicode */ 2948{ 2949 int mbcssize = 0; 2950 Py_ssize_t n = 0; 2951 2952 assert(size >= 0); 2953 2954 /* First get the size of the result */ 2955 if (size > 0) { 2956 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL); 2957 if (mbcssize == 0) { 2958 PyErr_SetFromWindowsErrWithFilename(0, NULL); 2959 return -1; 2960 } 2961 } 2962 2963 if (*repr == NULL) { 2964 /* Create string object */ 2965 *repr = PyString_FromStringAndSize(NULL, mbcssize); 2966 if (*repr == NULL) 2967 return -1; 2968 } 2969 else { 2970 /* Extend string object */ 2971 n = PyString_Size(*repr); 2972 if (_PyString_Resize(repr, n + mbcssize) < 0) 2973 return -1; 2974 } 2975 2976 /* Do the conversion */ 2977 if (size > 0) { 2978 char *s = PyString_AS_STRING(*repr) + n; 2979 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) { 2980 PyErr_SetFromWindowsErrWithFilename(0, NULL); 2981 return -1; 2982 } 2983 } 2984 2985 return 0; 2986} 2987 2988PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p, 2989 Py_ssize_t size, 2990 const char *errors) 2991{ 2992 PyObject *repr = NULL; 2993 int ret; 2994 2995#ifdef NEED_RETRY 2996 retry: 2997 if (size > INT_MAX) 2998 ret = encode_mbcs(&repr, p, INT_MAX); 2999 else 3000#endif 3001 ret = encode_mbcs(&repr, p, (int)size); 3002 3003 if (ret < 0) { 3004 Py_XDECREF(repr); 3005 return NULL; 3006 } 3007 3008#ifdef NEED_RETRY 3009 if (size > INT_MAX) { 3010 p += INT_MAX; 3011 size -= INT_MAX; 3012 goto retry; 3013 } 3014#endif 3015 3016 return repr; 3017} 3018 3019PyObject *PyUnicode_AsMBCSString(PyObject *unicode) 3020{ 3021 if (!PyUnicode_Check(unicode)) { 3022 PyErr_BadArgument(); 3023 return NULL; 3024 } 3025 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode), 3026 PyUnicode_GET_SIZE(unicode), 3027 NULL); 3028} 3029 3030#undef NEED_RETRY 3031 3032#endif /* MS_WINDOWS */ 3033 3034/* --- Character Mapping Codec -------------------------------------------- */ 3035 3036PyObject *PyUnicode_DecodeCharmap(const char *s, 3037 Py_ssize_t size, 3038 PyObject *mapping, 3039 const char *errors) 3040{ 3041 const char *starts = s; 3042 Py_ssize_t startinpos; 3043 Py_ssize_t endinpos; 3044 Py_ssize_t outpos; 3045 const char *e; 3046 PyUnicodeObject *v; 3047 Py_UNICODE *p; 3048 Py_ssize_t extrachars = 0; 3049 PyObject *errorHandler = NULL; 3050 PyObject *exc = NULL; 3051 Py_UNICODE *mapstring = NULL; 3052 Py_ssize_t maplen = 0; 3053 3054 /* Default to Latin-1 */ 3055 if (mapping == NULL) 3056 return PyUnicode_DecodeLatin1(s, size, errors); 3057 3058 v = _PyUnicode_New(size); 3059 if (v == NULL) 3060 goto onError; 3061 if (size == 0) 3062 return (PyObject *)v; 3063 p = PyUnicode_AS_UNICODE(v); 3064 e = s + size; 3065 if (PyUnicode_CheckExact(mapping)) { 3066 mapstring = PyUnicode_AS_UNICODE(mapping); 3067 maplen = PyUnicode_GET_SIZE(mapping); 3068 while (s < e) { 3069 unsigned char ch = *s; 3070 Py_UNICODE x = 0xfffe; /* illegal value */ 3071 3072 if (ch < maplen) 3073 x = mapstring[ch]; 3074 3075 if (x == 0xfffe) { 3076 /* undefined mapping */ 3077 outpos = p-PyUnicode_AS_UNICODE(v); 3078 startinpos = s-starts; 3079 endinpos = startinpos+1; 3080 if (unicode_decode_call_errorhandler( 3081 errors, &errorHandler, 3082 "charmap", "character maps to <undefined>", 3083 starts, size, &startinpos, &endinpos, &exc, &s, 3084 (PyObject **)&v, &outpos, &p)) { 3085 goto onError; 3086 } 3087 continue; 3088 } 3089 *p++ = x; 3090 ++s; 3091 } 3092 } 3093 else { 3094 while (s < e) { 3095 unsigned char ch = *s; 3096 PyObject *w, *x; 3097 3098 /* Get mapping (char ordinal -> integer, Unicode char or None) */ 3099 w = PyInt_FromLong((long)ch); 3100 if (w == NULL) 3101 goto onError; 3102 x = PyObject_GetItem(mapping, w); 3103 Py_DECREF(w); 3104 if (x == NULL) { 3105 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 3106 /* No mapping found means: mapping is undefined. */ 3107 PyErr_Clear(); 3108 x = Py_None; 3109 Py_INCREF(x); 3110 } else 3111 goto onError; 3112 } 3113 3114 /* Apply mapping */ 3115 if (PyInt_Check(x)) { 3116 long value = PyInt_AS_LONG(x); 3117 if (value < 0 || value > 65535) { 3118 PyErr_SetString(PyExc_TypeError, 3119 "character mapping must be in range(65536)"); 3120 Py_DECREF(x); 3121 goto onError; 3122 } 3123 *p++ = (Py_UNICODE)value; 3124 } 3125 else if (x == Py_None) { 3126 /* undefined mapping */ 3127 outpos = p-PyUnicode_AS_UNICODE(v); 3128 startinpos = s-starts; 3129 endinpos = startinpos+1; 3130 if (unicode_decode_call_errorhandler( 3131 errors, &errorHandler, 3132 "charmap", "character maps to <undefined>", 3133 starts, size, &startinpos, &endinpos, &exc, &s, 3134 (PyObject **)&v, &outpos, &p)) { 3135 Py_DECREF(x); 3136 goto onError; 3137 } 3138 Py_DECREF(x); 3139 continue; 3140 } 3141 else if (PyUnicode_Check(x)) { 3142 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x); 3143 3144 if (targetsize == 1) 3145 /* 1-1 mapping */ 3146 *p++ = *PyUnicode_AS_UNICODE(x); 3147 3148 else if (targetsize > 1) { 3149 /* 1-n mapping */ 3150 if (targetsize > extrachars) { 3151 /* resize first */ 3152 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v); 3153 Py_ssize_t needed = (targetsize - extrachars) + \ 3154 (targetsize << 2); 3155 extrachars += needed; 3156 if (_PyUnicode_Resize(&v, 3157 PyUnicode_GET_SIZE(v) + needed) < 0) { 3158 Py_DECREF(x); 3159 goto onError; 3160 } 3161 p = PyUnicode_AS_UNICODE(v) + oldpos; 3162 } 3163 Py_UNICODE_COPY(p, 3164 PyUnicode_AS_UNICODE(x), 3165 targetsize); 3166 p += targetsize; 3167 extrachars -= targetsize; 3168 } 3169 /* 1-0 mapping: skip the character */ 3170 } 3171 else { 3172 /* wrong return value */ 3173 PyErr_SetString(PyExc_TypeError, 3174 "character mapping must return integer, None or unicode"); 3175 Py_DECREF(x); 3176 goto onError; 3177 } 3178 Py_DECREF(x); 3179 ++s; 3180 } 3181 } 3182 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v)) 3183 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0) 3184 goto onError; 3185 Py_XDECREF(errorHandler); 3186 Py_XDECREF(exc); 3187 return (PyObject *)v; 3188 3189 onError: 3190 Py_XDECREF(errorHandler); 3191 Py_XDECREF(exc); 3192 Py_XDECREF(v); 3193 return NULL; 3194} 3195 3196/* Charmap encoding: the lookup table */ 3197 3198struct encoding_map{ 3199 PyObject_HEAD 3200 unsigned char level1[32]; 3201 int count2, count3; 3202 unsigned char level23[1]; 3203}; 3204 3205static PyObject* 3206encoding_map_size(PyObject *obj, PyObject* args) 3207{ 3208 struct encoding_map *map = (struct encoding_map*)obj; 3209 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 + 3210 128*map->count3); 3211} 3212 3213static PyMethodDef encoding_map_methods[] = { 3214 {"size", encoding_map_size, METH_NOARGS, 3215 PyDoc_STR("Return the size (in bytes) of this object") }, 3216 { 0 } 3217}; 3218 3219static void 3220encoding_map_dealloc(PyObject* o) 3221{ 3222 PyObject_FREE(o); 3223} 3224 3225static PyTypeObject EncodingMapType = { 3226 PyObject_HEAD_INIT(NULL) 3227 0, /*ob_size*/ 3228 "EncodingMap", /*tp_name*/ 3229 sizeof(struct encoding_map), /*tp_basicsize*/ 3230 0, /*tp_itemsize*/ 3231 /* methods */ 3232 encoding_map_dealloc, /*tp_dealloc*/ 3233 0, /*tp_print*/ 3234 0, /*tp_getattr*/ 3235 0, /*tp_setattr*/ 3236 0, /*tp_compare*/ 3237 0, /*tp_repr*/ 3238 0, /*tp_as_number*/ 3239 0, /*tp_as_sequence*/ 3240 0, /*tp_as_mapping*/ 3241 0, /*tp_hash*/ 3242 0, /*tp_call*/ 3243 0, /*tp_str*/ 3244 0, /*tp_getattro*/ 3245 0, /*tp_setattro*/ 3246 0, /*tp_as_buffer*/ 3247 Py_TPFLAGS_DEFAULT, /*tp_flags*/ 3248 0, /*tp_doc*/ 3249 0, /*tp_traverse*/ 3250 0, /*tp_clear*/ 3251 0, /*tp_richcompare*/ 3252 0, /*tp_weaklistoffset*/ 3253 0, /*tp_iter*/ 3254 0, /*tp_iternext*/ 3255 encoding_map_methods, /*tp_methods*/ 3256 0, /*tp_members*/ 3257 0, /*tp_getset*/ 3258 0, /*tp_base*/ 3259 0, /*tp_dict*/ 3260 0, /*tp_descr_get*/ 3261 0, /*tp_descr_set*/ 3262 0, /*tp_dictoffset*/ 3263 0, /*tp_init*/ 3264 0, /*tp_alloc*/ 3265 0, /*tp_new*/ 3266 0, /*tp_free*/ 3267 0, /*tp_is_gc*/ 3268}; 3269 3270PyObject* 3271PyUnicode_BuildEncodingMap(PyObject* string) 3272{ 3273 Py_UNICODE *decode; 3274 PyObject *result; 3275 struct encoding_map *mresult; 3276 int i; 3277 int need_dict = 0; 3278 unsigned char level1[32]; 3279 unsigned char level2[512]; 3280 unsigned char *mlevel1, *mlevel2, *mlevel3; 3281 int count2 = 0, count3 = 0; 3282 3283 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) { 3284 PyErr_BadArgument(); 3285 return NULL; 3286 } 3287 decode = PyUnicode_AS_UNICODE(string); 3288 memset(level1, 0xFF, sizeof level1); 3289 memset(level2, 0xFF, sizeof level2); 3290 3291 /* If there isn't a one-to-one mapping of NULL to \0, 3292 or if there are non-BMP characters, we need to use 3293 a mapping dictionary. */ 3294 if (decode[0] != 0) 3295 need_dict = 1; 3296 for (i = 1; i < 256; i++) { 3297 int l1, l2; 3298 if (decode[i] == 0 3299 #ifdef Py_UNICODE_WIDE 3300 || decode[i] > 0xFFFF 3301 #endif 3302 ) { 3303 need_dict = 1; 3304 break; 3305 } 3306 if (decode[i] == 0xFFFE) 3307 /* unmapped character */ 3308 continue; 3309 l1 = decode[i] >> 11; 3310 l2 = decode[i] >> 7; 3311 if (level1[l1] == 0xFF) 3312 level1[l1] = count2++; 3313 if (level2[l2] == 0xFF) 3314 level2[l2] = count3++; 3315 } 3316 3317 if (count2 >= 0xFF || count3 >= 0xFF) 3318 need_dict = 1; 3319 3320 if (need_dict) { 3321 PyObject *result = PyDict_New(); 3322 PyObject *key, *value; 3323 if (!result) 3324 return NULL; 3325 for (i = 0; i < 256; i++) { 3326 key = value = NULL; 3327 key = PyInt_FromLong(decode[i]); 3328 value = PyInt_FromLong(i); 3329 if (!key || !value) 3330 goto failed1; 3331 if (PyDict_SetItem(result, key, value) == -1) 3332 goto failed1; 3333 Py_DECREF(key); 3334 Py_DECREF(value); 3335 } 3336 return result; 3337 failed1: 3338 Py_XDECREF(key); 3339 Py_XDECREF(value); 3340 Py_DECREF(result); 3341 return NULL; 3342 } 3343 3344 /* Create a three-level trie */ 3345 result = PyObject_MALLOC(sizeof(struct encoding_map) + 3346 16*count2 + 128*count3 - 1); 3347 if (!result) 3348 return PyErr_NoMemory(); 3349 PyObject_Init(result, &EncodingMapType); 3350 mresult = (struct encoding_map*)result; 3351 mresult->count2 = count2; 3352 mresult->count3 = count3; 3353 mlevel1 = mresult->level1; 3354 mlevel2 = mresult->level23; 3355 mlevel3 = mresult->level23 + 16*count2; 3356 memcpy(mlevel1, level1, 32); 3357 memset(mlevel2, 0xFF, 16*count2); 3358 memset(mlevel3, 0, 128*count3); 3359 count3 = 0; 3360 for (i = 1; i < 256; i++) { 3361 int o1, o2, o3, i2, i3; 3362 if (decode[i] == 0xFFFE) 3363 /* unmapped character */ 3364 continue; 3365 o1 = decode[i]>>11; 3366 o2 = (decode[i]>>7) & 0xF; 3367 i2 = 16*mlevel1[o1] + o2; 3368 if (mlevel2[i2] == 0xFF) 3369 mlevel2[i2] = count3++; 3370 o3 = decode[i] & 0x7F; 3371 i3 = 128*mlevel2[i2] + o3; 3372 mlevel3[i3] = i; 3373 } 3374 return result; 3375} 3376 3377static int 3378encoding_map_lookup(Py_UNICODE c, PyObject *mapping) 3379{ 3380 struct encoding_map *map = (struct encoding_map*)mapping; 3381 int l1 = c>>11; 3382 int l2 = (c>>7) & 0xF; 3383 int l3 = c & 0x7F; 3384 int i; 3385 3386#ifdef Py_UNICODE_WIDE 3387 if (c > 0xFFFF) { 3388 return -1; 3389 } 3390#endif 3391 if (c == 0) 3392 return 0; 3393 /* level 1*/ 3394 i = map->level1[l1]; 3395 if (i == 0xFF) { 3396 return -1; 3397 } 3398 /* level 2*/ 3399 i = map->level23[16*i+l2]; 3400 if (i == 0xFF) { 3401 return -1; 3402 } 3403 /* level 3 */ 3404 i = map->level23[16*map->count2 + 128*i + l3]; 3405 if (i == 0) { 3406 return -1; 3407 } 3408 return i; 3409} 3410 3411/* Lookup the character ch in the mapping. If the character 3412 can't be found, Py_None is returned (or NULL, if another 3413 error occurred). */ 3414static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping) 3415{ 3416 PyObject *w = PyInt_FromLong((long)c); 3417 PyObject *x; 3418 3419 if (w == NULL) 3420 return NULL; 3421 x = PyObject_GetItem(mapping, w); 3422 Py_DECREF(w); 3423 if (x == NULL) { 3424 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 3425 /* No mapping found means: mapping is undefined. */ 3426 PyErr_Clear(); 3427 x = Py_None; 3428 Py_INCREF(x); 3429 return x; 3430 } else 3431 return NULL; 3432 } 3433 else if (x == Py_None) 3434 return x; 3435 else if (PyInt_Check(x)) { 3436 long value = PyInt_AS_LONG(x); 3437 if (value < 0 || value > 255) { 3438 PyErr_SetString(PyExc_TypeError, 3439 "character mapping must be in range(256)"); 3440 Py_DECREF(x); 3441 return NULL; 3442 } 3443 return x; 3444 } 3445 else if (PyString_Check(x)) 3446 return x; 3447 else { 3448 /* wrong return value */ 3449 PyErr_SetString(PyExc_TypeError, 3450 "character mapping must return integer, None or str"); 3451 Py_DECREF(x); 3452 return NULL; 3453 } 3454} 3455 3456static int 3457charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize) 3458{ 3459 Py_ssize_t outsize = PyString_GET_SIZE(*outobj); 3460 /* exponentially overallocate to minimize reallocations */ 3461 if (requiredsize < 2*outsize) 3462 requiredsize = 2*outsize; 3463 if (_PyString_Resize(outobj, requiredsize)) { 3464 return 0; 3465 } 3466 return 1; 3467} 3468 3469typedef enum charmapencode_result { 3470 enc_SUCCESS, enc_FAILED, enc_EXCEPTION 3471}charmapencode_result; 3472/* lookup the character, put the result in the output string and adjust 3473 various state variables. Reallocate the output string if not enough 3474 space is available. Return a new reference to the object that 3475 was put in the output buffer, or Py_None, if the mapping was undefined 3476 (in which case no character was written) or NULL, if a 3477 reallocation error occurred. The caller must decref the result */ 3478static 3479charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping, 3480 PyObject **outobj, Py_ssize_t *outpos) 3481{ 3482 PyObject *rep; 3483 char *outstart; 3484 Py_ssize_t outsize = PyString_GET_SIZE(*outobj); 3485 3486 if (mapping->ob_type == &EncodingMapType) { 3487 int res = encoding_map_lookup(c, mapping); 3488 Py_ssize_t requiredsize = *outpos+1; 3489 if (res == -1) 3490 return enc_FAILED; 3491 if (outsize<requiredsize) 3492 if (!charmapencode_resize(outobj, outpos, requiredsize)) 3493 return enc_EXCEPTION; 3494 outstart = PyString_AS_STRING(*outobj); 3495 outstart[(*outpos)++] = (char)res; 3496 return enc_SUCCESS; 3497 } 3498 3499 rep = charmapencode_lookup(c, mapping); 3500 if (rep==NULL) 3501 return enc_EXCEPTION; 3502 else if (rep==Py_None) { 3503 Py_DECREF(rep); 3504 return enc_FAILED; 3505 } else { 3506 if (PyInt_Check(rep)) { 3507 Py_ssize_t requiredsize = *outpos+1; 3508 if (outsize<requiredsize) 3509 if (!charmapencode_resize(outobj, outpos, requiredsize)) { 3510 Py_DECREF(rep); 3511 return enc_EXCEPTION; 3512 } 3513 outstart = PyString_AS_STRING(*outobj); 3514 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep); 3515 } 3516 else { 3517 const char *repchars = PyString_AS_STRING(rep); 3518 Py_ssize_t repsize = PyString_GET_SIZE(rep); 3519 Py_ssize_t requiredsize = *outpos+repsize; 3520 if (outsize<requiredsize) 3521 if (!charmapencode_resize(outobj, outpos, requiredsize)) { 3522 Py_DECREF(rep); 3523 return enc_EXCEPTION; 3524 } 3525 outstart = PyString_AS_STRING(*outobj); 3526 memcpy(outstart + *outpos, repchars, repsize); 3527 *outpos += repsize; 3528 } 3529 } 3530 Py_DECREF(rep); 3531 return enc_SUCCESS; 3532} 3533 3534/* handle an error in PyUnicode_EncodeCharmap 3535 Return 0 on success, -1 on error */ 3536static 3537int charmap_encoding_error( 3538 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping, 3539 PyObject **exceptionObject, 3540 int *known_errorHandler, PyObject **errorHandler, const char *errors, 3541 PyObject **res, Py_ssize_t *respos) 3542{ 3543 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */ 3544 Py_ssize_t repsize; 3545 Py_ssize_t newpos; 3546 Py_UNICODE *uni2; 3547 /* startpos for collecting unencodable chars */ 3548 Py_ssize_t collstartpos = *inpos; 3549 Py_ssize_t collendpos = *inpos+1; 3550 Py_ssize_t collpos; 3551 char *encoding = "charmap"; 3552 char *reason = "character maps to <undefined>"; 3553 charmapencode_result x; 3554 3555 /* find all unencodable characters */ 3556 while (collendpos < size) { 3557 PyObject *rep; 3558 if (mapping->ob_type == &EncodingMapType) { 3559 int res = encoding_map_lookup(p[collendpos], mapping); 3560 if (res != -1) 3561 break; 3562 ++collendpos; 3563 continue; 3564 } 3565 3566 rep = charmapencode_lookup(p[collendpos], mapping); 3567 if (rep==NULL) 3568 return -1; 3569 else if (rep!=Py_None) { 3570 Py_DECREF(rep); 3571 break; 3572 } 3573 Py_DECREF(rep); 3574 ++collendpos; 3575 } 3576 /* cache callback name lookup 3577 * (if not done yet, i.e. it's the first error) */ 3578 if (*known_errorHandler==-1) { 3579 if ((errors==NULL) || (!strcmp(errors, "strict"))) 3580 *known_errorHandler = 1; 3581 else if (!strcmp(errors, "replace")) 3582 *known_errorHandler = 2; 3583 else if (!strcmp(errors, "ignore")) 3584 *known_errorHandler = 3; 3585 else if (!strcmp(errors, "xmlcharrefreplace")) 3586 *known_errorHandler = 4; 3587 else 3588 *known_errorHandler = 0; 3589 } 3590 switch (*known_errorHandler) { 3591 case 1: /* strict */ 3592 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason); 3593 return -1; 3594 case 2: /* replace */ 3595 for (collpos = collstartpos; collpos<collendpos; ++collpos) { 3596 x = charmapencode_output('?', mapping, res, respos); 3597 if (x==enc_EXCEPTION) { 3598 return -1; 3599 } 3600 else if (x==enc_FAILED) { 3601 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason); 3602 return -1; 3603 } 3604 } 3605 /* fall through */ 3606 case 3: /* ignore */ 3607 *inpos = collendpos; 3608 break; 3609 case 4: /* xmlcharrefreplace */ 3610 /* generate replacement (temporarily (mis)uses p) */ 3611 for (collpos = collstartpos; collpos < collendpos; ++collpos) { 3612 char buffer[2+29+1+1]; 3613 char *cp; 3614 sprintf(buffer, "&#%d;", (int)p[collpos]); 3615 for (cp = buffer; *cp; ++cp) { 3616 x = charmapencode_output(*cp, mapping, res, respos); 3617 if (x==enc_EXCEPTION) 3618 return -1; 3619 else if (x==enc_FAILED) { 3620 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason); 3621 return -1; 3622 } 3623 } 3624 } 3625 *inpos = collendpos; 3626 break; 3627 default: 3628 repunicode = unicode_encode_call_errorhandler(errors, errorHandler, 3629 encoding, reason, p, size, exceptionObject, 3630 collstartpos, collendpos, &newpos); 3631 if (repunicode == NULL) 3632 return -1; 3633 /* generate replacement */ 3634 repsize = PyUnicode_GET_SIZE(repunicode); 3635 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) { 3636 x = charmapencode_output(*uni2, mapping, res, respos); 3637 if (x==enc_EXCEPTION) { 3638 return -1; 3639 } 3640 else if (x==enc_FAILED) { 3641 Py_DECREF(repunicode); 3642 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason); 3643 return -1; 3644 } 3645 } 3646 *inpos = newpos; 3647 Py_DECREF(repunicode); 3648 } 3649 return 0; 3650} 3651 3652PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p, 3653 Py_ssize_t size, 3654 PyObject *mapping, 3655 const char *errors) 3656{ 3657 /* output object */ 3658 PyObject *res = NULL; 3659 /* current input position */ 3660 Py_ssize_t inpos = 0; 3661 /* current output position */ 3662 Py_ssize_t respos = 0; 3663 PyObject *errorHandler = NULL; 3664 PyObject *exc = NULL; 3665 /* the following variable is used for caching string comparisons 3666 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3667 * 3=ignore, 4=xmlcharrefreplace */ 3668 int known_errorHandler = -1; 3669 3670 /* Default to Latin-1 */ 3671 if (mapping == NULL) 3672 return PyUnicode_EncodeLatin1(p, size, errors); 3673 3674 /* allocate enough for a simple encoding without 3675 replacements, if we need more, we'll resize */ 3676 res = PyString_FromStringAndSize(NULL, size); 3677 if (res == NULL) 3678 goto onError; 3679 if (size == 0) 3680 return res; 3681 3682 while (inpos<size) { 3683 /* try to encode it */ 3684 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos); 3685 if (x==enc_EXCEPTION) /* error */ 3686 goto onError; 3687 if (x==enc_FAILED) { /* unencodable character */ 3688 if (charmap_encoding_error(p, size, &inpos, mapping, 3689 &exc, 3690 &known_errorHandler, &errorHandler, errors, 3691 &res, &respos)) { 3692 goto onError; 3693 } 3694 } 3695 else 3696 /* done with this character => adjust input position */ 3697 ++inpos; 3698 } 3699 3700 /* Resize if we allocated to much */ 3701 if (respos<PyString_GET_SIZE(res)) { 3702 if (_PyString_Resize(&res, respos)) 3703 goto onError; 3704 } 3705 Py_XDECREF(exc); 3706 Py_XDECREF(errorHandler); 3707 return res; 3708 3709 onError: 3710 Py_XDECREF(res); 3711 Py_XDECREF(exc); 3712 Py_XDECREF(errorHandler); 3713 return NULL; 3714} 3715 3716PyObject *PyUnicode_AsCharmapString(PyObject *unicode, 3717 PyObject *mapping) 3718{ 3719 if (!PyUnicode_Check(unicode) || mapping == NULL) { 3720 PyErr_BadArgument(); 3721 return NULL; 3722 } 3723 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode), 3724 PyUnicode_GET_SIZE(unicode), 3725 mapping, 3726 NULL); 3727} 3728 3729/* create or adjust a UnicodeTranslateError */ 3730static void make_translate_exception(PyObject **exceptionObject, 3731 const Py_UNICODE *unicode, Py_ssize_t size, 3732 Py_ssize_t startpos, Py_ssize_t endpos, 3733 const char *reason) 3734{ 3735 if (*exceptionObject == NULL) { 3736 *exceptionObject = PyUnicodeTranslateError_Create( 3737 unicode, size, startpos, endpos, reason); 3738 } 3739 else { 3740 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos)) 3741 goto onError; 3742 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos)) 3743 goto onError; 3744 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason)) 3745 goto onError; 3746 return; 3747 onError: 3748 Py_DECREF(*exceptionObject); 3749 *exceptionObject = NULL; 3750 } 3751} 3752 3753/* raises a UnicodeTranslateError */ 3754static void raise_translate_exception(PyObject **exceptionObject, 3755 const Py_UNICODE *unicode, Py_ssize_t size, 3756 Py_ssize_t startpos, Py_ssize_t endpos, 3757 const char *reason) 3758{ 3759 make_translate_exception(exceptionObject, 3760 unicode, size, startpos, endpos, reason); 3761 if (*exceptionObject != NULL) 3762 PyCodec_StrictErrors(*exceptionObject); 3763} 3764 3765/* error handling callback helper: 3766 build arguments, call the callback and check the arguments, 3767 put the result into newpos and return the replacement string, which 3768 has to be freed by the caller */ 3769static PyObject *unicode_translate_call_errorhandler(const char *errors, 3770 PyObject **errorHandler, 3771 const char *reason, 3772 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject, 3773 Py_ssize_t startpos, Py_ssize_t endpos, 3774 Py_ssize_t *newpos) 3775{ 3776 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple"; 3777 3778 Py_ssize_t i_newpos; 3779 PyObject *restuple; 3780 PyObject *resunicode; 3781 3782 if (*errorHandler == NULL) { 3783 *errorHandler = PyCodec_LookupError(errors); 3784 if (*errorHandler == NULL) 3785 return NULL; 3786 } 3787 3788 make_translate_exception(exceptionObject, 3789 unicode, size, startpos, endpos, reason); 3790 if (*exceptionObject == NULL) 3791 return NULL; 3792 3793 restuple = PyObject_CallFunctionObjArgs( 3794 *errorHandler, *exceptionObject, NULL); 3795 if (restuple == NULL) 3796 return NULL; 3797 if (!PyTuple_Check(restuple)) { 3798 PyErr_Format(PyExc_TypeError, &argparse[4]); 3799 Py_DECREF(restuple); 3800 return NULL; 3801 } 3802 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, 3803 &resunicode, &i_newpos)) { 3804 Py_DECREF(restuple); 3805 return NULL; 3806 } 3807 if (i_newpos<0) 3808 *newpos = size+i_newpos; 3809 else 3810 *newpos = i_newpos; 3811 if (*newpos<0 || *newpos>size) { 3812 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos); 3813 Py_DECREF(restuple); 3814 return NULL; 3815 } 3816 Py_INCREF(resunicode); 3817 Py_DECREF(restuple); 3818 return resunicode; 3819} 3820 3821/* Lookup the character ch in the mapping and put the result in result, 3822 which must be decrefed by the caller. 3823 Return 0 on success, -1 on error */ 3824static 3825int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result) 3826{ 3827 PyObject *w = PyInt_FromLong((long)c); 3828 PyObject *x; 3829 3830 if (w == NULL) 3831 return -1; 3832 x = PyObject_GetItem(mapping, w); 3833 Py_DECREF(w); 3834 if (x == NULL) { 3835 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 3836 /* No mapping found means: use 1:1 mapping. */ 3837 PyErr_Clear(); 3838 *result = NULL; 3839 return 0; 3840 } else 3841 return -1; 3842 } 3843 else if (x == Py_None) { 3844 *result = x; 3845 return 0; 3846 } 3847 else if (PyInt_Check(x)) { 3848 long value = PyInt_AS_LONG(x); 3849 long max = PyUnicode_GetMax(); 3850 if (value < 0 || value > max) { 3851 PyErr_Format(PyExc_TypeError, 3852 "character mapping must be in range(0x%lx)", max+1); 3853 Py_DECREF(x); 3854 return -1; 3855 } 3856 *result = x; 3857 return 0; 3858 } 3859 else if (PyUnicode_Check(x)) { 3860 *result = x; 3861 return 0; 3862 } 3863 else { 3864 /* wrong return value */ 3865 PyErr_SetString(PyExc_TypeError, 3866 "character mapping must return integer, None or unicode"); 3867 Py_DECREF(x); 3868 return -1; 3869 } 3870} 3871/* ensure that *outobj is at least requiredsize characters long, 3872if not reallocate and adjust various state variables. 3873Return 0 on success, -1 on error */ 3874static 3875int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp, 3876 Py_ssize_t requiredsize) 3877{ 3878 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj); 3879 if (requiredsize > oldsize) { 3880 /* remember old output position */ 3881 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj); 3882 /* exponentially overallocate to minimize reallocations */ 3883 if (requiredsize < 2 * oldsize) 3884 requiredsize = 2 * oldsize; 3885 if (_PyUnicode_Resize(outobj, requiredsize) < 0) 3886 return -1; 3887 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos; 3888 } 3889 return 0; 3890} 3891/* lookup the character, put the result in the output string and adjust 3892 various state variables. Return a new reference to the object that 3893 was put in the output buffer in *result, or Py_None, if the mapping was 3894 undefined (in which case no character was written). 3895 The called must decref result. 3896 Return 0 on success, -1 on error. */ 3897static 3898int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp, 3899 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp, 3900 PyObject **res) 3901{ 3902 if (charmaptranslate_lookup(*curinp, mapping, res)) 3903 return -1; 3904 if (*res==NULL) { 3905 /* not found => default to 1:1 mapping */ 3906 *(*outp)++ = *curinp; 3907 } 3908 else if (*res==Py_None) 3909 ; 3910 else if (PyInt_Check(*res)) { 3911 /* no overflow check, because we know that the space is enough */ 3912 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res); 3913 } 3914 else if (PyUnicode_Check(*res)) { 3915 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res); 3916 if (repsize==1) { 3917 /* no overflow check, because we know that the space is enough */ 3918 *(*outp)++ = *PyUnicode_AS_UNICODE(*res); 3919 } 3920 else if (repsize!=0) { 3921 /* more than one character */ 3922 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) + 3923 (insize - (curinp-startinp)) + 3924 repsize - 1; 3925 if (charmaptranslate_makespace(outobj, outp, requiredsize)) 3926 return -1; 3927 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize); 3928 *outp += repsize; 3929 } 3930 } 3931 else 3932 return -1; 3933 return 0; 3934} 3935 3936PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p, 3937 Py_ssize_t size, 3938 PyObject *mapping, 3939 const char *errors) 3940{ 3941 /* output object */ 3942 PyObject *res = NULL; 3943 /* pointers to the beginning and end+1 of input */ 3944 const Py_UNICODE *startp = p; 3945 const Py_UNICODE *endp = p + size; 3946 /* pointer into the output */ 3947 Py_UNICODE *str; 3948 /* current output position */ 3949 Py_ssize_t respos = 0; 3950 char *reason = "character maps to <undefined>"; 3951 PyObject *errorHandler = NULL; 3952 PyObject *exc = NULL; 3953 /* the following variable is used for caching string comparisons 3954 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3955 * 3=ignore, 4=xmlcharrefreplace */ 3956 int known_errorHandler = -1; 3957 3958 if (mapping == NULL) { 3959 PyErr_BadArgument(); 3960 return NULL; 3961 } 3962 3963 /* allocate enough for a simple 1:1 translation without 3964 replacements, if we need more, we'll resize */ 3965 res = PyUnicode_FromUnicode(NULL, size); 3966 if (res == NULL) 3967 goto onError; 3968 if (size == 0) 3969 return res; 3970 str = PyUnicode_AS_UNICODE(res); 3971 3972 while (p<endp) { 3973 /* try to encode it */ 3974 PyObject *x = NULL; 3975 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) { 3976 Py_XDECREF(x); 3977 goto onError; 3978 } 3979 Py_XDECREF(x); 3980 if (x!=Py_None) /* it worked => adjust input pointer */ 3981 ++p; 3982 else { /* untranslatable character */ 3983 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */ 3984 Py_ssize_t repsize; 3985 Py_ssize_t newpos; 3986 Py_UNICODE *uni2; 3987 /* startpos for collecting untranslatable chars */ 3988 const Py_UNICODE *collstart = p; 3989 const Py_UNICODE *collend = p+1; 3990 const Py_UNICODE *coll; 3991 3992 /* find all untranslatable characters */ 3993 while (collend < endp) { 3994 if (charmaptranslate_lookup(*collend, mapping, &x)) 3995 goto onError; 3996 Py_XDECREF(x); 3997 if (x!=Py_None) 3998 break; 3999 ++collend; 4000 } 4001 /* cache callback name lookup 4002 * (if not done yet, i.e. it's the first error) */ 4003 if (known_errorHandler==-1) { 4004 if ((errors==NULL) || (!strcmp(errors, "strict"))) 4005 known_errorHandler = 1; 4006 else if (!strcmp(errors, "replace")) 4007 known_errorHandler = 2; 4008 else if (!strcmp(errors, "ignore")) 4009 known_errorHandler = 3; 4010 else if (!strcmp(errors, "xmlcharrefreplace")) 4011 known_errorHandler = 4; 4012 else 4013 known_errorHandler = 0; 4014 } 4015 switch (known_errorHandler) { 4016 case 1: /* strict */ 4017 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason); 4018 goto onError; 4019 case 2: /* replace */ 4020 /* No need to check for space, this is a 1:1 replacement */ 4021 for (coll = collstart; coll<collend; ++coll) 4022 *str++ = '?'; 4023 /* fall through */ 4024 case 3: /* ignore */ 4025 p = collend; 4026 break; 4027 case 4: /* xmlcharrefreplace */ 4028 /* generate replacement (temporarily (mis)uses p) */ 4029 for (p = collstart; p < collend; ++p) { 4030 char buffer[2+29+1+1]; 4031 char *cp; 4032 sprintf(buffer, "&#%d;", (int)*p); 4033 if (charmaptranslate_makespace(&res, &str, 4034 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend))) 4035 goto onError; 4036 for (cp = buffer; *cp; ++cp) 4037 *str++ = *cp; 4038 } 4039 p = collend; 4040 break; 4041 default: 4042 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler, 4043 reason, startp, size, &exc, 4044 collstart-startp, collend-startp, &newpos); 4045 if (repunicode == NULL) 4046 goto onError; 4047 /* generate replacement */ 4048 repsize = PyUnicode_GET_SIZE(repunicode); 4049 if (charmaptranslate_makespace(&res, &str, 4050 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) { 4051 Py_DECREF(repunicode); 4052 goto onError; 4053 } 4054 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) 4055 *str++ = *uni2; 4056 p = startp + newpos; 4057 Py_DECREF(repunicode); 4058 } 4059 } 4060 } 4061 /* Resize if we allocated to much */ 4062 respos = str-PyUnicode_AS_UNICODE(res); 4063 if (respos<PyUnicode_GET_SIZE(res)) { 4064 if (_PyUnicode_Resize(&res, respos) < 0) 4065 goto onError; 4066 } 4067 Py_XDECREF(exc); 4068 Py_XDECREF(errorHandler); 4069 return res; 4070 4071 onError: 4072 Py_XDECREF(res); 4073 Py_XDECREF(exc); 4074 Py_XDECREF(errorHandler); 4075 return NULL; 4076} 4077 4078PyObject *PyUnicode_Translate(PyObject *str, 4079 PyObject *mapping, 4080 const char *errors) 4081{ 4082 PyObject *result; 4083 4084 str = PyUnicode_FromObject(str); 4085 if (str == NULL) 4086 goto onError; 4087 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str), 4088 PyUnicode_GET_SIZE(str), 4089 mapping, 4090 errors); 4091 Py_DECREF(str); 4092 return result; 4093 4094 onError: 4095 Py_XDECREF(str); 4096 return NULL; 4097} 4098 4099/* --- Decimal Encoder ---------------------------------------------------- */ 4100 4101int PyUnicode_EncodeDecimal(Py_UNICODE *s, 4102 Py_ssize_t length, 4103 char *output, 4104 const char *errors) 4105{ 4106 Py_UNICODE *p, *end; 4107 PyObject *errorHandler = NULL; 4108 PyObject *exc = NULL; 4109 const char *encoding = "decimal"; 4110 const char *reason = "invalid decimal Unicode string"; 4111 /* the following variable is used for caching string comparisons 4112 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */ 4113 int known_errorHandler = -1; 4114 4115 if (output == NULL) { 4116 PyErr_BadArgument(); 4117 return -1; 4118 } 4119 4120 p = s; 4121 end = s + length; 4122 while (p < end) { 4123 register Py_UNICODE ch = *p; 4124 int decimal; 4125 PyObject *repunicode; 4126 Py_ssize_t repsize; 4127 Py_ssize_t newpos; 4128 Py_UNICODE *uni2; 4129 Py_UNICODE *collstart; 4130 Py_UNICODE *collend; 4131 4132 if (Py_UNICODE_ISSPACE(ch)) { 4133 *output++ = ' '; 4134 ++p; 4135 continue; 4136 } 4137 decimal = Py_UNICODE_TODECIMAL(ch); 4138 if (decimal >= 0) { 4139 *output++ = '0' + decimal; 4140 ++p; 4141 continue; 4142 } 4143 if (0 < ch && ch < 256) { 4144 *output++ = (char)ch; 4145 ++p; 4146 continue; 4147 } 4148 /* All other characters are considered unencodable */ 4149 collstart = p; 4150 collend = p+1; 4151 while (collend < end) { 4152 if ((0 < *collend && *collend < 256) || 4153 !Py_UNICODE_ISSPACE(*collend) || 4154 Py_UNICODE_TODECIMAL(*collend)) 4155 break; 4156 } 4157 /* cache callback name lookup 4158 * (if not done yet, i.e. it's the first error) */ 4159 if (known_errorHandler==-1) { 4160 if ((errors==NULL) || (!strcmp(errors, "strict"))) 4161 known_errorHandler = 1; 4162 else if (!strcmp(errors, "replace")) 4163 known_errorHandler = 2; 4164 else if (!strcmp(errors, "ignore")) 4165 known_errorHandler = 3; 4166 else if (!strcmp(errors, "xmlcharrefreplace")) 4167 known_errorHandler = 4; 4168 else 4169 known_errorHandler = 0; 4170 } 4171 switch (known_errorHandler) { 4172 case 1: /* strict */ 4173 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason); 4174 goto onError; 4175 case 2: /* replace */ 4176 for (p = collstart; p < collend; ++p) 4177 *output++ = '?'; 4178 /* fall through */ 4179 case 3: /* ignore */ 4180 p = collend; 4181 break; 4182 case 4: /* xmlcharrefreplace */ 4183 /* generate replacement (temporarily (mis)uses p) */ 4184 for (p = collstart; p < collend; ++p) 4185 output += sprintf(output, "&#%d;", (int)*p); 4186 p = collend; 4187 break; 4188 default: 4189 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler, 4190 encoding, reason, s, length, &exc, 4191 collstart-s, collend-s, &newpos); 4192 if (repunicode == NULL) 4193 goto onError; 4194 /* generate replacement */ 4195 repsize = PyUnicode_GET_SIZE(repunicode); 4196 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) { 4197 Py_UNICODE ch = *uni2; 4198 if (Py_UNICODE_ISSPACE(ch)) 4199 *output++ = ' '; 4200 else { 4201 decimal = Py_UNICODE_TODECIMAL(ch); 4202 if (decimal >= 0) 4203 *output++ = '0' + decimal; 4204 else if (0 < ch && ch < 256) 4205 *output++ = (char)ch; 4206 else { 4207 Py_DECREF(repunicode); 4208 raise_encode_exception(&exc, encoding, 4209 s, length, collstart-s, collend-s, reason); 4210 goto onError; 4211 } 4212 } 4213 } 4214 p = s + newpos; 4215 Py_DECREF(repunicode); 4216 } 4217 } 4218 /* 0-terminate the output string */ 4219 *output++ = '\0'; 4220 Py_XDECREF(exc); 4221 Py_XDECREF(errorHandler); 4222 return 0; 4223 4224 onError: 4225 Py_XDECREF(exc); 4226 Py_XDECREF(errorHandler); 4227 return -1; 4228} 4229 4230/* --- Helpers ------------------------------------------------------------ */ 4231 4232#define STRINGLIB_CHAR Py_UNICODE 4233 4234#define STRINGLIB_LEN PyUnicode_GET_SIZE 4235#define STRINGLIB_NEW PyUnicode_FromUnicode 4236#define STRINGLIB_STR PyUnicode_AS_UNICODE 4237 4238Py_LOCAL_INLINE(int) 4239STRINGLIB_CMP(const Py_UNICODE* str, const Py_UNICODE* other, Py_ssize_t len) 4240{ 4241 if (str[0] != other[0]) 4242 return 1; 4243 return memcmp((void*) str, (void*) other, len * sizeof(Py_UNICODE)); 4244} 4245 4246#define STRINGLIB_EMPTY unicode_empty 4247 4248#include "stringlib/fastsearch.h" 4249 4250#include "stringlib/count.h" 4251#include "stringlib/find.h" 4252#include "stringlib/partition.h" 4253 4254/* helper macro to fixup start/end slice values */ 4255#define FIX_START_END(obj) \ 4256 if (start < 0) \ 4257 start += (obj)->length; \ 4258 if (start < 0) \ 4259 start = 0; \ 4260 if (end > (obj)->length) \ 4261 end = (obj)->length; \ 4262 if (end < 0) \ 4263 end += (obj)->length; \ 4264 if (end < 0) \ 4265 end = 0; 4266 4267Py_ssize_t PyUnicode_Count(PyObject *str, 4268 PyObject *substr, 4269 Py_ssize_t start, 4270 Py_ssize_t end) 4271{ 4272 Py_ssize_t result; 4273 PyUnicodeObject* str_obj; 4274 PyUnicodeObject* sub_obj; 4275 4276 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str); 4277 if (!str_obj) 4278 return -1; 4279 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr); 4280 if (!sub_obj) { 4281 Py_DECREF(str_obj); 4282 return -1; 4283 } 4284 4285 FIX_START_END(str_obj); 4286 4287 result = stringlib_count( 4288 str_obj->str + start, end - start, sub_obj->str, sub_obj->length 4289 ); 4290 4291 Py_DECREF(sub_obj); 4292 Py_DECREF(str_obj); 4293 4294 return result; 4295} 4296 4297Py_ssize_t PyUnicode_Find(PyObject *str, 4298 PyObject *sub, 4299 Py_ssize_t start, 4300 Py_ssize_t end, 4301 int direction) 4302{ 4303 Py_ssize_t result; 4304 4305 str = PyUnicode_FromObject(str); 4306 if (!str) 4307 return -2; 4308 sub = PyUnicode_FromObject(sub); 4309 if (!sub) { 4310 Py_DECREF(str); 4311 return -2; 4312 } 4313 4314 if (direction > 0) 4315 result = stringlib_find_slice( 4316 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str), 4317 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub), 4318 start, end 4319 ); 4320 else 4321 result = stringlib_rfind_slice( 4322 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str), 4323 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub), 4324 start, end 4325 ); 4326 4327 Py_DECREF(str); 4328 Py_DECREF(sub); 4329 4330 return result; 4331} 4332 4333static 4334int tailmatch(PyUnicodeObject *self, 4335 PyUnicodeObject *substring, 4336 Py_ssize_t start, 4337 Py_ssize_t end, 4338 int direction) 4339{ 4340 if (substring->length == 0) 4341 return 1; 4342 4343 FIX_START_END(self); 4344 4345 end -= substring->length; 4346 if (end < start) 4347 return 0; 4348 4349 if (direction > 0) { 4350 if (Py_UNICODE_MATCH(self, end, substring)) 4351 return 1; 4352 } else { 4353 if (Py_UNICODE_MATCH(self, start, substring)) 4354 return 1; 4355 } 4356 4357 return 0; 4358} 4359 4360Py_ssize_t PyUnicode_Tailmatch(PyObject *str, 4361 PyObject *substr, 4362 Py_ssize_t start, 4363 Py_ssize_t end, 4364 int direction) 4365{ 4366 Py_ssize_t result; 4367 4368 str = PyUnicode_FromObject(str); 4369 if (str == NULL) 4370 return -1; 4371 substr = PyUnicode_FromObject(substr); 4372 if (substr == NULL) { 4373 Py_DECREF(str); 4374 return -1; 4375 } 4376 4377 result = tailmatch((PyUnicodeObject *)str, 4378 (PyUnicodeObject *)substr, 4379 start, end, direction); 4380 Py_DECREF(str); 4381 Py_DECREF(substr); 4382 return result; 4383} 4384 4385/* Apply fixfct filter to the Unicode object self and return a 4386 reference to the modified object */ 4387 4388static 4389PyObject *fixup(PyUnicodeObject *self, 4390 int (*fixfct)(PyUnicodeObject *s)) 4391{ 4392 4393 PyUnicodeObject *u; 4394 4395 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length); 4396 if (u == NULL) 4397 return NULL; 4398 4399 Py_UNICODE_COPY(u->str, self->str, self->length); 4400 4401 if (!fixfct(u) && PyUnicode_CheckExact(self)) { 4402 /* fixfct should return TRUE if it modified the buffer. If 4403 FALSE, return a reference to the original buffer instead 4404 (to save space, not time) */ 4405 Py_INCREF(self); 4406 Py_DECREF(u); 4407 return (PyObject*) self; 4408 } 4409 return (PyObject*) u; 4410} 4411 4412static 4413int fixupper(PyUnicodeObject *self) 4414{ 4415 Py_ssize_t len = self->length; 4416 Py_UNICODE *s = self->str; 4417 int status = 0; 4418 4419 while (len-- > 0) { 4420 register Py_UNICODE ch; 4421 4422 ch = Py_UNICODE_TOUPPER(*s); 4423 if (ch != *s) { 4424 status = 1; 4425 *s = ch; 4426 } 4427 s++; 4428 } 4429 4430 return status; 4431} 4432 4433static 4434int fixlower(PyUnicodeObject *self) 4435{ 4436 Py_ssize_t len = self->length; 4437 Py_UNICODE *s = self->str; 4438 int status = 0; 4439 4440 while (len-- > 0) { 4441 register Py_UNICODE ch; 4442 4443 ch = Py_UNICODE_TOLOWER(*s); 4444 if (ch != *s) { 4445 status = 1; 4446 *s = ch; 4447 } 4448 s++; 4449 } 4450 4451 return status; 4452} 4453 4454static 4455int fixswapcase(PyUnicodeObject *self) 4456{ 4457 Py_ssize_t len = self->length; 4458 Py_UNICODE *s = self->str; 4459 int status = 0; 4460 4461 while (len-- > 0) { 4462 if (Py_UNICODE_ISUPPER(*s)) { 4463 *s = Py_UNICODE_TOLOWER(*s); 4464 status = 1; 4465 } else if (Py_UNICODE_ISLOWER(*s)) { 4466 *s = Py_UNICODE_TOUPPER(*s); 4467 status = 1; 4468 } 4469 s++; 4470 } 4471 4472 return status; 4473} 4474 4475static 4476int fixcapitalize(PyUnicodeObject *self) 4477{ 4478 Py_ssize_t len = self->length; 4479 Py_UNICODE *s = self->str; 4480 int status = 0; 4481 4482 if (len == 0) 4483 return 0; 4484 if (Py_UNICODE_ISLOWER(*s)) { 4485 *s = Py_UNICODE_TOUPPER(*s); 4486 status = 1; 4487 } 4488 s++; 4489 while (--len > 0) { 4490 if (Py_UNICODE_ISUPPER(*s)) { 4491 *s = Py_UNICODE_TOLOWER(*s); 4492 status = 1; 4493 } 4494 s++; 4495 } 4496 return status; 4497} 4498 4499static 4500int fixtitle(PyUnicodeObject *self) 4501{ 4502 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 4503 register Py_UNICODE *e; 4504 int previous_is_cased; 4505 4506 /* Shortcut for single character strings */ 4507 if (PyUnicode_GET_SIZE(self) == 1) { 4508 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p); 4509 if (*p != ch) { 4510 *p = ch; 4511 return 1; 4512 } 4513 else 4514 return 0; 4515 } 4516 4517 e = p + PyUnicode_GET_SIZE(self); 4518 previous_is_cased = 0; 4519 for (; p < e; p++) { 4520 register const Py_UNICODE ch = *p; 4521 4522 if (previous_is_cased) 4523 *p = Py_UNICODE_TOLOWER(ch); 4524 else 4525 *p = Py_UNICODE_TOTITLE(ch); 4526 4527 if (Py_UNICODE_ISLOWER(ch) || 4528 Py_UNICODE_ISUPPER(ch) || 4529 Py_UNICODE_ISTITLE(ch)) 4530 previous_is_cased = 1; 4531 else 4532 previous_is_cased = 0; 4533 } 4534 return 1; 4535} 4536 4537PyObject * 4538PyUnicode_Join(PyObject *separator, PyObject *seq) 4539{ 4540 PyObject *internal_separator = NULL; 4541 const Py_UNICODE blank = ' '; 4542 const Py_UNICODE *sep = ␣ 4543 Py_ssize_t seplen = 1; 4544 PyUnicodeObject *res = NULL; /* the result */ 4545 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */ 4546 Py_ssize_t res_used; /* # used bytes */ 4547 Py_UNICODE *res_p; /* pointer to free byte in res's string area */ 4548 PyObject *fseq; /* PySequence_Fast(seq) */ 4549 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */ 4550 PyObject *item; 4551 Py_ssize_t i; 4552 4553 fseq = PySequence_Fast(seq, ""); 4554 if (fseq == NULL) { 4555 return NULL; 4556 } 4557 4558 /* Grrrr. A codec may be invoked to convert str objects to 4559 * Unicode, and so it's possible to call back into Python code 4560 * during PyUnicode_FromObject(), and so it's possible for a sick 4561 * codec to change the size of fseq (if seq is a list). Therefore 4562 * we have to keep refetching the size -- can't assume seqlen 4563 * is invariant. 4564 */ 4565 seqlen = PySequence_Fast_GET_SIZE(fseq); 4566 /* If empty sequence, return u"". */ 4567 if (seqlen == 0) { 4568 res = _PyUnicode_New(0); /* empty sequence; return u"" */ 4569 goto Done; 4570 } 4571 /* If singleton sequence with an exact Unicode, return that. */ 4572 if (seqlen == 1) { 4573 item = PySequence_Fast_GET_ITEM(fseq, 0); 4574 if (PyUnicode_CheckExact(item)) { 4575 Py_INCREF(item); 4576 res = (PyUnicodeObject *)item; 4577 goto Done; 4578 } 4579 } 4580 4581 /* At least two items to join, or one that isn't exact Unicode. */ 4582 if (seqlen > 1) { 4583 /* Set up sep and seplen -- they're needed. */ 4584 if (separator == NULL) { 4585 sep = ␣ 4586 seplen = 1; 4587 } 4588 else { 4589 internal_separator = PyUnicode_FromObject(separator); 4590 if (internal_separator == NULL) 4591 goto onError; 4592 sep = PyUnicode_AS_UNICODE(internal_separator); 4593 seplen = PyUnicode_GET_SIZE(internal_separator); 4594 /* In case PyUnicode_FromObject() mutated seq. */ 4595 seqlen = PySequence_Fast_GET_SIZE(fseq); 4596 } 4597 } 4598 4599 /* Get space. */ 4600 res = _PyUnicode_New(res_alloc); 4601 if (res == NULL) 4602 goto onError; 4603 res_p = PyUnicode_AS_UNICODE(res); 4604 res_used = 0; 4605 4606 for (i = 0; i < seqlen; ++i) { 4607 Py_ssize_t itemlen; 4608 Py_ssize_t new_res_used; 4609 4610 item = PySequence_Fast_GET_ITEM(fseq, i); 4611 /* Convert item to Unicode. */ 4612 if (! PyUnicode_Check(item) && ! PyString_Check(item)) { 4613 PyErr_Format(PyExc_TypeError, 4614 "sequence item %zd: expected string or Unicode," 4615 " %.80s found", 4616 i, item->ob_type->tp_name); 4617 goto onError; 4618 } 4619 item = PyUnicode_FromObject(item); 4620 if (item == NULL) 4621 goto onError; 4622 /* We own a reference to item from here on. */ 4623 4624 /* In case PyUnicode_FromObject() mutated seq. */ 4625 seqlen = PySequence_Fast_GET_SIZE(fseq); 4626 4627 /* Make sure we have enough space for the separator and the item. */ 4628 itemlen = PyUnicode_GET_SIZE(item); 4629 new_res_used = res_used + itemlen; 4630 if (new_res_used < 0) 4631 goto Overflow; 4632 if (i < seqlen - 1) { 4633 new_res_used += seplen; 4634 if (new_res_used < 0) 4635 goto Overflow; 4636 } 4637 if (new_res_used > res_alloc) { 4638 /* double allocated size until it's big enough */ 4639 do { 4640 res_alloc += res_alloc; 4641 if (res_alloc <= 0) 4642 goto Overflow; 4643 } while (new_res_used > res_alloc); 4644 if (_PyUnicode_Resize(&res, res_alloc) < 0) { 4645 Py_DECREF(item); 4646 goto onError; 4647 } 4648 res_p = PyUnicode_AS_UNICODE(res) + res_used; 4649 } 4650 4651 /* Copy item, and maybe the separator. */ 4652 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen); 4653 res_p += itemlen; 4654 if (i < seqlen - 1) { 4655 Py_UNICODE_COPY(res_p, sep, seplen); 4656 res_p += seplen; 4657 } 4658 Py_DECREF(item); 4659 res_used = new_res_used; 4660 } 4661 4662 /* Shrink res to match the used area; this probably can't fail, 4663 * but it's cheap to check. 4664 */ 4665 if (_PyUnicode_Resize(&res, res_used) < 0) 4666 goto onError; 4667 4668 Done: 4669 Py_XDECREF(internal_separator); 4670 Py_DECREF(fseq); 4671 return (PyObject *)res; 4672 4673 Overflow: 4674 PyErr_SetString(PyExc_OverflowError, 4675 "join() result is too long for a Python string"); 4676 Py_DECREF(item); 4677 /* fall through */ 4678 4679 onError: 4680 Py_XDECREF(internal_separator); 4681 Py_DECREF(fseq); 4682 Py_XDECREF(res); 4683 return NULL; 4684} 4685 4686static 4687PyUnicodeObject *pad(PyUnicodeObject *self, 4688 Py_ssize_t left, 4689 Py_ssize_t right, 4690 Py_UNICODE fill) 4691{ 4692 PyUnicodeObject *u; 4693 4694 if (left < 0) 4695 left = 0; 4696 if (right < 0) 4697 right = 0; 4698 4699 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) { 4700 Py_INCREF(self); 4701 return self; 4702 } 4703 4704 u = _PyUnicode_New(left + self->length + right); 4705 if (u) { 4706 if (left) 4707 Py_UNICODE_FILL(u->str, fill, left); 4708 Py_UNICODE_COPY(u->str + left, self->str, self->length); 4709 if (right) 4710 Py_UNICODE_FILL(u->str + left + self->length, fill, right); 4711 } 4712 4713 return u; 4714} 4715 4716#define SPLIT_APPEND(data, left, right) \ 4717 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \ 4718 if (!str) \ 4719 goto onError; \ 4720 if (PyList_Append(list, str)) { \ 4721 Py_DECREF(str); \ 4722 goto onError; \ 4723 } \ 4724 else \ 4725 Py_DECREF(str); 4726 4727static 4728PyObject *split_whitespace(PyUnicodeObject *self, 4729 PyObject *list, 4730 Py_ssize_t maxcount) 4731{ 4732 register Py_ssize_t i; 4733 register Py_ssize_t j; 4734 Py_ssize_t len = self->length; 4735 PyObject *str; 4736 4737 for (i = j = 0; i < len; ) { 4738 /* find a token */ 4739 while (i < len && Py_UNICODE_ISSPACE(self->str[i])) 4740 i++; 4741 j = i; 4742 while (i < len && !Py_UNICODE_ISSPACE(self->str[i])) 4743 i++; 4744 if (j < i) { 4745 if (maxcount-- <= 0) 4746 break; 4747 SPLIT_APPEND(self->str, j, i); 4748 while (i < len && Py_UNICODE_ISSPACE(self->str[i])) 4749 i++; 4750 j = i; 4751 } 4752 } 4753 if (j < len) { 4754 SPLIT_APPEND(self->str, j, len); 4755 } 4756 return list; 4757 4758 onError: 4759 Py_DECREF(list); 4760 return NULL; 4761} 4762 4763PyObject *PyUnicode_Splitlines(PyObject *string, 4764 int keepends) 4765{ 4766 register Py_ssize_t i; 4767 register Py_ssize_t j; 4768 Py_ssize_t len; 4769 PyObject *list; 4770 PyObject *str; 4771 Py_UNICODE *data; 4772 4773 string = PyUnicode_FromObject(string); 4774 if (string == NULL) 4775 return NULL; 4776 data = PyUnicode_AS_UNICODE(string); 4777 len = PyUnicode_GET_SIZE(string); 4778 4779 list = PyList_New(0); 4780 if (!list) 4781 goto onError; 4782 4783 for (i = j = 0; i < len; ) { 4784 Py_ssize_t eol; 4785 4786 /* Find a line and append it */ 4787 while (i < len && !BLOOM_LINEBREAK(data[i])) 4788 i++; 4789 4790 /* Skip the line break reading CRLF as one line break */ 4791 eol = i; 4792 if (i < len) { 4793 if (data[i] == '\r' && i + 1 < len && 4794 data[i+1] == '\n') 4795 i += 2; 4796 else 4797 i++; 4798 if (keepends) 4799 eol = i; 4800 } 4801 SPLIT_APPEND(data, j, eol); 4802 j = i; 4803 } 4804 if (j < len) { 4805 SPLIT_APPEND(data, j, len); 4806 } 4807 4808 Py_DECREF(string); 4809 return list; 4810 4811 onError: 4812 Py_XDECREF(list); 4813 Py_DECREF(string); 4814 return NULL; 4815} 4816 4817static 4818PyObject *split_char(PyUnicodeObject *self, 4819 PyObject *list, 4820 Py_UNICODE ch, 4821 Py_ssize_t maxcount) 4822{ 4823 register Py_ssize_t i; 4824 register Py_ssize_t j; 4825 Py_ssize_t len = self->length; 4826 PyObject *str; 4827 4828 for (i = j = 0; i < len; ) { 4829 if (self->str[i] == ch) { 4830 if (maxcount-- <= 0) 4831 break; 4832 SPLIT_APPEND(self->str, j, i); 4833 i = j = i + 1; 4834 } else 4835 i++; 4836 } 4837 if (j <= len) { 4838 SPLIT_APPEND(self->str, j, len); 4839 } 4840 return list; 4841 4842 onError: 4843 Py_DECREF(list); 4844 return NULL; 4845} 4846 4847static 4848PyObject *split_substring(PyUnicodeObject *self, 4849 PyObject *list, 4850 PyUnicodeObject *substring, 4851 Py_ssize_t maxcount) 4852{ 4853 register Py_ssize_t i; 4854 register Py_ssize_t j; 4855 Py_ssize_t len = self->length; 4856 Py_ssize_t sublen = substring->length; 4857 PyObject *str; 4858 4859 for (i = j = 0; i <= len - sublen; ) { 4860 if (Py_UNICODE_MATCH(self, i, substring)) { 4861 if (maxcount-- <= 0) 4862 break; 4863 SPLIT_APPEND(self->str, j, i); 4864 i = j = i + sublen; 4865 } else 4866 i++; 4867 } 4868 if (j <= len) { 4869 SPLIT_APPEND(self->str, j, len); 4870 } 4871 return list; 4872 4873 onError: 4874 Py_DECREF(list); 4875 return NULL; 4876} 4877 4878static 4879PyObject *rsplit_whitespace(PyUnicodeObject *self, 4880 PyObject *list, 4881 Py_ssize_t maxcount) 4882{ 4883 register Py_ssize_t i; 4884 register Py_ssize_t j; 4885 Py_ssize_t len = self->length; 4886 PyObject *str; 4887 4888 for (i = j = len - 1; i >= 0; ) { 4889 /* find a token */ 4890 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i])) 4891 i--; 4892 j = i; 4893 while (i >= 0 && !Py_UNICODE_ISSPACE(self->str[i])) 4894 i--; 4895 if (j > i) { 4896 if (maxcount-- <= 0) 4897 break; 4898 SPLIT_APPEND(self->str, i + 1, j + 1); 4899 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i])) 4900 i--; 4901 j = i; 4902 } 4903 } 4904 if (j >= 0) { 4905 SPLIT_APPEND(self->str, 0, j + 1); 4906 } 4907 if (PyList_Reverse(list) < 0) 4908 goto onError; 4909 return list; 4910 4911 onError: 4912 Py_DECREF(list); 4913 return NULL; 4914} 4915 4916static 4917PyObject *rsplit_char(PyUnicodeObject *self, 4918 PyObject *list, 4919 Py_UNICODE ch, 4920 Py_ssize_t maxcount) 4921{ 4922 register Py_ssize_t i; 4923 register Py_ssize_t j; 4924 Py_ssize_t len = self->length; 4925 PyObject *str; 4926 4927 for (i = j = len - 1; i >= 0; ) { 4928 if (self->str[i] == ch) { 4929 if (maxcount-- <= 0) 4930 break; 4931 SPLIT_APPEND(self->str, i + 1, j + 1); 4932 j = i = i - 1; 4933 } else 4934 i--; 4935 } 4936 if (j >= -1) { 4937 SPLIT_APPEND(self->str, 0, j + 1); 4938 } 4939 if (PyList_Reverse(list) < 0) 4940 goto onError; 4941 return list; 4942 4943 onError: 4944 Py_DECREF(list); 4945 return NULL; 4946} 4947 4948static 4949PyObject *rsplit_substring(PyUnicodeObject *self, 4950 PyObject *list, 4951 PyUnicodeObject *substring, 4952 Py_ssize_t maxcount) 4953{ 4954 register Py_ssize_t i; 4955 register Py_ssize_t j; 4956 Py_ssize_t len = self->length; 4957 Py_ssize_t sublen = substring->length; 4958 PyObject *str; 4959 4960 for (i = len - sublen, j = len; i >= 0; ) { 4961 if (Py_UNICODE_MATCH(self, i, substring)) { 4962 if (maxcount-- <= 0) 4963 break; 4964 SPLIT_APPEND(self->str, i + sublen, j); 4965 j = i; 4966 i -= sublen; 4967 } else 4968 i--; 4969 } 4970 if (j >= 0) { 4971 SPLIT_APPEND(self->str, 0, j); 4972 } 4973 if (PyList_Reverse(list) < 0) 4974 goto onError; 4975 return list; 4976 4977 onError: 4978 Py_DECREF(list); 4979 return NULL; 4980} 4981 4982#undef SPLIT_APPEND 4983 4984static 4985PyObject *split(PyUnicodeObject *self, 4986 PyUnicodeObject *substring, 4987 Py_ssize_t maxcount) 4988{ 4989 PyObject *list; 4990 4991 if (maxcount < 0) 4992 maxcount = PY_SSIZE_T_MAX; 4993 4994 list = PyList_New(0); 4995 if (!list) 4996 return NULL; 4997 4998 if (substring == NULL) 4999 return split_whitespace(self,list,maxcount); 5000 5001 else if (substring->length == 1) 5002 return split_char(self,list,substring->str[0],maxcount); 5003 5004 else if (substring->length == 0) { 5005 Py_DECREF(list); 5006 PyErr_SetString(PyExc_ValueError, "empty separator"); 5007 return NULL; 5008 } 5009 else 5010 return split_substring(self,list,substring,maxcount); 5011} 5012 5013static 5014PyObject *rsplit(PyUnicodeObject *self, 5015 PyUnicodeObject *substring, 5016 Py_ssize_t maxcount) 5017{ 5018 PyObject *list; 5019 5020 if (maxcount < 0) 5021 maxcount = PY_SSIZE_T_MAX; 5022 5023 list = PyList_New(0); 5024 if (!list) 5025 return NULL; 5026 5027 if (substring == NULL) 5028 return rsplit_whitespace(self,list,maxcount); 5029 5030 else if (substring->length == 1) 5031 return rsplit_char(self,list,substring->str[0],maxcount); 5032 5033 else if (substring->length == 0) { 5034 Py_DECREF(list); 5035 PyErr_SetString(PyExc_ValueError, "empty separator"); 5036 return NULL; 5037 } 5038 else 5039 return rsplit_substring(self,list,substring,maxcount); 5040} 5041 5042static 5043PyObject *replace(PyUnicodeObject *self, 5044 PyUnicodeObject *str1, 5045 PyUnicodeObject *str2, 5046 Py_ssize_t maxcount) 5047{ 5048 PyUnicodeObject *u; 5049 5050 if (maxcount < 0) 5051 maxcount = PY_SSIZE_T_MAX; 5052 5053 if (str1->length == str2->length) { 5054 /* same length */ 5055 Py_ssize_t i; 5056 if (str1->length == 1) { 5057 /* replace characters */ 5058 Py_UNICODE u1, u2; 5059 if (!findchar(self->str, self->length, str1->str[0])) 5060 goto nothing; 5061 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length); 5062 if (!u) 5063 return NULL; 5064 Py_UNICODE_COPY(u->str, self->str, self->length); 5065 u1 = str1->str[0]; 5066 u2 = str2->str[0]; 5067 for (i = 0; i < u->length; i++) 5068 if (u->str[i] == u1) { 5069 if (--maxcount < 0) 5070 break; 5071 u->str[i] = u2; 5072 } 5073 } else { 5074 i = fastsearch( 5075 self->str, self->length, str1->str, str1->length, FAST_SEARCH 5076 ); 5077 if (i < 0) 5078 goto nothing; 5079 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length); 5080 if (!u) 5081 return NULL; 5082 Py_UNICODE_COPY(u->str, self->str, self->length); 5083 while (i <= self->length - str1->length) 5084 if (Py_UNICODE_MATCH(self, i, str1)) { 5085 if (--maxcount < 0) 5086 break; 5087 Py_UNICODE_COPY(u->str+i, str2->str, str2->length); 5088 i += str1->length; 5089 } else 5090 i++; 5091 } 5092 } else { 5093 5094 Py_ssize_t n, i, j, e; 5095 Py_ssize_t product, new_size, delta; 5096 Py_UNICODE *p; 5097 5098 /* replace strings */ 5099 n = stringlib_count(self->str, self->length, str1->str, str1->length); 5100 if (n > maxcount) 5101 n = maxcount; 5102 if (n == 0) 5103 goto nothing; 5104 /* new_size = self->length + n * (str2->length - str1->length)); */ 5105 delta = (str2->length - str1->length); 5106 if (delta == 0) { 5107 new_size = self->length; 5108 } else { 5109 product = n * (str2->length - str1->length); 5110 if ((product / (str2->length - str1->length)) != n) { 5111 PyErr_SetString(PyExc_OverflowError, 5112 "replace string is too long"); 5113 return NULL; 5114 } 5115 new_size = self->length + product; 5116 if (new_size < 0) { 5117 PyErr_SetString(PyExc_OverflowError, 5118 "replace string is too long"); 5119 return NULL; 5120 } 5121 } 5122 u = _PyUnicode_New(new_size); 5123 if (!u) 5124 return NULL; 5125 i = 0; 5126 p = u->str; 5127 e = self->length - str1->length; 5128 if (str1->length > 0) { 5129 while (n-- > 0) { 5130 /* look for next match */ 5131 j = i; 5132 while (j <= e) { 5133 if (Py_UNICODE_MATCH(self, j, str1)) 5134 break; 5135 j++; 5136 } 5137 if (j > i) { 5138 if (j > e) 5139 break; 5140 /* copy unchanged part [i:j] */ 5141 Py_UNICODE_COPY(p, self->str+i, j-i); 5142 p += j - i; 5143 } 5144 /* copy substitution string */ 5145 if (str2->length > 0) { 5146 Py_UNICODE_COPY(p, str2->str, str2->length); 5147 p += str2->length; 5148 } 5149 i = j + str1->length; 5150 } 5151 if (i < self->length) 5152 /* copy tail [i:] */ 5153 Py_UNICODE_COPY(p, self->str+i, self->length-i); 5154 } else { 5155 /* interleave */ 5156 while (n > 0) { 5157 Py_UNICODE_COPY(p, str2->str, str2->length); 5158 p += str2->length; 5159 if (--n <= 0) 5160 break; 5161 *p++ = self->str[i++]; 5162 } 5163 Py_UNICODE_COPY(p, self->str+i, self->length-i); 5164 } 5165 } 5166 return (PyObject *) u; 5167 5168nothing: 5169 /* nothing to replace; return original string (when possible) */ 5170 if (PyUnicode_CheckExact(self)) { 5171 Py_INCREF(self); 5172 return (PyObject *) self; 5173 } 5174 return PyUnicode_FromUnicode(self->str, self->length); 5175} 5176 5177/* --- Unicode Object Methods --------------------------------------------- */ 5178 5179PyDoc_STRVAR(title__doc__, 5180"S.title() -> unicode\n\ 5181\n\ 5182Return a titlecased version of S, i.e. words start with title case\n\ 5183characters, all remaining cased characters have lower case."); 5184 5185static PyObject* 5186unicode_title(PyUnicodeObject *self) 5187{ 5188 return fixup(self, fixtitle); 5189} 5190 5191PyDoc_STRVAR(capitalize__doc__, 5192"S.capitalize() -> unicode\n\ 5193\n\ 5194Return a capitalized version of S, i.e. make the first character\n\ 5195have upper case."); 5196 5197static PyObject* 5198unicode_capitalize(PyUnicodeObject *self) 5199{ 5200 return fixup(self, fixcapitalize); 5201} 5202 5203#if 0 5204PyDoc_STRVAR(capwords__doc__, 5205"S.capwords() -> unicode\n\ 5206\n\ 5207Apply .capitalize() to all words in S and return the result with\n\ 5208normalized whitespace (all whitespace strings are replaced by ' ')."); 5209 5210static PyObject* 5211unicode_capwords(PyUnicodeObject *self) 5212{ 5213 PyObject *list; 5214 PyObject *item; 5215 Py_ssize_t i; 5216 5217 /* Split into words */ 5218 list = split(self, NULL, -1); 5219 if (!list) 5220 return NULL; 5221 5222 /* Capitalize each word */ 5223 for (i = 0; i < PyList_GET_SIZE(list); i++) { 5224 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i), 5225 fixcapitalize); 5226 if (item == NULL) 5227 goto onError; 5228 Py_DECREF(PyList_GET_ITEM(list, i)); 5229 PyList_SET_ITEM(list, i, item); 5230 } 5231 5232 /* Join the words to form a new string */ 5233 item = PyUnicode_Join(NULL, list); 5234 5235onError: 5236 Py_DECREF(list); 5237 return (PyObject *)item; 5238} 5239#endif 5240 5241/* Argument converter. Coerces to a single unicode character */ 5242 5243static int 5244convert_uc(PyObject *obj, void *addr) 5245{ 5246 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr; 5247 PyObject *uniobj; 5248 Py_UNICODE *unistr; 5249 5250 uniobj = PyUnicode_FromObject(obj); 5251 if (uniobj == NULL) { 5252 PyErr_SetString(PyExc_TypeError, 5253 "The fill character cannot be converted to Unicode"); 5254 return 0; 5255 } 5256 if (PyUnicode_GET_SIZE(uniobj) != 1) { 5257 PyErr_SetString(PyExc_TypeError, 5258 "The fill character must be exactly one character long"); 5259 Py_DECREF(uniobj); 5260 return 0; 5261 } 5262 unistr = PyUnicode_AS_UNICODE(uniobj); 5263 *fillcharloc = unistr[0]; 5264 Py_DECREF(uniobj); 5265 return 1; 5266} 5267 5268PyDoc_STRVAR(center__doc__, 5269"S.center(width[, fillchar]) -> unicode\n\ 5270\n\ 5271Return S centered in a Unicode string of length width. Padding is\n\ 5272done using the specified fill character (default is a space)"); 5273 5274static PyObject * 5275unicode_center(PyUnicodeObject *self, PyObject *args) 5276{ 5277 Py_ssize_t marg, left; 5278 Py_ssize_t width; 5279 Py_UNICODE fillchar = ' '; 5280 5281 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar)) 5282 return NULL; 5283 5284 if (self->length >= width && PyUnicode_CheckExact(self)) { 5285 Py_INCREF(self); 5286 return (PyObject*) self; 5287 } 5288 5289 marg = width - self->length; 5290 left = marg / 2 + (marg & width & 1); 5291 5292 return (PyObject*) pad(self, left, marg - left, fillchar); 5293} 5294 5295#if 0 5296 5297/* This code should go into some future Unicode collation support 5298 module. The basic comparison should compare ordinals on a naive 5299 basis (this is what Java does and thus JPython too). */ 5300 5301/* speedy UTF-16 code point order comparison */ 5302/* gleaned from: */ 5303/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */ 5304 5305static short utf16Fixup[32] = 5306{ 5307 0, 0, 0, 0, 0, 0, 0, 0, 5308 0, 0, 0, 0, 0, 0, 0, 0, 5309 0, 0, 0, 0, 0, 0, 0, 0, 5310 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800 5311}; 5312 5313static int 5314unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2) 5315{ 5316 Py_ssize_t len1, len2; 5317 5318 Py_UNICODE *s1 = str1->str; 5319 Py_UNICODE *s2 = str2->str; 5320 5321 len1 = str1->length; 5322 len2 = str2->length; 5323 5324 while (len1 > 0 && len2 > 0) { 5325 Py_UNICODE c1, c2; 5326 5327 c1 = *s1++; 5328 c2 = *s2++; 5329 5330 if (c1 > (1<<11) * 26) 5331 c1 += utf16Fixup[c1>>11]; 5332 if (c2 > (1<<11) * 26) 5333 c2 += utf16Fixup[c2>>11]; 5334 /* now c1 and c2 are in UTF-32-compatible order */ 5335 5336 if (c1 != c2) 5337 return (c1 < c2) ? -1 : 1; 5338 5339 len1--; len2--; 5340 } 5341 5342 return (len1 < len2) ? -1 : (len1 != len2); 5343} 5344 5345#else 5346 5347static int 5348unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2) 5349{ 5350 register Py_ssize_t len1, len2; 5351 5352 Py_UNICODE *s1 = str1->str; 5353 Py_UNICODE *s2 = str2->str; 5354 5355 len1 = str1->length; 5356 len2 = str2->length; 5357 5358 while (len1 > 0 && len2 > 0) { 5359 Py_UNICODE c1, c2; 5360 5361 c1 = *s1++; 5362 c2 = *s2++; 5363 5364 if (c1 != c2) 5365 return (c1 < c2) ? -1 : 1; 5366 5367 len1--; len2--; 5368 } 5369 5370 return (len1 < len2) ? -1 : (len1 != len2); 5371} 5372 5373#endif 5374 5375int PyUnicode_Compare(PyObject *left, 5376 PyObject *right) 5377{ 5378 PyUnicodeObject *u = NULL, *v = NULL; 5379 int result; 5380 5381 /* Coerce the two arguments */ 5382 u = (PyUnicodeObject *)PyUnicode_FromObject(left); 5383 if (u == NULL) 5384 goto onError; 5385 v = (PyUnicodeObject *)PyUnicode_FromObject(right); 5386 if (v == NULL) 5387 goto onError; 5388 5389 /* Shortcut for empty or interned objects */ 5390 if (v == u) { 5391 Py_DECREF(u); 5392 Py_DECREF(v); 5393 return 0; 5394 } 5395 5396 result = unicode_compare(u, v); 5397 5398 Py_DECREF(u); 5399 Py_DECREF(v); 5400 return result; 5401 5402onError: 5403 Py_XDECREF(u); 5404 Py_XDECREF(v); 5405 return -1; 5406} 5407 5408int PyUnicode_Contains(PyObject *container, 5409 PyObject *element) 5410{ 5411 PyObject *str, *sub; 5412 int result; 5413 5414 /* Coerce the two arguments */ 5415 sub = PyUnicode_FromObject(element); 5416 if (!sub) { 5417 PyErr_SetString(PyExc_TypeError, 5418 "'in <string>' requires string as left operand"); 5419 return -1; 5420 } 5421 5422 str = PyUnicode_FromObject(container); 5423 if (!str) { 5424 Py_DECREF(sub); 5425 return -1; 5426 } 5427 5428 result = stringlib_contains_obj(str, sub); 5429 5430 Py_DECREF(str); 5431 Py_DECREF(sub); 5432 5433 return result; 5434} 5435 5436/* Concat to string or Unicode object giving a new Unicode object. */ 5437 5438PyObject *PyUnicode_Concat(PyObject *left, 5439 PyObject *right) 5440{ 5441 PyUnicodeObject *u = NULL, *v = NULL, *w; 5442 5443 /* Coerce the two arguments */ 5444 u = (PyUnicodeObject *)PyUnicode_FromObject(left); 5445 if (u == NULL) 5446 goto onError; 5447 v = (PyUnicodeObject *)PyUnicode_FromObject(right); 5448 if (v == NULL) 5449 goto onError; 5450 5451 /* Shortcuts */ 5452 if (v == unicode_empty) { 5453 Py_DECREF(v); 5454 return (PyObject *)u; 5455 } 5456 if (u == unicode_empty) { 5457 Py_DECREF(u); 5458 return (PyObject *)v; 5459 } 5460 5461 /* Concat the two Unicode strings */ 5462 w = _PyUnicode_New(u->length + v->length); 5463 if (w == NULL) 5464 goto onError; 5465 Py_UNICODE_COPY(w->str, u->str, u->length); 5466 Py_UNICODE_COPY(w->str + u->length, v->str, v->length); 5467 5468 Py_DECREF(u); 5469 Py_DECREF(v); 5470 return (PyObject *)w; 5471 5472onError: 5473 Py_XDECREF(u); 5474 Py_XDECREF(v); 5475 return NULL; 5476} 5477 5478PyDoc_STRVAR(count__doc__, 5479"S.count(sub[, start[, end]]) -> int\n\ 5480\n\ 5481Return the number of non-overlapping occurrences of substring sub in\n\ 5482Unicode string S[start:end]. Optional arguments start and end are\n\ 5483interpreted as in slice notation."); 5484 5485static PyObject * 5486unicode_count(PyUnicodeObject *self, PyObject *args) 5487{ 5488 PyUnicodeObject *substring; 5489 Py_ssize_t start = 0; 5490 Py_ssize_t end = PY_SSIZE_T_MAX; 5491 PyObject *result; 5492 5493 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring, 5494 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 5495 return NULL; 5496 5497 substring = (PyUnicodeObject *)PyUnicode_FromObject( 5498 (PyObject *)substring); 5499 if (substring == NULL) 5500 return NULL; 5501 5502 FIX_START_END(self); 5503 5504 result = PyInt_FromSsize_t( 5505 stringlib_count(self->str + start, end - start, 5506 substring->str, substring->length) 5507 ); 5508 5509 Py_DECREF(substring); 5510 5511 return result; 5512} 5513 5514PyDoc_STRVAR(encode__doc__, 5515"S.encode([encoding[,errors]]) -> string or unicode\n\ 5516\n\ 5517Encodes S using the codec registered for encoding. encoding defaults\n\ 5518to the default encoding. errors may be given to set a different error\n\ 5519handling scheme. Default is 'strict' meaning that encoding errors raise\n\ 5520a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\ 5521'xmlcharrefreplace' as well as any other name registered with\n\ 5522codecs.register_error that can handle UnicodeEncodeErrors."); 5523 5524static PyObject * 5525unicode_encode(PyUnicodeObject *self, PyObject *args) 5526{ 5527 char *encoding = NULL; 5528 char *errors = NULL; 5529 PyObject *v; 5530 5531 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors)) 5532 return NULL; 5533 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors); 5534 if (v == NULL) 5535 goto onError; 5536 if (!PyString_Check(v) && !PyUnicode_Check(v)) { 5537 PyErr_Format(PyExc_TypeError, 5538 "encoder did not return a string/unicode object " 5539 "(type=%.400s)", 5540 v->ob_type->tp_name); 5541 Py_DECREF(v); 5542 return NULL; 5543 } 5544 return v; 5545 5546 onError: 5547 return NULL; 5548} 5549 5550PyDoc_STRVAR(decode__doc__, 5551"S.decode([encoding[,errors]]) -> string or unicode\n\ 5552\n\ 5553Decodes S using the codec registered for encoding. encoding defaults\n\ 5554to the default encoding. errors may be given to set a different error\n\ 5555handling scheme. Default is 'strict' meaning that encoding errors raise\n\ 5556a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\ 5557as well as any other name registerd with codecs.register_error that is\n\ 5558able to handle UnicodeDecodeErrors."); 5559 5560static PyObject * 5561unicode_decode(PyUnicodeObject *self, PyObject *args) 5562{ 5563 char *encoding = NULL; 5564 char *errors = NULL; 5565 PyObject *v; 5566 5567 if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors)) 5568 return NULL; 5569 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors); 5570 if (v == NULL) 5571 goto onError; 5572 if (!PyString_Check(v) && !PyUnicode_Check(v)) { 5573 PyErr_Format(PyExc_TypeError, 5574 "decoder did not return a string/unicode object " 5575 "(type=%.400s)", 5576 v->ob_type->tp_name); 5577 Py_DECREF(v); 5578 return NULL; 5579 } 5580 return v; 5581 5582 onError: 5583 return NULL; 5584} 5585 5586PyDoc_STRVAR(expandtabs__doc__, 5587"S.expandtabs([tabsize]) -> unicode\n\ 5588\n\ 5589Return a copy of S where all tab characters are expanded using spaces.\n\ 5590If tabsize is not given, a tab size of 8 characters is assumed."); 5591 5592static PyObject* 5593unicode_expandtabs(PyUnicodeObject *self, PyObject *args) 5594{ 5595 Py_UNICODE *e; 5596 Py_UNICODE *p; 5597 Py_UNICODE *q; 5598 Py_ssize_t i, j; 5599 PyUnicodeObject *u; 5600 int tabsize = 8; 5601 5602 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize)) 5603 return NULL; 5604 5605 /* First pass: determine size of output string */ 5606 i = j = 0; 5607 e = self->str + self->length; 5608 for (p = self->str; p < e; p++) 5609 if (*p == '\t') { 5610 if (tabsize > 0) 5611 j += tabsize - (j % tabsize); 5612 } 5613 else { 5614 j++; 5615 if (*p == '\n' || *p == '\r') { 5616 i += j; 5617 j = 0; 5618 } 5619 } 5620 5621 /* Second pass: create output string and fill it */ 5622 u = _PyUnicode_New(i + j); 5623 if (!u) 5624 return NULL; 5625 5626 j = 0; 5627 q = u->str; 5628 5629 for (p = self->str; p < e; p++) 5630 if (*p == '\t') { 5631 if (tabsize > 0) { 5632 i = tabsize - (j % tabsize); 5633 j += i; 5634 while (i--) 5635 *q++ = ' '; 5636 } 5637 } 5638 else { 5639 j++; 5640 *q++ = *p; 5641 if (*p == '\n' || *p == '\r') 5642 j = 0; 5643 } 5644 5645 return (PyObject*) u; 5646} 5647 5648PyDoc_STRVAR(find__doc__, 5649"S.find(sub [,start [,end]]) -> int\n\ 5650\n\ 5651Return the lowest index in S where substring sub is found,\n\ 5652such that sub is contained within s[start,end]. Optional\n\ 5653arguments start and end are interpreted as in slice notation.\n\ 5654\n\ 5655Return -1 on failure."); 5656 5657static PyObject * 5658unicode_find(PyUnicodeObject *self, PyObject *args) 5659{ 5660 PyObject *substring; 5661 Py_ssize_t start = 0; 5662 Py_ssize_t end = PY_SSIZE_T_MAX; 5663 Py_ssize_t result; 5664 5665 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring, 5666 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 5667 return NULL; 5668 substring = PyUnicode_FromObject(substring); 5669 if (!substring) 5670 return NULL; 5671 5672 result = stringlib_find_slice( 5673 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self), 5674 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring), 5675 start, end 5676 ); 5677 5678 Py_DECREF(substring); 5679 5680 return PyInt_FromSsize_t(result); 5681} 5682 5683static PyObject * 5684unicode_getitem(PyUnicodeObject *self, Py_ssize_t index) 5685{ 5686 if (index < 0 || index >= self->length) { 5687 PyErr_SetString(PyExc_IndexError, "string index out of range"); 5688 return NULL; 5689 } 5690 5691 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1); 5692} 5693 5694static long 5695unicode_hash(PyUnicodeObject *self) 5696{ 5697 /* Since Unicode objects compare equal to their ASCII string 5698 counterparts, they should use the individual character values 5699 as basis for their hash value. This is needed to assure that 5700 strings and Unicode objects behave in the same way as 5701 dictionary keys. */ 5702 5703 register Py_ssize_t len; 5704 register Py_UNICODE *p; 5705 register long x; 5706 5707 if (self->hash != -1) 5708 return self->hash; 5709 len = PyUnicode_GET_SIZE(self); 5710 p = PyUnicode_AS_UNICODE(self); 5711 x = *p << 7; 5712 while (--len >= 0) 5713 x = (1000003*x) ^ *p++; 5714 x ^= PyUnicode_GET_SIZE(self); 5715 if (x == -1) 5716 x = -2; 5717 self->hash = x; 5718 return x; 5719} 5720 5721PyDoc_STRVAR(index__doc__, 5722"S.index(sub [,start [,end]]) -> int\n\ 5723\n\ 5724Like S.find() but raise ValueError when the substring is not found."); 5725 5726static PyObject * 5727unicode_index(PyUnicodeObject *self, PyObject *args) 5728{ 5729 Py_ssize_t result; 5730 PyObject *substring; 5731 Py_ssize_t start = 0; 5732 Py_ssize_t end = PY_SSIZE_T_MAX; 5733 5734 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring, 5735 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 5736 return NULL; 5737 substring = PyUnicode_FromObject(substring); 5738 if (!substring) 5739 return NULL; 5740 5741 result = stringlib_find_slice( 5742 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self), 5743 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring), 5744 start, end 5745 ); 5746 5747 Py_DECREF(substring); 5748 5749 if (result < 0) { 5750 PyErr_SetString(PyExc_ValueError, "substring not found"); 5751 return NULL; 5752 } 5753 5754 return PyInt_FromSsize_t(result); 5755} 5756 5757PyDoc_STRVAR(islower__doc__, 5758"S.islower() -> bool\n\ 5759\n\ 5760Return True if all cased characters in S are lowercase and there is\n\ 5761at least one cased character in S, False otherwise."); 5762 5763static PyObject* 5764unicode_islower(PyUnicodeObject *self) 5765{ 5766 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 5767 register const Py_UNICODE *e; 5768 int cased; 5769 5770 /* Shortcut for single character strings */ 5771 if (PyUnicode_GET_SIZE(self) == 1) 5772 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p)); 5773 5774 /* Special case for empty strings */ 5775 if (PyUnicode_GET_SIZE(self) == 0) 5776 return PyBool_FromLong(0); 5777 5778 e = p + PyUnicode_GET_SIZE(self); 5779 cased = 0; 5780 for (; p < e; p++) { 5781 register const Py_UNICODE ch = *p; 5782 5783 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) 5784 return PyBool_FromLong(0); 5785 else if (!cased && Py_UNICODE_ISLOWER(ch)) 5786 cased = 1; 5787 } 5788 return PyBool_FromLong(cased); 5789} 5790 5791PyDoc_STRVAR(isupper__doc__, 5792"S.isupper() -> bool\n\ 5793\n\ 5794Return True if all cased characters in S are uppercase and there is\n\ 5795at least one cased character in S, False otherwise."); 5796 5797static PyObject* 5798unicode_isupper(PyUnicodeObject *self) 5799{ 5800 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 5801 register const Py_UNICODE *e; 5802 int cased; 5803 5804 /* Shortcut for single character strings */ 5805 if (PyUnicode_GET_SIZE(self) == 1) 5806 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0); 5807 5808 /* Special case for empty strings */ 5809 if (PyUnicode_GET_SIZE(self) == 0) 5810 return PyBool_FromLong(0); 5811 5812 e = p + PyUnicode_GET_SIZE(self); 5813 cased = 0; 5814 for (; p < e; p++) { 5815 register const Py_UNICODE ch = *p; 5816 5817 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch)) 5818 return PyBool_FromLong(0); 5819 else if (!cased && Py_UNICODE_ISUPPER(ch)) 5820 cased = 1; 5821 } 5822 return PyBool_FromLong(cased); 5823} 5824 5825PyDoc_STRVAR(istitle__doc__, 5826"S.istitle() -> bool\n\ 5827\n\ 5828Return True if S is a titlecased string and there is at least one\n\ 5829character in S, i.e. upper- and titlecase characters may only\n\ 5830follow uncased characters and lowercase characters only cased ones.\n\ 5831Return False otherwise."); 5832 5833static PyObject* 5834unicode_istitle(PyUnicodeObject *self) 5835{ 5836 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 5837 register const Py_UNICODE *e; 5838 int cased, previous_is_cased; 5839 5840 /* Shortcut for single character strings */ 5841 if (PyUnicode_GET_SIZE(self) == 1) 5842 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) || 5843 (Py_UNICODE_ISUPPER(*p) != 0)); 5844 5845 /* Special case for empty strings */ 5846 if (PyUnicode_GET_SIZE(self) == 0) 5847 return PyBool_FromLong(0); 5848 5849 e = p + PyUnicode_GET_SIZE(self); 5850 cased = 0; 5851 previous_is_cased = 0; 5852 for (; p < e; p++) { 5853 register const Py_UNICODE ch = *p; 5854 5855 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) { 5856 if (previous_is_cased) 5857 return PyBool_FromLong(0); 5858 previous_is_cased = 1; 5859 cased = 1; 5860 } 5861 else if (Py_UNICODE_ISLOWER(ch)) { 5862 if (!previous_is_cased) 5863 return PyBool_FromLong(0); 5864 previous_is_cased = 1; 5865 cased = 1; 5866 } 5867 else 5868 previous_is_cased = 0; 5869 } 5870 return PyBool_FromLong(cased); 5871} 5872 5873PyDoc_STRVAR(isspace__doc__, 5874"S.isspace() -> bool\n\ 5875\n\ 5876Return True if all characters in S are whitespace\n\ 5877and there is at least one character in S, False otherwise."); 5878 5879static PyObject* 5880unicode_isspace(PyUnicodeObject *self) 5881{ 5882 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 5883 register const Py_UNICODE *e; 5884 5885 /* Shortcut for single character strings */ 5886 if (PyUnicode_GET_SIZE(self) == 1 && 5887 Py_UNICODE_ISSPACE(*p)) 5888 return PyBool_FromLong(1); 5889 5890 /* Special case for empty strings */ 5891 if (PyUnicode_GET_SIZE(self) == 0) 5892 return PyBool_FromLong(0); 5893 5894 e = p + PyUnicode_GET_SIZE(self); 5895 for (; p < e; p++) { 5896 if (!Py_UNICODE_ISSPACE(*p)) 5897 return PyBool_FromLong(0); 5898 } 5899 return PyBool_FromLong(1); 5900} 5901 5902PyDoc_STRVAR(isalpha__doc__, 5903"S.isalpha() -> bool\n\ 5904\n\ 5905Return True if all characters in S are alphabetic\n\ 5906and there is at least one character in S, False otherwise."); 5907 5908static PyObject* 5909unicode_isalpha(PyUnicodeObject *self) 5910{ 5911 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 5912 register const Py_UNICODE *e; 5913 5914 /* Shortcut for single character strings */ 5915 if (PyUnicode_GET_SIZE(self) == 1 && 5916 Py_UNICODE_ISALPHA(*p)) 5917 return PyBool_FromLong(1); 5918 5919 /* Special case for empty strings */ 5920 if (PyUnicode_GET_SIZE(self) == 0) 5921 return PyBool_FromLong(0); 5922 5923 e = p + PyUnicode_GET_SIZE(self); 5924 for (; p < e; p++) { 5925 if (!Py_UNICODE_ISALPHA(*p)) 5926 return PyBool_FromLong(0); 5927 } 5928 return PyBool_FromLong(1); 5929} 5930 5931PyDoc_STRVAR(isalnum__doc__, 5932"S.isalnum() -> bool\n\ 5933\n\ 5934Return True if all characters in S are alphanumeric\n\ 5935and there is at least one character in S, False otherwise."); 5936 5937static PyObject* 5938unicode_isalnum(PyUnicodeObject *self) 5939{ 5940 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 5941 register const Py_UNICODE *e; 5942 5943 /* Shortcut for single character strings */ 5944 if (PyUnicode_GET_SIZE(self) == 1 && 5945 Py_UNICODE_ISALNUM(*p)) 5946 return PyBool_FromLong(1); 5947 5948 /* Special case for empty strings */ 5949 if (PyUnicode_GET_SIZE(self) == 0) 5950 return PyBool_FromLong(0); 5951 5952 e = p + PyUnicode_GET_SIZE(self); 5953 for (; p < e; p++) { 5954 if (!Py_UNICODE_ISALNUM(*p)) 5955 return PyBool_FromLong(0); 5956 } 5957 return PyBool_FromLong(1); 5958} 5959 5960PyDoc_STRVAR(isdecimal__doc__, 5961"S.isdecimal() -> bool\n\ 5962\n\ 5963Return True if there are only decimal characters in S,\n\ 5964False otherwise."); 5965 5966static PyObject* 5967unicode_isdecimal(PyUnicodeObject *self) 5968{ 5969 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 5970 register const Py_UNICODE *e; 5971 5972 /* Shortcut for single character strings */ 5973 if (PyUnicode_GET_SIZE(self) == 1 && 5974 Py_UNICODE_ISDECIMAL(*p)) 5975 return PyBool_FromLong(1); 5976 5977 /* Special case for empty strings */ 5978 if (PyUnicode_GET_SIZE(self) == 0) 5979 return PyBool_FromLong(0); 5980 5981 e = p + PyUnicode_GET_SIZE(self); 5982 for (; p < e; p++) { 5983 if (!Py_UNICODE_ISDECIMAL(*p)) 5984 return PyBool_FromLong(0); 5985 } 5986 return PyBool_FromLong(1); 5987} 5988 5989PyDoc_STRVAR(isdigit__doc__, 5990"S.isdigit() -> bool\n\ 5991\n\ 5992Return True if all characters in S are digits\n\ 5993and there is at least one character in S, False otherwise."); 5994 5995static PyObject* 5996unicode_isdigit(PyUnicodeObject *self) 5997{ 5998 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 5999 register const Py_UNICODE *e; 6000 6001 /* Shortcut for single character strings */ 6002 if (PyUnicode_GET_SIZE(self) == 1 && 6003 Py_UNICODE_ISDIGIT(*p)) 6004 return PyBool_FromLong(1); 6005 6006 /* Special case for empty strings */ 6007 if (PyUnicode_GET_SIZE(self) == 0) 6008 return PyBool_FromLong(0); 6009 6010 e = p + PyUnicode_GET_SIZE(self); 6011 for (; p < e; p++) { 6012 if (!Py_UNICODE_ISDIGIT(*p)) 6013 return PyBool_FromLong(0); 6014 } 6015 return PyBool_FromLong(1); 6016} 6017 6018PyDoc_STRVAR(isnumeric__doc__, 6019"S.isnumeric() -> bool\n\ 6020\n\ 6021Return True if there are only numeric characters in S,\n\ 6022False otherwise."); 6023 6024static PyObject* 6025unicode_isnumeric(PyUnicodeObject *self) 6026{ 6027 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 6028 register const Py_UNICODE *e; 6029 6030 /* Shortcut for single character strings */ 6031 if (PyUnicode_GET_SIZE(self) == 1 && 6032 Py_UNICODE_ISNUMERIC(*p)) 6033 return PyBool_FromLong(1); 6034 6035 /* Special case for empty strings */ 6036 if (PyUnicode_GET_SIZE(self) == 0) 6037 return PyBool_FromLong(0); 6038 6039 e = p + PyUnicode_GET_SIZE(self); 6040 for (; p < e; p++) { 6041 if (!Py_UNICODE_ISNUMERIC(*p)) 6042 return PyBool_FromLong(0); 6043 } 6044 return PyBool_FromLong(1); 6045} 6046 6047PyDoc_STRVAR(join__doc__, 6048"S.join(sequence) -> unicode\n\ 6049\n\ 6050Return a string which is the concatenation of the strings in the\n\ 6051sequence. The separator between elements is S."); 6052 6053static PyObject* 6054unicode_join(PyObject *self, PyObject *data) 6055{ 6056 return PyUnicode_Join(self, data); 6057} 6058 6059static Py_ssize_t 6060unicode_length(PyUnicodeObject *self) 6061{ 6062 return self->length; 6063} 6064 6065PyDoc_STRVAR(ljust__doc__, 6066"S.ljust(width[, fillchar]) -> int\n\ 6067\n\ 6068Return S left justified in a Unicode string of length width. Padding is\n\ 6069done using the specified fill character (default is a space)."); 6070 6071static PyObject * 6072unicode_ljust(PyUnicodeObject *self, PyObject *args) 6073{ 6074 Py_ssize_t width; 6075 Py_UNICODE fillchar = ' '; 6076 6077 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar)) 6078 return NULL; 6079 6080 if (self->length >= width && PyUnicode_CheckExact(self)) { 6081 Py_INCREF(self); 6082 return (PyObject*) self; 6083 } 6084 6085 return (PyObject*) pad(self, 0, width - self->length, fillchar); 6086} 6087 6088PyDoc_STRVAR(lower__doc__, 6089"S.lower() -> unicode\n\ 6090\n\ 6091Return a copy of the string S converted to lowercase."); 6092 6093static PyObject* 6094unicode_lower(PyUnicodeObject *self) 6095{ 6096 return fixup(self, fixlower); 6097} 6098 6099#define LEFTSTRIP 0 6100#define RIGHTSTRIP 1 6101#define BOTHSTRIP 2 6102 6103/* Arrays indexed by above */ 6104static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"}; 6105 6106#define STRIPNAME(i) (stripformat[i]+3) 6107 6108/* externally visible for str.strip(unicode) */ 6109PyObject * 6110_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj) 6111{ 6112 Py_UNICODE *s = PyUnicode_AS_UNICODE(self); 6113 Py_ssize_t len = PyUnicode_GET_SIZE(self); 6114 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj); 6115 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj); 6116 Py_ssize_t i, j; 6117 6118 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen); 6119 6120 i = 0; 6121 if (striptype != RIGHTSTRIP) { 6122 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) { 6123 i++; 6124 } 6125 } 6126 6127 j = len; 6128 if (striptype != LEFTSTRIP) { 6129 do { 6130 j--; 6131 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen)); 6132 j++; 6133 } 6134 6135 if (i == 0 && j == len && PyUnicode_CheckExact(self)) { 6136 Py_INCREF(self); 6137 return (PyObject*)self; 6138 } 6139 else 6140 return PyUnicode_FromUnicode(s+i, j-i); 6141} 6142 6143 6144static PyObject * 6145do_strip(PyUnicodeObject *self, int striptype) 6146{ 6147 Py_UNICODE *s = PyUnicode_AS_UNICODE(self); 6148 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j; 6149 6150 i = 0; 6151 if (striptype != RIGHTSTRIP) { 6152 while (i < len && Py_UNICODE_ISSPACE(s[i])) { 6153 i++; 6154 } 6155 } 6156 6157 j = len; 6158 if (striptype != LEFTSTRIP) { 6159 do { 6160 j--; 6161 } while (j >= i && Py_UNICODE_ISSPACE(s[j])); 6162 j++; 6163 } 6164 6165 if (i == 0 && j == len && PyUnicode_CheckExact(self)) { 6166 Py_INCREF(self); 6167 return (PyObject*)self; 6168 } 6169 else 6170 return PyUnicode_FromUnicode(s+i, j-i); 6171} 6172 6173 6174static PyObject * 6175do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args) 6176{ 6177 PyObject *sep = NULL; 6178 6179 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep)) 6180 return NULL; 6181 6182 if (sep != NULL && sep != Py_None) { 6183 if (PyUnicode_Check(sep)) 6184 return _PyUnicode_XStrip(self, striptype, sep); 6185 else if (PyString_Check(sep)) { 6186 PyObject *res; 6187 sep = PyUnicode_FromObject(sep); 6188 if (sep==NULL) 6189 return NULL; 6190 res = _PyUnicode_XStrip(self, striptype, sep); 6191 Py_DECREF(sep); 6192 return res; 6193 } 6194 else { 6195 PyErr_Format(PyExc_TypeError, 6196 "%s arg must be None, unicode or str", 6197 STRIPNAME(striptype)); 6198 return NULL; 6199 } 6200 } 6201 6202 return do_strip(self, striptype); 6203} 6204 6205 6206PyDoc_STRVAR(strip__doc__, 6207"S.strip([chars]) -> unicode\n\ 6208\n\ 6209Return a copy of the string S with leading and trailing\n\ 6210whitespace removed.\n\ 6211If chars is given and not None, remove characters in chars instead.\n\ 6212If chars is a str, it will be converted to unicode before stripping"); 6213 6214static PyObject * 6215unicode_strip(PyUnicodeObject *self, PyObject *args) 6216{ 6217 if (PyTuple_GET_SIZE(args) == 0) 6218 return do_strip(self, BOTHSTRIP); /* Common case */ 6219 else 6220 return do_argstrip(self, BOTHSTRIP, args); 6221} 6222 6223 6224PyDoc_STRVAR(lstrip__doc__, 6225"S.lstrip([chars]) -> unicode\n\ 6226\n\ 6227Return a copy of the string S with leading whitespace removed.\n\ 6228If chars is given and not None, remove characters in chars instead.\n\ 6229If chars is a str, it will be converted to unicode before stripping"); 6230 6231static PyObject * 6232unicode_lstrip(PyUnicodeObject *self, PyObject *args) 6233{ 6234 if (PyTuple_GET_SIZE(args) == 0) 6235 return do_strip(self, LEFTSTRIP); /* Common case */ 6236 else 6237 return do_argstrip(self, LEFTSTRIP, args); 6238} 6239 6240 6241PyDoc_STRVAR(rstrip__doc__, 6242"S.rstrip([chars]) -> unicode\n\ 6243\n\ 6244Return a copy of the string S with trailing whitespace removed.\n\ 6245If chars is given and not None, remove characters in chars instead.\n\ 6246If chars is a str, it will be converted to unicode before stripping"); 6247 6248static PyObject * 6249unicode_rstrip(PyUnicodeObject *self, PyObject *args) 6250{ 6251 if (PyTuple_GET_SIZE(args) == 0) 6252 return do_strip(self, RIGHTSTRIP); /* Common case */ 6253 else 6254 return do_argstrip(self, RIGHTSTRIP, args); 6255} 6256 6257 6258static PyObject* 6259unicode_repeat(PyUnicodeObject *str, Py_ssize_t len) 6260{ 6261 PyUnicodeObject *u; 6262 Py_UNICODE *p; 6263 Py_ssize_t nchars; 6264 size_t nbytes; 6265 6266 if (len < 0) 6267 len = 0; 6268 6269 if (len == 1 && PyUnicode_CheckExact(str)) { 6270 /* no repeat, return original string */ 6271 Py_INCREF(str); 6272 return (PyObject*) str; 6273 } 6274 6275 /* ensure # of chars needed doesn't overflow int and # of bytes 6276 * needed doesn't overflow size_t 6277 */ 6278 nchars = len * str->length; 6279 if (len && nchars / len != str->length) { 6280 PyErr_SetString(PyExc_OverflowError, 6281 "repeated string is too long"); 6282 return NULL; 6283 } 6284 nbytes = (nchars + 1) * sizeof(Py_UNICODE); 6285 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) { 6286 PyErr_SetString(PyExc_OverflowError, 6287 "repeated string is too long"); 6288 return NULL; 6289 } 6290 u = _PyUnicode_New(nchars); 6291 if (!u) 6292 return NULL; 6293 6294 p = u->str; 6295 6296 if (str->length == 1 && len > 0) { 6297 Py_UNICODE_FILL(p, str->str[0], len); 6298 } else { 6299 Py_ssize_t done = 0; /* number of characters copied this far */ 6300 if (done < nchars) { 6301 Py_UNICODE_COPY(p, str->str, str->length); 6302 done = str->length; 6303 } 6304 while (done < nchars) { 6305 int n = (done <= nchars-done) ? done : nchars-done; 6306 Py_UNICODE_COPY(p+done, p, n); 6307 done += n; 6308 } 6309 } 6310 6311 return (PyObject*) u; 6312} 6313 6314PyObject *PyUnicode_Replace(PyObject *obj, 6315 PyObject *subobj, 6316 PyObject *replobj, 6317 Py_ssize_t maxcount) 6318{ 6319 PyObject *self; 6320 PyObject *str1; 6321 PyObject *str2; 6322 PyObject *result; 6323 6324 self = PyUnicode_FromObject(obj); 6325 if (self == NULL) 6326 return NULL; 6327 str1 = PyUnicode_FromObject(subobj); 6328 if (str1 == NULL) { 6329 Py_DECREF(self); 6330 return NULL; 6331 } 6332 str2 = PyUnicode_FromObject(replobj); 6333 if (str2 == NULL) { 6334 Py_DECREF(self); 6335 Py_DECREF(str1); 6336 return NULL; 6337 } 6338 result = replace((PyUnicodeObject *)self, 6339 (PyUnicodeObject *)str1, 6340 (PyUnicodeObject *)str2, 6341 maxcount); 6342 Py_DECREF(self); 6343 Py_DECREF(str1); 6344 Py_DECREF(str2); 6345 return result; 6346} 6347 6348PyDoc_STRVAR(replace__doc__, 6349"S.replace (old, new[, maxsplit]) -> unicode\n\ 6350\n\ 6351Return a copy of S with all occurrences of substring\n\ 6352old replaced by new. If the optional argument maxsplit is\n\ 6353given, only the first maxsplit occurrences are replaced."); 6354 6355static PyObject* 6356unicode_replace(PyUnicodeObject *self, PyObject *args) 6357{ 6358 PyUnicodeObject *str1; 6359 PyUnicodeObject *str2; 6360 Py_ssize_t maxcount = -1; 6361 PyObject *result; 6362 6363 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount)) 6364 return NULL; 6365 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1); 6366 if (str1 == NULL) 6367 return NULL; 6368 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2); 6369 if (str2 == NULL) { 6370 Py_DECREF(str1); 6371 return NULL; 6372 } 6373 6374 result = replace(self, str1, str2, maxcount); 6375 6376 Py_DECREF(str1); 6377 Py_DECREF(str2); 6378 return result; 6379} 6380 6381static 6382PyObject *unicode_repr(PyObject *unicode) 6383{ 6384 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode), 6385 PyUnicode_GET_SIZE(unicode), 6386 1); 6387} 6388 6389PyDoc_STRVAR(rfind__doc__, 6390"S.rfind(sub [,start [,end]]) -> int\n\ 6391\n\ 6392Return the highest index in S where substring sub is found,\n\ 6393such that sub is contained within s[start,end]. Optional\n\ 6394arguments start and end are interpreted as in slice notation.\n\ 6395\n\ 6396Return -1 on failure."); 6397 6398static PyObject * 6399unicode_rfind(PyUnicodeObject *self, PyObject *args) 6400{ 6401 PyObject *substring; 6402 Py_ssize_t start = 0; 6403 Py_ssize_t end = PY_SSIZE_T_MAX; 6404 Py_ssize_t result; 6405 6406 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring, 6407 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 6408 return NULL; 6409 substring = PyUnicode_FromObject(substring); 6410 if (!substring) 6411 return NULL; 6412 6413 result = stringlib_rfind_slice( 6414 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self), 6415 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring), 6416 start, end 6417 ); 6418 6419 Py_DECREF(substring); 6420 6421 return PyInt_FromSsize_t(result); 6422} 6423 6424PyDoc_STRVAR(rindex__doc__, 6425"S.rindex(sub [,start [,end]]) -> int\n\ 6426\n\ 6427Like S.rfind() but raise ValueError when the substring is not found."); 6428 6429static PyObject * 6430unicode_rindex(PyUnicodeObject *self, PyObject *args) 6431{ 6432 PyObject *substring; 6433 Py_ssize_t start = 0; 6434 Py_ssize_t end = PY_SSIZE_T_MAX; 6435 Py_ssize_t result; 6436 6437 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring, 6438 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 6439 return NULL; 6440 substring = PyUnicode_FromObject(substring); 6441 if (!substring) 6442 return NULL; 6443 6444 result = stringlib_rfind_slice( 6445 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self), 6446 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring), 6447 start, end 6448 ); 6449 6450 Py_DECREF(substring); 6451 6452 if (result < 0) { 6453 PyErr_SetString(PyExc_ValueError, "substring not found"); 6454 return NULL; 6455 } 6456 return PyInt_FromSsize_t(result); 6457} 6458 6459PyDoc_STRVAR(rjust__doc__, 6460"S.rjust(width[, fillchar]) -> unicode\n\ 6461\n\ 6462Return S right justified in a Unicode string of length width. Padding is\n\ 6463done using the specified fill character (default is a space)."); 6464 6465static PyObject * 6466unicode_rjust(PyUnicodeObject *self, PyObject *args) 6467{ 6468 Py_ssize_t width; 6469 Py_UNICODE fillchar = ' '; 6470 6471 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar)) 6472 return NULL; 6473 6474 if (self->length >= width && PyUnicode_CheckExact(self)) { 6475 Py_INCREF(self); 6476 return (PyObject*) self; 6477 } 6478 6479 return (PyObject*) pad(self, width - self->length, 0, fillchar); 6480} 6481 6482static PyObject* 6483unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end) 6484{ 6485 /* standard clamping */ 6486 if (start < 0) 6487 start = 0; 6488 if (end < 0) 6489 end = 0; 6490 if (end > self->length) 6491 end = self->length; 6492 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) { 6493 /* full slice, return original string */ 6494 Py_INCREF(self); 6495 return (PyObject*) self; 6496 } 6497 if (start > end) 6498 start = end; 6499 /* copy slice */ 6500 return (PyObject*) PyUnicode_FromUnicode(self->str + start, 6501 end - start); 6502} 6503 6504PyObject *PyUnicode_Split(PyObject *s, 6505 PyObject *sep, 6506 Py_ssize_t maxsplit) 6507{ 6508 PyObject *result; 6509 6510 s = PyUnicode_FromObject(s); 6511 if (s == NULL) 6512 return NULL; 6513 if (sep != NULL) { 6514 sep = PyUnicode_FromObject(sep); 6515 if (sep == NULL) { 6516 Py_DECREF(s); 6517 return NULL; 6518 } 6519 } 6520 6521 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit); 6522 6523 Py_DECREF(s); 6524 Py_XDECREF(sep); 6525 return result; 6526} 6527 6528PyDoc_STRVAR(split__doc__, 6529"S.split([sep [,maxsplit]]) -> list of strings\n\ 6530\n\ 6531Return a list of the words in S, using sep as the\n\ 6532delimiter string. If maxsplit is given, at most maxsplit\n\ 6533splits are done. If sep is not specified or is None,\n\ 6534any whitespace string is a separator."); 6535 6536static PyObject* 6537unicode_split(PyUnicodeObject *self, PyObject *args) 6538{ 6539 PyObject *substring = Py_None; 6540 Py_ssize_t maxcount = -1; 6541 6542 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount)) 6543 return NULL; 6544 6545 if (substring == Py_None) 6546 return split(self, NULL, maxcount); 6547 else if (PyUnicode_Check(substring)) 6548 return split(self, (PyUnicodeObject *)substring, maxcount); 6549 else 6550 return PyUnicode_Split((PyObject *)self, substring, maxcount); 6551} 6552 6553PyObject * 6554PyUnicode_Partition(PyObject *str_in, PyObject *sep_in) 6555{ 6556 PyObject* str_obj; 6557 PyObject* sep_obj; 6558 PyObject* out; 6559 6560 str_obj = PyUnicode_FromObject(str_in); 6561 if (!str_obj) 6562 return NULL; 6563 sep_obj = PyUnicode_FromObject(sep_in); 6564 if (!sep_obj) { 6565 Py_DECREF(str_obj); 6566 return NULL; 6567 } 6568 6569 out = stringlib_partition( 6570 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj), 6571 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj) 6572 ); 6573 6574 Py_DECREF(sep_obj); 6575 Py_DECREF(str_obj); 6576 6577 return out; 6578} 6579 6580 6581PyObject * 6582PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in) 6583{ 6584 PyObject* str_obj; 6585 PyObject* sep_obj; 6586 PyObject* out; 6587 6588 str_obj = PyUnicode_FromObject(str_in); 6589 if (!str_obj) 6590 return NULL; 6591 sep_obj = PyUnicode_FromObject(sep_in); 6592 if (!sep_obj) { 6593 Py_DECREF(str_obj); 6594 return NULL; 6595 } 6596 6597 out = stringlib_rpartition( 6598 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj), 6599 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj) 6600 ); 6601 6602 Py_DECREF(sep_obj); 6603 Py_DECREF(str_obj); 6604 6605 return out; 6606} 6607 6608PyDoc_STRVAR(partition__doc__, 6609"S.partition(sep) -> (head, sep, tail)\n\ 6610\n\ 6611Searches for the separator sep in S, and returns the part before it,\n\ 6612the separator itself, and the part after it. If the separator is not\n\ 6613found, returns S and two empty strings."); 6614 6615static PyObject* 6616unicode_partition(PyUnicodeObject *self, PyObject *separator) 6617{ 6618 return PyUnicode_Partition((PyObject *)self, separator); 6619} 6620 6621PyDoc_STRVAR(rpartition__doc__, 6622"S.rpartition(sep) -> (head, sep, tail)\n\ 6623\n\ 6624Searches for the separator sep in S, starting at the end of S, and returns\n\ 6625the part before it, the separator itself, and the part after it. If the\n\ 6626separator is not found, returns S and two empty strings."); 6627 6628static PyObject* 6629unicode_rpartition(PyUnicodeObject *self, PyObject *separator) 6630{ 6631 return PyUnicode_RPartition((PyObject *)self, separator); 6632} 6633 6634PyObject *PyUnicode_RSplit(PyObject *s, 6635 PyObject *sep, 6636 Py_ssize_t maxsplit) 6637{ 6638 PyObject *result; 6639 6640 s = PyUnicode_FromObject(s); 6641 if (s == NULL) 6642 return NULL; 6643 if (sep != NULL) { 6644 sep = PyUnicode_FromObject(sep); 6645 if (sep == NULL) { 6646 Py_DECREF(s); 6647 return NULL; 6648 } 6649 } 6650 6651 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit); 6652 6653 Py_DECREF(s); 6654 Py_XDECREF(sep); 6655 return result; 6656} 6657 6658PyDoc_STRVAR(rsplit__doc__, 6659"S.rsplit([sep [,maxsplit]]) -> list of strings\n\ 6660\n\ 6661Return a list of the words in S, using sep as the\n\ 6662delimiter string, starting at the end of the string and\n\ 6663working to the front. If maxsplit is given, at most maxsplit\n\ 6664splits are done. If sep is not specified, any whitespace string\n\ 6665is a separator."); 6666 6667static PyObject* 6668unicode_rsplit(PyUnicodeObject *self, PyObject *args) 6669{ 6670 PyObject *substring = Py_None; 6671 Py_ssize_t maxcount = -1; 6672 6673 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount)) 6674 return NULL; 6675 6676 if (substring == Py_None) 6677 return rsplit(self, NULL, maxcount); 6678 else if (PyUnicode_Check(substring)) 6679 return rsplit(self, (PyUnicodeObject *)substring, maxcount); 6680 else 6681 return PyUnicode_RSplit((PyObject *)self, substring, maxcount); 6682} 6683 6684PyDoc_STRVAR(splitlines__doc__, 6685"S.splitlines([keepends]]) -> list of strings\n\ 6686\n\ 6687Return a list of the lines in S, breaking at line boundaries.\n\ 6688Line breaks are not included in the resulting list unless keepends\n\ 6689is given and true."); 6690 6691static PyObject* 6692unicode_splitlines(PyUnicodeObject *self, PyObject *args) 6693{ 6694 int keepends = 0; 6695 6696 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends)) 6697 return NULL; 6698 6699 return PyUnicode_Splitlines((PyObject *)self, keepends); 6700} 6701 6702static 6703PyObject *unicode_str(PyUnicodeObject *self) 6704{ 6705 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL); 6706} 6707 6708PyDoc_STRVAR(swapcase__doc__, 6709"S.swapcase() -> unicode\n\ 6710\n\ 6711Return a copy of S with uppercase characters converted to lowercase\n\ 6712and vice versa."); 6713 6714static PyObject* 6715unicode_swapcase(PyUnicodeObject *self) 6716{ 6717 return fixup(self, fixswapcase); 6718} 6719 6720PyDoc_STRVAR(translate__doc__, 6721"S.translate(table) -> unicode\n\ 6722\n\ 6723Return a copy of the string S, where all characters have been mapped\n\ 6724through the given translation table, which must be a mapping of\n\ 6725Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\ 6726Unmapped characters are left untouched. Characters mapped to None\n\ 6727are deleted."); 6728 6729static PyObject* 6730unicode_translate(PyUnicodeObject *self, PyObject *table) 6731{ 6732 return PyUnicode_TranslateCharmap(self->str, 6733 self->length, 6734 table, 6735 "ignore"); 6736} 6737 6738PyDoc_STRVAR(upper__doc__, 6739"S.upper() -> unicode\n\ 6740\n\ 6741Return a copy of S converted to uppercase."); 6742 6743static PyObject* 6744unicode_upper(PyUnicodeObject *self) 6745{ 6746 return fixup(self, fixupper); 6747} 6748 6749PyDoc_STRVAR(zfill__doc__, 6750"S.zfill(width) -> unicode\n\ 6751\n\ 6752Pad a numeric string x with zeros on the left, to fill a field\n\ 6753of the specified width. The string x is never truncated."); 6754 6755static PyObject * 6756unicode_zfill(PyUnicodeObject *self, PyObject *args) 6757{ 6758 Py_ssize_t fill; 6759 PyUnicodeObject *u; 6760 6761 Py_ssize_t width; 6762 if (!PyArg_ParseTuple(args, "n:zfill", &width)) 6763 return NULL; 6764 6765 if (self->length >= width) { 6766 if (PyUnicode_CheckExact(self)) { 6767 Py_INCREF(self); 6768 return (PyObject*) self; 6769 } 6770 else 6771 return PyUnicode_FromUnicode( 6772 PyUnicode_AS_UNICODE(self), 6773 PyUnicode_GET_SIZE(self) 6774 ); 6775 } 6776 6777 fill = width - self->length; 6778 6779 u = pad(self, fill, 0, '0'); 6780 6781 if (u == NULL) 6782 return NULL; 6783 6784 if (u->str[fill] == '+' || u->str[fill] == '-') { 6785 /* move sign to beginning of string */ 6786 u->str[0] = u->str[fill]; 6787 u->str[fill] = '0'; 6788 } 6789 6790 return (PyObject*) u; 6791} 6792 6793#if 0 6794static PyObject* 6795unicode_freelistsize(PyUnicodeObject *self) 6796{ 6797 return PyInt_FromLong(unicode_freelist_size); 6798} 6799#endif 6800 6801PyDoc_STRVAR(startswith__doc__, 6802"S.startswith(prefix[, start[, end]]) -> bool\n\ 6803\n\ 6804Return True if S starts with the specified prefix, False otherwise.\n\ 6805With optional start, test S beginning at that position.\n\ 6806With optional end, stop comparing S at that position.\n\ 6807prefix can also be a tuple of strings to try."); 6808 6809static PyObject * 6810unicode_startswith(PyUnicodeObject *self, 6811 PyObject *args) 6812{ 6813 PyObject *subobj; 6814 PyUnicodeObject *substring; 6815 Py_ssize_t start = 0; 6816 Py_ssize_t end = PY_SSIZE_T_MAX; 6817 int result; 6818 6819 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj, 6820 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 6821 return NULL; 6822 if (PyTuple_Check(subobj)) { 6823 Py_ssize_t i; 6824 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) { 6825 substring = (PyUnicodeObject *)PyUnicode_FromObject( 6826 PyTuple_GET_ITEM(subobj, i)); 6827 if (substring == NULL) 6828 return NULL; 6829 result = tailmatch(self, substring, start, end, -1); 6830 Py_DECREF(substring); 6831 if (result) { 6832 Py_RETURN_TRUE; 6833 } 6834 } 6835 /* nothing matched */ 6836 Py_RETURN_FALSE; 6837 } 6838 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj); 6839 if (substring == NULL) 6840 return NULL; 6841 result = tailmatch(self, substring, start, end, -1); 6842 Py_DECREF(substring); 6843 return PyBool_FromLong(result); 6844} 6845 6846 6847PyDoc_STRVAR(endswith__doc__, 6848"S.endswith(suffix[, start[, end]]) -> bool\n\ 6849\n\ 6850Return True if S ends with the specified suffix, False otherwise.\n\ 6851With optional start, test S beginning at that position.\n\ 6852With optional end, stop comparing S at that position.\n\ 6853suffix can also be a tuple of strings to try."); 6854 6855static PyObject * 6856unicode_endswith(PyUnicodeObject *self, 6857 PyObject *args) 6858{ 6859 PyObject *subobj; 6860 PyUnicodeObject *substring; 6861 Py_ssize_t start = 0; 6862 Py_ssize_t end = PY_SSIZE_T_MAX; 6863 int result; 6864 6865 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj, 6866 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 6867 return NULL; 6868 if (PyTuple_Check(subobj)) { 6869 Py_ssize_t i; 6870 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) { 6871 substring = (PyUnicodeObject *)PyUnicode_FromObject( 6872 PyTuple_GET_ITEM(subobj, i)); 6873 if (substring == NULL) 6874 return NULL; 6875 result = tailmatch(self, substring, start, end, +1); 6876 Py_DECREF(substring); 6877 if (result) { 6878 Py_RETURN_TRUE; 6879 } 6880 } 6881 Py_RETURN_FALSE; 6882 } 6883 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj); 6884 if (substring == NULL) 6885 return NULL; 6886 6887 result = tailmatch(self, substring, start, end, +1); 6888 Py_DECREF(substring); 6889 return PyBool_FromLong(result); 6890} 6891 6892 6893 6894static PyObject * 6895unicode_getnewargs(PyUnicodeObject *v) 6896{ 6897 return Py_BuildValue("(u#)", v->str, v->length); 6898} 6899 6900 6901static PyMethodDef unicode_methods[] = { 6902 6903 /* Order is according to common usage: often used methods should 6904 appear first, since lookup is done sequentially. */ 6905 6906 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__}, 6907 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__}, 6908 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__}, 6909 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__}, 6910 {"join", (PyCFunction) unicode_join, METH_O, join__doc__}, 6911 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__}, 6912 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__}, 6913 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__}, 6914 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__}, 6915 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__}, 6916 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__}, 6917 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__}, 6918 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__}, 6919 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__}, 6920 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__}, 6921 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__}, 6922 {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__}, 6923/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */ 6924 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__}, 6925 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__}, 6926 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__}, 6927 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__}, 6928 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__}, 6929 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__}, 6930 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__}, 6931 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__}, 6932 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__}, 6933 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__}, 6934 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__}, 6935 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__}, 6936 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__}, 6937 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__}, 6938 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__}, 6939 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__}, 6940 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__}, 6941 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__}, 6942 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__}, 6943 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__}, 6944 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__}, 6945 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__}, 6946#if 0 6947 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__}, 6948#endif 6949 6950#if 0 6951 /* This one is just used for debugging the implementation. */ 6952 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS}, 6953#endif 6954 6955 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS}, 6956 {NULL, NULL} 6957}; 6958 6959static PyObject * 6960unicode_mod(PyObject *v, PyObject *w) 6961{ 6962 if (!PyUnicode_Check(v)) { 6963 Py_INCREF(Py_NotImplemented); 6964 return Py_NotImplemented; 6965 } 6966 return PyUnicode_Format(v, w); 6967} 6968 6969static PyNumberMethods unicode_as_number = { 6970 0, /*nb_add*/ 6971 0, /*nb_subtract*/ 6972 0, /*nb_multiply*/ 6973 unicode_mod, /*nb_remainder*/ 6974}; 6975 6976static PySequenceMethods unicode_as_sequence = { 6977 (lenfunc) unicode_length, /* sq_length */ 6978 PyUnicode_Concat, /* sq_concat */ 6979 (ssizeargfunc) unicode_repeat, /* sq_repeat */ 6980 (ssizeargfunc) unicode_getitem, /* sq_item */ 6981 (ssizessizeargfunc) unicode_slice, /* sq_slice */ 6982 0, /* sq_ass_item */ 6983 0, /* sq_ass_slice */ 6984 PyUnicode_Contains, /* sq_contains */ 6985}; 6986 6987static PyObject* 6988unicode_subscript(PyUnicodeObject* self, PyObject* item) 6989{ 6990 PyNumberMethods *nb = item->ob_type->tp_as_number; 6991 if (nb != NULL && nb->nb_index != NULL) { 6992 Py_ssize_t i = nb->nb_index(item); 6993 if (i == -1 && PyErr_Occurred()) 6994 return NULL; 6995 if (i < 0) 6996 i += PyUnicode_GET_SIZE(self); 6997 return unicode_getitem(self, i); 6998 } else if (PySlice_Check(item)) { 6999 Py_ssize_t start, stop, step, slicelength, cur, i; 7000 Py_UNICODE* source_buf; 7001 Py_UNICODE* result_buf; 7002 PyObject* result; 7003 7004 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self), 7005 &start, &stop, &step, &slicelength) < 0) { 7006 return NULL; 7007 } 7008 7009 if (slicelength <= 0) { 7010 return PyUnicode_FromUnicode(NULL, 0); 7011 } else { 7012 source_buf = PyUnicode_AS_UNICODE((PyObject*)self); 7013 result_buf = (Py_UNICODE *)PyMem_MALLOC(slicelength* 7014 sizeof(Py_UNICODE)); 7015 7016 if (result_buf == NULL) 7017 return PyErr_NoMemory(); 7018 7019 for (cur = start, i = 0; i < slicelength; cur += step, i++) { 7020 result_buf[i] = source_buf[cur]; 7021 } 7022 7023 result = PyUnicode_FromUnicode(result_buf, slicelength); 7024 PyMem_FREE(result_buf); 7025 return result; 7026 } 7027 } else { 7028 PyErr_SetString(PyExc_TypeError, "string indices must be integers"); 7029 return NULL; 7030 } 7031} 7032 7033static PyMappingMethods unicode_as_mapping = { 7034 (lenfunc)unicode_length, /* mp_length */ 7035 (binaryfunc)unicode_subscript, /* mp_subscript */ 7036 (objobjargproc)0, /* mp_ass_subscript */ 7037}; 7038 7039static Py_ssize_t 7040unicode_buffer_getreadbuf(PyUnicodeObject *self, 7041 Py_ssize_t index, 7042 const void **ptr) 7043{ 7044 if (index != 0) { 7045 PyErr_SetString(PyExc_SystemError, 7046 "accessing non-existent unicode segment"); 7047 return -1; 7048 } 7049 *ptr = (void *) self->str; 7050 return PyUnicode_GET_DATA_SIZE(self); 7051} 7052 7053static Py_ssize_t 7054unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index, 7055 const void **ptr) 7056{ 7057 PyErr_SetString(PyExc_TypeError, 7058 "cannot use unicode as modifiable buffer"); 7059 return -1; 7060} 7061 7062static int 7063unicode_buffer_getsegcount(PyUnicodeObject *self, 7064 Py_ssize_t *lenp) 7065{ 7066 if (lenp) 7067 *lenp = PyUnicode_GET_DATA_SIZE(self); 7068 return 1; 7069} 7070 7071static Py_ssize_t 7072unicode_buffer_getcharbuf(PyUnicodeObject *self, 7073 Py_ssize_t index, 7074 const void **ptr) 7075{ 7076 PyObject *str; 7077 7078 if (index != 0) { 7079 PyErr_SetString(PyExc_SystemError, 7080 "accessing non-existent unicode segment"); 7081 return -1; 7082 } 7083 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL); 7084 if (str == NULL) 7085 return -1; 7086 *ptr = (void *) PyString_AS_STRING(str); 7087 return PyString_GET_SIZE(str); 7088} 7089 7090/* Helpers for PyUnicode_Format() */ 7091 7092static PyObject * 7093getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx) 7094{ 7095 Py_ssize_t argidx = *p_argidx; 7096 if (argidx < arglen) { 7097 (*p_argidx)++; 7098 if (arglen < 0) 7099 return args; 7100 else 7101 return PyTuple_GetItem(args, argidx); 7102 } 7103 PyErr_SetString(PyExc_TypeError, 7104 "not enough arguments for format string"); 7105 return NULL; 7106} 7107 7108#define F_LJUST (1<<0) 7109#define F_SIGN (1<<1) 7110#define F_BLANK (1<<2) 7111#define F_ALT (1<<3) 7112#define F_ZERO (1<<4) 7113 7114static Py_ssize_t 7115strtounicode(Py_UNICODE *buffer, const char *charbuffer) 7116{ 7117 register Py_ssize_t i; 7118 Py_ssize_t len = strlen(charbuffer); 7119 for (i = len - 1; i >= 0; i--) 7120 buffer[i] = (Py_UNICODE) charbuffer[i]; 7121 7122 return len; 7123} 7124 7125static int 7126doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x) 7127{ 7128 Py_ssize_t result; 7129 7130 PyOS_ascii_formatd((char *)buffer, len, format, x); 7131 result = strtounicode(buffer, (char *)buffer); 7132 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int); 7133} 7134 7135static int 7136longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x) 7137{ 7138 Py_ssize_t result; 7139 7140 PyOS_snprintf((char *)buffer, len, format, x); 7141 result = strtounicode(buffer, (char *)buffer); 7142 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int); 7143} 7144 7145/* XXX To save some code duplication, formatfloat/long/int could have been 7146 shared with stringobject.c, converting from 8-bit to Unicode after the 7147 formatting is done. */ 7148 7149static int 7150formatfloat(Py_UNICODE *buf, 7151 size_t buflen, 7152 int flags, 7153 int prec, 7154 int type, 7155 PyObject *v) 7156{ 7157 /* fmt = '%#.' + `prec` + `type` 7158 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/ 7159 char fmt[20]; 7160 double x; 7161 7162 x = PyFloat_AsDouble(v); 7163 if (x == -1.0 && PyErr_Occurred()) 7164 return -1; 7165 if (prec < 0) 7166 prec = 6; 7167 if (type == 'f' && (fabs(x) / 1e25) >= 1e25) 7168 type = 'g'; 7169 /* Worst case length calc to ensure no buffer overrun: 7170 7171 'g' formats: 7172 fmt = %#.<prec>g 7173 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp 7174 for any double rep.) 7175 len = 1 + prec + 1 + 2 + 5 = 9 + prec 7176 7177 'f' formats: 7178 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50) 7179 len = 1 + 50 + 1 + prec = 52 + prec 7180 7181 If prec=0 the effective precision is 1 (the leading digit is 7182 always given), therefore increase the length by one. 7183 7184 */ 7185 if ((type == 'g' && buflen <= (size_t)10 + (size_t)prec) || 7186 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) { 7187 PyErr_SetString(PyExc_OverflowError, 7188 "formatted float is too long (precision too large?)"); 7189 return -1; 7190 } 7191 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c", 7192 (flags&F_ALT) ? "#" : "", 7193 prec, type); 7194 return doubletounicode(buf, buflen, fmt, x); 7195} 7196 7197static PyObject* 7198formatlong(PyObject *val, int flags, int prec, int type) 7199{ 7200 char *buf; 7201 int i, len; 7202 PyObject *str; /* temporary string object. */ 7203 PyUnicodeObject *result; 7204 7205 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len); 7206 if (!str) 7207 return NULL; 7208 result = _PyUnicode_New(len); 7209 if (!result) { 7210 Py_DECREF(str); 7211 return NULL; 7212 } 7213 for (i = 0; i < len; i++) 7214 result->str[i] = buf[i]; 7215 result->str[len] = 0; 7216 Py_DECREF(str); 7217 return (PyObject*)result; 7218} 7219 7220static int 7221formatint(Py_UNICODE *buf, 7222 size_t buflen, 7223 int flags, 7224 int prec, 7225 int type, 7226 PyObject *v) 7227{ 7228 /* fmt = '%#.' + `prec` + 'l' + `type` 7229 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine) 7230 * + 1 + 1 7231 * = 24 7232 */ 7233 char fmt[64]; /* plenty big enough! */ 7234 char *sign; 7235 long x; 7236 7237 x = PyInt_AsLong(v); 7238 if (x == -1 && PyErr_Occurred()) 7239 return -1; 7240 if (x < 0 && type == 'u') { 7241 type = 'd'; 7242 } 7243 if (x < 0 && (type == 'x' || type == 'X' || type == 'o')) 7244 sign = "-"; 7245 else 7246 sign = ""; 7247 if (prec < 0) 7248 prec = 1; 7249 7250 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal)) 7251 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11 7252 */ 7253 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) { 7254 PyErr_SetString(PyExc_OverflowError, 7255 "formatted integer is too long (precision too large?)"); 7256 return -1; 7257 } 7258 7259 if ((flags & F_ALT) && 7260 (type == 'x' || type == 'X')) { 7261 /* When converting under %#x or %#X, there are a number 7262 * of issues that cause pain: 7263 * - when 0 is being converted, the C standard leaves off 7264 * the '0x' or '0X', which is inconsistent with other 7265 * %#x/%#X conversions and inconsistent with Python's 7266 * hex() function 7267 * - there are platforms that violate the standard and 7268 * convert 0 with the '0x' or '0X' 7269 * (Metrowerks, Compaq Tru64) 7270 * - there are platforms that give '0x' when converting 7271 * under %#X, but convert 0 in accordance with the 7272 * standard (OS/2 EMX) 7273 * 7274 * We can achieve the desired consistency by inserting our 7275 * own '0x' or '0X' prefix, and substituting %x/%X in place 7276 * of %#x/%#X. 7277 * 7278 * Note that this is the same approach as used in 7279 * formatint() in stringobject.c 7280 */ 7281 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c", 7282 sign, type, prec, type); 7283 } 7284 else { 7285 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c", 7286 sign, (flags&F_ALT) ? "#" : "", 7287 prec, type); 7288 } 7289 if (sign[0]) 7290 return longtounicode(buf, buflen, fmt, -x); 7291 else 7292 return longtounicode(buf, buflen, fmt, x); 7293} 7294 7295static int 7296formatchar(Py_UNICODE *buf, 7297 size_t buflen, 7298 PyObject *v) 7299{ 7300 /* presume that the buffer is at least 2 characters long */ 7301 if (PyUnicode_Check(v)) { 7302 if (PyUnicode_GET_SIZE(v) != 1) 7303 goto onError; 7304 buf[0] = PyUnicode_AS_UNICODE(v)[0]; 7305 } 7306 7307 else if (PyString_Check(v)) { 7308 if (PyString_GET_SIZE(v) != 1) 7309 goto onError; 7310 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0]; 7311 } 7312 7313 else { 7314 /* Integer input truncated to a character */ 7315 long x; 7316 x = PyInt_AsLong(v); 7317 if (x == -1 && PyErr_Occurred()) 7318 goto onError; 7319#ifdef Py_UNICODE_WIDE 7320 if (x < 0 || x > 0x10ffff) { 7321 PyErr_SetString(PyExc_OverflowError, 7322 "%c arg not in range(0x110000) " 7323 "(wide Python build)"); 7324 return -1; 7325 } 7326#else 7327 if (x < 0 || x > 0xffff) { 7328 PyErr_SetString(PyExc_OverflowError, 7329 "%c arg not in range(0x10000) " 7330 "(narrow Python build)"); 7331 return -1; 7332 } 7333#endif 7334 buf[0] = (Py_UNICODE) x; 7335 } 7336 buf[1] = '\0'; 7337 return 1; 7338 7339 onError: 7340 PyErr_SetString(PyExc_TypeError, 7341 "%c requires int or char"); 7342 return -1; 7343} 7344 7345/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...) 7346 7347 FORMATBUFLEN is the length of the buffer in which the floats, ints, & 7348 chars are formatted. XXX This is a magic number. Each formatting 7349 routine does bounds checking to ensure no overflow, but a better 7350 solution may be to malloc a buffer of appropriate size for each 7351 format. For now, the current solution is sufficient. 7352*/ 7353#define FORMATBUFLEN (size_t)120 7354 7355PyObject *PyUnicode_Format(PyObject *format, 7356 PyObject *args) 7357{ 7358 Py_UNICODE *fmt, *res; 7359 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx; 7360 int args_owned = 0; 7361 PyUnicodeObject *result = NULL; 7362 PyObject *dict = NULL; 7363 PyObject *uformat; 7364 7365 if (format == NULL || args == NULL) { 7366 PyErr_BadInternalCall(); 7367 return NULL; 7368 } 7369 uformat = PyUnicode_FromObject(format); 7370 if (uformat == NULL) 7371 return NULL; 7372 fmt = PyUnicode_AS_UNICODE(uformat); 7373 fmtcnt = PyUnicode_GET_SIZE(uformat); 7374 7375 reslen = rescnt = fmtcnt + 100; 7376 result = _PyUnicode_New(reslen); 7377 if (result == NULL) 7378 goto onError; 7379 res = PyUnicode_AS_UNICODE(result); 7380 7381 if (PyTuple_Check(args)) { 7382 arglen = PyTuple_Size(args); 7383 argidx = 0; 7384 } 7385 else { 7386 arglen = -1; 7387 argidx = -2; 7388 } 7389 if (args->ob_type->tp_as_mapping && !PyTuple_Check(args) && 7390 !PyObject_TypeCheck(args, &PyBaseString_Type)) 7391 dict = args; 7392 7393 while (--fmtcnt >= 0) { 7394 if (*fmt != '%') { 7395 if (--rescnt < 0) { 7396 rescnt = fmtcnt + 100; 7397 reslen += rescnt; 7398 if (_PyUnicode_Resize(&result, reslen) < 0) 7399 goto onError; 7400 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt; 7401 --rescnt; 7402 } 7403 *res++ = *fmt++; 7404 } 7405 else { 7406 /* Got a format specifier */ 7407 int flags = 0; 7408 Py_ssize_t width = -1; 7409 int prec = -1; 7410 Py_UNICODE c = '\0'; 7411 Py_UNICODE fill; 7412 PyObject *v = NULL; 7413 PyObject *temp = NULL; 7414 Py_UNICODE *pbuf; 7415 Py_UNICODE sign; 7416 Py_ssize_t len; 7417 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */ 7418 7419 fmt++; 7420 if (*fmt == '(') { 7421 Py_UNICODE *keystart; 7422 Py_ssize_t keylen; 7423 PyObject *key; 7424 int pcount = 1; 7425 7426 if (dict == NULL) { 7427 PyErr_SetString(PyExc_TypeError, 7428 "format requires a mapping"); 7429 goto onError; 7430 } 7431 ++fmt; 7432 --fmtcnt; 7433 keystart = fmt; 7434 /* Skip over balanced parentheses */ 7435 while (pcount > 0 && --fmtcnt >= 0) { 7436 if (*fmt == ')') 7437 --pcount; 7438 else if (*fmt == '(') 7439 ++pcount; 7440 fmt++; 7441 } 7442 keylen = fmt - keystart - 1; 7443 if (fmtcnt < 0 || pcount > 0) { 7444 PyErr_SetString(PyExc_ValueError, 7445 "incomplete format key"); 7446 goto onError; 7447 } 7448#if 0 7449 /* keys are converted to strings using UTF-8 and 7450 then looked up since Python uses strings to hold 7451 variables names etc. in its namespaces and we 7452 wouldn't want to break common idioms. */ 7453 key = PyUnicode_EncodeUTF8(keystart, 7454 keylen, 7455 NULL); 7456#else 7457 key = PyUnicode_FromUnicode(keystart, keylen); 7458#endif 7459 if (key == NULL) 7460 goto onError; 7461 if (args_owned) { 7462 Py_DECREF(args); 7463 args_owned = 0; 7464 } 7465 args = PyObject_GetItem(dict, key); 7466 Py_DECREF(key); 7467 if (args == NULL) { 7468 goto onError; 7469 } 7470 args_owned = 1; 7471 arglen = -1; 7472 argidx = -2; 7473 } 7474 while (--fmtcnt >= 0) { 7475 switch (c = *fmt++) { 7476 case '-': flags |= F_LJUST; continue; 7477 case '+': flags |= F_SIGN; continue; 7478 case ' ': flags |= F_BLANK; continue; 7479 case '#': flags |= F_ALT; continue; 7480 case '0': flags |= F_ZERO; continue; 7481 } 7482 break; 7483 } 7484 if (c == '*') { 7485 v = getnextarg(args, arglen, &argidx); 7486 if (v == NULL) 7487 goto onError; 7488 if (!PyInt_Check(v)) { 7489 PyErr_SetString(PyExc_TypeError, 7490 "* wants int"); 7491 goto onError; 7492 } 7493 width = PyInt_AsLong(v); 7494 if (width < 0) { 7495 flags |= F_LJUST; 7496 width = -width; 7497 } 7498 if (--fmtcnt >= 0) 7499 c = *fmt++; 7500 } 7501 else if (c >= '0' && c <= '9') { 7502 width = c - '0'; 7503 while (--fmtcnt >= 0) { 7504 c = *fmt++; 7505 if (c < '0' || c > '9') 7506 break; 7507 if ((width*10) / 10 != width) { 7508 PyErr_SetString(PyExc_ValueError, 7509 "width too big"); 7510 goto onError; 7511 } 7512 width = width*10 + (c - '0'); 7513 } 7514 } 7515 if (c == '.') { 7516 prec = 0; 7517 if (--fmtcnt >= 0) 7518 c = *fmt++; 7519 if (c == '*') { 7520 v = getnextarg(args, arglen, &argidx); 7521 if (v == NULL) 7522 goto onError; 7523 if (!PyInt_Check(v)) { 7524 PyErr_SetString(PyExc_TypeError, 7525 "* wants int"); 7526 goto onError; 7527 } 7528 prec = PyInt_AsLong(v); 7529 if (prec < 0) 7530 prec = 0; 7531 if (--fmtcnt >= 0) 7532 c = *fmt++; 7533 } 7534 else if (c >= '0' && c <= '9') { 7535 prec = c - '0'; 7536 while (--fmtcnt >= 0) { 7537 c = Py_CHARMASK(*fmt++); 7538 if (c < '0' || c > '9') 7539 break; 7540 if ((prec*10) / 10 != prec) { 7541 PyErr_SetString(PyExc_ValueError, 7542 "prec too big"); 7543 goto onError; 7544 } 7545 prec = prec*10 + (c - '0'); 7546 } 7547 } 7548 } /* prec */ 7549 if (fmtcnt >= 0) { 7550 if (c == 'h' || c == 'l' || c == 'L') { 7551 if (--fmtcnt >= 0) 7552 c = *fmt++; 7553 } 7554 } 7555 if (fmtcnt < 0) { 7556 PyErr_SetString(PyExc_ValueError, 7557 "incomplete format"); 7558 goto onError; 7559 } 7560 if (c != '%') { 7561 v = getnextarg(args, arglen, &argidx); 7562 if (v == NULL) 7563 goto onError; 7564 } 7565 sign = 0; 7566 fill = ' '; 7567 switch (c) { 7568 7569 case '%': 7570 pbuf = formatbuf; 7571 /* presume that buffer length is at least 1 */ 7572 pbuf[0] = '%'; 7573 len = 1; 7574 break; 7575 7576 case 's': 7577 case 'r': 7578 if (PyUnicode_Check(v) && c == 's') { 7579 temp = v; 7580 Py_INCREF(temp); 7581 } 7582 else { 7583 PyObject *unicode; 7584 if (c == 's') 7585 temp = PyObject_Unicode(v); 7586 else 7587 temp = PyObject_Repr(v); 7588 if (temp == NULL) 7589 goto onError; 7590 if (PyUnicode_Check(temp)) 7591 /* nothing to do */; 7592 else if (PyString_Check(temp)) { 7593 /* convert to string to Unicode */ 7594 unicode = PyUnicode_Decode(PyString_AS_STRING(temp), 7595 PyString_GET_SIZE(temp), 7596 NULL, 7597 "strict"); 7598 Py_DECREF(temp); 7599 temp = unicode; 7600 if (temp == NULL) 7601 goto onError; 7602 } 7603 else { 7604 Py_DECREF(temp); 7605 PyErr_SetString(PyExc_TypeError, 7606 "%s argument has non-string str()"); 7607 goto onError; 7608 } 7609 } 7610 pbuf = PyUnicode_AS_UNICODE(temp); 7611 len = PyUnicode_GET_SIZE(temp); 7612 if (prec >= 0 && len > prec) 7613 len = prec; 7614 break; 7615 7616 case 'i': 7617 case 'd': 7618 case 'u': 7619 case 'o': 7620 case 'x': 7621 case 'X': 7622 if (c == 'i') 7623 c = 'd'; 7624 if (PyLong_Check(v)) { 7625 temp = formatlong(v, flags, prec, c); 7626 if (!temp) 7627 goto onError; 7628 pbuf = PyUnicode_AS_UNICODE(temp); 7629 len = PyUnicode_GET_SIZE(temp); 7630 sign = 1; 7631 } 7632 else { 7633 pbuf = formatbuf; 7634 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), 7635 flags, prec, c, v); 7636 if (len < 0) 7637 goto onError; 7638 sign = 1; 7639 } 7640 if (flags & F_ZERO) 7641 fill = '0'; 7642 break; 7643 7644 case 'e': 7645 case 'E': 7646 case 'f': 7647 case 'F': 7648 case 'g': 7649 case 'G': 7650 if (c == 'F') 7651 c = 'f'; 7652 pbuf = formatbuf; 7653 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), 7654 flags, prec, c, v); 7655 if (len < 0) 7656 goto onError; 7657 sign = 1; 7658 if (flags & F_ZERO) 7659 fill = '0'; 7660 break; 7661 7662 case 'c': 7663 pbuf = formatbuf; 7664 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v); 7665 if (len < 0) 7666 goto onError; 7667 break; 7668 7669 default: 7670 PyErr_Format(PyExc_ValueError, 7671 "unsupported format character '%c' (0x%x) " 7672 "at index %i", 7673 (31<=c && c<=126) ? (char)c : '?', 7674 (int)c, 7675 (int)(fmt -1 - PyUnicode_AS_UNICODE(uformat))); 7676 goto onError; 7677 } 7678 if (sign) { 7679 if (*pbuf == '-' || *pbuf == '+') { 7680 sign = *pbuf++; 7681 len--; 7682 } 7683 else if (flags & F_SIGN) 7684 sign = '+'; 7685 else if (flags & F_BLANK) 7686 sign = ' '; 7687 else 7688 sign = 0; 7689 } 7690 if (width < len) 7691 width = len; 7692 if (rescnt - (sign != 0) < width) { 7693 reslen -= rescnt; 7694 rescnt = width + fmtcnt + 100; 7695 reslen += rescnt; 7696 if (reslen < 0) { 7697 Py_XDECREF(temp); 7698 PyErr_NoMemory(); 7699 goto onError; 7700 } 7701 if (_PyUnicode_Resize(&result, reslen) < 0) { 7702 Py_XDECREF(temp); 7703 goto onError; 7704 } 7705 res = PyUnicode_AS_UNICODE(result) 7706 + reslen - rescnt; 7707 } 7708 if (sign) { 7709 if (fill != ' ') 7710 *res++ = sign; 7711 rescnt--; 7712 if (width > len) 7713 width--; 7714 } 7715 if ((flags & F_ALT) && (c == 'x' || c == 'X')) { 7716 assert(pbuf[0] == '0'); 7717 assert(pbuf[1] == c); 7718 if (fill != ' ') { 7719 *res++ = *pbuf++; 7720 *res++ = *pbuf++; 7721 } 7722 rescnt -= 2; 7723 width -= 2; 7724 if (width < 0) 7725 width = 0; 7726 len -= 2; 7727 } 7728 if (width > len && !(flags & F_LJUST)) { 7729 do { 7730 --rescnt; 7731 *res++ = fill; 7732 } while (--width > len); 7733 } 7734 if (fill == ' ') { 7735 if (sign) 7736 *res++ = sign; 7737 if ((flags & F_ALT) && (c == 'x' || c == 'X')) { 7738 assert(pbuf[0] == '0'); 7739 assert(pbuf[1] == c); 7740 *res++ = *pbuf++; 7741 *res++ = *pbuf++; 7742 } 7743 } 7744 Py_UNICODE_COPY(res, pbuf, len); 7745 res += len; 7746 rescnt -= len; 7747 while (--width >= len) { 7748 --rescnt; 7749 *res++ = ' '; 7750 } 7751 if (dict && (argidx < arglen) && c != '%') { 7752 PyErr_SetString(PyExc_TypeError, 7753 "not all arguments converted during string formatting"); 7754 Py_XDECREF(temp); 7755 goto onError; 7756 } 7757 Py_XDECREF(temp); 7758 } /* '%' */ 7759 } /* until end */ 7760 if (argidx < arglen && !dict) { 7761 PyErr_SetString(PyExc_TypeError, 7762 "not all arguments converted during string formatting"); 7763 goto onError; 7764 } 7765 7766 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0) 7767 goto onError; 7768 if (args_owned) { 7769 Py_DECREF(args); 7770 } 7771 Py_DECREF(uformat); 7772 return (PyObject *)result; 7773 7774 onError: 7775 Py_XDECREF(result); 7776 Py_DECREF(uformat); 7777 if (args_owned) { 7778 Py_DECREF(args); 7779 } 7780 return NULL; 7781} 7782 7783static PyBufferProcs unicode_as_buffer = { 7784 (readbufferproc) unicode_buffer_getreadbuf, 7785 (writebufferproc) unicode_buffer_getwritebuf, 7786 (segcountproc) unicode_buffer_getsegcount, 7787 (charbufferproc) unicode_buffer_getcharbuf, 7788}; 7789 7790static PyObject * 7791unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds); 7792 7793static PyObject * 7794unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds) 7795{ 7796 PyObject *x = NULL; 7797 static char *kwlist[] = {"string", "encoding", "errors", 0}; 7798 char *encoding = NULL; 7799 char *errors = NULL; 7800 7801 if (type != &PyUnicode_Type) 7802 return unicode_subtype_new(type, args, kwds); 7803 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode", 7804 kwlist, &x, &encoding, &errors)) 7805 return NULL; 7806 if (x == NULL) 7807 return (PyObject *)_PyUnicode_New(0); 7808 if (encoding == NULL && errors == NULL) 7809 return PyObject_Unicode(x); 7810 else 7811 return PyUnicode_FromEncodedObject(x, encoding, errors); 7812} 7813 7814static PyObject * 7815unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds) 7816{ 7817 PyUnicodeObject *tmp, *pnew; 7818 Py_ssize_t n; 7819 7820 assert(PyType_IsSubtype(type, &PyUnicode_Type)); 7821 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds); 7822 if (tmp == NULL) 7823 return NULL; 7824 assert(PyUnicode_Check(tmp)); 7825 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length); 7826 if (pnew == NULL) { 7827 Py_DECREF(tmp); 7828 return NULL; 7829 } 7830 pnew->str = PyMem_NEW(Py_UNICODE, n+1); 7831 if (pnew->str == NULL) { 7832 _Py_ForgetReference((PyObject *)pnew); 7833 PyObject_Del(pnew); 7834 Py_DECREF(tmp); 7835 return PyErr_NoMemory(); 7836 } 7837 Py_UNICODE_COPY(pnew->str, tmp->str, n+1); 7838 pnew->length = n; 7839 pnew->hash = tmp->hash; 7840 Py_DECREF(tmp); 7841 return (PyObject *)pnew; 7842} 7843 7844PyDoc_STRVAR(unicode_doc, 7845"unicode(string [, encoding[, errors]]) -> object\n\ 7846\n\ 7847Create a new Unicode object from the given encoded string.\n\ 7848encoding defaults to the current default string encoding.\n\ 7849errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'."); 7850 7851PyTypeObject PyUnicode_Type = { 7852 PyObject_HEAD_INIT(&PyType_Type) 7853 0, /* ob_size */ 7854 "unicode", /* tp_name */ 7855 sizeof(PyUnicodeObject), /* tp_size */ 7856 0, /* tp_itemsize */ 7857 /* Slots */ 7858 (destructor)unicode_dealloc, /* tp_dealloc */ 7859 0, /* tp_print */ 7860 0, /* tp_getattr */ 7861 0, /* tp_setattr */ 7862 (cmpfunc) unicode_compare, /* tp_compare */ 7863 unicode_repr, /* tp_repr */ 7864 &unicode_as_number, /* tp_as_number */ 7865 &unicode_as_sequence, /* tp_as_sequence */ 7866 &unicode_as_mapping, /* tp_as_mapping */ 7867 (hashfunc) unicode_hash, /* tp_hash*/ 7868 0, /* tp_call*/ 7869 (reprfunc) unicode_str, /* tp_str */ 7870 PyObject_GenericGetAttr, /* tp_getattro */ 7871 0, /* tp_setattro */ 7872 &unicode_as_buffer, /* tp_as_buffer */ 7873 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */ 7874 unicode_doc, /* tp_doc */ 7875 0, /* tp_traverse */ 7876 0, /* tp_clear */ 7877 0, /* tp_richcompare */ 7878 0, /* tp_weaklistoffset */ 7879 0, /* tp_iter */ 7880 0, /* tp_iternext */ 7881 unicode_methods, /* tp_methods */ 7882 0, /* tp_members */ 7883 0, /* tp_getset */ 7884 &PyBaseString_Type, /* tp_base */ 7885 0, /* tp_dict */ 7886 0, /* tp_descr_get */ 7887 0, /* tp_descr_set */ 7888 0, /* tp_dictoffset */ 7889 0, /* tp_init */ 7890 0, /* tp_alloc */ 7891 unicode_new, /* tp_new */ 7892 PyObject_Del, /* tp_free */ 7893}; 7894 7895/* Initialize the Unicode implementation */ 7896 7897void _PyUnicode_Init(void) 7898{ 7899 int i; 7900 7901 /* XXX - move this array to unicodectype.c ? */ 7902 Py_UNICODE linebreak[] = { 7903 0x000A, /* LINE FEED */ 7904 0x000D, /* CARRIAGE RETURN */ 7905 0x001C, /* FILE SEPARATOR */ 7906 0x001D, /* GROUP SEPARATOR */ 7907 0x001E, /* RECORD SEPARATOR */ 7908 0x0085, /* NEXT LINE */ 7909 0x2028, /* LINE SEPARATOR */ 7910 0x2029, /* PARAGRAPH SEPARATOR */ 7911 }; 7912 7913 /* Init the implementation */ 7914 unicode_freelist = NULL; 7915 unicode_freelist_size = 0; 7916 unicode_empty = _PyUnicode_New(0); 7917 if (!unicode_empty) 7918 return; 7919 7920 strcpy(unicode_default_encoding, "ascii"); 7921 for (i = 0; i < 256; i++) 7922 unicode_latin1[i] = NULL; 7923 if (PyType_Ready(&PyUnicode_Type) < 0) 7924 Py_FatalError("Can't initialize 'unicode'"); 7925 7926 /* initialize the linebreak bloom filter */ 7927 bloom_linebreak = make_bloom_mask( 7928 linebreak, sizeof(linebreak) / sizeof(linebreak[0]) 7929 ); 7930 7931 PyType_Ready(&EncodingMapType); 7932} 7933 7934/* Finalize the Unicode implementation */ 7935 7936void 7937_PyUnicode_Fini(void) 7938{ 7939 PyUnicodeObject *u; 7940 int i; 7941 7942 Py_XDECREF(unicode_empty); 7943 unicode_empty = NULL; 7944 7945 for (i = 0; i < 256; i++) { 7946 if (unicode_latin1[i]) { 7947 Py_DECREF(unicode_latin1[i]); 7948 unicode_latin1[i] = NULL; 7949 } 7950 } 7951 7952 for (u = unicode_freelist; u != NULL;) { 7953 PyUnicodeObject *v = u; 7954 u = *(PyUnicodeObject **)u; 7955 if (v->str) 7956 PyMem_DEL(v->str); 7957 Py_XDECREF(v->defenc); 7958 PyObject_Del(v); 7959 } 7960 unicode_freelist = NULL; 7961 unicode_freelist_size = 0; 7962} 7963 7964#ifdef __cplusplus 7965} 7966#endif 7967 7968 7969/* 7970Local variables: 7971c-basic-offset: 4 7972indent-tabs-mode: nil 7973End: 7974*/ 7975