unicodeobject.c revision 782afc5927c5d37c3de1a082b6363a79e4bd5962
1/* 2 3Unicode implementation based on original code by Fredrik Lundh, 4modified by Marc-Andre Lemburg <mal@lemburg.com> according to the 5Unicode Integration Proposal (see file Misc/unicode.txt). 6 7Copyright (c) Corporation for National Research Initiatives. 8 9-------------------------------------------------------------------- 10The original string type implementation is: 11 12 Copyright (c) 1999 by Secret Labs AB 13 Copyright (c) 1999 by Fredrik Lundh 14 15By obtaining, using, and/or copying this software and/or its 16associated documentation, you agree that you have read, understood, 17and will comply with the following terms and conditions: 18 19Permission to use, copy, modify, and distribute this software and its 20associated documentation for any purpose and without fee is hereby 21granted, provided that the above copyright notice appears in all 22copies, and that both that copyright notice and this permission notice 23appear in supporting documentation, and that the name of Secret Labs 24AB or the author not be used in advertising or publicity pertaining to 25distribution of the software without specific, written prior 26permission. 27 28SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO 29THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND 30FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR 31ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 32WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 33ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT 34OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 35-------------------------------------------------------------------- 36 37*/ 38 39#include "Python.h" 40 41#include "unicodeobject.h" 42#include "ucnhash.h" 43 44#ifdef MS_WINDOWS 45#include <windows.h> 46#endif 47 48/* Limit for the Unicode object free list */ 49 50#define MAX_UNICODE_FREELIST_SIZE 1024 51 52/* Limit for the Unicode object free list stay alive optimization. 53 54 The implementation will keep allocated Unicode memory intact for 55 all objects on the free list having a size less than this 56 limit. This reduces malloc() overhead for small Unicode objects. 57 58 At worst this will result in MAX_UNICODE_FREELIST_SIZE * 59 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT + 60 malloc()-overhead) bytes of unused garbage. 61 62 Setting the limit to 0 effectively turns the feature off. 63 64 Note: This is an experimental feature ! If you get core dumps when 65 using Unicode objects, turn this feature off. 66 67*/ 68 69#define KEEPALIVE_SIZE_LIMIT 9 70 71/* Endianness switches; defaults to little endian */ 72 73#ifdef WORDS_BIGENDIAN 74# define BYTEORDER_IS_BIG_ENDIAN 75#else 76# define BYTEORDER_IS_LITTLE_ENDIAN 77#endif 78 79/* --- Globals ------------------------------------------------------------ 80 81 The globals are initialized by the _PyUnicode_Init() API and should 82 not be used before calling that API. 83 84*/ 85 86/* Free list for Unicode objects */ 87static PyUnicodeObject *unicode_freelist; 88static int unicode_freelist_size; 89 90/* The empty Unicode object is shared to improve performance. */ 91static PyUnicodeObject *unicode_empty; 92 93/* Single character Unicode strings in the Latin-1 range are being 94 shared as well. */ 95static PyUnicodeObject *unicode_latin1[256]; 96 97/* Default encoding to use and assume when NULL is passed as encoding 98 parameter; it is initialized by _PyUnicode_Init(). 99 100 Always use the PyUnicode_SetDefaultEncoding() and 101 PyUnicode_GetDefaultEncoding() APIs to access this global. 102 103*/ 104static char unicode_default_encoding[100]; 105 106Py_UNICODE 107PyUnicode_GetMax(void) 108{ 109#ifdef Py_UNICODE_WIDE 110 return 0x10FFFF; 111#else 112 /* This is actually an illegal character, so it should 113 not be passed to unichr. */ 114 return 0xFFFF; 115#endif 116} 117 118/* --- Unicode Object ----------------------------------------------------- */ 119 120static 121int unicode_resize(register PyUnicodeObject *unicode, 122 int length) 123{ 124 void *oldstr; 125 126 /* Shortcut if there's nothing much to do. */ 127 if (unicode->length == length) 128 goto reset; 129 130 /* Resizing shared object (unicode_empty or single character 131 objects) in-place is not allowed. Use PyUnicode_Resize() 132 instead ! */ 133 if (unicode == unicode_empty || 134 (unicode->length == 1 && 135 /* MvL said unicode->str[] may be signed. Python generally assumes 136 * an int contains at least 32 bits, and we don't use more than 137 * 32 bits even in a UCS4 build, so casting to unsigned int should 138 * be correct. 139 */ 140 (unsigned int)unicode->str[0] < 256U && 141 unicode_latin1[unicode->str[0]] == unicode)) { 142 PyErr_SetString(PyExc_SystemError, 143 "can't resize shared unicode objects"); 144 return -1; 145 } 146 147 /* We allocate one more byte to make sure the string is 148 Ux0000 terminated -- XXX is this needed ? */ 149 oldstr = unicode->str; 150 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1); 151 if (!unicode->str) { 152 unicode->str = oldstr; 153 PyErr_NoMemory(); 154 return -1; 155 } 156 unicode->str[length] = 0; 157 unicode->length = length; 158 159 reset: 160 /* Reset the object caches */ 161 if (unicode->defenc) { 162 Py_DECREF(unicode->defenc); 163 unicode->defenc = NULL; 164 } 165 unicode->hash = -1; 166 167 return 0; 168} 169 170/* We allocate one more byte to make sure the string is 171 Ux0000 terminated -- XXX is this needed ? 172 173 XXX This allocator could further be enhanced by assuring that the 174 free list never reduces its size below 1. 175 176*/ 177 178static 179PyUnicodeObject *_PyUnicode_New(int length) 180{ 181 register PyUnicodeObject *unicode; 182 183 /* Optimization fo empty strings */ 184 if (length == 0 && unicode_empty != NULL) { 185 Py_INCREF(unicode_empty); 186 return unicode_empty; 187 } 188 189 /* Unicode freelist & memory allocation */ 190 if (unicode_freelist) { 191 unicode = unicode_freelist; 192 unicode_freelist = *(PyUnicodeObject **)unicode; 193 unicode_freelist_size--; 194 if (unicode->str) { 195 /* Keep-Alive optimization: we only upsize the buffer, 196 never downsize it. */ 197 if ((unicode->length < length) && 198 unicode_resize(unicode, length) < 0) { 199 PyMem_DEL(unicode->str); 200 goto onError; 201 } 202 } 203 else { 204 unicode->str = PyMem_NEW(Py_UNICODE, length + 1); 205 } 206 PyObject_INIT(unicode, &PyUnicode_Type); 207 } 208 else { 209 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type); 210 if (unicode == NULL) 211 return NULL; 212 unicode->str = PyMem_NEW(Py_UNICODE, length + 1); 213 } 214 215 if (!unicode->str) { 216 PyErr_NoMemory(); 217 goto onError; 218 } 219 /* Initialize the first element to guard against cases where 220 * the caller fails before initializing str -- unicode_resize() 221 * reads str[0], and the Keep-Alive optimization can keep memory 222 * allocated for str alive across a call to unicode_dealloc(unicode). 223 * We don't want unicode_resize to read uninitialized memory in 224 * that case. 225 */ 226 unicode->str[0] = 0; 227 unicode->str[length] = 0; 228 unicode->length = length; 229 unicode->hash = -1; 230 unicode->defenc = NULL; 231 return unicode; 232 233 onError: 234 _Py_ForgetReference((PyObject *)unicode); 235 PyObject_Del(unicode); 236 return NULL; 237} 238 239static 240void unicode_dealloc(register PyUnicodeObject *unicode) 241{ 242 if (PyUnicode_CheckExact(unicode) && 243 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) { 244 /* Keep-Alive optimization */ 245 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) { 246 PyMem_DEL(unicode->str); 247 unicode->str = NULL; 248 unicode->length = 0; 249 } 250 if (unicode->defenc) { 251 Py_DECREF(unicode->defenc); 252 unicode->defenc = NULL; 253 } 254 /* Add to free list */ 255 *(PyUnicodeObject **)unicode = unicode_freelist; 256 unicode_freelist = unicode; 257 unicode_freelist_size++; 258 } 259 else { 260 PyMem_DEL(unicode->str); 261 Py_XDECREF(unicode->defenc); 262 unicode->ob_type->tp_free((PyObject *)unicode); 263 } 264} 265 266int PyUnicode_Resize(PyObject **unicode, int length) 267{ 268 register PyUnicodeObject *v; 269 270 /* Argument checks */ 271 if (unicode == NULL) { 272 PyErr_BadInternalCall(); 273 return -1; 274 } 275 v = (PyUnicodeObject *)*unicode; 276 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1 || length < 0) { 277 PyErr_BadInternalCall(); 278 return -1; 279 } 280 281 /* Resizing unicode_empty and single character objects is not 282 possible since these are being shared. We simply return a fresh 283 copy with the same Unicode content. */ 284 if (v->length != length && 285 (v == unicode_empty || v->length == 1)) { 286 PyUnicodeObject *w = _PyUnicode_New(length); 287 if (w == NULL) 288 return -1; 289 Py_UNICODE_COPY(w->str, v->str, 290 length < v->length ? length : v->length); 291 Py_DECREF(*unicode); 292 *unicode = (PyObject *)w; 293 return 0; 294 } 295 296 /* Note that we don't have to modify *unicode for unshared Unicode 297 objects, since we can modify them in-place. */ 298 return unicode_resize(v, length); 299} 300 301/* Internal API for use in unicodeobject.c only ! */ 302#define _PyUnicode_Resize(unicodevar, length) \ 303 PyUnicode_Resize(((PyObject **)(unicodevar)), length) 304 305PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u, 306 int size) 307{ 308 PyUnicodeObject *unicode; 309 310 /* If the Unicode data is known at construction time, we can apply 311 some optimizations which share commonly used objects. */ 312 if (u != NULL) { 313 314 /* Optimization for empty strings */ 315 if (size == 0 && unicode_empty != NULL) { 316 Py_INCREF(unicode_empty); 317 return (PyObject *)unicode_empty; 318 } 319 320 /* Single character Unicode objects in the Latin-1 range are 321 shared when using this constructor */ 322 if (size == 1 && *u < 256) { 323 unicode = unicode_latin1[*u]; 324 if (!unicode) { 325 unicode = _PyUnicode_New(1); 326 if (!unicode) 327 return NULL; 328 unicode->str[0] = *u; 329 unicode_latin1[*u] = unicode; 330 } 331 Py_INCREF(unicode); 332 return (PyObject *)unicode; 333 } 334 } 335 336 unicode = _PyUnicode_New(size); 337 if (!unicode) 338 return NULL; 339 340 /* Copy the Unicode data into the new object */ 341 if (u != NULL) 342 Py_UNICODE_COPY(unicode->str, u, size); 343 344 return (PyObject *)unicode; 345} 346 347#ifdef HAVE_WCHAR_H 348 349PyObject *PyUnicode_FromWideChar(register const wchar_t *w, 350 int size) 351{ 352 PyUnicodeObject *unicode; 353 354 if (w == NULL) { 355 PyErr_BadInternalCall(); 356 return NULL; 357 } 358 359 unicode = _PyUnicode_New(size); 360 if (!unicode) 361 return NULL; 362 363 /* Copy the wchar_t data into the new object */ 364#ifdef HAVE_USABLE_WCHAR_T 365 memcpy(unicode->str, w, size * sizeof(wchar_t)); 366#else 367 { 368 register Py_UNICODE *u; 369 register int i; 370 u = PyUnicode_AS_UNICODE(unicode); 371 for (i = size; i >= 0; i--) 372 *u++ = *w++; 373 } 374#endif 375 376 return (PyObject *)unicode; 377} 378 379int PyUnicode_AsWideChar(PyUnicodeObject *unicode, 380 register wchar_t *w, 381 int size) 382{ 383 if (unicode == NULL) { 384 PyErr_BadInternalCall(); 385 return -1; 386 } 387 if (size > PyUnicode_GET_SIZE(unicode)) 388 size = PyUnicode_GET_SIZE(unicode); 389#ifdef HAVE_USABLE_WCHAR_T 390 memcpy(w, unicode->str, size * sizeof(wchar_t)); 391#else 392 { 393 register Py_UNICODE *u; 394 register int i; 395 u = PyUnicode_AS_UNICODE(unicode); 396 for (i = size; i >= 0; i--) 397 *w++ = *u++; 398 } 399#endif 400 401 return size; 402} 403 404#endif 405 406PyObject *PyUnicode_FromOrdinal(int ordinal) 407{ 408 Py_UNICODE s[1]; 409 410#ifdef Py_UNICODE_WIDE 411 if (ordinal < 0 || ordinal > 0x10ffff) { 412 PyErr_SetString(PyExc_ValueError, 413 "unichr() arg not in range(0x110000) " 414 "(wide Python build)"); 415 return NULL; 416 } 417#else 418 if (ordinal < 0 || ordinal > 0xffff) { 419 PyErr_SetString(PyExc_ValueError, 420 "unichr() arg not in range(0x10000) " 421 "(narrow Python build)"); 422 return NULL; 423 } 424#endif 425 426 s[0] = (Py_UNICODE)ordinal; 427 return PyUnicode_FromUnicode(s, 1); 428} 429 430PyObject *PyUnicode_FromObject(register PyObject *obj) 431{ 432 /* XXX Perhaps we should make this API an alias of 433 PyObject_Unicode() instead ?! */ 434 if (PyUnicode_CheckExact(obj)) { 435 Py_INCREF(obj); 436 return obj; 437 } 438 if (PyUnicode_Check(obj)) { 439 /* For a Unicode subtype that's not a Unicode object, 440 return a true Unicode object with the same data. */ 441 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj), 442 PyUnicode_GET_SIZE(obj)); 443 } 444 return PyUnicode_FromEncodedObject(obj, NULL, "strict"); 445} 446 447PyObject *PyUnicode_FromEncodedObject(register PyObject *obj, 448 const char *encoding, 449 const char *errors) 450{ 451 const char *s = NULL; 452 int len; 453 PyObject *v; 454 455 if (obj == NULL) { 456 PyErr_BadInternalCall(); 457 return NULL; 458 } 459 460#if 0 461 /* For b/w compatibility we also accept Unicode objects provided 462 that no encodings is given and then redirect to 463 PyObject_Unicode() which then applies the additional logic for 464 Unicode subclasses. 465 466 NOTE: This API should really only be used for object which 467 represent *encoded* Unicode ! 468 469 */ 470 if (PyUnicode_Check(obj)) { 471 if (encoding) { 472 PyErr_SetString(PyExc_TypeError, 473 "decoding Unicode is not supported"); 474 return NULL; 475 } 476 return PyObject_Unicode(obj); 477 } 478#else 479 if (PyUnicode_Check(obj)) { 480 PyErr_SetString(PyExc_TypeError, 481 "decoding Unicode is not supported"); 482 return NULL; 483 } 484#endif 485 486 /* Coerce object */ 487 if (PyString_Check(obj)) { 488 s = PyString_AS_STRING(obj); 489 len = PyString_GET_SIZE(obj); 490 } 491 else if (PyObject_AsCharBuffer(obj, &s, &len)) { 492 /* Overwrite the error message with something more useful in 493 case of a TypeError. */ 494 if (PyErr_ExceptionMatches(PyExc_TypeError)) 495 PyErr_Format(PyExc_TypeError, 496 "coercing to Unicode: need string or buffer, " 497 "%.80s found", 498 obj->ob_type->tp_name); 499 goto onError; 500 } 501 502 /* Convert to Unicode */ 503 if (len == 0) { 504 Py_INCREF(unicode_empty); 505 v = (PyObject *)unicode_empty; 506 } 507 else 508 v = PyUnicode_Decode(s, len, encoding, errors); 509 510 return v; 511 512 onError: 513 return NULL; 514} 515 516PyObject *PyUnicode_Decode(const char *s, 517 int size, 518 const char *encoding, 519 const char *errors) 520{ 521 PyObject *buffer = NULL, *unicode; 522 523 if (encoding == NULL) 524 encoding = PyUnicode_GetDefaultEncoding(); 525 526 /* Shortcuts for common default encodings */ 527 if (strcmp(encoding, "utf-8") == 0) 528 return PyUnicode_DecodeUTF8(s, size, errors); 529 else if (strcmp(encoding, "latin-1") == 0) 530 return PyUnicode_DecodeLatin1(s, size, errors); 531#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T) 532 else if (strcmp(encoding, "mbcs") == 0) 533 return PyUnicode_DecodeMBCS(s, size, errors); 534#endif 535 else if (strcmp(encoding, "ascii") == 0) 536 return PyUnicode_DecodeASCII(s, size, errors); 537 538 /* Decode via the codec registry */ 539 buffer = PyBuffer_FromMemory((void *)s, size); 540 if (buffer == NULL) 541 goto onError; 542 unicode = PyCodec_Decode(buffer, encoding, errors); 543 if (unicode == NULL) 544 goto onError; 545 if (!PyUnicode_Check(unicode)) { 546 PyErr_Format(PyExc_TypeError, 547 "decoder did not return an unicode object (type=%.400s)", 548 unicode->ob_type->tp_name); 549 Py_DECREF(unicode); 550 goto onError; 551 } 552 Py_DECREF(buffer); 553 return unicode; 554 555 onError: 556 Py_XDECREF(buffer); 557 return NULL; 558} 559 560PyObject *PyUnicode_AsDecodedObject(PyObject *unicode, 561 const char *encoding, 562 const char *errors) 563{ 564 PyObject *v; 565 566 if (!PyUnicode_Check(unicode)) { 567 PyErr_BadArgument(); 568 goto onError; 569 } 570 571 if (encoding == NULL) 572 encoding = PyUnicode_GetDefaultEncoding(); 573 574 /* Decode via the codec registry */ 575 v = PyCodec_Decode(unicode, encoding, errors); 576 if (v == NULL) 577 goto onError; 578 return v; 579 580 onError: 581 return NULL; 582} 583 584PyObject *PyUnicode_Encode(const Py_UNICODE *s, 585 int size, 586 const char *encoding, 587 const char *errors) 588{ 589 PyObject *v, *unicode; 590 591 unicode = PyUnicode_FromUnicode(s, size); 592 if (unicode == NULL) 593 return NULL; 594 v = PyUnicode_AsEncodedString(unicode, encoding, errors); 595 Py_DECREF(unicode); 596 return v; 597} 598 599PyObject *PyUnicode_AsEncodedObject(PyObject *unicode, 600 const char *encoding, 601 const char *errors) 602{ 603 PyObject *v; 604 605 if (!PyUnicode_Check(unicode)) { 606 PyErr_BadArgument(); 607 goto onError; 608 } 609 610 if (encoding == NULL) 611 encoding = PyUnicode_GetDefaultEncoding(); 612 613 /* Encode via the codec registry */ 614 v = PyCodec_Encode(unicode, encoding, errors); 615 if (v == NULL) 616 goto onError; 617 return v; 618 619 onError: 620 return NULL; 621} 622 623PyObject *PyUnicode_AsEncodedString(PyObject *unicode, 624 const char *encoding, 625 const char *errors) 626{ 627 PyObject *v; 628 629 if (!PyUnicode_Check(unicode)) { 630 PyErr_BadArgument(); 631 goto onError; 632 } 633 634 if (encoding == NULL) 635 encoding = PyUnicode_GetDefaultEncoding(); 636 637 /* Shortcuts for common default encodings */ 638 if (errors == NULL) { 639 if (strcmp(encoding, "utf-8") == 0) 640 return PyUnicode_AsUTF8String(unicode); 641 else if (strcmp(encoding, "latin-1") == 0) 642 return PyUnicode_AsLatin1String(unicode); 643#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T) 644 else if (strcmp(encoding, "mbcs") == 0) 645 return PyUnicode_AsMBCSString(unicode); 646#endif 647 else if (strcmp(encoding, "ascii") == 0) 648 return PyUnicode_AsASCIIString(unicode); 649 } 650 651 /* Encode via the codec registry */ 652 v = PyCodec_Encode(unicode, encoding, errors); 653 if (v == NULL) 654 goto onError; 655 if (!PyString_Check(v)) { 656 PyErr_Format(PyExc_TypeError, 657 "encoder did not return a string object (type=%.400s)", 658 v->ob_type->tp_name); 659 Py_DECREF(v); 660 goto onError; 661 } 662 return v; 663 664 onError: 665 return NULL; 666} 667 668PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode, 669 const char *errors) 670{ 671 PyObject *v = ((PyUnicodeObject *)unicode)->defenc; 672 673 if (v) 674 return v; 675 v = PyUnicode_AsEncodedString(unicode, NULL, errors); 676 if (v && errors == NULL) 677 ((PyUnicodeObject *)unicode)->defenc = v; 678 return v; 679} 680 681Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode) 682{ 683 if (!PyUnicode_Check(unicode)) { 684 PyErr_BadArgument(); 685 goto onError; 686 } 687 return PyUnicode_AS_UNICODE(unicode); 688 689 onError: 690 return NULL; 691} 692 693int PyUnicode_GetSize(PyObject *unicode) 694{ 695 if (!PyUnicode_Check(unicode)) { 696 PyErr_BadArgument(); 697 goto onError; 698 } 699 return PyUnicode_GET_SIZE(unicode); 700 701 onError: 702 return -1; 703} 704 705const char *PyUnicode_GetDefaultEncoding(void) 706{ 707 return unicode_default_encoding; 708} 709 710int PyUnicode_SetDefaultEncoding(const char *encoding) 711{ 712 PyObject *v; 713 714 /* Make sure the encoding is valid. As side effect, this also 715 loads the encoding into the codec registry cache. */ 716 v = _PyCodec_Lookup(encoding); 717 if (v == NULL) 718 goto onError; 719 Py_DECREF(v); 720 strncpy(unicode_default_encoding, 721 encoding, 722 sizeof(unicode_default_encoding)); 723 return 0; 724 725 onError: 726 return -1; 727} 728 729/* error handling callback helper: 730 build arguments, call the callback and check the arguments, 731 if no exception occured, copy the replacement to the output 732 and adjust various state variables. 733 return 0 on success, -1 on error 734*/ 735 736static 737int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler, 738 const char *encoding, const char *reason, 739 const char *input, int insize, int *startinpos, int *endinpos, PyObject **exceptionObject, const char **inptr, 740 PyObject **output, int *outpos, Py_UNICODE **outptr) 741{ 742 static char *argparse = "O!i;decoding error handler must return (unicode, int) tuple"; 743 744 PyObject *restuple = NULL; 745 PyObject *repunicode = NULL; 746 int outsize = PyUnicode_GET_SIZE(*output); 747 int requiredsize; 748 int newpos; 749 Py_UNICODE *repptr; 750 int repsize; 751 int res = -1; 752 753 if (*errorHandler == NULL) { 754 *errorHandler = PyCodec_LookupError(errors); 755 if (*errorHandler == NULL) 756 goto onError; 757 } 758 759 if (*exceptionObject == NULL) { 760 *exceptionObject = PyUnicodeDecodeError_Create( 761 encoding, input, insize, *startinpos, *endinpos, reason); 762 if (*exceptionObject == NULL) 763 goto onError; 764 } 765 else { 766 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos)) 767 goto onError; 768 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos)) 769 goto onError; 770 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason)) 771 goto onError; 772 } 773 774 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL); 775 if (restuple == NULL) 776 goto onError; 777 if (!PyTuple_Check(restuple)) { 778 PyErr_Format(PyExc_TypeError, &argparse[4]); 779 goto onError; 780 } 781 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos)) 782 goto onError; 783 if (newpos<0) 784 newpos = insize+newpos; 785 if (newpos<0 || newpos>insize) { 786 PyErr_Format(PyExc_IndexError, "position %d from error handler out of bounds", newpos); 787 goto onError; 788 } 789 790 /* need more space? (at least enough for what we 791 have+the replacement+the rest of the string (starting 792 at the new input position), so we won't have to check space 793 when there are no errors in the rest of the string) */ 794 repptr = PyUnicode_AS_UNICODE(repunicode); 795 repsize = PyUnicode_GET_SIZE(repunicode); 796 requiredsize = *outpos + repsize + insize-newpos; 797 if (requiredsize > outsize) { 798 if (requiredsize<2*outsize) 799 requiredsize = 2*outsize; 800 if (PyUnicode_Resize(output, requiredsize) < 0) 801 goto onError; 802 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos; 803 } 804 *endinpos = newpos; 805 *inptr = input + newpos; 806 Py_UNICODE_COPY(*outptr, repptr, repsize); 807 *outptr += repsize; 808 *outpos += repsize; 809 /* we made it! */ 810 res = 0; 811 812 onError: 813 Py_XDECREF(restuple); 814 return res; 815} 816 817/* --- UTF-7 Codec -------------------------------------------------------- */ 818 819/* see RFC2152 for details */ 820 821static 822char utf7_special[128] = { 823 /* indicate whether a UTF-7 character is special i.e. cannot be directly 824 encoded: 825 0 - not special 826 1 - special 827 2 - whitespace (optional) 828 3 - RFC2152 Set O (optional) */ 829 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1, 830 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 831 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1, 832 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0, 833 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 834 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3, 835 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 836 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1, 837 838}; 839 840#define SPECIAL(c, encodeO, encodeWS) \ 841 (((c)>127 || utf7_special[(c)] == 1) || \ 842 (encodeWS && (utf7_special[(c)] == 2)) || \ 843 (encodeO && (utf7_special[(c)] == 3))) 844 845#define B64(n) ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f]) 846#define B64CHAR(c) (isalnum(c) || (c) == '+' || (c) == '/') 847#define UB64(c) ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \ 848 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4) 849 850#define ENCODE(out, ch, bits) \ 851 while (bits >= 6) { \ 852 *out++ = B64(ch >> (bits-6)); \ 853 bits -= 6; \ 854 } 855 856#define DECODE(out, ch, bits, surrogate) \ 857 while (bits >= 16) { \ 858 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \ 859 bits -= 16; \ 860 if (surrogate) { \ 861 /* We have already generated an error for the high surrogate 862 so let's not bother seeing if the low surrogate is correct or not */\ 863 surrogate = 0; \ 864 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \ 865 /* This is a surrogate pair. Unfortunately we can't represent \ 866 it in a 16-bit character */ \ 867 surrogate = 1; \ 868 errmsg = "code pairs are not supported"; \ 869 goto utf7Error; \ 870 } else { \ 871 *out++ = outCh; \ 872 } \ 873 } \ 874 875PyObject *PyUnicode_DecodeUTF7(const char *s, 876 int size, 877 const char *errors) 878{ 879 const char *starts = s; 880 int startinpos; 881 int endinpos; 882 int outpos; 883 const char *e; 884 PyUnicodeObject *unicode; 885 Py_UNICODE *p; 886 const char *errmsg = ""; 887 int inShift = 0; 888 unsigned int bitsleft = 0; 889 unsigned long charsleft = 0; 890 int surrogate = 0; 891 PyObject *errorHandler = NULL; 892 PyObject *exc = NULL; 893 894 unicode = _PyUnicode_New(size); 895 if (!unicode) 896 return NULL; 897 if (size == 0) 898 return (PyObject *)unicode; 899 900 p = unicode->str; 901 e = s + size; 902 903 while (s < e) { 904 Py_UNICODE ch; 905 restart: 906 ch = *s; 907 908 if (inShift) { 909 if ((ch == '-') || !B64CHAR(ch)) { 910 inShift = 0; 911 s++; 912 913 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate); 914 if (bitsleft >= 6) { 915 /* The shift sequence has a partial character in it. If 916 bitsleft < 6 then we could just classify it as padding 917 but that is not the case here */ 918 919 errmsg = "partial character in shift sequence"; 920 goto utf7Error; 921 } 922 /* According to RFC2152 the remaining bits should be zero. We 923 choose to signal an error/insert a replacement character 924 here so indicate the potential of a misencoded character. */ 925 926 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */ 927 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) { 928 errmsg = "non-zero padding bits in shift sequence"; 929 goto utf7Error; 930 } 931 932 if (ch == '-') { 933 if ((s < e) && (*(s) == '-')) { 934 *p++ = '-'; 935 inShift = 1; 936 } 937 } else if (SPECIAL(ch,0,0)) { 938 errmsg = "unexpected special character"; 939 goto utf7Error; 940 } else { 941 *p++ = ch; 942 } 943 } else { 944 charsleft = (charsleft << 6) | UB64(ch); 945 bitsleft += 6; 946 s++; 947 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate); 948 } 949 } 950 else if ( ch == '+' ) { 951 startinpos = s-starts; 952 s++; 953 if (s < e && *s == '-') { 954 s++; 955 *p++ = '+'; 956 } else 957 { 958 inShift = 1; 959 bitsleft = 0; 960 } 961 } 962 else if (SPECIAL(ch,0,0)) { 963 errmsg = "unexpected special character"; 964 s++; 965 goto utf7Error; 966 } 967 else { 968 *p++ = ch; 969 s++; 970 } 971 continue; 972 utf7Error: 973 outpos = p-PyUnicode_AS_UNICODE(unicode); 974 endinpos = s-starts; 975 if (unicode_decode_call_errorhandler( 976 errors, &errorHandler, 977 "utf7", errmsg, 978 starts, size, &startinpos, &endinpos, &exc, &s, 979 (PyObject **)&unicode, &outpos, &p)) 980 goto onError; 981 } 982 983 if (inShift) { 984 outpos = p-PyUnicode_AS_UNICODE(unicode); 985 endinpos = size; 986 if (unicode_decode_call_errorhandler( 987 errors, &errorHandler, 988 "utf7", "unterminated shift sequence", 989 starts, size, &startinpos, &endinpos, &exc, &s, 990 (PyObject **)&unicode, &outpos, &p)) 991 goto onError; 992 if (s < e) 993 goto restart; 994 } 995 996 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0) 997 goto onError; 998 999 Py_XDECREF(errorHandler); 1000 Py_XDECREF(exc); 1001 return (PyObject *)unicode; 1002 1003onError: 1004 Py_XDECREF(errorHandler); 1005 Py_XDECREF(exc); 1006 Py_DECREF(unicode); 1007 return NULL; 1008} 1009 1010 1011PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s, 1012 int size, 1013 int encodeSetO, 1014 int encodeWhiteSpace, 1015 const char *errors) 1016{ 1017 PyObject *v; 1018 /* It might be possible to tighten this worst case */ 1019 unsigned int cbAllocated = 5 * size; 1020 int inShift = 0; 1021 int i = 0; 1022 unsigned int bitsleft = 0; 1023 unsigned long charsleft = 0; 1024 char * out; 1025 char * start; 1026 1027 if (size == 0) 1028 return PyString_FromStringAndSize(NULL, 0); 1029 1030 v = PyString_FromStringAndSize(NULL, cbAllocated); 1031 if (v == NULL) 1032 return NULL; 1033 1034 start = out = PyString_AS_STRING(v); 1035 for (;i < size; ++i) { 1036 Py_UNICODE ch = s[i]; 1037 1038 if (!inShift) { 1039 if (ch == '+') { 1040 *out++ = '+'; 1041 *out++ = '-'; 1042 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) { 1043 charsleft = ch; 1044 bitsleft = 16; 1045 *out++ = '+'; 1046 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft); 1047 inShift = bitsleft > 0; 1048 } else { 1049 *out++ = (char) ch; 1050 } 1051 } else { 1052 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) { 1053 *out++ = B64(charsleft << (6-bitsleft)); 1054 charsleft = 0; 1055 bitsleft = 0; 1056 /* Characters not in the BASE64 set implicitly unshift the sequence 1057 so no '-' is required, except if the character is itself a '-' */ 1058 if (B64CHAR(ch) || ch == '-') { 1059 *out++ = '-'; 1060 } 1061 inShift = 0; 1062 *out++ = (char) ch; 1063 } else { 1064 bitsleft += 16; 1065 charsleft = (charsleft << 16) | ch; 1066 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft); 1067 1068 /* If the next character is special then we dont' need to terminate 1069 the shift sequence. If the next character is not a BASE64 character 1070 or '-' then the shift sequence will be terminated implicitly and we 1071 don't have to insert a '-'. */ 1072 1073 if (bitsleft == 0) { 1074 if (i + 1 < size) { 1075 Py_UNICODE ch2 = s[i+1]; 1076 1077 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) { 1078 1079 } else if (B64CHAR(ch2) || ch2 == '-') { 1080 *out++ = '-'; 1081 inShift = 0; 1082 } else { 1083 inShift = 0; 1084 } 1085 1086 } 1087 else { 1088 *out++ = '-'; 1089 inShift = 0; 1090 } 1091 } 1092 } 1093 } 1094 } 1095 if (bitsleft) { 1096 *out++= B64(charsleft << (6-bitsleft) ); 1097 *out++ = '-'; 1098 } 1099 1100 _PyString_Resize(&v, out - start); 1101 return v; 1102} 1103 1104#undef SPECIAL 1105#undef B64 1106#undef B64CHAR 1107#undef UB64 1108#undef ENCODE 1109#undef DECODE 1110 1111/* --- UTF-8 Codec -------------------------------------------------------- */ 1112 1113static 1114char utf8_code_length[256] = { 1115 /* Map UTF-8 encoded prefix byte to sequence length. zero means 1116 illegal prefix. see RFC 2279 for details */ 1117 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1118 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1119 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1120 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1121 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1122 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1123 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1124 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1125 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1126 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1127 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1128 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1129 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1130 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1131 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1132 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0 1133}; 1134 1135PyObject *PyUnicode_DecodeUTF8(const char *s, 1136 int size, 1137 const char *errors) 1138{ 1139 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL); 1140} 1141 1142PyObject *PyUnicode_DecodeUTF8Stateful(const char *s, 1143 int size, 1144 const char *errors, 1145 int *consumed) 1146{ 1147 const char *starts = s; 1148 int n; 1149 int startinpos; 1150 int endinpos; 1151 int outpos; 1152 const char *e; 1153 PyUnicodeObject *unicode; 1154 Py_UNICODE *p; 1155 const char *errmsg = ""; 1156 PyObject *errorHandler = NULL; 1157 PyObject *exc = NULL; 1158 1159 /* Note: size will always be longer than the resulting Unicode 1160 character count */ 1161 unicode = _PyUnicode_New(size); 1162 if (!unicode) 1163 return NULL; 1164 if (size == 0) { 1165 if (consumed) 1166 *consumed = 0; 1167 return (PyObject *)unicode; 1168 } 1169 1170 /* Unpack UTF-8 encoded data */ 1171 p = unicode->str; 1172 e = s + size; 1173 1174 while (s < e) { 1175 Py_UCS4 ch = (unsigned char)*s; 1176 1177 if (ch < 0x80) { 1178 *p++ = (Py_UNICODE)ch; 1179 s++; 1180 continue; 1181 } 1182 1183 n = utf8_code_length[ch]; 1184 1185 if (s + n > e) { 1186 if (consumed) 1187 break; 1188 else { 1189 errmsg = "unexpected end of data"; 1190 startinpos = s-starts; 1191 endinpos = size; 1192 goto utf8Error; 1193 } 1194 } 1195 1196 switch (n) { 1197 1198 case 0: 1199 errmsg = "unexpected code byte"; 1200 startinpos = s-starts; 1201 endinpos = startinpos+1; 1202 goto utf8Error; 1203 1204 case 1: 1205 errmsg = "internal error"; 1206 startinpos = s-starts; 1207 endinpos = startinpos+1; 1208 goto utf8Error; 1209 1210 case 2: 1211 if ((s[1] & 0xc0) != 0x80) { 1212 errmsg = "invalid data"; 1213 startinpos = s-starts; 1214 endinpos = startinpos+2; 1215 goto utf8Error; 1216 } 1217 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f); 1218 if (ch < 0x80) { 1219 startinpos = s-starts; 1220 endinpos = startinpos+2; 1221 errmsg = "illegal encoding"; 1222 goto utf8Error; 1223 } 1224 else 1225 *p++ = (Py_UNICODE)ch; 1226 break; 1227 1228 case 3: 1229 if ((s[1] & 0xc0) != 0x80 || 1230 (s[2] & 0xc0) != 0x80) { 1231 errmsg = "invalid data"; 1232 startinpos = s-starts; 1233 endinpos = startinpos+3; 1234 goto utf8Error; 1235 } 1236 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f); 1237 if (ch < 0x0800) { 1238 /* Note: UTF-8 encodings of surrogates are considered 1239 legal UTF-8 sequences; 1240 1241 XXX For wide builds (UCS-4) we should probably try 1242 to recombine the surrogates into a single code 1243 unit. 1244 */ 1245 errmsg = "illegal encoding"; 1246 startinpos = s-starts; 1247 endinpos = startinpos+3; 1248 goto utf8Error; 1249 } 1250 else 1251 *p++ = (Py_UNICODE)ch; 1252 break; 1253 1254 case 4: 1255 if ((s[1] & 0xc0) != 0x80 || 1256 (s[2] & 0xc0) != 0x80 || 1257 (s[3] & 0xc0) != 0x80) { 1258 errmsg = "invalid data"; 1259 startinpos = s-starts; 1260 endinpos = startinpos+4; 1261 goto utf8Error; 1262 } 1263 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) + 1264 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f); 1265 /* validate and convert to UTF-16 */ 1266 if ((ch < 0x10000) /* minimum value allowed for 4 1267 byte encoding */ 1268 || (ch > 0x10ffff)) /* maximum value allowed for 1269 UTF-16 */ 1270 { 1271 errmsg = "illegal encoding"; 1272 startinpos = s-starts; 1273 endinpos = startinpos+4; 1274 goto utf8Error; 1275 } 1276#ifdef Py_UNICODE_WIDE 1277 *p++ = (Py_UNICODE)ch; 1278#else 1279 /* compute and append the two surrogates: */ 1280 1281 /* translate from 10000..10FFFF to 0..FFFF */ 1282 ch -= 0x10000; 1283 1284 /* high surrogate = top 10 bits added to D800 */ 1285 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10)); 1286 1287 /* low surrogate = bottom 10 bits added to DC00 */ 1288 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF)); 1289#endif 1290 break; 1291 1292 default: 1293 /* Other sizes are only needed for UCS-4 */ 1294 errmsg = "unsupported Unicode code range"; 1295 startinpos = s-starts; 1296 endinpos = startinpos+n; 1297 goto utf8Error; 1298 } 1299 s += n; 1300 continue; 1301 1302 utf8Error: 1303 outpos = p-PyUnicode_AS_UNICODE(unicode); 1304 if (unicode_decode_call_errorhandler( 1305 errors, &errorHandler, 1306 "utf8", errmsg, 1307 starts, size, &startinpos, &endinpos, &exc, &s, 1308 (PyObject **)&unicode, &outpos, &p)) 1309 goto onError; 1310 } 1311 if (consumed) 1312 *consumed = s-starts; 1313 1314 /* Adjust length */ 1315 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0) 1316 goto onError; 1317 1318 Py_XDECREF(errorHandler); 1319 Py_XDECREF(exc); 1320 return (PyObject *)unicode; 1321 1322onError: 1323 Py_XDECREF(errorHandler); 1324 Py_XDECREF(exc); 1325 Py_DECREF(unicode); 1326 return NULL; 1327} 1328 1329/* Allocation strategy: if the string is short, convert into a stack buffer 1330 and allocate exactly as much space needed at the end. Else allocate the 1331 maximum possible needed (4 result bytes per Unicode character), and return 1332 the excess memory at the end. 1333*/ 1334PyObject * 1335PyUnicode_EncodeUTF8(const Py_UNICODE *s, 1336 int size, 1337 const char *errors) 1338{ 1339#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */ 1340 1341 int i; /* index into s of next input byte */ 1342 PyObject *v; /* result string object */ 1343 char *p; /* next free byte in output buffer */ 1344 int nallocated; /* number of result bytes allocated */ 1345 int nneeded; /* number of result bytes needed */ 1346 char stackbuf[MAX_SHORT_UNICHARS * 4]; 1347 1348 assert(s != NULL); 1349 assert(size >= 0); 1350 1351 if (size <= MAX_SHORT_UNICHARS) { 1352 /* Write into the stack buffer; nallocated can't overflow. 1353 * At the end, we'll allocate exactly as much heap space as it 1354 * turns out we need. 1355 */ 1356 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int); 1357 v = NULL; /* will allocate after we're done */ 1358 p = stackbuf; 1359 } 1360 else { 1361 /* Overallocate on the heap, and give the excess back at the end. */ 1362 nallocated = size * 4; 1363 if (nallocated / 4 != size) /* overflow! */ 1364 return PyErr_NoMemory(); 1365 v = PyString_FromStringAndSize(NULL, nallocated); 1366 if (v == NULL) 1367 return NULL; 1368 p = PyString_AS_STRING(v); 1369 } 1370 1371 for (i = 0; i < size;) { 1372 Py_UCS4 ch = s[i++]; 1373 1374 if (ch < 0x80) 1375 /* Encode ASCII */ 1376 *p++ = (char) ch; 1377 1378 else if (ch < 0x0800) { 1379 /* Encode Latin-1 */ 1380 *p++ = (char)(0xc0 | (ch >> 6)); 1381 *p++ = (char)(0x80 | (ch & 0x3f)); 1382 } 1383 else { 1384 /* Encode UCS2 Unicode ordinals */ 1385 if (ch < 0x10000) { 1386 /* Special case: check for high surrogate */ 1387 if (0xD800 <= ch && ch <= 0xDBFF && i != size) { 1388 Py_UCS4 ch2 = s[i]; 1389 /* Check for low surrogate and combine the two to 1390 form a UCS4 value */ 1391 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) { 1392 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000; 1393 i++; 1394 goto encodeUCS4; 1395 } 1396 /* Fall through: handles isolated high surrogates */ 1397 } 1398 *p++ = (char)(0xe0 | (ch >> 12)); 1399 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); 1400 *p++ = (char)(0x80 | (ch & 0x3f)); 1401 continue; 1402 } 1403encodeUCS4: 1404 /* Encode UCS4 Unicode ordinals */ 1405 *p++ = (char)(0xf0 | (ch >> 18)); 1406 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f)); 1407 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); 1408 *p++ = (char)(0x80 | (ch & 0x3f)); 1409 } 1410 } 1411 1412 if (v == NULL) { 1413 /* This was stack allocated. */ 1414 nneeded = Py_SAFE_DOWNCAST(p - stackbuf, long, int); 1415 assert(nneeded <= nallocated); 1416 v = PyString_FromStringAndSize(stackbuf, nneeded); 1417 } 1418 else { 1419 /* Cut back to size actually needed. */ 1420 nneeded = Py_SAFE_DOWNCAST(p - PyString_AS_STRING(v), long, int); 1421 assert(nneeded <= nallocated); 1422 _PyString_Resize(&v, nneeded); 1423 } 1424 return v; 1425 1426#undef MAX_SHORT_UNICHARS 1427} 1428 1429PyObject *PyUnicode_AsUTF8String(PyObject *unicode) 1430{ 1431 if (!PyUnicode_Check(unicode)) { 1432 PyErr_BadArgument(); 1433 return NULL; 1434 } 1435 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode), 1436 PyUnicode_GET_SIZE(unicode), 1437 NULL); 1438} 1439 1440/* --- UTF-16 Codec ------------------------------------------------------- */ 1441 1442PyObject * 1443PyUnicode_DecodeUTF16(const char *s, 1444 int size, 1445 const char *errors, 1446 int *byteorder) 1447{ 1448 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL); 1449} 1450 1451PyObject * 1452PyUnicode_DecodeUTF16Stateful(const char *s, 1453 int size, 1454 const char *errors, 1455 int *byteorder, 1456 int *consumed) 1457{ 1458 const char *starts = s; 1459 int startinpos; 1460 int endinpos; 1461 int outpos; 1462 PyUnicodeObject *unicode; 1463 Py_UNICODE *p; 1464 const unsigned char *q, *e; 1465 int bo = 0; /* assume native ordering by default */ 1466 const char *errmsg = ""; 1467 /* Offsets from q for retrieving byte pairs in the right order. */ 1468#ifdef BYTEORDER_IS_LITTLE_ENDIAN 1469 int ihi = 1, ilo = 0; 1470#else 1471 int ihi = 0, ilo = 1; 1472#endif 1473 PyObject *errorHandler = NULL; 1474 PyObject *exc = NULL; 1475 1476 /* Note: size will always be longer than the resulting Unicode 1477 character count */ 1478 unicode = _PyUnicode_New(size); 1479 if (!unicode) 1480 return NULL; 1481 if (size == 0) 1482 return (PyObject *)unicode; 1483 1484 /* Unpack UTF-16 encoded data */ 1485 p = unicode->str; 1486 q = (unsigned char *)s; 1487 e = q + size; 1488 1489 if (byteorder) 1490 bo = *byteorder; 1491 1492 /* Check for BOM marks (U+FEFF) in the input and adjust current 1493 byte order setting accordingly. In native mode, the leading BOM 1494 mark is skipped, in all other modes, it is copied to the output 1495 stream as-is (giving a ZWNBSP character). */ 1496 if (bo == 0) { 1497 if (size >= 2) { 1498 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo]; 1499#ifdef BYTEORDER_IS_LITTLE_ENDIAN 1500 if (bom == 0xFEFF) { 1501 q += 2; 1502 bo = -1; 1503 } 1504 else if (bom == 0xFFFE) { 1505 q += 2; 1506 bo = 1; 1507 } 1508#else 1509 if (bom == 0xFEFF) { 1510 q += 2; 1511 bo = 1; 1512 } 1513 else if (bom == 0xFFFE) { 1514 q += 2; 1515 bo = -1; 1516 } 1517#endif 1518 } 1519 } 1520 1521 if (bo == -1) { 1522 /* force LE */ 1523 ihi = 1; 1524 ilo = 0; 1525 } 1526 else if (bo == 1) { 1527 /* force BE */ 1528 ihi = 0; 1529 ilo = 1; 1530 } 1531 1532 while (q < e) { 1533 Py_UNICODE ch; 1534 /* remaining bytes at the end? (size should be even) */ 1535 if (e-q<2) { 1536 if (consumed) 1537 break; 1538 errmsg = "truncated data"; 1539 startinpos = ((const char *)q)-starts; 1540 endinpos = ((const char *)e)-starts; 1541 goto utf16Error; 1542 /* The remaining input chars are ignored if the callback 1543 chooses to skip the input */ 1544 } 1545 ch = (q[ihi] << 8) | q[ilo]; 1546 1547 q += 2; 1548 1549 if (ch < 0xD800 || ch > 0xDFFF) { 1550 *p++ = ch; 1551 continue; 1552 } 1553 1554 /* UTF-16 code pair: */ 1555 if (q >= e) { 1556 errmsg = "unexpected end of data"; 1557 startinpos = (((const char *)q)-2)-starts; 1558 endinpos = ((const char *)e)-starts; 1559 goto utf16Error; 1560 } 1561 if (0xD800 <= ch && ch <= 0xDBFF) { 1562 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo]; 1563 q += 2; 1564 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) { 1565#ifndef Py_UNICODE_WIDE 1566 *p++ = ch; 1567 *p++ = ch2; 1568#else 1569 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000; 1570#endif 1571 continue; 1572 } 1573 else { 1574 errmsg = "illegal UTF-16 surrogate"; 1575 startinpos = (((const char *)q)-4)-starts; 1576 endinpos = startinpos+2; 1577 goto utf16Error; 1578 } 1579 1580 } 1581 errmsg = "illegal encoding"; 1582 startinpos = (((const char *)q)-2)-starts; 1583 endinpos = startinpos+2; 1584 /* Fall through to report the error */ 1585 1586 utf16Error: 1587 outpos = p-PyUnicode_AS_UNICODE(unicode); 1588 if (unicode_decode_call_errorhandler( 1589 errors, &errorHandler, 1590 "utf16", errmsg, 1591 starts, size, &startinpos, &endinpos, &exc, (const char **)&q, 1592 (PyObject **)&unicode, &outpos, &p)) 1593 goto onError; 1594 } 1595 1596 if (byteorder) 1597 *byteorder = bo; 1598 1599 if (consumed) 1600 *consumed = (const char *)q-starts; 1601 1602 /* Adjust length */ 1603 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0) 1604 goto onError; 1605 1606 Py_XDECREF(errorHandler); 1607 Py_XDECREF(exc); 1608 return (PyObject *)unicode; 1609 1610onError: 1611 Py_DECREF(unicode); 1612 Py_XDECREF(errorHandler); 1613 Py_XDECREF(exc); 1614 return NULL; 1615} 1616 1617PyObject * 1618PyUnicode_EncodeUTF16(const Py_UNICODE *s, 1619 int size, 1620 const char *errors, 1621 int byteorder) 1622{ 1623 PyObject *v; 1624 unsigned char *p; 1625#ifdef Py_UNICODE_WIDE 1626 int i, pairs; 1627#else 1628 const int pairs = 0; 1629#endif 1630 /* Offsets from p for storing byte pairs in the right order. */ 1631#ifdef BYTEORDER_IS_LITTLE_ENDIAN 1632 int ihi = 1, ilo = 0; 1633#else 1634 int ihi = 0, ilo = 1; 1635#endif 1636 1637#define STORECHAR(CH) \ 1638 do { \ 1639 p[ihi] = ((CH) >> 8) & 0xff; \ 1640 p[ilo] = (CH) & 0xff; \ 1641 p += 2; \ 1642 } while(0) 1643 1644#ifdef Py_UNICODE_WIDE 1645 for (i = pairs = 0; i < size; i++) 1646 if (s[i] >= 0x10000) 1647 pairs++; 1648#endif 1649 v = PyString_FromStringAndSize(NULL, 1650 2 * (size + pairs + (byteorder == 0))); 1651 if (v == NULL) 1652 return NULL; 1653 1654 p = (unsigned char *)PyString_AS_STRING(v); 1655 if (byteorder == 0) 1656 STORECHAR(0xFEFF); 1657 if (size == 0) 1658 return v; 1659 1660 if (byteorder == -1) { 1661 /* force LE */ 1662 ihi = 1; 1663 ilo = 0; 1664 } 1665 else if (byteorder == 1) { 1666 /* force BE */ 1667 ihi = 0; 1668 ilo = 1; 1669 } 1670 1671 while (size-- > 0) { 1672 Py_UNICODE ch = *s++; 1673 Py_UNICODE ch2 = 0; 1674#ifdef Py_UNICODE_WIDE 1675 if (ch >= 0x10000) { 1676 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF); 1677 ch = 0xD800 | ((ch-0x10000) >> 10); 1678 } 1679#endif 1680 STORECHAR(ch); 1681 if (ch2) 1682 STORECHAR(ch2); 1683 } 1684 return v; 1685#undef STORECHAR 1686} 1687 1688PyObject *PyUnicode_AsUTF16String(PyObject *unicode) 1689{ 1690 if (!PyUnicode_Check(unicode)) { 1691 PyErr_BadArgument(); 1692 return NULL; 1693 } 1694 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode), 1695 PyUnicode_GET_SIZE(unicode), 1696 NULL, 1697 0); 1698} 1699 1700/* --- Unicode Escape Codec ----------------------------------------------- */ 1701 1702static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL; 1703 1704PyObject *PyUnicode_DecodeUnicodeEscape(const char *s, 1705 int size, 1706 const char *errors) 1707{ 1708 const char *starts = s; 1709 int startinpos; 1710 int endinpos; 1711 int outpos; 1712 int i; 1713 PyUnicodeObject *v; 1714 Py_UNICODE *p; 1715 const char *end; 1716 char* message; 1717 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */ 1718 PyObject *errorHandler = NULL; 1719 PyObject *exc = NULL; 1720 1721 /* Escaped strings will always be longer than the resulting 1722 Unicode string, so we start with size here and then reduce the 1723 length after conversion to the true value. 1724 (but if the error callback returns a long replacement string 1725 we'll have to allocate more space) */ 1726 v = _PyUnicode_New(size); 1727 if (v == NULL) 1728 goto onError; 1729 if (size == 0) 1730 return (PyObject *)v; 1731 1732 p = PyUnicode_AS_UNICODE(v); 1733 end = s + size; 1734 1735 while (s < end) { 1736 unsigned char c; 1737 Py_UNICODE x; 1738 int digits; 1739 1740 /* Non-escape characters are interpreted as Unicode ordinals */ 1741 if (*s != '\\') { 1742 *p++ = (unsigned char) *s++; 1743 continue; 1744 } 1745 1746 startinpos = s-starts; 1747 /* \ - Escapes */ 1748 s++; 1749 switch (*s++) { 1750 1751 /* \x escapes */ 1752 case '\n': break; 1753 case '\\': *p++ = '\\'; break; 1754 case '\'': *p++ = '\''; break; 1755 case '\"': *p++ = '\"'; break; 1756 case 'b': *p++ = '\b'; break; 1757 case 'f': *p++ = '\014'; break; /* FF */ 1758 case 't': *p++ = '\t'; break; 1759 case 'n': *p++ = '\n'; break; 1760 case 'r': *p++ = '\r'; break; 1761 case 'v': *p++ = '\013'; break; /* VT */ 1762 case 'a': *p++ = '\007'; break; /* BEL, not classic C */ 1763 1764 /* \OOO (octal) escapes */ 1765 case '0': case '1': case '2': case '3': 1766 case '4': case '5': case '6': case '7': 1767 x = s[-1] - '0'; 1768 if ('0' <= *s && *s <= '7') { 1769 x = (x<<3) + *s++ - '0'; 1770 if ('0' <= *s && *s <= '7') 1771 x = (x<<3) + *s++ - '0'; 1772 } 1773 *p++ = x; 1774 break; 1775 1776 /* hex escapes */ 1777 /* \xXX */ 1778 case 'x': 1779 digits = 2; 1780 message = "truncated \\xXX escape"; 1781 goto hexescape; 1782 1783 /* \uXXXX */ 1784 case 'u': 1785 digits = 4; 1786 message = "truncated \\uXXXX escape"; 1787 goto hexescape; 1788 1789 /* \UXXXXXXXX */ 1790 case 'U': 1791 digits = 8; 1792 message = "truncated \\UXXXXXXXX escape"; 1793 hexescape: 1794 chr = 0; 1795 outpos = p-PyUnicode_AS_UNICODE(v); 1796 if (s+digits>end) { 1797 endinpos = size; 1798 if (unicode_decode_call_errorhandler( 1799 errors, &errorHandler, 1800 "unicodeescape", "end of string in escape sequence", 1801 starts, size, &startinpos, &endinpos, &exc, &s, 1802 (PyObject **)&v, &outpos, &p)) 1803 goto onError; 1804 goto nextByte; 1805 } 1806 for (i = 0; i < digits; ++i) { 1807 c = (unsigned char) s[i]; 1808 if (!isxdigit(c)) { 1809 endinpos = (s+i+1)-starts; 1810 if (unicode_decode_call_errorhandler( 1811 errors, &errorHandler, 1812 "unicodeescape", message, 1813 starts, size, &startinpos, &endinpos, &exc, &s, 1814 (PyObject **)&v, &outpos, &p)) 1815 goto onError; 1816 goto nextByte; 1817 } 1818 chr = (chr<<4) & ~0xF; 1819 if (c >= '0' && c <= '9') 1820 chr += c - '0'; 1821 else if (c >= 'a' && c <= 'f') 1822 chr += 10 + c - 'a'; 1823 else 1824 chr += 10 + c - 'A'; 1825 } 1826 s += i; 1827 if (chr == 0xffffffff && PyErr_Occurred()) 1828 /* _decoding_error will have already written into the 1829 target buffer. */ 1830 break; 1831 store: 1832 /* when we get here, chr is a 32-bit unicode character */ 1833 if (chr <= 0xffff) 1834 /* UCS-2 character */ 1835 *p++ = (Py_UNICODE) chr; 1836 else if (chr <= 0x10ffff) { 1837 /* UCS-4 character. Either store directly, or as 1838 surrogate pair. */ 1839#ifdef Py_UNICODE_WIDE 1840 *p++ = chr; 1841#else 1842 chr -= 0x10000L; 1843 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10); 1844 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF); 1845#endif 1846 } else { 1847 endinpos = s-starts; 1848 outpos = p-PyUnicode_AS_UNICODE(v); 1849 if (unicode_decode_call_errorhandler( 1850 errors, &errorHandler, 1851 "unicodeescape", "illegal Unicode character", 1852 starts, size, &startinpos, &endinpos, &exc, &s, 1853 (PyObject **)&v, &outpos, &p)) 1854 goto onError; 1855 } 1856 break; 1857 1858 /* \N{name} */ 1859 case 'N': 1860 message = "malformed \\N character escape"; 1861 if (ucnhash_CAPI == NULL) { 1862 /* load the unicode data module */ 1863 PyObject *m, *v; 1864 m = PyImport_ImportModule("unicodedata"); 1865 if (m == NULL) 1866 goto ucnhashError; 1867 v = PyObject_GetAttrString(m, "ucnhash_CAPI"); 1868 Py_DECREF(m); 1869 if (v == NULL) 1870 goto ucnhashError; 1871 ucnhash_CAPI = PyCObject_AsVoidPtr(v); 1872 Py_DECREF(v); 1873 if (ucnhash_CAPI == NULL) 1874 goto ucnhashError; 1875 } 1876 if (*s == '{') { 1877 const char *start = s+1; 1878 /* look for the closing brace */ 1879 while (*s != '}' && s < end) 1880 s++; 1881 if (s > start && s < end && *s == '}') { 1882 /* found a name. look it up in the unicode database */ 1883 message = "unknown Unicode character name"; 1884 s++; 1885 if (ucnhash_CAPI->getcode(start, s-start-1, &chr)) 1886 goto store; 1887 } 1888 } 1889 endinpos = s-starts; 1890 outpos = p-PyUnicode_AS_UNICODE(v); 1891 if (unicode_decode_call_errorhandler( 1892 errors, &errorHandler, 1893 "unicodeescape", message, 1894 starts, size, &startinpos, &endinpos, &exc, &s, 1895 (PyObject **)&v, &outpos, &p)) 1896 goto onError; 1897 break; 1898 1899 default: 1900 if (s > end) { 1901 message = "\\ at end of string"; 1902 s--; 1903 endinpos = s-starts; 1904 outpos = p-PyUnicode_AS_UNICODE(v); 1905 if (unicode_decode_call_errorhandler( 1906 errors, &errorHandler, 1907 "unicodeescape", message, 1908 starts, size, &startinpos, &endinpos, &exc, &s, 1909 (PyObject **)&v, &outpos, &p)) 1910 goto onError; 1911 } 1912 else { 1913 *p++ = '\\'; 1914 *p++ = (unsigned char)s[-1]; 1915 } 1916 break; 1917 } 1918 nextByte: 1919 ; 1920 } 1921 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0) 1922 goto onError; 1923 Py_XDECREF(errorHandler); 1924 Py_XDECREF(exc); 1925 return (PyObject *)v; 1926 1927ucnhashError: 1928 PyErr_SetString( 1929 PyExc_UnicodeError, 1930 "\\N escapes not supported (can't load unicodedata module)" 1931 ); 1932 Py_XDECREF(errorHandler); 1933 Py_XDECREF(exc); 1934 return NULL; 1935 1936onError: 1937 Py_XDECREF(v); 1938 Py_XDECREF(errorHandler); 1939 Py_XDECREF(exc); 1940 return NULL; 1941} 1942 1943/* Return a Unicode-Escape string version of the Unicode object. 1944 1945 If quotes is true, the string is enclosed in u"" or u'' quotes as 1946 appropriate. 1947 1948*/ 1949 1950static const Py_UNICODE *findchar(const Py_UNICODE *s, 1951 int size, 1952 Py_UNICODE ch); 1953 1954static 1955PyObject *unicodeescape_string(const Py_UNICODE *s, 1956 int size, 1957 int quotes) 1958{ 1959 PyObject *repr; 1960 char *p; 1961 1962 static const char *hexdigit = "0123456789abcdef"; 1963 1964 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1); 1965 if (repr == NULL) 1966 return NULL; 1967 1968 p = PyString_AS_STRING(repr); 1969 1970 if (quotes) { 1971 *p++ = 'u'; 1972 *p++ = (findchar(s, size, '\'') && 1973 !findchar(s, size, '"')) ? '"' : '\''; 1974 } 1975 while (size-- > 0) { 1976 Py_UNICODE ch = *s++; 1977 1978 /* Escape quotes */ 1979 if (quotes && 1980 (ch == (Py_UNICODE) PyString_AS_STRING(repr)[1] || ch == '\\')) { 1981 *p++ = '\\'; 1982 *p++ = (char) ch; 1983 continue; 1984 } 1985 1986#ifdef Py_UNICODE_WIDE 1987 /* Map 21-bit characters to '\U00xxxxxx' */ 1988 else if (ch >= 0x10000) { 1989 int offset = p - PyString_AS_STRING(repr); 1990 1991 /* Resize the string if necessary */ 1992 if (offset + 12 > PyString_GET_SIZE(repr)) { 1993 if (_PyString_Resize(&repr, PyString_GET_SIZE(repr) + 100)) 1994 return NULL; 1995 p = PyString_AS_STRING(repr) + offset; 1996 } 1997 1998 *p++ = '\\'; 1999 *p++ = 'U'; 2000 *p++ = hexdigit[(ch >> 28) & 0x0000000F]; 2001 *p++ = hexdigit[(ch >> 24) & 0x0000000F]; 2002 *p++ = hexdigit[(ch >> 20) & 0x0000000F]; 2003 *p++ = hexdigit[(ch >> 16) & 0x0000000F]; 2004 *p++ = hexdigit[(ch >> 12) & 0x0000000F]; 2005 *p++ = hexdigit[(ch >> 8) & 0x0000000F]; 2006 *p++ = hexdigit[(ch >> 4) & 0x0000000F]; 2007 *p++ = hexdigit[ch & 0x0000000F]; 2008 continue; 2009 } 2010#endif 2011 /* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */ 2012 else if (ch >= 0xD800 && ch < 0xDC00) { 2013 Py_UNICODE ch2; 2014 Py_UCS4 ucs; 2015 2016 ch2 = *s++; 2017 size--; 2018 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) { 2019 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000; 2020 *p++ = '\\'; 2021 *p++ = 'U'; 2022 *p++ = hexdigit[(ucs >> 28) & 0x0000000F]; 2023 *p++ = hexdigit[(ucs >> 24) & 0x0000000F]; 2024 *p++ = hexdigit[(ucs >> 20) & 0x0000000F]; 2025 *p++ = hexdigit[(ucs >> 16) & 0x0000000F]; 2026 *p++ = hexdigit[(ucs >> 12) & 0x0000000F]; 2027 *p++ = hexdigit[(ucs >> 8) & 0x0000000F]; 2028 *p++ = hexdigit[(ucs >> 4) & 0x0000000F]; 2029 *p++ = hexdigit[ucs & 0x0000000F]; 2030 continue; 2031 } 2032 /* Fall through: isolated surrogates are copied as-is */ 2033 s--; 2034 size++; 2035 } 2036 2037 /* Map 16-bit characters to '\uxxxx' */ 2038 if (ch >= 256) { 2039 *p++ = '\\'; 2040 *p++ = 'u'; 2041 *p++ = hexdigit[(ch >> 12) & 0x000F]; 2042 *p++ = hexdigit[(ch >> 8) & 0x000F]; 2043 *p++ = hexdigit[(ch >> 4) & 0x000F]; 2044 *p++ = hexdigit[ch & 0x000F]; 2045 } 2046 2047 /* Map special whitespace to '\t', \n', '\r' */ 2048 else if (ch == '\t') { 2049 *p++ = '\\'; 2050 *p++ = 't'; 2051 } 2052 else if (ch == '\n') { 2053 *p++ = '\\'; 2054 *p++ = 'n'; 2055 } 2056 else if (ch == '\r') { 2057 *p++ = '\\'; 2058 *p++ = 'r'; 2059 } 2060 2061 /* Map non-printable US ASCII to '\xhh' */ 2062 else if (ch < ' ' || ch >= 0x7F) { 2063 *p++ = '\\'; 2064 *p++ = 'x'; 2065 *p++ = hexdigit[(ch >> 4) & 0x000F]; 2066 *p++ = hexdigit[ch & 0x000F]; 2067 } 2068 2069 /* Copy everything else as-is */ 2070 else 2071 *p++ = (char) ch; 2072 } 2073 if (quotes) 2074 *p++ = PyString_AS_STRING(repr)[1]; 2075 2076 *p = '\0'; 2077 _PyString_Resize(&repr, p - PyString_AS_STRING(repr)); 2078 return repr; 2079} 2080 2081PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s, 2082 int size) 2083{ 2084 return unicodeescape_string(s, size, 0); 2085} 2086 2087PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode) 2088{ 2089 if (!PyUnicode_Check(unicode)) { 2090 PyErr_BadArgument(); 2091 return NULL; 2092 } 2093 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode), 2094 PyUnicode_GET_SIZE(unicode)); 2095} 2096 2097/* --- Raw Unicode Escape Codec ------------------------------------------- */ 2098 2099PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s, 2100 int size, 2101 const char *errors) 2102{ 2103 const char *starts = s; 2104 int startinpos; 2105 int endinpos; 2106 int outpos; 2107 PyUnicodeObject *v; 2108 Py_UNICODE *p; 2109 const char *end; 2110 const char *bs; 2111 PyObject *errorHandler = NULL; 2112 PyObject *exc = NULL; 2113 2114 /* Escaped strings will always be longer than the resulting 2115 Unicode string, so we start with size here and then reduce the 2116 length after conversion to the true value. (But decoding error 2117 handler might have to resize the string) */ 2118 v = _PyUnicode_New(size); 2119 if (v == NULL) 2120 goto onError; 2121 if (size == 0) 2122 return (PyObject *)v; 2123 p = PyUnicode_AS_UNICODE(v); 2124 end = s + size; 2125 while (s < end) { 2126 unsigned char c; 2127 Py_UCS4 x; 2128 int i; 2129 int count; 2130 2131 /* Non-escape characters are interpreted as Unicode ordinals */ 2132 if (*s != '\\') { 2133 *p++ = (unsigned char)*s++; 2134 continue; 2135 } 2136 startinpos = s-starts; 2137 2138 /* \u-escapes are only interpreted iff the number of leading 2139 backslashes if odd */ 2140 bs = s; 2141 for (;s < end;) { 2142 if (*s != '\\') 2143 break; 2144 *p++ = (unsigned char)*s++; 2145 } 2146 if (((s - bs) & 1) == 0 || 2147 s >= end || 2148 (*s != 'u' && *s != 'U')) { 2149 continue; 2150 } 2151 p--; 2152 count = *s=='u' ? 4 : 8; 2153 s++; 2154 2155 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */ 2156 outpos = p-PyUnicode_AS_UNICODE(v); 2157 for (x = 0, i = 0; i < count; ++i, ++s) { 2158 c = (unsigned char)*s; 2159 if (!isxdigit(c)) { 2160 endinpos = s-starts; 2161 if (unicode_decode_call_errorhandler( 2162 errors, &errorHandler, 2163 "rawunicodeescape", "truncated \\uXXXX", 2164 starts, size, &startinpos, &endinpos, &exc, &s, 2165 (PyObject **)&v, &outpos, &p)) 2166 goto onError; 2167 goto nextByte; 2168 } 2169 x = (x<<4) & ~0xF; 2170 if (c >= '0' && c <= '9') 2171 x += c - '0'; 2172 else if (c >= 'a' && c <= 'f') 2173 x += 10 + c - 'a'; 2174 else 2175 x += 10 + c - 'A'; 2176 } 2177#ifndef Py_UNICODE_WIDE 2178 if (x > 0x10000) { 2179 if (unicode_decode_call_errorhandler( 2180 errors, &errorHandler, 2181 "rawunicodeescape", "\\Uxxxxxxxx out of range", 2182 starts, size, &startinpos, &endinpos, &exc, &s, 2183 (PyObject **)&v, &outpos, &p)) 2184 goto onError; 2185 } 2186#endif 2187 *p++ = x; 2188 nextByte: 2189 ; 2190 } 2191 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0) 2192 goto onError; 2193 Py_XDECREF(errorHandler); 2194 Py_XDECREF(exc); 2195 return (PyObject *)v; 2196 2197 onError: 2198 Py_XDECREF(v); 2199 Py_XDECREF(errorHandler); 2200 Py_XDECREF(exc); 2201 return NULL; 2202} 2203 2204PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s, 2205 int size) 2206{ 2207 PyObject *repr; 2208 char *p; 2209 char *q; 2210 2211 static const char *hexdigit = "0123456789abcdef"; 2212 2213#ifdef Py_UNICODE_WIDE 2214 repr = PyString_FromStringAndSize(NULL, 10 * size); 2215#else 2216 repr = PyString_FromStringAndSize(NULL, 6 * size); 2217#endif 2218 if (repr == NULL) 2219 return NULL; 2220 if (size == 0) 2221 return repr; 2222 2223 p = q = PyString_AS_STRING(repr); 2224 while (size-- > 0) { 2225 Py_UNICODE ch = *s++; 2226#ifdef Py_UNICODE_WIDE 2227 /* Map 32-bit characters to '\Uxxxxxxxx' */ 2228 if (ch >= 0x10000) { 2229 *p++ = '\\'; 2230 *p++ = 'U'; 2231 *p++ = hexdigit[(ch >> 28) & 0xf]; 2232 *p++ = hexdigit[(ch >> 24) & 0xf]; 2233 *p++ = hexdigit[(ch >> 20) & 0xf]; 2234 *p++ = hexdigit[(ch >> 16) & 0xf]; 2235 *p++ = hexdigit[(ch >> 12) & 0xf]; 2236 *p++ = hexdigit[(ch >> 8) & 0xf]; 2237 *p++ = hexdigit[(ch >> 4) & 0xf]; 2238 *p++ = hexdigit[ch & 15]; 2239 } 2240 else 2241#endif 2242 /* Map 16-bit characters to '\uxxxx' */ 2243 if (ch >= 256) { 2244 *p++ = '\\'; 2245 *p++ = 'u'; 2246 *p++ = hexdigit[(ch >> 12) & 0xf]; 2247 *p++ = hexdigit[(ch >> 8) & 0xf]; 2248 *p++ = hexdigit[(ch >> 4) & 0xf]; 2249 *p++ = hexdigit[ch & 15]; 2250 } 2251 /* Copy everything else as-is */ 2252 else 2253 *p++ = (char) ch; 2254 } 2255 *p = '\0'; 2256 _PyString_Resize(&repr, p - q); 2257 return repr; 2258} 2259 2260PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode) 2261{ 2262 if (!PyUnicode_Check(unicode)) { 2263 PyErr_BadArgument(); 2264 return NULL; 2265 } 2266 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode), 2267 PyUnicode_GET_SIZE(unicode)); 2268} 2269 2270/* --- Latin-1 Codec ------------------------------------------------------ */ 2271 2272PyObject *PyUnicode_DecodeLatin1(const char *s, 2273 int size, 2274 const char *errors) 2275{ 2276 PyUnicodeObject *v; 2277 Py_UNICODE *p; 2278 2279 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */ 2280 if (size == 1) { 2281 Py_UNICODE r = *(unsigned char*)s; 2282 return PyUnicode_FromUnicode(&r, 1); 2283 } 2284 2285 v = _PyUnicode_New(size); 2286 if (v == NULL) 2287 goto onError; 2288 if (size == 0) 2289 return (PyObject *)v; 2290 p = PyUnicode_AS_UNICODE(v); 2291 while (size-- > 0) 2292 *p++ = (unsigned char)*s++; 2293 return (PyObject *)v; 2294 2295 onError: 2296 Py_XDECREF(v); 2297 return NULL; 2298} 2299 2300/* create or adjust a UnicodeEncodeError */ 2301static void make_encode_exception(PyObject **exceptionObject, 2302 const char *encoding, 2303 const Py_UNICODE *unicode, int size, 2304 int startpos, int endpos, 2305 const char *reason) 2306{ 2307 if (*exceptionObject == NULL) { 2308 *exceptionObject = PyUnicodeEncodeError_Create( 2309 encoding, unicode, size, startpos, endpos, reason); 2310 } 2311 else { 2312 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos)) 2313 goto onError; 2314 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos)) 2315 goto onError; 2316 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason)) 2317 goto onError; 2318 return; 2319 onError: 2320 Py_DECREF(*exceptionObject); 2321 *exceptionObject = NULL; 2322 } 2323} 2324 2325/* raises a UnicodeEncodeError */ 2326static void raise_encode_exception(PyObject **exceptionObject, 2327 const char *encoding, 2328 const Py_UNICODE *unicode, int size, 2329 int startpos, int endpos, 2330 const char *reason) 2331{ 2332 make_encode_exception(exceptionObject, 2333 encoding, unicode, size, startpos, endpos, reason); 2334 if (*exceptionObject != NULL) 2335 PyCodec_StrictErrors(*exceptionObject); 2336} 2337 2338/* error handling callback helper: 2339 build arguments, call the callback and check the arguments, 2340 put the result into newpos and return the replacement string, which 2341 has to be freed by the caller */ 2342static PyObject *unicode_encode_call_errorhandler(const char *errors, 2343 PyObject **errorHandler, 2344 const char *encoding, const char *reason, 2345 const Py_UNICODE *unicode, int size, PyObject **exceptionObject, 2346 int startpos, int endpos, 2347 int *newpos) 2348{ 2349 static char *argparse = "O!i;encoding error handler must return (unicode, int) tuple"; 2350 2351 PyObject *restuple; 2352 PyObject *resunicode; 2353 2354 if (*errorHandler == NULL) { 2355 *errorHandler = PyCodec_LookupError(errors); 2356 if (*errorHandler == NULL) 2357 return NULL; 2358 } 2359 2360 make_encode_exception(exceptionObject, 2361 encoding, unicode, size, startpos, endpos, reason); 2362 if (*exceptionObject == NULL) 2363 return NULL; 2364 2365 restuple = PyObject_CallFunctionObjArgs( 2366 *errorHandler, *exceptionObject, NULL); 2367 if (restuple == NULL) 2368 return NULL; 2369 if (!PyTuple_Check(restuple)) { 2370 PyErr_Format(PyExc_TypeError, &argparse[4]); 2371 Py_DECREF(restuple); 2372 return NULL; 2373 } 2374 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, 2375 &resunicode, newpos)) { 2376 Py_DECREF(restuple); 2377 return NULL; 2378 } 2379 if (*newpos<0) 2380 *newpos = size+*newpos; 2381 if (*newpos<0 || *newpos>size) { 2382 PyErr_Format(PyExc_IndexError, "position %d from error handler out of bounds", *newpos); 2383 Py_DECREF(restuple); 2384 return NULL; 2385 } 2386 Py_INCREF(resunicode); 2387 Py_DECREF(restuple); 2388 return resunicode; 2389} 2390 2391static PyObject *unicode_encode_ucs1(const Py_UNICODE *p, 2392 int size, 2393 const char *errors, 2394 int limit) 2395{ 2396 /* output object */ 2397 PyObject *res; 2398 /* pointers to the beginning and end+1 of input */ 2399 const Py_UNICODE *startp = p; 2400 const Py_UNICODE *endp = p + size; 2401 /* pointer to the beginning of the unencodable characters */ 2402 /* const Py_UNICODE *badp = NULL; */ 2403 /* pointer into the output */ 2404 char *str; 2405 /* current output position */ 2406 int respos = 0; 2407 int ressize; 2408 char *encoding = (limit == 256) ? "latin-1" : "ascii"; 2409 char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)"; 2410 PyObject *errorHandler = NULL; 2411 PyObject *exc = NULL; 2412 /* the following variable is used for caching string comparisons 2413 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */ 2414 int known_errorHandler = -1; 2415 2416 /* allocate enough for a simple encoding without 2417 replacements, if we need more, we'll resize */ 2418 res = PyString_FromStringAndSize(NULL, size); 2419 if (res == NULL) 2420 goto onError; 2421 if (size == 0) 2422 return res; 2423 str = PyString_AS_STRING(res); 2424 ressize = size; 2425 2426 while (p<endp) { 2427 Py_UNICODE c = *p; 2428 2429 /* can we encode this? */ 2430 if (c<limit) { 2431 /* no overflow check, because we know that the space is enough */ 2432 *str++ = (char)c; 2433 ++p; 2434 } 2435 else { 2436 int unicodepos = p-startp; 2437 int requiredsize; 2438 PyObject *repunicode; 2439 int repsize; 2440 int newpos; 2441 int respos; 2442 Py_UNICODE *uni2; 2443 /* startpos for collecting unencodable chars */ 2444 const Py_UNICODE *collstart = p; 2445 const Py_UNICODE *collend = p; 2446 /* find all unecodable characters */ 2447 while ((collend < endp) && ((*collend)>=limit)) 2448 ++collend; 2449 /* cache callback name lookup (if not done yet, i.e. it's the first error) */ 2450 if (known_errorHandler==-1) { 2451 if ((errors==NULL) || (!strcmp(errors, "strict"))) 2452 known_errorHandler = 1; 2453 else if (!strcmp(errors, "replace")) 2454 known_errorHandler = 2; 2455 else if (!strcmp(errors, "ignore")) 2456 known_errorHandler = 3; 2457 else if (!strcmp(errors, "xmlcharrefreplace")) 2458 known_errorHandler = 4; 2459 else 2460 known_errorHandler = 0; 2461 } 2462 switch (known_errorHandler) { 2463 case 1: /* strict */ 2464 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason); 2465 goto onError; 2466 case 2: /* replace */ 2467 while (collstart++<collend) 2468 *str++ = '?'; /* fall through */ 2469 case 3: /* ignore */ 2470 p = collend; 2471 break; 2472 case 4: /* xmlcharrefreplace */ 2473 respos = str-PyString_AS_STRING(res); 2474 /* determine replacement size (temporarily (mis)uses p) */ 2475 for (p = collstart, repsize = 0; p < collend; ++p) { 2476 if (*p<10) 2477 repsize += 2+1+1; 2478 else if (*p<100) 2479 repsize += 2+2+1; 2480 else if (*p<1000) 2481 repsize += 2+3+1; 2482 else if (*p<10000) 2483 repsize += 2+4+1; 2484#ifndef Py_UNICODE_WIDE 2485 else 2486 repsize += 2+5+1; 2487#else 2488 else if (*p<100000) 2489 repsize += 2+5+1; 2490 else if (*p<1000000) 2491 repsize += 2+6+1; 2492 else 2493 repsize += 2+7+1; 2494#endif 2495 } 2496 requiredsize = respos+repsize+(endp-collend); 2497 if (requiredsize > ressize) { 2498 if (requiredsize<2*ressize) 2499 requiredsize = 2*ressize; 2500 if (_PyString_Resize(&res, requiredsize)) 2501 goto onError; 2502 str = PyString_AS_STRING(res) + respos; 2503 ressize = requiredsize; 2504 } 2505 /* generate replacement (temporarily (mis)uses p) */ 2506 for (p = collstart; p < collend; ++p) { 2507 str += sprintf(str, "&#%d;", (int)*p); 2508 } 2509 p = collend; 2510 break; 2511 default: 2512 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler, 2513 encoding, reason, startp, size, &exc, 2514 collstart-startp, collend-startp, &newpos); 2515 if (repunicode == NULL) 2516 goto onError; 2517 /* need more space? (at least enough for what we 2518 have+the replacement+the rest of the string, so 2519 we won't have to check space for encodable characters) */ 2520 respos = str-PyString_AS_STRING(res); 2521 repsize = PyUnicode_GET_SIZE(repunicode); 2522 requiredsize = respos+repsize+(endp-collend); 2523 if (requiredsize > ressize) { 2524 if (requiredsize<2*ressize) 2525 requiredsize = 2*ressize; 2526 if (_PyString_Resize(&res, requiredsize)) { 2527 Py_DECREF(repunicode); 2528 goto onError; 2529 } 2530 str = PyString_AS_STRING(res) + respos; 2531 ressize = requiredsize; 2532 } 2533 /* check if there is anything unencodable in the replacement 2534 and copy it to the output */ 2535 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) { 2536 c = *uni2; 2537 if (c >= limit) { 2538 raise_encode_exception(&exc, encoding, startp, size, 2539 unicodepos, unicodepos+1, reason); 2540 Py_DECREF(repunicode); 2541 goto onError; 2542 } 2543 *str = (char)c; 2544 } 2545 p = startp + newpos; 2546 Py_DECREF(repunicode); 2547 } 2548 } 2549 } 2550 /* Resize if we allocated to much */ 2551 respos = str-PyString_AS_STRING(res); 2552 if (respos<ressize) 2553 /* If this falls res will be NULL */ 2554 _PyString_Resize(&res, respos); 2555 Py_XDECREF(errorHandler); 2556 Py_XDECREF(exc); 2557 return res; 2558 2559 onError: 2560 Py_XDECREF(res); 2561 Py_XDECREF(errorHandler); 2562 Py_XDECREF(exc); 2563 return NULL; 2564} 2565 2566PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p, 2567 int size, 2568 const char *errors) 2569{ 2570 return unicode_encode_ucs1(p, size, errors, 256); 2571} 2572 2573PyObject *PyUnicode_AsLatin1String(PyObject *unicode) 2574{ 2575 if (!PyUnicode_Check(unicode)) { 2576 PyErr_BadArgument(); 2577 return NULL; 2578 } 2579 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode), 2580 PyUnicode_GET_SIZE(unicode), 2581 NULL); 2582} 2583 2584/* --- 7-bit ASCII Codec -------------------------------------------------- */ 2585 2586PyObject *PyUnicode_DecodeASCII(const char *s, 2587 int size, 2588 const char *errors) 2589{ 2590 const char *starts = s; 2591 PyUnicodeObject *v; 2592 Py_UNICODE *p; 2593 int startinpos; 2594 int endinpos; 2595 int outpos; 2596 const char *e; 2597 PyObject *errorHandler = NULL; 2598 PyObject *exc = NULL; 2599 2600 /* ASCII is equivalent to the first 128 ordinals in Unicode. */ 2601 if (size == 1 && *(unsigned char*)s < 128) { 2602 Py_UNICODE r = *(unsigned char*)s; 2603 return PyUnicode_FromUnicode(&r, 1); 2604 } 2605 2606 v = _PyUnicode_New(size); 2607 if (v == NULL) 2608 goto onError; 2609 if (size == 0) 2610 return (PyObject *)v; 2611 p = PyUnicode_AS_UNICODE(v); 2612 e = s + size; 2613 while (s < e) { 2614 register unsigned char c = (unsigned char)*s; 2615 if (c < 128) { 2616 *p++ = c; 2617 ++s; 2618 } 2619 else { 2620 startinpos = s-starts; 2621 endinpos = startinpos + 1; 2622 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v); 2623 if (unicode_decode_call_errorhandler( 2624 errors, &errorHandler, 2625 "ascii", "ordinal not in range(128)", 2626 starts, size, &startinpos, &endinpos, &exc, &s, 2627 (PyObject **)&v, &outpos, &p)) 2628 goto onError; 2629 } 2630 } 2631 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v)) 2632 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0) 2633 goto onError; 2634 Py_XDECREF(errorHandler); 2635 Py_XDECREF(exc); 2636 return (PyObject *)v; 2637 2638 onError: 2639 Py_XDECREF(v); 2640 Py_XDECREF(errorHandler); 2641 Py_XDECREF(exc); 2642 return NULL; 2643} 2644 2645PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p, 2646 int size, 2647 const char *errors) 2648{ 2649 return unicode_encode_ucs1(p, size, errors, 128); 2650} 2651 2652PyObject *PyUnicode_AsASCIIString(PyObject *unicode) 2653{ 2654 if (!PyUnicode_Check(unicode)) { 2655 PyErr_BadArgument(); 2656 return NULL; 2657 } 2658 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode), 2659 PyUnicode_GET_SIZE(unicode), 2660 NULL); 2661} 2662 2663#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T) 2664 2665/* --- MBCS codecs for Windows -------------------------------------------- */ 2666 2667PyObject *PyUnicode_DecodeMBCS(const char *s, 2668 int size, 2669 const char *errors) 2670{ 2671 PyUnicodeObject *v; 2672 Py_UNICODE *p; 2673 2674 /* First get the size of the result */ 2675 DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0); 2676 if (size > 0 && usize==0) 2677 return PyErr_SetFromWindowsErrWithFilename(0, NULL); 2678 2679 v = _PyUnicode_New(usize); 2680 if (v == NULL) 2681 return NULL; 2682 if (usize == 0) 2683 return (PyObject *)v; 2684 p = PyUnicode_AS_UNICODE(v); 2685 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) { 2686 Py_DECREF(v); 2687 return PyErr_SetFromWindowsErrWithFilename(0, NULL); 2688 } 2689 2690 return (PyObject *)v; 2691} 2692 2693PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p, 2694 int size, 2695 const char *errors) 2696{ 2697 PyObject *repr; 2698 char *s; 2699 DWORD mbcssize; 2700 2701 /* If there are no characters, bail now! */ 2702 if (size==0) 2703 return PyString_FromString(""); 2704 2705 /* First get the size of the result */ 2706 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL); 2707 if (mbcssize==0) 2708 return PyErr_SetFromWindowsErrWithFilename(0, NULL); 2709 2710 repr = PyString_FromStringAndSize(NULL, mbcssize); 2711 if (repr == NULL) 2712 return NULL; 2713 if (mbcssize == 0) 2714 return repr; 2715 2716 /* Do the conversion */ 2717 s = PyString_AS_STRING(repr); 2718 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) { 2719 Py_DECREF(repr); 2720 return PyErr_SetFromWindowsErrWithFilename(0, NULL); 2721 } 2722 return repr; 2723} 2724 2725PyObject *PyUnicode_AsMBCSString(PyObject *unicode) 2726{ 2727 if (!PyUnicode_Check(unicode)) { 2728 PyErr_BadArgument(); 2729 return NULL; 2730 } 2731 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode), 2732 PyUnicode_GET_SIZE(unicode), 2733 NULL); 2734} 2735 2736#endif /* MS_WINDOWS */ 2737 2738/* --- Character Mapping Codec -------------------------------------------- */ 2739 2740PyObject *PyUnicode_DecodeCharmap(const char *s, 2741 int size, 2742 PyObject *mapping, 2743 const char *errors) 2744{ 2745 const char *starts = s; 2746 int startinpos; 2747 int endinpos; 2748 int outpos; 2749 const char *e; 2750 PyUnicodeObject *v; 2751 Py_UNICODE *p; 2752 int extrachars = 0; 2753 PyObject *errorHandler = NULL; 2754 PyObject *exc = NULL; 2755 2756 /* Default to Latin-1 */ 2757 if (mapping == NULL) 2758 return PyUnicode_DecodeLatin1(s, size, errors); 2759 2760 v = _PyUnicode_New(size); 2761 if (v == NULL) 2762 goto onError; 2763 if (size == 0) 2764 return (PyObject *)v; 2765 p = PyUnicode_AS_UNICODE(v); 2766 e = s + size; 2767 while (s < e) { 2768 unsigned char ch = *s; 2769 PyObject *w, *x; 2770 2771 /* Get mapping (char ordinal -> integer, Unicode char or None) */ 2772 w = PyInt_FromLong((long)ch); 2773 if (w == NULL) 2774 goto onError; 2775 x = PyObject_GetItem(mapping, w); 2776 Py_DECREF(w); 2777 if (x == NULL) { 2778 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 2779 /* No mapping found means: mapping is undefined. */ 2780 PyErr_Clear(); 2781 x = Py_None; 2782 Py_INCREF(x); 2783 } else 2784 goto onError; 2785 } 2786 2787 /* Apply mapping */ 2788 if (PyInt_Check(x)) { 2789 long value = PyInt_AS_LONG(x); 2790 if (value < 0 || value > 65535) { 2791 PyErr_SetString(PyExc_TypeError, 2792 "character mapping must be in range(65536)"); 2793 Py_DECREF(x); 2794 goto onError; 2795 } 2796 *p++ = (Py_UNICODE)value; 2797 } 2798 else if (x == Py_None) { 2799 /* undefined mapping */ 2800 outpos = p-PyUnicode_AS_UNICODE(v); 2801 startinpos = s-starts; 2802 endinpos = startinpos+1; 2803 if (unicode_decode_call_errorhandler( 2804 errors, &errorHandler, 2805 "charmap", "character maps to <undefined>", 2806 starts, size, &startinpos, &endinpos, &exc, &s, 2807 (PyObject **)&v, &outpos, &p)) { 2808 Py_DECREF(x); 2809 goto onError; 2810 } 2811 continue; 2812 } 2813 else if (PyUnicode_Check(x)) { 2814 int targetsize = PyUnicode_GET_SIZE(x); 2815 2816 if (targetsize == 1) 2817 /* 1-1 mapping */ 2818 *p++ = *PyUnicode_AS_UNICODE(x); 2819 2820 else if (targetsize > 1) { 2821 /* 1-n mapping */ 2822 if (targetsize > extrachars) { 2823 /* resize first */ 2824 int oldpos = (int)(p - PyUnicode_AS_UNICODE(v)); 2825 int needed = (targetsize - extrachars) + \ 2826 (targetsize << 2); 2827 extrachars += needed; 2828 if (_PyUnicode_Resize(&v, 2829 PyUnicode_GET_SIZE(v) + needed) < 0) { 2830 Py_DECREF(x); 2831 goto onError; 2832 } 2833 p = PyUnicode_AS_UNICODE(v) + oldpos; 2834 } 2835 Py_UNICODE_COPY(p, 2836 PyUnicode_AS_UNICODE(x), 2837 targetsize); 2838 p += targetsize; 2839 extrachars -= targetsize; 2840 } 2841 /* 1-0 mapping: skip the character */ 2842 } 2843 else { 2844 /* wrong return value */ 2845 PyErr_SetString(PyExc_TypeError, 2846 "character mapping must return integer, None or unicode"); 2847 Py_DECREF(x); 2848 goto onError; 2849 } 2850 Py_DECREF(x); 2851 ++s; 2852 } 2853 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v)) 2854 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0) 2855 goto onError; 2856 Py_XDECREF(errorHandler); 2857 Py_XDECREF(exc); 2858 return (PyObject *)v; 2859 2860 onError: 2861 Py_XDECREF(errorHandler); 2862 Py_XDECREF(exc); 2863 Py_XDECREF(v); 2864 return NULL; 2865} 2866 2867/* Lookup the character ch in the mapping. If the character 2868 can't be found, Py_None is returned (or NULL, if another 2869 error occured). */ 2870static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping) 2871{ 2872 PyObject *w = PyInt_FromLong((long)c); 2873 PyObject *x; 2874 2875 if (w == NULL) 2876 return NULL; 2877 x = PyObject_GetItem(mapping, w); 2878 Py_DECREF(w); 2879 if (x == NULL) { 2880 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 2881 /* No mapping found means: mapping is undefined. */ 2882 PyErr_Clear(); 2883 x = Py_None; 2884 Py_INCREF(x); 2885 return x; 2886 } else 2887 return NULL; 2888 } 2889 else if (x == Py_None) 2890 return x; 2891 else if (PyInt_Check(x)) { 2892 long value = PyInt_AS_LONG(x); 2893 if (value < 0 || value > 255) { 2894 PyErr_SetString(PyExc_TypeError, 2895 "character mapping must be in range(256)"); 2896 Py_DECREF(x); 2897 return NULL; 2898 } 2899 return x; 2900 } 2901 else if (PyString_Check(x)) 2902 return x; 2903 else { 2904 /* wrong return value */ 2905 PyErr_SetString(PyExc_TypeError, 2906 "character mapping must return integer, None or str"); 2907 Py_DECREF(x); 2908 return NULL; 2909 } 2910} 2911 2912/* lookup the character, put the result in the output string and adjust 2913 various state variables. Reallocate the output string if not enough 2914 space is available. Return a new reference to the object that 2915 was put in the output buffer, or Py_None, if the mapping was undefined 2916 (in which case no character was written) or NULL, if a 2917 reallocation error ocurred. The called must decref the result */ 2918static 2919PyObject *charmapencode_output(Py_UNICODE c, PyObject *mapping, 2920 PyObject **outobj, int *outpos) 2921{ 2922 PyObject *rep = charmapencode_lookup(c, mapping); 2923 2924 if (rep==NULL) 2925 return NULL; 2926 else if (rep==Py_None) 2927 return rep; 2928 else { 2929 char *outstart = PyString_AS_STRING(*outobj); 2930 int outsize = PyString_GET_SIZE(*outobj); 2931 if (PyInt_Check(rep)) { 2932 int requiredsize = *outpos+1; 2933 if (outsize<requiredsize) { 2934 /* exponentially overallocate to minimize reallocations */ 2935 if (requiredsize < 2*outsize) 2936 requiredsize = 2*outsize; 2937 if (_PyString_Resize(outobj, requiredsize)) { 2938 Py_DECREF(rep); 2939 return NULL; 2940 } 2941 outstart = PyString_AS_STRING(*outobj); 2942 } 2943 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep); 2944 } 2945 else { 2946 const char *repchars = PyString_AS_STRING(rep); 2947 int repsize = PyString_GET_SIZE(rep); 2948 int requiredsize = *outpos+repsize; 2949 if (outsize<requiredsize) { 2950 /* exponentially overallocate to minimize reallocations */ 2951 if (requiredsize < 2*outsize) 2952 requiredsize = 2*outsize; 2953 if (_PyString_Resize(outobj, requiredsize)) { 2954 Py_DECREF(rep); 2955 return NULL; 2956 } 2957 outstart = PyString_AS_STRING(*outobj); 2958 } 2959 memcpy(outstart + *outpos, repchars, repsize); 2960 *outpos += repsize; 2961 } 2962 } 2963 return rep; 2964} 2965 2966/* handle an error in PyUnicode_EncodeCharmap 2967 Return 0 on success, -1 on error */ 2968static 2969int charmap_encoding_error( 2970 const Py_UNICODE *p, int size, int *inpos, PyObject *mapping, 2971 PyObject **exceptionObject, 2972 int *known_errorHandler, PyObject **errorHandler, const char *errors, 2973 PyObject **res, int *respos) 2974{ 2975 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */ 2976 int repsize; 2977 int newpos; 2978 Py_UNICODE *uni2; 2979 /* startpos for collecting unencodable chars */ 2980 int collstartpos = *inpos; 2981 int collendpos = *inpos+1; 2982 int collpos; 2983 char *encoding = "charmap"; 2984 char *reason = "character maps to <undefined>"; 2985 2986 PyObject *x; 2987 /* find all unencodable characters */ 2988 while (collendpos < size) { 2989 x = charmapencode_lookup(p[collendpos], mapping); 2990 if (x==NULL) 2991 return -1; 2992 else if (x!=Py_None) { 2993 Py_DECREF(x); 2994 break; 2995 } 2996 Py_DECREF(x); 2997 ++collendpos; 2998 } 2999 /* cache callback name lookup 3000 * (if not done yet, i.e. it's the first error) */ 3001 if (*known_errorHandler==-1) { 3002 if ((errors==NULL) || (!strcmp(errors, "strict"))) 3003 *known_errorHandler = 1; 3004 else if (!strcmp(errors, "replace")) 3005 *known_errorHandler = 2; 3006 else if (!strcmp(errors, "ignore")) 3007 *known_errorHandler = 3; 3008 else if (!strcmp(errors, "xmlcharrefreplace")) 3009 *known_errorHandler = 4; 3010 else 3011 *known_errorHandler = 0; 3012 } 3013 switch (*known_errorHandler) { 3014 case 1: /* strict */ 3015 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason); 3016 return -1; 3017 case 2: /* replace */ 3018 for (collpos = collstartpos; collpos<collendpos; ++collpos) { 3019 x = charmapencode_output('?', mapping, res, respos); 3020 if (x==NULL) { 3021 return -1; 3022 } 3023 else if (x==Py_None) { 3024 Py_DECREF(x); 3025 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason); 3026 return -1; 3027 } 3028 Py_DECREF(x); 3029 } 3030 /* fall through */ 3031 case 3: /* ignore */ 3032 *inpos = collendpos; 3033 break; 3034 case 4: /* xmlcharrefreplace */ 3035 /* generate replacement (temporarily (mis)uses p) */ 3036 for (collpos = collstartpos; collpos < collendpos; ++collpos) { 3037 char buffer[2+29+1+1]; 3038 char *cp; 3039 sprintf(buffer, "&#%d;", (int)p[collpos]); 3040 for (cp = buffer; *cp; ++cp) { 3041 x = charmapencode_output(*cp, mapping, res, respos); 3042 if (x==NULL) 3043 return -1; 3044 else if (x==Py_None) { 3045 Py_DECREF(x); 3046 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason); 3047 return -1; 3048 } 3049 Py_DECREF(x); 3050 } 3051 } 3052 *inpos = collendpos; 3053 break; 3054 default: 3055 repunicode = unicode_encode_call_errorhandler(errors, errorHandler, 3056 encoding, reason, p, size, exceptionObject, 3057 collstartpos, collendpos, &newpos); 3058 if (repunicode == NULL) 3059 return -1; 3060 /* generate replacement */ 3061 repsize = PyUnicode_GET_SIZE(repunicode); 3062 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) { 3063 x = charmapencode_output(*uni2, mapping, res, respos); 3064 if (x==NULL) { 3065 Py_DECREF(repunicode); 3066 return -1; 3067 } 3068 else if (x==Py_None) { 3069 Py_DECREF(repunicode); 3070 Py_DECREF(x); 3071 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason); 3072 return -1; 3073 } 3074 Py_DECREF(x); 3075 } 3076 *inpos = newpos; 3077 Py_DECREF(repunicode); 3078 } 3079 return 0; 3080} 3081 3082PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p, 3083 int size, 3084 PyObject *mapping, 3085 const char *errors) 3086{ 3087 /* output object */ 3088 PyObject *res = NULL; 3089 /* current input position */ 3090 int inpos = 0; 3091 /* current output position */ 3092 int respos = 0; 3093 PyObject *errorHandler = NULL; 3094 PyObject *exc = NULL; 3095 /* the following variable is used for caching string comparisons 3096 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3097 * 3=ignore, 4=xmlcharrefreplace */ 3098 int known_errorHandler = -1; 3099 3100 /* Default to Latin-1 */ 3101 if (mapping == NULL) 3102 return PyUnicode_EncodeLatin1(p, size, errors); 3103 3104 /* allocate enough for a simple encoding without 3105 replacements, if we need more, we'll resize */ 3106 res = PyString_FromStringAndSize(NULL, size); 3107 if (res == NULL) 3108 goto onError; 3109 if (size == 0) 3110 return res; 3111 3112 while (inpos<size) { 3113 /* try to encode it */ 3114 PyObject *x = charmapencode_output(p[inpos], mapping, &res, &respos); 3115 if (x==NULL) /* error */ 3116 goto onError; 3117 if (x==Py_None) { /* unencodable character */ 3118 if (charmap_encoding_error(p, size, &inpos, mapping, 3119 &exc, 3120 &known_errorHandler, &errorHandler, errors, 3121 &res, &respos)) { 3122 Py_DECREF(x); 3123 goto onError; 3124 } 3125 } 3126 else 3127 /* done with this character => adjust input position */ 3128 ++inpos; 3129 Py_DECREF(x); 3130 } 3131 3132 /* Resize if we allocated to much */ 3133 if (respos<PyString_GET_SIZE(res)) { 3134 if (_PyString_Resize(&res, respos)) 3135 goto onError; 3136 } 3137 Py_XDECREF(exc); 3138 Py_XDECREF(errorHandler); 3139 return res; 3140 3141 onError: 3142 Py_XDECREF(res); 3143 Py_XDECREF(exc); 3144 Py_XDECREF(errorHandler); 3145 return NULL; 3146} 3147 3148PyObject *PyUnicode_AsCharmapString(PyObject *unicode, 3149 PyObject *mapping) 3150{ 3151 if (!PyUnicode_Check(unicode) || mapping == NULL) { 3152 PyErr_BadArgument(); 3153 return NULL; 3154 } 3155 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode), 3156 PyUnicode_GET_SIZE(unicode), 3157 mapping, 3158 NULL); 3159} 3160 3161/* create or adjust a UnicodeTranslateError */ 3162static void make_translate_exception(PyObject **exceptionObject, 3163 const Py_UNICODE *unicode, int size, 3164 int startpos, int endpos, 3165 const char *reason) 3166{ 3167 if (*exceptionObject == NULL) { 3168 *exceptionObject = PyUnicodeTranslateError_Create( 3169 unicode, size, startpos, endpos, reason); 3170 } 3171 else { 3172 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos)) 3173 goto onError; 3174 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos)) 3175 goto onError; 3176 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason)) 3177 goto onError; 3178 return; 3179 onError: 3180 Py_DECREF(*exceptionObject); 3181 *exceptionObject = NULL; 3182 } 3183} 3184 3185/* raises a UnicodeTranslateError */ 3186static void raise_translate_exception(PyObject **exceptionObject, 3187 const Py_UNICODE *unicode, int size, 3188 int startpos, int endpos, 3189 const char *reason) 3190{ 3191 make_translate_exception(exceptionObject, 3192 unicode, size, startpos, endpos, reason); 3193 if (*exceptionObject != NULL) 3194 PyCodec_StrictErrors(*exceptionObject); 3195} 3196 3197/* error handling callback helper: 3198 build arguments, call the callback and check the arguments, 3199 put the result into newpos and return the replacement string, which 3200 has to be freed by the caller */ 3201static PyObject *unicode_translate_call_errorhandler(const char *errors, 3202 PyObject **errorHandler, 3203 const char *reason, 3204 const Py_UNICODE *unicode, int size, PyObject **exceptionObject, 3205 int startpos, int endpos, 3206 int *newpos) 3207{ 3208 static char *argparse = "O!i;translating error handler must return (unicode, int) tuple"; 3209 3210 PyObject *restuple; 3211 PyObject *resunicode; 3212 3213 if (*errorHandler == NULL) { 3214 *errorHandler = PyCodec_LookupError(errors); 3215 if (*errorHandler == NULL) 3216 return NULL; 3217 } 3218 3219 make_translate_exception(exceptionObject, 3220 unicode, size, startpos, endpos, reason); 3221 if (*exceptionObject == NULL) 3222 return NULL; 3223 3224 restuple = PyObject_CallFunctionObjArgs( 3225 *errorHandler, *exceptionObject, NULL); 3226 if (restuple == NULL) 3227 return NULL; 3228 if (!PyTuple_Check(restuple)) { 3229 PyErr_Format(PyExc_TypeError, &argparse[4]); 3230 Py_DECREF(restuple); 3231 return NULL; 3232 } 3233 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, 3234 &resunicode, newpos)) { 3235 Py_DECREF(restuple); 3236 return NULL; 3237 } 3238 if (*newpos<0) 3239 *newpos = size+*newpos; 3240 if (*newpos<0 || *newpos>size) { 3241 PyErr_Format(PyExc_IndexError, "position %d from error handler out of bounds", *newpos); 3242 Py_DECREF(restuple); 3243 return NULL; 3244 } 3245 Py_INCREF(resunicode); 3246 Py_DECREF(restuple); 3247 return resunicode; 3248} 3249 3250/* Lookup the character ch in the mapping and put the result in result, 3251 which must be decrefed by the caller. 3252 Return 0 on success, -1 on error */ 3253static 3254int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result) 3255{ 3256 PyObject *w = PyInt_FromLong((long)c); 3257 PyObject *x; 3258 3259 if (w == NULL) 3260 return -1; 3261 x = PyObject_GetItem(mapping, w); 3262 Py_DECREF(w); 3263 if (x == NULL) { 3264 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 3265 /* No mapping found means: use 1:1 mapping. */ 3266 PyErr_Clear(); 3267 *result = NULL; 3268 return 0; 3269 } else 3270 return -1; 3271 } 3272 else if (x == Py_None) { 3273 *result = x; 3274 return 0; 3275 } 3276 else if (PyInt_Check(x)) { 3277 long value = PyInt_AS_LONG(x); 3278 long max = PyUnicode_GetMax(); 3279 if (value < 0 || value > max) { 3280 PyErr_Format(PyExc_TypeError, 3281 "character mapping must be in range(0x%lx)", max+1); 3282 Py_DECREF(x); 3283 return -1; 3284 } 3285 *result = x; 3286 return 0; 3287 } 3288 else if (PyUnicode_Check(x)) { 3289 *result = x; 3290 return 0; 3291 } 3292 else { 3293 /* wrong return value */ 3294 PyErr_SetString(PyExc_TypeError, 3295 "character mapping must return integer, None or unicode"); 3296 Py_DECREF(x); 3297 return -1; 3298 } 3299} 3300/* ensure that *outobj is at least requiredsize characters long, 3301if not reallocate and adjust various state variables. 3302Return 0 on success, -1 on error */ 3303static 3304int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp, 3305 int requiredsize) 3306{ 3307 int oldsize = PyUnicode_GET_SIZE(*outobj); 3308 if (requiredsize > oldsize) { 3309 /* remember old output position */ 3310 int outpos = *outp-PyUnicode_AS_UNICODE(*outobj); 3311 /* exponentially overallocate to minimize reallocations */ 3312 if (requiredsize < 2 * oldsize) 3313 requiredsize = 2 * oldsize; 3314 if (_PyUnicode_Resize(outobj, requiredsize) < 0) 3315 return -1; 3316 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos; 3317 } 3318 return 0; 3319} 3320/* lookup the character, put the result in the output string and adjust 3321 various state variables. Return a new reference to the object that 3322 was put in the output buffer in *result, or Py_None, if the mapping was 3323 undefined (in which case no character was written). 3324 The called must decref result. 3325 Return 0 on success, -1 on error. */ 3326static 3327int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp, 3328 int insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp, 3329 PyObject **res) 3330{ 3331 if (charmaptranslate_lookup(*curinp, mapping, res)) 3332 return -1; 3333 if (*res==NULL) { 3334 /* not found => default to 1:1 mapping */ 3335 *(*outp)++ = *curinp; 3336 } 3337 else if (*res==Py_None) 3338 ; 3339 else if (PyInt_Check(*res)) { 3340 /* no overflow check, because we know that the space is enough */ 3341 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res); 3342 } 3343 else if (PyUnicode_Check(*res)) { 3344 int repsize = PyUnicode_GET_SIZE(*res); 3345 if (repsize==1) { 3346 /* no overflow check, because we know that the space is enough */ 3347 *(*outp)++ = *PyUnicode_AS_UNICODE(*res); 3348 } 3349 else if (repsize!=0) { 3350 /* more than one character */ 3351 int requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) + 3352 (insize - (curinp-startinp)) + 3353 repsize - 1; 3354 if (charmaptranslate_makespace(outobj, outp, requiredsize)) 3355 return -1; 3356 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize); 3357 *outp += repsize; 3358 } 3359 } 3360 else 3361 return -1; 3362 return 0; 3363} 3364 3365PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p, 3366 int size, 3367 PyObject *mapping, 3368 const char *errors) 3369{ 3370 /* output object */ 3371 PyObject *res = NULL; 3372 /* pointers to the beginning and end+1 of input */ 3373 const Py_UNICODE *startp = p; 3374 const Py_UNICODE *endp = p + size; 3375 /* pointer into the output */ 3376 Py_UNICODE *str; 3377 /* current output position */ 3378 int respos = 0; 3379 char *reason = "character maps to <undefined>"; 3380 PyObject *errorHandler = NULL; 3381 PyObject *exc = NULL; 3382 /* the following variable is used for caching string comparisons 3383 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3384 * 3=ignore, 4=xmlcharrefreplace */ 3385 int known_errorHandler = -1; 3386 3387 if (mapping == NULL) { 3388 PyErr_BadArgument(); 3389 return NULL; 3390 } 3391 3392 /* allocate enough for a simple 1:1 translation without 3393 replacements, if we need more, we'll resize */ 3394 res = PyUnicode_FromUnicode(NULL, size); 3395 if (res == NULL) 3396 goto onError; 3397 if (size == 0) 3398 return res; 3399 str = PyUnicode_AS_UNICODE(res); 3400 3401 while (p<endp) { 3402 /* try to encode it */ 3403 PyObject *x = NULL; 3404 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) { 3405 Py_XDECREF(x); 3406 goto onError; 3407 } 3408 Py_XDECREF(x); 3409 if (x!=Py_None) /* it worked => adjust input pointer */ 3410 ++p; 3411 else { /* untranslatable character */ 3412 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */ 3413 int repsize; 3414 int newpos; 3415 Py_UNICODE *uni2; 3416 /* startpos for collecting untranslatable chars */ 3417 const Py_UNICODE *collstart = p; 3418 const Py_UNICODE *collend = p+1; 3419 const Py_UNICODE *coll; 3420 3421 /* find all untranslatable characters */ 3422 while (collend < endp) { 3423 if (charmaptranslate_lookup(*collend, mapping, &x)) 3424 goto onError; 3425 Py_XDECREF(x); 3426 if (x!=Py_None) 3427 break; 3428 ++collend; 3429 } 3430 /* cache callback name lookup 3431 * (if not done yet, i.e. it's the first error) */ 3432 if (known_errorHandler==-1) { 3433 if ((errors==NULL) || (!strcmp(errors, "strict"))) 3434 known_errorHandler = 1; 3435 else if (!strcmp(errors, "replace")) 3436 known_errorHandler = 2; 3437 else if (!strcmp(errors, "ignore")) 3438 known_errorHandler = 3; 3439 else if (!strcmp(errors, "xmlcharrefreplace")) 3440 known_errorHandler = 4; 3441 else 3442 known_errorHandler = 0; 3443 } 3444 switch (known_errorHandler) { 3445 case 1: /* strict */ 3446 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason); 3447 goto onError; 3448 case 2: /* replace */ 3449 /* No need to check for space, this is a 1:1 replacement */ 3450 for (coll = collstart; coll<collend; ++coll) 3451 *str++ = '?'; 3452 /* fall through */ 3453 case 3: /* ignore */ 3454 p = collend; 3455 break; 3456 case 4: /* xmlcharrefreplace */ 3457 /* generate replacement (temporarily (mis)uses p) */ 3458 for (p = collstart; p < collend; ++p) { 3459 char buffer[2+29+1+1]; 3460 char *cp; 3461 sprintf(buffer, "&#%d;", (int)*p); 3462 if (charmaptranslate_makespace(&res, &str, 3463 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend))) 3464 goto onError; 3465 for (cp = buffer; *cp; ++cp) 3466 *str++ = *cp; 3467 } 3468 p = collend; 3469 break; 3470 default: 3471 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler, 3472 reason, startp, size, &exc, 3473 collstart-startp, collend-startp, &newpos); 3474 if (repunicode == NULL) 3475 goto onError; 3476 /* generate replacement */ 3477 repsize = PyUnicode_GET_SIZE(repunicode); 3478 if (charmaptranslate_makespace(&res, &str, 3479 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) { 3480 Py_DECREF(repunicode); 3481 goto onError; 3482 } 3483 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) 3484 *str++ = *uni2; 3485 p = startp + newpos; 3486 Py_DECREF(repunicode); 3487 } 3488 } 3489 } 3490 /* Resize if we allocated to much */ 3491 respos = str-PyUnicode_AS_UNICODE(res); 3492 if (respos<PyUnicode_GET_SIZE(res)) { 3493 if (_PyUnicode_Resize(&res, respos) < 0) 3494 goto onError; 3495 } 3496 Py_XDECREF(exc); 3497 Py_XDECREF(errorHandler); 3498 return res; 3499 3500 onError: 3501 Py_XDECREF(res); 3502 Py_XDECREF(exc); 3503 Py_XDECREF(errorHandler); 3504 return NULL; 3505} 3506 3507PyObject *PyUnicode_Translate(PyObject *str, 3508 PyObject *mapping, 3509 const char *errors) 3510{ 3511 PyObject *result; 3512 3513 str = PyUnicode_FromObject(str); 3514 if (str == NULL) 3515 goto onError; 3516 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str), 3517 PyUnicode_GET_SIZE(str), 3518 mapping, 3519 errors); 3520 Py_DECREF(str); 3521 return result; 3522 3523 onError: 3524 Py_XDECREF(str); 3525 return NULL; 3526} 3527 3528/* --- Decimal Encoder ---------------------------------------------------- */ 3529 3530int PyUnicode_EncodeDecimal(Py_UNICODE *s, 3531 int length, 3532 char *output, 3533 const char *errors) 3534{ 3535 Py_UNICODE *p, *end; 3536 PyObject *errorHandler = NULL; 3537 PyObject *exc = NULL; 3538 const char *encoding = "decimal"; 3539 const char *reason = "invalid decimal Unicode string"; 3540 /* the following variable is used for caching string comparisons 3541 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */ 3542 int known_errorHandler = -1; 3543 3544 if (output == NULL) { 3545 PyErr_BadArgument(); 3546 return -1; 3547 } 3548 3549 p = s; 3550 end = s + length; 3551 while (p < end) { 3552 register Py_UNICODE ch = *p; 3553 int decimal; 3554 PyObject *repunicode; 3555 int repsize; 3556 int newpos; 3557 Py_UNICODE *uni2; 3558 Py_UNICODE *collstart; 3559 Py_UNICODE *collend; 3560 3561 if (Py_UNICODE_ISSPACE(ch)) { 3562 *output++ = ' '; 3563 ++p; 3564 continue; 3565 } 3566 decimal = Py_UNICODE_TODECIMAL(ch); 3567 if (decimal >= 0) { 3568 *output++ = '0' + decimal; 3569 ++p; 3570 continue; 3571 } 3572 if (0 < ch && ch < 256) { 3573 *output++ = (char)ch; 3574 ++p; 3575 continue; 3576 } 3577 /* All other characters are considered unencodable */ 3578 collstart = p; 3579 collend = p+1; 3580 while (collend < end) { 3581 if ((0 < *collend && *collend < 256) || 3582 !Py_UNICODE_ISSPACE(*collend) || 3583 Py_UNICODE_TODECIMAL(*collend)) 3584 break; 3585 } 3586 /* cache callback name lookup 3587 * (if not done yet, i.e. it's the first error) */ 3588 if (known_errorHandler==-1) { 3589 if ((errors==NULL) || (!strcmp(errors, "strict"))) 3590 known_errorHandler = 1; 3591 else if (!strcmp(errors, "replace")) 3592 known_errorHandler = 2; 3593 else if (!strcmp(errors, "ignore")) 3594 known_errorHandler = 3; 3595 else if (!strcmp(errors, "xmlcharrefreplace")) 3596 known_errorHandler = 4; 3597 else 3598 known_errorHandler = 0; 3599 } 3600 switch (known_errorHandler) { 3601 case 1: /* strict */ 3602 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason); 3603 goto onError; 3604 case 2: /* replace */ 3605 for (p = collstart; p < collend; ++p) 3606 *output++ = '?'; 3607 /* fall through */ 3608 case 3: /* ignore */ 3609 p = collend; 3610 break; 3611 case 4: /* xmlcharrefreplace */ 3612 /* generate replacement (temporarily (mis)uses p) */ 3613 for (p = collstart; p < collend; ++p) 3614 output += sprintf(output, "&#%d;", (int)*p); 3615 p = collend; 3616 break; 3617 default: 3618 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler, 3619 encoding, reason, s, length, &exc, 3620 collstart-s, collend-s, &newpos); 3621 if (repunicode == NULL) 3622 goto onError; 3623 /* generate replacement */ 3624 repsize = PyUnicode_GET_SIZE(repunicode); 3625 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) { 3626 Py_UNICODE ch = *uni2; 3627 if (Py_UNICODE_ISSPACE(ch)) 3628 *output++ = ' '; 3629 else { 3630 decimal = Py_UNICODE_TODECIMAL(ch); 3631 if (decimal >= 0) 3632 *output++ = '0' + decimal; 3633 else if (0 < ch && ch < 256) 3634 *output++ = (char)ch; 3635 else { 3636 Py_DECREF(repunicode); 3637 raise_encode_exception(&exc, encoding, 3638 s, length, collstart-s, collend-s, reason); 3639 goto onError; 3640 } 3641 } 3642 } 3643 p = s + newpos; 3644 Py_DECREF(repunicode); 3645 } 3646 } 3647 /* 0-terminate the output string */ 3648 *output++ = '\0'; 3649 Py_XDECREF(exc); 3650 Py_XDECREF(errorHandler); 3651 return 0; 3652 3653 onError: 3654 Py_XDECREF(exc); 3655 Py_XDECREF(errorHandler); 3656 return -1; 3657} 3658 3659/* --- Helpers ------------------------------------------------------------ */ 3660 3661static 3662int count(PyUnicodeObject *self, 3663 int start, 3664 int end, 3665 PyUnicodeObject *substring) 3666{ 3667 int count = 0; 3668 3669 if (start < 0) 3670 start += self->length; 3671 if (start < 0) 3672 start = 0; 3673 if (end > self->length) 3674 end = self->length; 3675 if (end < 0) 3676 end += self->length; 3677 if (end < 0) 3678 end = 0; 3679 3680 if (substring->length == 0) 3681 return (end - start + 1); 3682 3683 end -= substring->length; 3684 3685 while (start <= end) 3686 if (Py_UNICODE_MATCH(self, start, substring)) { 3687 count++; 3688 start += substring->length; 3689 } else 3690 start++; 3691 3692 return count; 3693} 3694 3695int PyUnicode_Count(PyObject *str, 3696 PyObject *substr, 3697 int start, 3698 int end) 3699{ 3700 int result; 3701 3702 str = PyUnicode_FromObject(str); 3703 if (str == NULL) 3704 return -1; 3705 substr = PyUnicode_FromObject(substr); 3706 if (substr == NULL) { 3707 Py_DECREF(str); 3708 return -1; 3709 } 3710 3711 result = count((PyUnicodeObject *)str, 3712 start, end, 3713 (PyUnicodeObject *)substr); 3714 3715 Py_DECREF(str); 3716 Py_DECREF(substr); 3717 return result; 3718} 3719 3720static 3721int findstring(PyUnicodeObject *self, 3722 PyUnicodeObject *substring, 3723 int start, 3724 int end, 3725 int direction) 3726{ 3727 if (start < 0) 3728 start += self->length; 3729 if (start < 0) 3730 start = 0; 3731 3732 if (end > self->length) 3733 end = self->length; 3734 if (end < 0) 3735 end += self->length; 3736 if (end < 0) 3737 end = 0; 3738 3739 if (substring->length == 0) 3740 return (direction > 0) ? start : end; 3741 3742 end -= substring->length; 3743 3744 if (direction < 0) { 3745 for (; end >= start; end--) 3746 if (Py_UNICODE_MATCH(self, end, substring)) 3747 return end; 3748 } else { 3749 for (; start <= end; start++) 3750 if (Py_UNICODE_MATCH(self, start, substring)) 3751 return start; 3752 } 3753 3754 return -1; 3755} 3756 3757int PyUnicode_Find(PyObject *str, 3758 PyObject *substr, 3759 int start, 3760 int end, 3761 int direction) 3762{ 3763 int result; 3764 3765 str = PyUnicode_FromObject(str); 3766 if (str == NULL) 3767 return -2; 3768 substr = PyUnicode_FromObject(substr); 3769 if (substr == NULL) { 3770 Py_DECREF(str); 3771 return -2; 3772 } 3773 3774 result = findstring((PyUnicodeObject *)str, 3775 (PyUnicodeObject *)substr, 3776 start, end, direction); 3777 Py_DECREF(str); 3778 Py_DECREF(substr); 3779 return result; 3780} 3781 3782static 3783int tailmatch(PyUnicodeObject *self, 3784 PyUnicodeObject *substring, 3785 int start, 3786 int end, 3787 int direction) 3788{ 3789 if (start < 0) 3790 start += self->length; 3791 if (start < 0) 3792 start = 0; 3793 3794 if (substring->length == 0) 3795 return 1; 3796 3797 if (end > self->length) 3798 end = self->length; 3799 if (end < 0) 3800 end += self->length; 3801 if (end < 0) 3802 end = 0; 3803 3804 end -= substring->length; 3805 if (end < start) 3806 return 0; 3807 3808 if (direction > 0) { 3809 if (Py_UNICODE_MATCH(self, end, substring)) 3810 return 1; 3811 } else { 3812 if (Py_UNICODE_MATCH(self, start, substring)) 3813 return 1; 3814 } 3815 3816 return 0; 3817} 3818 3819int PyUnicode_Tailmatch(PyObject *str, 3820 PyObject *substr, 3821 int start, 3822 int end, 3823 int direction) 3824{ 3825 int result; 3826 3827 str = PyUnicode_FromObject(str); 3828 if (str == NULL) 3829 return -1; 3830 substr = PyUnicode_FromObject(substr); 3831 if (substr == NULL) { 3832 Py_DECREF(substr); 3833 return -1; 3834 } 3835 3836 result = tailmatch((PyUnicodeObject *)str, 3837 (PyUnicodeObject *)substr, 3838 start, end, direction); 3839 Py_DECREF(str); 3840 Py_DECREF(substr); 3841 return result; 3842} 3843 3844static 3845const Py_UNICODE *findchar(const Py_UNICODE *s, 3846 int size, 3847 Py_UNICODE ch) 3848{ 3849 /* like wcschr, but doesn't stop at NULL characters */ 3850 3851 while (size-- > 0) { 3852 if (*s == ch) 3853 return s; 3854 s++; 3855 } 3856 3857 return NULL; 3858} 3859 3860/* Apply fixfct filter to the Unicode object self and return a 3861 reference to the modified object */ 3862 3863static 3864PyObject *fixup(PyUnicodeObject *self, 3865 int (*fixfct)(PyUnicodeObject *s)) 3866{ 3867 3868 PyUnicodeObject *u; 3869 3870 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length); 3871 if (u == NULL) 3872 return NULL; 3873 3874 Py_UNICODE_COPY(u->str, self->str, self->length); 3875 3876 if (!fixfct(u) && PyUnicode_CheckExact(self)) { 3877 /* fixfct should return TRUE if it modified the buffer. If 3878 FALSE, return a reference to the original buffer instead 3879 (to save space, not time) */ 3880 Py_INCREF(self); 3881 Py_DECREF(u); 3882 return (PyObject*) self; 3883 } 3884 return (PyObject*) u; 3885} 3886 3887static 3888int fixupper(PyUnicodeObject *self) 3889{ 3890 int len = self->length; 3891 Py_UNICODE *s = self->str; 3892 int status = 0; 3893 3894 while (len-- > 0) { 3895 register Py_UNICODE ch; 3896 3897 ch = Py_UNICODE_TOUPPER(*s); 3898 if (ch != *s) { 3899 status = 1; 3900 *s = ch; 3901 } 3902 s++; 3903 } 3904 3905 return status; 3906} 3907 3908static 3909int fixlower(PyUnicodeObject *self) 3910{ 3911 int len = self->length; 3912 Py_UNICODE *s = self->str; 3913 int status = 0; 3914 3915 while (len-- > 0) { 3916 register Py_UNICODE ch; 3917 3918 ch = Py_UNICODE_TOLOWER(*s); 3919 if (ch != *s) { 3920 status = 1; 3921 *s = ch; 3922 } 3923 s++; 3924 } 3925 3926 return status; 3927} 3928 3929static 3930int fixswapcase(PyUnicodeObject *self) 3931{ 3932 int len = self->length; 3933 Py_UNICODE *s = self->str; 3934 int status = 0; 3935 3936 while (len-- > 0) { 3937 if (Py_UNICODE_ISUPPER(*s)) { 3938 *s = Py_UNICODE_TOLOWER(*s); 3939 status = 1; 3940 } else if (Py_UNICODE_ISLOWER(*s)) { 3941 *s = Py_UNICODE_TOUPPER(*s); 3942 status = 1; 3943 } 3944 s++; 3945 } 3946 3947 return status; 3948} 3949 3950static 3951int fixcapitalize(PyUnicodeObject *self) 3952{ 3953 int len = self->length; 3954 Py_UNICODE *s = self->str; 3955 int status = 0; 3956 3957 if (len == 0) 3958 return 0; 3959 if (Py_UNICODE_ISLOWER(*s)) { 3960 *s = Py_UNICODE_TOUPPER(*s); 3961 status = 1; 3962 } 3963 s++; 3964 while (--len > 0) { 3965 if (Py_UNICODE_ISUPPER(*s)) { 3966 *s = Py_UNICODE_TOLOWER(*s); 3967 status = 1; 3968 } 3969 s++; 3970 } 3971 return status; 3972} 3973 3974static 3975int fixtitle(PyUnicodeObject *self) 3976{ 3977 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 3978 register Py_UNICODE *e; 3979 int previous_is_cased; 3980 3981 /* Shortcut for single character strings */ 3982 if (PyUnicode_GET_SIZE(self) == 1) { 3983 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p); 3984 if (*p != ch) { 3985 *p = ch; 3986 return 1; 3987 } 3988 else 3989 return 0; 3990 } 3991 3992 e = p + PyUnicode_GET_SIZE(self); 3993 previous_is_cased = 0; 3994 for (; p < e; p++) { 3995 register const Py_UNICODE ch = *p; 3996 3997 if (previous_is_cased) 3998 *p = Py_UNICODE_TOLOWER(ch); 3999 else 4000 *p = Py_UNICODE_TOTITLE(ch); 4001 4002 if (Py_UNICODE_ISLOWER(ch) || 4003 Py_UNICODE_ISUPPER(ch) || 4004 Py_UNICODE_ISTITLE(ch)) 4005 previous_is_cased = 1; 4006 else 4007 previous_is_cased = 0; 4008 } 4009 return 1; 4010} 4011 4012PyObject * 4013PyUnicode_Join(PyObject *separator, PyObject *seq) 4014{ 4015 PyObject *internal_separator = NULL; 4016 const Py_UNICODE *sep; 4017 size_t seplen; 4018 PyUnicodeObject *res = NULL; /* the result */ 4019 size_t res_alloc = 100; /* # allocated bytes for string in res */ 4020 size_t res_used; /* # used bytes */ 4021 Py_UNICODE *res_p; /* pointer to free byte in res's string area */ 4022 PyObject *fseq; /* PySequence_Fast(seq) */ 4023 int seqlen; /* len(fseq) -- number of items in sequence */ 4024 const Py_UNICODE blank = ' '; 4025 PyObject *item; 4026 int i; 4027 4028 fseq = PySequence_Fast(seq, ""); 4029 if (fseq == NULL) { 4030 if (PyErr_ExceptionMatches(PyExc_TypeError)) 4031 PyErr_Format(PyExc_TypeError, 4032 "sequence expected, %.80s found", 4033 seq->ob_type->tp_name); 4034 return NULL; 4035 } 4036 4037 /* Grrrr. A codec may be invoked to convert str objects to 4038 * Unicode, and so it's possible to call back into Python code 4039 * during PyUnicode_FromObject(), and so it's possible for a sick 4040 * codec to change the size of fseq (if seq is a list). Therefore 4041 * we have to keep refetching the size -- can't assume seqlen 4042 * is invariant. 4043 */ 4044 seqlen = PySequence_Fast_GET_SIZE(fseq); 4045 /* If empty sequence, return u"". */ 4046 if (seqlen == 0) { 4047 res = _PyUnicode_New(0); /* empty sequence; return u"" */ 4048 goto Done; 4049 } 4050 /* If singleton sequence with an exact Unicode, return that. */ 4051 if (seqlen == 1) { 4052 item = PySequence_Fast_GET_ITEM(fseq, 0); 4053 if (PyUnicode_CheckExact(item)) { 4054 Py_INCREF(item); 4055 res = (PyUnicodeObject *)item; 4056 goto Done; 4057 } 4058 } 4059 4060 /* At least two items to join, or one that isn't exact Unicode. */ 4061 if (seqlen > 1) { 4062 /* Set up sep and seplen -- they're needed. */ 4063 if (separator == NULL) { 4064 sep = ␣ 4065 seplen = 1; 4066 } 4067 else { 4068 internal_separator = PyUnicode_FromObject(separator); 4069 if (internal_separator == NULL) 4070 goto onError; 4071 sep = PyUnicode_AS_UNICODE(internal_separator); 4072 seplen = PyUnicode_GET_SIZE(internal_separator); 4073 /* In case PyUnicode_FromObject() mutated seq. */ 4074 seqlen = PySequence_Fast_GET_SIZE(fseq); 4075 } 4076 } 4077 4078 /* Get space. */ 4079 res = _PyUnicode_New((int)res_alloc); 4080 if (res == NULL) 4081 goto onError; 4082 res_p = PyUnicode_AS_UNICODE(res); 4083 res_used = 0; 4084 4085 for (i = 0; i < seqlen; ++i) { 4086 size_t itemlen; 4087 size_t new_res_used; 4088 4089 item = PySequence_Fast_GET_ITEM(fseq, i); 4090 /* Convert item to Unicode. */ 4091 if (! PyUnicode_Check(item) && ! PyString_Check(item)) { 4092 PyErr_Format(PyExc_TypeError, 4093 "sequence item %i: expected string or Unicode," 4094 " %.80s found", 4095 i, item->ob_type->tp_name); 4096 goto onError; 4097 } 4098 item = PyUnicode_FromObject(item); 4099 if (item == NULL) 4100 goto onError; 4101 /* We own a reference to item from here on. */ 4102 4103 /* In case PyUnicode_FromObject() mutated seq. */ 4104 seqlen = PySequence_Fast_GET_SIZE(fseq); 4105 4106 /* Make sure we have enough space for the separator and the item. */ 4107 itemlen = PyUnicode_GET_SIZE(item); 4108 new_res_used = res_used + itemlen; 4109 if (new_res_used < res_used || new_res_used > INT_MAX) 4110 goto Overflow; 4111 if (i < seqlen - 1) { 4112 new_res_used += seplen; 4113 if (new_res_used < res_used || new_res_used > INT_MAX) 4114 goto Overflow; 4115 } 4116 if (new_res_used > res_alloc) { 4117 /* double allocated size until it's big enough */ 4118 do { 4119 size_t oldsize = res_alloc; 4120 res_alloc += res_alloc; 4121 if (res_alloc < oldsize || res_alloc > INT_MAX) 4122 goto Overflow; 4123 } while (new_res_used > res_alloc); 4124 if (_PyUnicode_Resize(&res, (int)res_alloc) < 0) { 4125 Py_DECREF(item); 4126 goto onError; 4127 } 4128 res_p = PyUnicode_AS_UNICODE(res) + res_used; 4129 } 4130 4131 /* Copy item, and maybe the separator. */ 4132 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), (int)itemlen); 4133 res_p += itemlen; 4134 if (i < seqlen - 1) { 4135 Py_UNICODE_COPY(res_p, sep, (int)seplen); 4136 res_p += seplen; 4137 } 4138 Py_DECREF(item); 4139 res_used = new_res_used; 4140 } 4141 4142 /* Shrink res to match the used area; this probably can't fail, 4143 * but it's cheap to check. 4144 */ 4145 if (_PyUnicode_Resize(&res, (int)res_used) < 0) 4146 goto onError; 4147 4148 Done: 4149 Py_XDECREF(internal_separator); 4150 Py_DECREF(fseq); 4151 return (PyObject *)res; 4152 4153 Overflow: 4154 PyErr_SetString(PyExc_OverflowError, 4155 "join() is too long for a Python string"); 4156 Py_DECREF(item); 4157 /* fall through */ 4158 4159 onError: 4160 Py_XDECREF(internal_separator); 4161 Py_DECREF(fseq); 4162 Py_XDECREF(res); 4163 return NULL; 4164} 4165 4166static 4167PyUnicodeObject *pad(PyUnicodeObject *self, 4168 int left, 4169 int right, 4170 Py_UNICODE fill) 4171{ 4172 PyUnicodeObject *u; 4173 4174 if (left < 0) 4175 left = 0; 4176 if (right < 0) 4177 right = 0; 4178 4179 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) { 4180 Py_INCREF(self); 4181 return self; 4182 } 4183 4184 u = _PyUnicode_New(left + self->length + right); 4185 if (u) { 4186 if (left) 4187 Py_UNICODE_FILL(u->str, fill, left); 4188 Py_UNICODE_COPY(u->str + left, self->str, self->length); 4189 if (right) 4190 Py_UNICODE_FILL(u->str + left + self->length, fill, right); 4191 } 4192 4193 return u; 4194} 4195 4196#define SPLIT_APPEND(data, left, right) \ 4197 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \ 4198 if (!str) \ 4199 goto onError; \ 4200 if (PyList_Append(list, str)) { \ 4201 Py_DECREF(str); \ 4202 goto onError; \ 4203 } \ 4204 else \ 4205 Py_DECREF(str); 4206 4207#define SPLIT_INSERT(data, left, right) \ 4208 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \ 4209 if (!str) \ 4210 goto onError; \ 4211 if (PyList_Insert(list, 0, str)) { \ 4212 Py_DECREF(str); \ 4213 goto onError; \ 4214 } \ 4215 else \ 4216 Py_DECREF(str); 4217 4218static 4219PyObject *split_whitespace(PyUnicodeObject *self, 4220 PyObject *list, 4221 int maxcount) 4222{ 4223 register int i; 4224 register int j; 4225 int len = self->length; 4226 PyObject *str; 4227 4228 for (i = j = 0; i < len; ) { 4229 /* find a token */ 4230 while (i < len && Py_UNICODE_ISSPACE(self->str[i])) 4231 i++; 4232 j = i; 4233 while (i < len && !Py_UNICODE_ISSPACE(self->str[i])) 4234 i++; 4235 if (j < i) { 4236 if (maxcount-- <= 0) 4237 break; 4238 SPLIT_APPEND(self->str, j, i); 4239 while (i < len && Py_UNICODE_ISSPACE(self->str[i])) 4240 i++; 4241 j = i; 4242 } 4243 } 4244 if (j < len) { 4245 SPLIT_APPEND(self->str, j, len); 4246 } 4247 return list; 4248 4249 onError: 4250 Py_DECREF(list); 4251 return NULL; 4252} 4253 4254PyObject *PyUnicode_Splitlines(PyObject *string, 4255 int keepends) 4256{ 4257 register int i; 4258 register int j; 4259 int len; 4260 PyObject *list; 4261 PyObject *str; 4262 Py_UNICODE *data; 4263 4264 string = PyUnicode_FromObject(string); 4265 if (string == NULL) 4266 return NULL; 4267 data = PyUnicode_AS_UNICODE(string); 4268 len = PyUnicode_GET_SIZE(string); 4269 4270 list = PyList_New(0); 4271 if (!list) 4272 goto onError; 4273 4274 for (i = j = 0; i < len; ) { 4275 int eol; 4276 4277 /* Find a line and append it */ 4278 while (i < len && !Py_UNICODE_ISLINEBREAK(data[i])) 4279 i++; 4280 4281 /* Skip the line break reading CRLF as one line break */ 4282 eol = i; 4283 if (i < len) { 4284 if (data[i] == '\r' && i + 1 < len && 4285 data[i+1] == '\n') 4286 i += 2; 4287 else 4288 i++; 4289 if (keepends) 4290 eol = i; 4291 } 4292 SPLIT_APPEND(data, j, eol); 4293 j = i; 4294 } 4295 if (j < len) { 4296 SPLIT_APPEND(data, j, len); 4297 } 4298 4299 Py_DECREF(string); 4300 return list; 4301 4302 onError: 4303 Py_DECREF(list); 4304 Py_DECREF(string); 4305 return NULL; 4306} 4307 4308static 4309PyObject *split_char(PyUnicodeObject *self, 4310 PyObject *list, 4311 Py_UNICODE ch, 4312 int maxcount) 4313{ 4314 register int i; 4315 register int j; 4316 int len = self->length; 4317 PyObject *str; 4318 4319 for (i = j = 0; i < len; ) { 4320 if (self->str[i] == ch) { 4321 if (maxcount-- <= 0) 4322 break; 4323 SPLIT_APPEND(self->str, j, i); 4324 i = j = i + 1; 4325 } else 4326 i++; 4327 } 4328 if (j <= len) { 4329 SPLIT_APPEND(self->str, j, len); 4330 } 4331 return list; 4332 4333 onError: 4334 Py_DECREF(list); 4335 return NULL; 4336} 4337 4338static 4339PyObject *split_substring(PyUnicodeObject *self, 4340 PyObject *list, 4341 PyUnicodeObject *substring, 4342 int maxcount) 4343{ 4344 register int i; 4345 register int j; 4346 int len = self->length; 4347 int sublen = substring->length; 4348 PyObject *str; 4349 4350 for (i = j = 0; i <= len - sublen; ) { 4351 if (Py_UNICODE_MATCH(self, i, substring)) { 4352 if (maxcount-- <= 0) 4353 break; 4354 SPLIT_APPEND(self->str, j, i); 4355 i = j = i + sublen; 4356 } else 4357 i++; 4358 } 4359 if (j <= len) { 4360 SPLIT_APPEND(self->str, j, len); 4361 } 4362 return list; 4363 4364 onError: 4365 Py_DECREF(list); 4366 return NULL; 4367} 4368 4369static 4370PyObject *rsplit_whitespace(PyUnicodeObject *self, 4371 PyObject *list, 4372 int maxcount) 4373{ 4374 register int i; 4375 register int j; 4376 int len = self->length; 4377 PyObject *str; 4378 4379 for (i = j = len - 1; i >= 0; ) { 4380 /* find a token */ 4381 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i])) 4382 i--; 4383 j = i; 4384 while (i >= 0 && !Py_UNICODE_ISSPACE(self->str[i])) 4385 i--; 4386 if (j > i) { 4387 if (maxcount-- <= 0) 4388 break; 4389 SPLIT_INSERT(self->str, i + 1, j + 1); 4390 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i])) 4391 i--; 4392 j = i; 4393 } 4394 } 4395 if (j >= 0) { 4396 SPLIT_INSERT(self->str, 0, j + 1); 4397 } 4398 return list; 4399 4400 onError: 4401 Py_DECREF(list); 4402 return NULL; 4403} 4404 4405static 4406PyObject *rsplit_char(PyUnicodeObject *self, 4407 PyObject *list, 4408 Py_UNICODE ch, 4409 int maxcount) 4410{ 4411 register int i; 4412 register int j; 4413 int len = self->length; 4414 PyObject *str; 4415 4416 for (i = j = len - 1; i >= 0; ) { 4417 if (self->str[i] == ch) { 4418 if (maxcount-- <= 0) 4419 break; 4420 SPLIT_INSERT(self->str, i + 1, j + 1); 4421 j = i = i - 1; 4422 } else 4423 i--; 4424 } 4425 if (j >= -1) { 4426 SPLIT_INSERT(self->str, 0, j + 1); 4427 } 4428 return list; 4429 4430 onError: 4431 Py_DECREF(list); 4432 return NULL; 4433} 4434 4435static 4436PyObject *rsplit_substring(PyUnicodeObject *self, 4437 PyObject *list, 4438 PyUnicodeObject *substring, 4439 int maxcount) 4440{ 4441 register int i; 4442 register int j; 4443 int len = self->length; 4444 int sublen = substring->length; 4445 PyObject *str; 4446 4447 for (i = len - sublen, j = len; i >= 0; ) { 4448 if (Py_UNICODE_MATCH(self, i, substring)) { 4449 if (maxcount-- <= 0) 4450 break; 4451 SPLIT_INSERT(self->str, i + sublen, j); 4452 j = i; 4453 i -= sublen; 4454 } else 4455 i--; 4456 } 4457 if (j >= 0) { 4458 SPLIT_INSERT(self->str, 0, j); 4459 } 4460 return list; 4461 4462 onError: 4463 Py_DECREF(list); 4464 return NULL; 4465} 4466 4467#undef SPLIT_APPEND 4468#undef SPLIT_INSERT 4469 4470static 4471PyObject *split(PyUnicodeObject *self, 4472 PyUnicodeObject *substring, 4473 int maxcount) 4474{ 4475 PyObject *list; 4476 4477 if (maxcount < 0) 4478 maxcount = INT_MAX; 4479 4480 list = PyList_New(0); 4481 if (!list) 4482 return NULL; 4483 4484 if (substring == NULL) 4485 return split_whitespace(self,list,maxcount); 4486 4487 else if (substring->length == 1) 4488 return split_char(self,list,substring->str[0],maxcount); 4489 4490 else if (substring->length == 0) { 4491 Py_DECREF(list); 4492 PyErr_SetString(PyExc_ValueError, "empty separator"); 4493 return NULL; 4494 } 4495 else 4496 return split_substring(self,list,substring,maxcount); 4497} 4498 4499static 4500PyObject *rsplit(PyUnicodeObject *self, 4501 PyUnicodeObject *substring, 4502 int maxcount) 4503{ 4504 PyObject *list; 4505 4506 if (maxcount < 0) 4507 maxcount = INT_MAX; 4508 4509 list = PyList_New(0); 4510 if (!list) 4511 return NULL; 4512 4513 if (substring == NULL) 4514 return rsplit_whitespace(self,list,maxcount); 4515 4516 else if (substring->length == 1) 4517 return rsplit_char(self,list,substring->str[0],maxcount); 4518 4519 else if (substring->length == 0) { 4520 Py_DECREF(list); 4521 PyErr_SetString(PyExc_ValueError, "empty separator"); 4522 return NULL; 4523 } 4524 else 4525 return rsplit_substring(self,list,substring,maxcount); 4526} 4527 4528static 4529PyObject *replace(PyUnicodeObject *self, 4530 PyUnicodeObject *str1, 4531 PyUnicodeObject *str2, 4532 int maxcount) 4533{ 4534 PyUnicodeObject *u; 4535 4536 if (maxcount < 0) 4537 maxcount = INT_MAX; 4538 4539 if (str1->length == 1 && str2->length == 1) { 4540 int i; 4541 4542 /* replace characters */ 4543 if (!findchar(self->str, self->length, str1->str[0]) && 4544 PyUnicode_CheckExact(self)) { 4545 /* nothing to replace, return original string */ 4546 Py_INCREF(self); 4547 u = self; 4548 } else { 4549 Py_UNICODE u1 = str1->str[0]; 4550 Py_UNICODE u2 = str2->str[0]; 4551 4552 u = (PyUnicodeObject*) PyUnicode_FromUnicode( 4553 NULL, 4554 self->length 4555 ); 4556 if (u != NULL) { 4557 Py_UNICODE_COPY(u->str, self->str, 4558 self->length); 4559 for (i = 0; i < u->length; i++) 4560 if (u->str[i] == u1) { 4561 if (--maxcount < 0) 4562 break; 4563 u->str[i] = u2; 4564 } 4565 } 4566 } 4567 4568 } else { 4569 int n, i; 4570 Py_UNICODE *p; 4571 4572 /* replace strings */ 4573 n = count(self, 0, self->length, str1); 4574 if (n > maxcount) 4575 n = maxcount; 4576 if (n == 0) { 4577 /* nothing to replace, return original string */ 4578 if (PyUnicode_CheckExact(self)) { 4579 Py_INCREF(self); 4580 u = self; 4581 } 4582 else { 4583 u = (PyUnicodeObject *) 4584 PyUnicode_FromUnicode(self->str, self->length); 4585 } 4586 } else { 4587 u = _PyUnicode_New( 4588 self->length + n * (str2->length - str1->length)); 4589 if (u) { 4590 i = 0; 4591 p = u->str; 4592 if (str1->length > 0) { 4593 while (i <= self->length - str1->length) 4594 if (Py_UNICODE_MATCH(self, i, str1)) { 4595 /* replace string segment */ 4596 Py_UNICODE_COPY(p, str2->str, str2->length); 4597 p += str2->length; 4598 i += str1->length; 4599 if (--n <= 0) { 4600 /* copy remaining part */ 4601 Py_UNICODE_COPY(p, self->str+i, self->length-i); 4602 break; 4603 } 4604 } else 4605 *p++ = self->str[i++]; 4606 } else { 4607 while (n > 0) { 4608 Py_UNICODE_COPY(p, str2->str, str2->length); 4609 p += str2->length; 4610 if (--n <= 0) 4611 break; 4612 *p++ = self->str[i++]; 4613 } 4614 Py_UNICODE_COPY(p, self->str+i, self->length-i); 4615 } 4616 } 4617 } 4618 } 4619 4620 return (PyObject *) u; 4621} 4622 4623/* --- Unicode Object Methods --------------------------------------------- */ 4624 4625PyDoc_STRVAR(title__doc__, 4626"S.title() -> unicode\n\ 4627\n\ 4628Return a titlecased version of S, i.e. words start with title case\n\ 4629characters, all remaining cased characters have lower case."); 4630 4631static PyObject* 4632unicode_title(PyUnicodeObject *self) 4633{ 4634 return fixup(self, fixtitle); 4635} 4636 4637PyDoc_STRVAR(capitalize__doc__, 4638"S.capitalize() -> unicode\n\ 4639\n\ 4640Return a capitalized version of S, i.e. make the first character\n\ 4641have upper case."); 4642 4643static PyObject* 4644unicode_capitalize(PyUnicodeObject *self) 4645{ 4646 return fixup(self, fixcapitalize); 4647} 4648 4649#if 0 4650PyDoc_STRVAR(capwords__doc__, 4651"S.capwords() -> unicode\n\ 4652\n\ 4653Apply .capitalize() to all words in S and return the result with\n\ 4654normalized whitespace (all whitespace strings are replaced by ' ')."); 4655 4656static PyObject* 4657unicode_capwords(PyUnicodeObject *self) 4658{ 4659 PyObject *list; 4660 PyObject *item; 4661 int i; 4662 4663 /* Split into words */ 4664 list = split(self, NULL, -1); 4665 if (!list) 4666 return NULL; 4667 4668 /* Capitalize each word */ 4669 for (i = 0; i < PyList_GET_SIZE(list); i++) { 4670 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i), 4671 fixcapitalize); 4672 if (item == NULL) 4673 goto onError; 4674 Py_DECREF(PyList_GET_ITEM(list, i)); 4675 PyList_SET_ITEM(list, i, item); 4676 } 4677 4678 /* Join the words to form a new string */ 4679 item = PyUnicode_Join(NULL, list); 4680 4681onError: 4682 Py_DECREF(list); 4683 return (PyObject *)item; 4684} 4685#endif 4686 4687/* Argument converter. Coerces to a single unicode character */ 4688 4689static int 4690convert_uc(PyObject *obj, void *addr) 4691{ 4692 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr; 4693 PyObject *uniobj; 4694 Py_UNICODE *unistr; 4695 4696 uniobj = PyUnicode_FromObject(obj); 4697 if (uniobj == NULL) { 4698 PyErr_SetString(PyExc_TypeError, 4699 "The fill character cannot be converted to Unicode"); 4700 return 0; 4701 } 4702 if (PyUnicode_GET_SIZE(uniobj) != 1) { 4703 PyErr_SetString(PyExc_TypeError, 4704 "The fill character must be exactly one character long"); 4705 Py_DECREF(uniobj); 4706 return 0; 4707 } 4708 unistr = PyUnicode_AS_UNICODE(uniobj); 4709 *fillcharloc = unistr[0]; 4710 Py_DECREF(uniobj); 4711 return 1; 4712} 4713 4714PyDoc_STRVAR(center__doc__, 4715"S.center(width[, fillchar]) -> unicode\n\ 4716\n\ 4717Return S centered in a Unicode string of length width. Padding is\n\ 4718done using the specified fill character (default is a space)"); 4719 4720static PyObject * 4721unicode_center(PyUnicodeObject *self, PyObject *args) 4722{ 4723 int marg, left; 4724 int width; 4725 Py_UNICODE fillchar = ' '; 4726 4727 if (!PyArg_ParseTuple(args, "i|O&:center", &width, convert_uc, &fillchar)) 4728 return NULL; 4729 4730 if (self->length >= width && PyUnicode_CheckExact(self)) { 4731 Py_INCREF(self); 4732 return (PyObject*) self; 4733 } 4734 4735 marg = width - self->length; 4736 left = marg / 2 + (marg & width & 1); 4737 4738 return (PyObject*) pad(self, left, marg - left, fillchar); 4739} 4740 4741#if 0 4742 4743/* This code should go into some future Unicode collation support 4744 module. The basic comparison should compare ordinals on a naive 4745 basis (this is what Java does and thus JPython too). */ 4746 4747/* speedy UTF-16 code point order comparison */ 4748/* gleaned from: */ 4749/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */ 4750 4751static short utf16Fixup[32] = 4752{ 4753 0, 0, 0, 0, 0, 0, 0, 0, 4754 0, 0, 0, 0, 0, 0, 0, 0, 4755 0, 0, 0, 0, 0, 0, 0, 0, 4756 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800 4757}; 4758 4759static int 4760unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2) 4761{ 4762 int len1, len2; 4763 4764 Py_UNICODE *s1 = str1->str; 4765 Py_UNICODE *s2 = str2->str; 4766 4767 len1 = str1->length; 4768 len2 = str2->length; 4769 4770 while (len1 > 0 && len2 > 0) { 4771 Py_UNICODE c1, c2; 4772 4773 c1 = *s1++; 4774 c2 = *s2++; 4775 4776 if (c1 > (1<<11) * 26) 4777 c1 += utf16Fixup[c1>>11]; 4778 if (c2 > (1<<11) * 26) 4779 c2 += utf16Fixup[c2>>11]; 4780 /* now c1 and c2 are in UTF-32-compatible order */ 4781 4782 if (c1 != c2) 4783 return (c1 < c2) ? -1 : 1; 4784 4785 len1--; len2--; 4786 } 4787 4788 return (len1 < len2) ? -1 : (len1 != len2); 4789} 4790 4791#else 4792 4793static int 4794unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2) 4795{ 4796 register int len1, len2; 4797 4798 Py_UNICODE *s1 = str1->str; 4799 Py_UNICODE *s2 = str2->str; 4800 4801 len1 = str1->length; 4802 len2 = str2->length; 4803 4804 while (len1 > 0 && len2 > 0) { 4805 Py_UNICODE c1, c2; 4806 4807 c1 = *s1++; 4808 c2 = *s2++; 4809 4810 if (c1 != c2) 4811 return (c1 < c2) ? -1 : 1; 4812 4813 len1--; len2--; 4814 } 4815 4816 return (len1 < len2) ? -1 : (len1 != len2); 4817} 4818 4819#endif 4820 4821int PyUnicode_Compare(PyObject *left, 4822 PyObject *right) 4823{ 4824 PyUnicodeObject *u = NULL, *v = NULL; 4825 int result; 4826 4827 /* Coerce the two arguments */ 4828 u = (PyUnicodeObject *)PyUnicode_FromObject(left); 4829 if (u == NULL) 4830 goto onError; 4831 v = (PyUnicodeObject *)PyUnicode_FromObject(right); 4832 if (v == NULL) 4833 goto onError; 4834 4835 /* Shortcut for empty or interned objects */ 4836 if (v == u) { 4837 Py_DECREF(u); 4838 Py_DECREF(v); 4839 return 0; 4840 } 4841 4842 result = unicode_compare(u, v); 4843 4844 Py_DECREF(u); 4845 Py_DECREF(v); 4846 return result; 4847 4848onError: 4849 Py_XDECREF(u); 4850 Py_XDECREF(v); 4851 return -1; 4852} 4853 4854int PyUnicode_Contains(PyObject *container, 4855 PyObject *element) 4856{ 4857 PyUnicodeObject *u = NULL, *v = NULL; 4858 int result, size; 4859 register const Py_UNICODE *lhs, *end, *rhs; 4860 4861 /* Coerce the two arguments */ 4862 v = (PyUnicodeObject *)PyUnicode_FromObject(element); 4863 if (v == NULL) { 4864 PyErr_SetString(PyExc_TypeError, 4865 "'in <string>' requires string as left operand"); 4866 goto onError; 4867 } 4868 u = (PyUnicodeObject *)PyUnicode_FromObject(container); 4869 if (u == NULL) 4870 goto onError; 4871 4872 size = PyUnicode_GET_SIZE(v); 4873 rhs = PyUnicode_AS_UNICODE(v); 4874 lhs = PyUnicode_AS_UNICODE(u); 4875 4876 result = 0; 4877 if (size == 1) { 4878 end = lhs + PyUnicode_GET_SIZE(u); 4879 while (lhs < end) { 4880 if (*lhs++ == *rhs) { 4881 result = 1; 4882 break; 4883 } 4884 } 4885 } 4886 else { 4887 end = lhs + (PyUnicode_GET_SIZE(u) - size); 4888 while (lhs <= end) { 4889 if (memcmp(lhs++, rhs, size * sizeof(Py_UNICODE)) == 0) { 4890 result = 1; 4891 break; 4892 } 4893 } 4894 } 4895 4896 Py_DECREF(u); 4897 Py_DECREF(v); 4898 return result; 4899 4900onError: 4901 Py_XDECREF(u); 4902 Py_XDECREF(v); 4903 return -1; 4904} 4905 4906/* Concat to string or Unicode object giving a new Unicode object. */ 4907 4908PyObject *PyUnicode_Concat(PyObject *left, 4909 PyObject *right) 4910{ 4911 PyUnicodeObject *u = NULL, *v = NULL, *w; 4912 4913 /* Coerce the two arguments */ 4914 u = (PyUnicodeObject *)PyUnicode_FromObject(left); 4915 if (u == NULL) 4916 goto onError; 4917 v = (PyUnicodeObject *)PyUnicode_FromObject(right); 4918 if (v == NULL) 4919 goto onError; 4920 4921 /* Shortcuts */ 4922 if (v == unicode_empty) { 4923 Py_DECREF(v); 4924 return (PyObject *)u; 4925 } 4926 if (u == unicode_empty) { 4927 Py_DECREF(u); 4928 return (PyObject *)v; 4929 } 4930 4931 /* Concat the two Unicode strings */ 4932 w = _PyUnicode_New(u->length + v->length); 4933 if (w == NULL) 4934 goto onError; 4935 Py_UNICODE_COPY(w->str, u->str, u->length); 4936 Py_UNICODE_COPY(w->str + u->length, v->str, v->length); 4937 4938 Py_DECREF(u); 4939 Py_DECREF(v); 4940 return (PyObject *)w; 4941 4942onError: 4943 Py_XDECREF(u); 4944 Py_XDECREF(v); 4945 return NULL; 4946} 4947 4948PyDoc_STRVAR(count__doc__, 4949"S.count(sub[, start[, end]]) -> int\n\ 4950\n\ 4951Return the number of occurrences of substring sub in Unicode string\n\ 4952S[start:end]. Optional arguments start and end are\n\ 4953interpreted as in slice notation."); 4954 4955static PyObject * 4956unicode_count(PyUnicodeObject *self, PyObject *args) 4957{ 4958 PyUnicodeObject *substring; 4959 int start = 0; 4960 int end = INT_MAX; 4961 PyObject *result; 4962 4963 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring, 4964 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 4965 return NULL; 4966 4967 substring = (PyUnicodeObject *)PyUnicode_FromObject( 4968 (PyObject *)substring); 4969 if (substring == NULL) 4970 return NULL; 4971 4972 if (start < 0) 4973 start += self->length; 4974 if (start < 0) 4975 start = 0; 4976 if (end > self->length) 4977 end = self->length; 4978 if (end < 0) 4979 end += self->length; 4980 if (end < 0) 4981 end = 0; 4982 4983 result = PyInt_FromLong((long) count(self, start, end, substring)); 4984 4985 Py_DECREF(substring); 4986 return result; 4987} 4988 4989PyDoc_STRVAR(encode__doc__, 4990"S.encode([encoding[,errors]]) -> string or unicode\n\ 4991\n\ 4992Encodes S using the codec registered for encoding. encoding defaults\n\ 4993to the default encoding. errors may be given to set a different error\n\ 4994handling scheme. Default is 'strict' meaning that encoding errors raise\n\ 4995a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\ 4996'xmlcharrefreplace' as well as any other name registered with\n\ 4997codecs.register_error that can handle UnicodeEncodeErrors."); 4998 4999static PyObject * 5000unicode_encode(PyUnicodeObject *self, PyObject *args) 5001{ 5002 char *encoding = NULL; 5003 char *errors = NULL; 5004 PyObject *v; 5005 5006 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors)) 5007 return NULL; 5008 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors); 5009 if (v == NULL) 5010 goto onError; 5011 if (!PyString_Check(v) && !PyUnicode_Check(v)) { 5012 PyErr_Format(PyExc_TypeError, 5013 "encoder did not return a string/unicode object " 5014 "(type=%.400s)", 5015 v->ob_type->tp_name); 5016 Py_DECREF(v); 5017 return NULL; 5018 } 5019 return v; 5020 5021 onError: 5022 return NULL; 5023} 5024 5025PyDoc_STRVAR(decode__doc__, 5026"S.decode([encoding[,errors]]) -> string or unicode\n\ 5027\n\ 5028Decodes S using the codec registered for encoding. encoding defaults\n\ 5029to the default encoding. errors may be given to set a different error\n\ 5030handling scheme. Default is 'strict' meaning that encoding errors raise\n\ 5031a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\ 5032as well as any other name registerd with codecs.register_error that is\n\ 5033able to handle UnicodeDecodeErrors."); 5034 5035static PyObject * 5036unicode_decode(PyUnicodeObject *self, PyObject *args) 5037{ 5038 char *encoding = NULL; 5039 char *errors = NULL; 5040 PyObject *v; 5041 5042 if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors)) 5043 return NULL; 5044 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors); 5045 if (v == NULL) 5046 goto onError; 5047 if (!PyString_Check(v) && !PyUnicode_Check(v)) { 5048 PyErr_Format(PyExc_TypeError, 5049 "decoder did not return a string/unicode object " 5050 "(type=%.400s)", 5051 v->ob_type->tp_name); 5052 Py_DECREF(v); 5053 return NULL; 5054 } 5055 return v; 5056 5057 onError: 5058 return NULL; 5059} 5060 5061PyDoc_STRVAR(expandtabs__doc__, 5062"S.expandtabs([tabsize]) -> unicode\n\ 5063\n\ 5064Return a copy of S where all tab characters are expanded using spaces.\n\ 5065If tabsize is not given, a tab size of 8 characters is assumed."); 5066 5067static PyObject* 5068unicode_expandtabs(PyUnicodeObject *self, PyObject *args) 5069{ 5070 Py_UNICODE *e; 5071 Py_UNICODE *p; 5072 Py_UNICODE *q; 5073 int i, j; 5074 PyUnicodeObject *u; 5075 int tabsize = 8; 5076 5077 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize)) 5078 return NULL; 5079 5080 /* First pass: determine size of output string */ 5081 i = j = 0; 5082 e = self->str + self->length; 5083 for (p = self->str; p < e; p++) 5084 if (*p == '\t') { 5085 if (tabsize > 0) 5086 j += tabsize - (j % tabsize); 5087 } 5088 else { 5089 j++; 5090 if (*p == '\n' || *p == '\r') { 5091 i += j; 5092 j = 0; 5093 } 5094 } 5095 5096 /* Second pass: create output string and fill it */ 5097 u = _PyUnicode_New(i + j); 5098 if (!u) 5099 return NULL; 5100 5101 j = 0; 5102 q = u->str; 5103 5104 for (p = self->str; p < e; p++) 5105 if (*p == '\t') { 5106 if (tabsize > 0) { 5107 i = tabsize - (j % tabsize); 5108 j += i; 5109 while (i--) 5110 *q++ = ' '; 5111 } 5112 } 5113 else { 5114 j++; 5115 *q++ = *p; 5116 if (*p == '\n' || *p == '\r') 5117 j = 0; 5118 } 5119 5120 return (PyObject*) u; 5121} 5122 5123PyDoc_STRVAR(find__doc__, 5124"S.find(sub [,start [,end]]) -> int\n\ 5125\n\ 5126Return the lowest index in S where substring sub is found,\n\ 5127such that sub is contained within s[start,end]. Optional\n\ 5128arguments start and end are interpreted as in slice notation.\n\ 5129\n\ 5130Return -1 on failure."); 5131 5132static PyObject * 5133unicode_find(PyUnicodeObject *self, PyObject *args) 5134{ 5135 PyUnicodeObject *substring; 5136 int start = 0; 5137 int end = INT_MAX; 5138 PyObject *result; 5139 5140 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring, 5141 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 5142 return NULL; 5143 substring = (PyUnicodeObject *)PyUnicode_FromObject( 5144 (PyObject *)substring); 5145 if (substring == NULL) 5146 return NULL; 5147 5148 result = PyInt_FromLong(findstring(self, substring, start, end, 1)); 5149 5150 Py_DECREF(substring); 5151 return result; 5152} 5153 5154static PyObject * 5155unicode_getitem(PyUnicodeObject *self, int index) 5156{ 5157 if (index < 0 || index >= self->length) { 5158 PyErr_SetString(PyExc_IndexError, "string index out of range"); 5159 return NULL; 5160 } 5161 5162 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1); 5163} 5164 5165static long 5166unicode_hash(PyUnicodeObject *self) 5167{ 5168 /* Since Unicode objects compare equal to their ASCII string 5169 counterparts, they should use the individual character values 5170 as basis for their hash value. This is needed to assure that 5171 strings and Unicode objects behave in the same way as 5172 dictionary keys. */ 5173 5174 register int len; 5175 register Py_UNICODE *p; 5176 register long x; 5177 5178 if (self->hash != -1) 5179 return self->hash; 5180 len = PyUnicode_GET_SIZE(self); 5181 p = PyUnicode_AS_UNICODE(self); 5182 x = *p << 7; 5183 while (--len >= 0) 5184 x = (1000003*x) ^ *p++; 5185 x ^= PyUnicode_GET_SIZE(self); 5186 if (x == -1) 5187 x = -2; 5188 self->hash = x; 5189 return x; 5190} 5191 5192PyDoc_STRVAR(index__doc__, 5193"S.index(sub [,start [,end]]) -> int\n\ 5194\n\ 5195Like S.find() but raise ValueError when the substring is not found."); 5196 5197static PyObject * 5198unicode_index(PyUnicodeObject *self, PyObject *args) 5199{ 5200 int result; 5201 PyUnicodeObject *substring; 5202 int start = 0; 5203 int end = INT_MAX; 5204 5205 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring, 5206 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 5207 return NULL; 5208 5209 substring = (PyUnicodeObject *)PyUnicode_FromObject( 5210 (PyObject *)substring); 5211 if (substring == NULL) 5212 return NULL; 5213 5214 result = findstring(self, substring, start, end, 1); 5215 5216 Py_DECREF(substring); 5217 if (result < 0) { 5218 PyErr_SetString(PyExc_ValueError, "substring not found"); 5219 return NULL; 5220 } 5221 return PyInt_FromLong(result); 5222} 5223 5224PyDoc_STRVAR(islower__doc__, 5225"S.islower() -> bool\n\ 5226\n\ 5227Return True if all cased characters in S are lowercase and there is\n\ 5228at least one cased character in S, False otherwise."); 5229 5230static PyObject* 5231unicode_islower(PyUnicodeObject *self) 5232{ 5233 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 5234 register const Py_UNICODE *e; 5235 int cased; 5236 5237 /* Shortcut for single character strings */ 5238 if (PyUnicode_GET_SIZE(self) == 1) 5239 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p)); 5240 5241 /* Special case for empty strings */ 5242 if (PyString_GET_SIZE(self) == 0) 5243 return PyBool_FromLong(0); 5244 5245 e = p + PyUnicode_GET_SIZE(self); 5246 cased = 0; 5247 for (; p < e; p++) { 5248 register const Py_UNICODE ch = *p; 5249 5250 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) 5251 return PyBool_FromLong(0); 5252 else if (!cased && Py_UNICODE_ISLOWER(ch)) 5253 cased = 1; 5254 } 5255 return PyBool_FromLong(cased); 5256} 5257 5258PyDoc_STRVAR(isupper__doc__, 5259"S.isupper() -> bool\n\ 5260\n\ 5261Return True if all cased characters in S are uppercase and there is\n\ 5262at least one cased character in S, False otherwise."); 5263 5264static PyObject* 5265unicode_isupper(PyUnicodeObject *self) 5266{ 5267 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 5268 register const Py_UNICODE *e; 5269 int cased; 5270 5271 /* Shortcut for single character strings */ 5272 if (PyUnicode_GET_SIZE(self) == 1) 5273 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0); 5274 5275 /* Special case for empty strings */ 5276 if (PyString_GET_SIZE(self) == 0) 5277 return PyBool_FromLong(0); 5278 5279 e = p + PyUnicode_GET_SIZE(self); 5280 cased = 0; 5281 for (; p < e; p++) { 5282 register const Py_UNICODE ch = *p; 5283 5284 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch)) 5285 return PyBool_FromLong(0); 5286 else if (!cased && Py_UNICODE_ISUPPER(ch)) 5287 cased = 1; 5288 } 5289 return PyBool_FromLong(cased); 5290} 5291 5292PyDoc_STRVAR(istitle__doc__, 5293"S.istitle() -> bool\n\ 5294\n\ 5295Return True if S is a titlecased string and there is at least one\n\ 5296character in S, i.e. upper- and titlecase characters may only\n\ 5297follow uncased characters and lowercase characters only cased ones.\n\ 5298Return False otherwise."); 5299 5300static PyObject* 5301unicode_istitle(PyUnicodeObject *self) 5302{ 5303 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 5304 register const Py_UNICODE *e; 5305 int cased, previous_is_cased; 5306 5307 /* Shortcut for single character strings */ 5308 if (PyUnicode_GET_SIZE(self) == 1) 5309 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) || 5310 (Py_UNICODE_ISUPPER(*p) != 0)); 5311 5312 /* Special case for empty strings */ 5313 if (PyString_GET_SIZE(self) == 0) 5314 return PyBool_FromLong(0); 5315 5316 e = p + PyUnicode_GET_SIZE(self); 5317 cased = 0; 5318 previous_is_cased = 0; 5319 for (; p < e; p++) { 5320 register const Py_UNICODE ch = *p; 5321 5322 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) { 5323 if (previous_is_cased) 5324 return PyBool_FromLong(0); 5325 previous_is_cased = 1; 5326 cased = 1; 5327 } 5328 else if (Py_UNICODE_ISLOWER(ch)) { 5329 if (!previous_is_cased) 5330 return PyBool_FromLong(0); 5331 previous_is_cased = 1; 5332 cased = 1; 5333 } 5334 else 5335 previous_is_cased = 0; 5336 } 5337 return PyBool_FromLong(cased); 5338} 5339 5340PyDoc_STRVAR(isspace__doc__, 5341"S.isspace() -> bool\n\ 5342\n\ 5343Return True if all characters in S are whitespace\n\ 5344and there is at least one character in S, False otherwise."); 5345 5346static PyObject* 5347unicode_isspace(PyUnicodeObject *self) 5348{ 5349 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 5350 register const Py_UNICODE *e; 5351 5352 /* Shortcut for single character strings */ 5353 if (PyUnicode_GET_SIZE(self) == 1 && 5354 Py_UNICODE_ISSPACE(*p)) 5355 return PyBool_FromLong(1); 5356 5357 /* Special case for empty strings */ 5358 if (PyString_GET_SIZE(self) == 0) 5359 return PyBool_FromLong(0); 5360 5361 e = p + PyUnicode_GET_SIZE(self); 5362 for (; p < e; p++) { 5363 if (!Py_UNICODE_ISSPACE(*p)) 5364 return PyBool_FromLong(0); 5365 } 5366 return PyBool_FromLong(1); 5367} 5368 5369PyDoc_STRVAR(isalpha__doc__, 5370"S.isalpha() -> bool\n\ 5371\n\ 5372Return True if all characters in S are alphabetic\n\ 5373and there is at least one character in S, False otherwise."); 5374 5375static PyObject* 5376unicode_isalpha(PyUnicodeObject *self) 5377{ 5378 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 5379 register const Py_UNICODE *e; 5380 5381 /* Shortcut for single character strings */ 5382 if (PyUnicode_GET_SIZE(self) == 1 && 5383 Py_UNICODE_ISALPHA(*p)) 5384 return PyBool_FromLong(1); 5385 5386 /* Special case for empty strings */ 5387 if (PyString_GET_SIZE(self) == 0) 5388 return PyBool_FromLong(0); 5389 5390 e = p + PyUnicode_GET_SIZE(self); 5391 for (; p < e; p++) { 5392 if (!Py_UNICODE_ISALPHA(*p)) 5393 return PyBool_FromLong(0); 5394 } 5395 return PyBool_FromLong(1); 5396} 5397 5398PyDoc_STRVAR(isalnum__doc__, 5399"S.isalnum() -> bool\n\ 5400\n\ 5401Return True if all characters in S are alphanumeric\n\ 5402and there is at least one character in S, False otherwise."); 5403 5404static PyObject* 5405unicode_isalnum(PyUnicodeObject *self) 5406{ 5407 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 5408 register const Py_UNICODE *e; 5409 5410 /* Shortcut for single character strings */ 5411 if (PyUnicode_GET_SIZE(self) == 1 && 5412 Py_UNICODE_ISALNUM(*p)) 5413 return PyBool_FromLong(1); 5414 5415 /* Special case for empty strings */ 5416 if (PyString_GET_SIZE(self) == 0) 5417 return PyBool_FromLong(0); 5418 5419 e = p + PyUnicode_GET_SIZE(self); 5420 for (; p < e; p++) { 5421 if (!Py_UNICODE_ISALNUM(*p)) 5422 return PyBool_FromLong(0); 5423 } 5424 return PyBool_FromLong(1); 5425} 5426 5427PyDoc_STRVAR(isdecimal__doc__, 5428"S.isdecimal() -> bool\n\ 5429\n\ 5430Return True if there are only decimal characters in S,\n\ 5431False otherwise."); 5432 5433static PyObject* 5434unicode_isdecimal(PyUnicodeObject *self) 5435{ 5436 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 5437 register const Py_UNICODE *e; 5438 5439 /* Shortcut for single character strings */ 5440 if (PyUnicode_GET_SIZE(self) == 1 && 5441 Py_UNICODE_ISDECIMAL(*p)) 5442 return PyBool_FromLong(1); 5443 5444 /* Special case for empty strings */ 5445 if (PyString_GET_SIZE(self) == 0) 5446 return PyBool_FromLong(0); 5447 5448 e = p + PyUnicode_GET_SIZE(self); 5449 for (; p < e; p++) { 5450 if (!Py_UNICODE_ISDECIMAL(*p)) 5451 return PyBool_FromLong(0); 5452 } 5453 return PyBool_FromLong(1); 5454} 5455 5456PyDoc_STRVAR(isdigit__doc__, 5457"S.isdigit() -> bool\n\ 5458\n\ 5459Return True if all characters in S are digits\n\ 5460and there is at least one character in S, False otherwise."); 5461 5462static PyObject* 5463unicode_isdigit(PyUnicodeObject *self) 5464{ 5465 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 5466 register const Py_UNICODE *e; 5467 5468 /* Shortcut for single character strings */ 5469 if (PyUnicode_GET_SIZE(self) == 1 && 5470 Py_UNICODE_ISDIGIT(*p)) 5471 return PyBool_FromLong(1); 5472 5473 /* Special case for empty strings */ 5474 if (PyString_GET_SIZE(self) == 0) 5475 return PyBool_FromLong(0); 5476 5477 e = p + PyUnicode_GET_SIZE(self); 5478 for (; p < e; p++) { 5479 if (!Py_UNICODE_ISDIGIT(*p)) 5480 return PyBool_FromLong(0); 5481 } 5482 return PyBool_FromLong(1); 5483} 5484 5485PyDoc_STRVAR(isnumeric__doc__, 5486"S.isnumeric() -> bool\n\ 5487\n\ 5488Return True if there are only numeric characters in S,\n\ 5489False otherwise."); 5490 5491static PyObject* 5492unicode_isnumeric(PyUnicodeObject *self) 5493{ 5494 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 5495 register const Py_UNICODE *e; 5496 5497 /* Shortcut for single character strings */ 5498 if (PyUnicode_GET_SIZE(self) == 1 && 5499 Py_UNICODE_ISNUMERIC(*p)) 5500 return PyBool_FromLong(1); 5501 5502 /* Special case for empty strings */ 5503 if (PyString_GET_SIZE(self) == 0) 5504 return PyBool_FromLong(0); 5505 5506 e = p + PyUnicode_GET_SIZE(self); 5507 for (; p < e; p++) { 5508 if (!Py_UNICODE_ISNUMERIC(*p)) 5509 return PyBool_FromLong(0); 5510 } 5511 return PyBool_FromLong(1); 5512} 5513 5514PyDoc_STRVAR(join__doc__, 5515"S.join(sequence) -> unicode\n\ 5516\n\ 5517Return a string which is the concatenation of the strings in the\n\ 5518sequence. The separator between elements is S."); 5519 5520static PyObject* 5521unicode_join(PyObject *self, PyObject *data) 5522{ 5523 return PyUnicode_Join(self, data); 5524} 5525 5526static int 5527unicode_length(PyUnicodeObject *self) 5528{ 5529 return self->length; 5530} 5531 5532PyDoc_STRVAR(ljust__doc__, 5533"S.ljust(width[, fillchar]) -> int\n\ 5534\n\ 5535Return S left justified in a Unicode string of length width. Padding is\n\ 5536done using the specified fill character (default is a space)."); 5537 5538static PyObject * 5539unicode_ljust(PyUnicodeObject *self, PyObject *args) 5540{ 5541 int width; 5542 Py_UNICODE fillchar = ' '; 5543 5544 if (!PyArg_ParseTuple(args, "i|O&:ljust", &width, convert_uc, &fillchar)) 5545 return NULL; 5546 5547 if (self->length >= width && PyUnicode_CheckExact(self)) { 5548 Py_INCREF(self); 5549 return (PyObject*) self; 5550 } 5551 5552 return (PyObject*) pad(self, 0, width - self->length, fillchar); 5553} 5554 5555PyDoc_STRVAR(lower__doc__, 5556"S.lower() -> unicode\n\ 5557\n\ 5558Return a copy of the string S converted to lowercase."); 5559 5560static PyObject* 5561unicode_lower(PyUnicodeObject *self) 5562{ 5563 return fixup(self, fixlower); 5564} 5565 5566#define LEFTSTRIP 0 5567#define RIGHTSTRIP 1 5568#define BOTHSTRIP 2 5569 5570/* Arrays indexed by above */ 5571static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"}; 5572 5573#define STRIPNAME(i) (stripformat[i]+3) 5574 5575static const Py_UNICODE * 5576unicode_memchr(const Py_UNICODE *s, Py_UNICODE c, size_t n) 5577{ 5578 size_t i; 5579 for (i = 0; i < n; ++i) 5580 if (s[i] == c) 5581 return s+i; 5582 return NULL; 5583} 5584 5585/* externally visible for str.strip(unicode) */ 5586PyObject * 5587_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj) 5588{ 5589 Py_UNICODE *s = PyUnicode_AS_UNICODE(self); 5590 int len = PyUnicode_GET_SIZE(self); 5591 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj); 5592 int seplen = PyUnicode_GET_SIZE(sepobj); 5593 int i, j; 5594 5595 i = 0; 5596 if (striptype != RIGHTSTRIP) { 5597 while (i < len && unicode_memchr(sep, s[i], seplen)) { 5598 i++; 5599 } 5600 } 5601 5602 j = len; 5603 if (striptype != LEFTSTRIP) { 5604 do { 5605 j--; 5606 } while (j >= i && unicode_memchr(sep, s[j], seplen)); 5607 j++; 5608 } 5609 5610 if (i == 0 && j == len && PyUnicode_CheckExact(self)) { 5611 Py_INCREF(self); 5612 return (PyObject*)self; 5613 } 5614 else 5615 return PyUnicode_FromUnicode(s+i, j-i); 5616} 5617 5618 5619static PyObject * 5620do_strip(PyUnicodeObject *self, int striptype) 5621{ 5622 Py_UNICODE *s = PyUnicode_AS_UNICODE(self); 5623 int len = PyUnicode_GET_SIZE(self), i, j; 5624 5625 i = 0; 5626 if (striptype != RIGHTSTRIP) { 5627 while (i < len && Py_UNICODE_ISSPACE(s[i])) { 5628 i++; 5629 } 5630 } 5631 5632 j = len; 5633 if (striptype != LEFTSTRIP) { 5634 do { 5635 j--; 5636 } while (j >= i && Py_UNICODE_ISSPACE(s[j])); 5637 j++; 5638 } 5639 5640 if (i == 0 && j == len && PyUnicode_CheckExact(self)) { 5641 Py_INCREF(self); 5642 return (PyObject*)self; 5643 } 5644 else 5645 return PyUnicode_FromUnicode(s+i, j-i); 5646} 5647 5648 5649static PyObject * 5650do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args) 5651{ 5652 PyObject *sep = NULL; 5653 5654 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep)) 5655 return NULL; 5656 5657 if (sep != NULL && sep != Py_None) { 5658 if (PyUnicode_Check(sep)) 5659 return _PyUnicode_XStrip(self, striptype, sep); 5660 else if (PyString_Check(sep)) { 5661 PyObject *res; 5662 sep = PyUnicode_FromObject(sep); 5663 if (sep==NULL) 5664 return NULL; 5665 res = _PyUnicode_XStrip(self, striptype, sep); 5666 Py_DECREF(sep); 5667 return res; 5668 } 5669 else { 5670 PyErr_Format(PyExc_TypeError, 5671 "%s arg must be None, unicode or str", 5672 STRIPNAME(striptype)); 5673 return NULL; 5674 } 5675 } 5676 5677 return do_strip(self, striptype); 5678} 5679 5680 5681PyDoc_STRVAR(strip__doc__, 5682"S.strip([chars]) -> unicode\n\ 5683\n\ 5684Return a copy of the string S with leading and trailing\n\ 5685whitespace removed.\n\ 5686If chars is given and not None, remove characters in chars instead.\n\ 5687If chars is a str, it will be converted to unicode before stripping"); 5688 5689static PyObject * 5690unicode_strip(PyUnicodeObject *self, PyObject *args) 5691{ 5692 if (PyTuple_GET_SIZE(args) == 0) 5693 return do_strip(self, BOTHSTRIP); /* Common case */ 5694 else 5695 return do_argstrip(self, BOTHSTRIP, args); 5696} 5697 5698 5699PyDoc_STRVAR(lstrip__doc__, 5700"S.lstrip([chars]) -> unicode\n\ 5701\n\ 5702Return a copy of the string S with leading whitespace removed.\n\ 5703If chars is given and not None, remove characters in chars instead.\n\ 5704If chars is a str, it will be converted to unicode before stripping"); 5705 5706static PyObject * 5707unicode_lstrip(PyUnicodeObject *self, PyObject *args) 5708{ 5709 if (PyTuple_GET_SIZE(args) == 0) 5710 return do_strip(self, LEFTSTRIP); /* Common case */ 5711 else 5712 return do_argstrip(self, LEFTSTRIP, args); 5713} 5714 5715 5716PyDoc_STRVAR(rstrip__doc__, 5717"S.rstrip([chars]) -> unicode\n\ 5718\n\ 5719Return a copy of the string S with trailing whitespace removed.\n\ 5720If chars is given and not None, remove characters in chars instead.\n\ 5721If chars is a str, it will be converted to unicode before stripping"); 5722 5723static PyObject * 5724unicode_rstrip(PyUnicodeObject *self, PyObject *args) 5725{ 5726 if (PyTuple_GET_SIZE(args) == 0) 5727 return do_strip(self, RIGHTSTRIP); /* Common case */ 5728 else 5729 return do_argstrip(self, RIGHTSTRIP, args); 5730} 5731 5732 5733static PyObject* 5734unicode_repeat(PyUnicodeObject *str, int len) 5735{ 5736 PyUnicodeObject *u; 5737 Py_UNICODE *p; 5738 int nchars; 5739 size_t nbytes; 5740 5741 if (len < 0) 5742 len = 0; 5743 5744 if (len == 1 && PyUnicode_CheckExact(str)) { 5745 /* no repeat, return original string */ 5746 Py_INCREF(str); 5747 return (PyObject*) str; 5748 } 5749 5750 /* ensure # of chars needed doesn't overflow int and # of bytes 5751 * needed doesn't overflow size_t 5752 */ 5753 nchars = len * str->length; 5754 if (len && nchars / len != str->length) { 5755 PyErr_SetString(PyExc_OverflowError, 5756 "repeated string is too long"); 5757 return NULL; 5758 } 5759 nbytes = (nchars + 1) * sizeof(Py_UNICODE); 5760 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) { 5761 PyErr_SetString(PyExc_OverflowError, 5762 "repeated string is too long"); 5763 return NULL; 5764 } 5765 u = _PyUnicode_New(nchars); 5766 if (!u) 5767 return NULL; 5768 5769 p = u->str; 5770 5771 while (len-- > 0) { 5772 Py_UNICODE_COPY(p, str->str, str->length); 5773 p += str->length; 5774 } 5775 5776 return (PyObject*) u; 5777} 5778 5779PyObject *PyUnicode_Replace(PyObject *obj, 5780 PyObject *subobj, 5781 PyObject *replobj, 5782 int maxcount) 5783{ 5784 PyObject *self; 5785 PyObject *str1; 5786 PyObject *str2; 5787 PyObject *result; 5788 5789 self = PyUnicode_FromObject(obj); 5790 if (self == NULL) 5791 return NULL; 5792 str1 = PyUnicode_FromObject(subobj); 5793 if (str1 == NULL) { 5794 Py_DECREF(self); 5795 return NULL; 5796 } 5797 str2 = PyUnicode_FromObject(replobj); 5798 if (str2 == NULL) { 5799 Py_DECREF(self); 5800 Py_DECREF(str1); 5801 return NULL; 5802 } 5803 result = replace((PyUnicodeObject *)self, 5804 (PyUnicodeObject *)str1, 5805 (PyUnicodeObject *)str2, 5806 maxcount); 5807 Py_DECREF(self); 5808 Py_DECREF(str1); 5809 Py_DECREF(str2); 5810 return result; 5811} 5812 5813PyDoc_STRVAR(replace__doc__, 5814"S.replace (old, new[, maxsplit]) -> unicode\n\ 5815\n\ 5816Return a copy of S with all occurrences of substring\n\ 5817old replaced by new. If the optional argument maxsplit is\n\ 5818given, only the first maxsplit occurrences are replaced."); 5819 5820static PyObject* 5821unicode_replace(PyUnicodeObject *self, PyObject *args) 5822{ 5823 PyUnicodeObject *str1; 5824 PyUnicodeObject *str2; 5825 int maxcount = -1; 5826 PyObject *result; 5827 5828 if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount)) 5829 return NULL; 5830 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1); 5831 if (str1 == NULL) 5832 return NULL; 5833 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2); 5834 if (str2 == NULL) { 5835 Py_DECREF(str1); 5836 return NULL; 5837 } 5838 5839 result = replace(self, str1, str2, maxcount); 5840 5841 Py_DECREF(str1); 5842 Py_DECREF(str2); 5843 return result; 5844} 5845 5846static 5847PyObject *unicode_repr(PyObject *unicode) 5848{ 5849 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode), 5850 PyUnicode_GET_SIZE(unicode), 5851 1); 5852} 5853 5854PyDoc_STRVAR(rfind__doc__, 5855"S.rfind(sub [,start [,end]]) -> int\n\ 5856\n\ 5857Return the highest index in S where substring sub is found,\n\ 5858such that sub is contained within s[start,end]. Optional\n\ 5859arguments start and end are interpreted as in slice notation.\n\ 5860\n\ 5861Return -1 on failure."); 5862 5863static PyObject * 5864unicode_rfind(PyUnicodeObject *self, PyObject *args) 5865{ 5866 PyUnicodeObject *substring; 5867 int start = 0; 5868 int end = INT_MAX; 5869 PyObject *result; 5870 5871 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring, 5872 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 5873 return NULL; 5874 substring = (PyUnicodeObject *)PyUnicode_FromObject( 5875 (PyObject *)substring); 5876 if (substring == NULL) 5877 return NULL; 5878 5879 result = PyInt_FromLong(findstring(self, substring, start, end, -1)); 5880 5881 Py_DECREF(substring); 5882 return result; 5883} 5884 5885PyDoc_STRVAR(rindex__doc__, 5886"S.rindex(sub [,start [,end]]) -> int\n\ 5887\n\ 5888Like S.rfind() but raise ValueError when the substring is not found."); 5889 5890static PyObject * 5891unicode_rindex(PyUnicodeObject *self, PyObject *args) 5892{ 5893 int result; 5894 PyUnicodeObject *substring; 5895 int start = 0; 5896 int end = INT_MAX; 5897 5898 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring, 5899 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 5900 return NULL; 5901 substring = (PyUnicodeObject *)PyUnicode_FromObject( 5902 (PyObject *)substring); 5903 if (substring == NULL) 5904 return NULL; 5905 5906 result = findstring(self, substring, start, end, -1); 5907 5908 Py_DECREF(substring); 5909 if (result < 0) { 5910 PyErr_SetString(PyExc_ValueError, "substring not found"); 5911 return NULL; 5912 } 5913 return PyInt_FromLong(result); 5914} 5915 5916PyDoc_STRVAR(rjust__doc__, 5917"S.rjust(width[, fillchar]) -> unicode\n\ 5918\n\ 5919Return S right justified in a Unicode string of length width. Padding is\n\ 5920done using the specified fill character (default is a space)."); 5921 5922static PyObject * 5923unicode_rjust(PyUnicodeObject *self, PyObject *args) 5924{ 5925 int width; 5926 Py_UNICODE fillchar = ' '; 5927 5928 if (!PyArg_ParseTuple(args, "i|O&:rjust", &width, convert_uc, &fillchar)) 5929 return NULL; 5930 5931 if (self->length >= width && PyUnicode_CheckExact(self)) { 5932 Py_INCREF(self); 5933 return (PyObject*) self; 5934 } 5935 5936 return (PyObject*) pad(self, width - self->length, 0, fillchar); 5937} 5938 5939static PyObject* 5940unicode_slice(PyUnicodeObject *self, int start, int end) 5941{ 5942 /* standard clamping */ 5943 if (start < 0) 5944 start = 0; 5945 if (end < 0) 5946 end = 0; 5947 if (end > self->length) 5948 end = self->length; 5949 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) { 5950 /* full slice, return original string */ 5951 Py_INCREF(self); 5952 return (PyObject*) self; 5953 } 5954 if (start > end) 5955 start = end; 5956 /* copy slice */ 5957 return (PyObject*) PyUnicode_FromUnicode(self->str + start, 5958 end - start); 5959} 5960 5961PyObject *PyUnicode_Split(PyObject *s, 5962 PyObject *sep, 5963 int maxsplit) 5964{ 5965 PyObject *result; 5966 5967 s = PyUnicode_FromObject(s); 5968 if (s == NULL) 5969 return NULL; 5970 if (sep != NULL) { 5971 sep = PyUnicode_FromObject(sep); 5972 if (sep == NULL) { 5973 Py_DECREF(s); 5974 return NULL; 5975 } 5976 } 5977 5978 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit); 5979 5980 Py_DECREF(s); 5981 Py_XDECREF(sep); 5982 return result; 5983} 5984 5985PyDoc_STRVAR(split__doc__, 5986"S.split([sep [,maxsplit]]) -> list of strings\n\ 5987\n\ 5988Return a list of the words in S, using sep as the\n\ 5989delimiter string. If maxsplit is given, at most maxsplit\n\ 5990splits are done. If sep is not specified or None, 5991any whitespace string is a separator."); 5992 5993static PyObject* 5994unicode_split(PyUnicodeObject *self, PyObject *args) 5995{ 5996 PyObject *substring = Py_None; 5997 int maxcount = -1; 5998 5999 if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount)) 6000 return NULL; 6001 6002 if (substring == Py_None) 6003 return split(self, NULL, maxcount); 6004 else if (PyUnicode_Check(substring)) 6005 return split(self, (PyUnicodeObject *)substring, maxcount); 6006 else 6007 return PyUnicode_Split((PyObject *)self, substring, maxcount); 6008} 6009 6010PyObject *PyUnicode_RSplit(PyObject *s, 6011 PyObject *sep, 6012 int maxsplit) 6013{ 6014 PyObject *result; 6015 6016 s = PyUnicode_FromObject(s); 6017 if (s == NULL) 6018 return NULL; 6019 if (sep != NULL) { 6020 sep = PyUnicode_FromObject(sep); 6021 if (sep == NULL) { 6022 Py_DECREF(s); 6023 return NULL; 6024 } 6025 } 6026 6027 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit); 6028 6029 Py_DECREF(s); 6030 Py_XDECREF(sep); 6031 return result; 6032} 6033 6034PyDoc_STRVAR(rsplit__doc__, 6035"S.rsplit([sep [,maxsplit]]) -> list of strings\n\ 6036\n\ 6037Return a list of the words in S, using sep as the\n\ 6038delimiter string, starting at the end of the string and\n\ 6039working to the front. If maxsplit is given, at most maxsplit\n\ 6040splits are done. If sep is not specified, any whitespace string\n\ 6041is a separator."); 6042 6043static PyObject* 6044unicode_rsplit(PyUnicodeObject *self, PyObject *args) 6045{ 6046 PyObject *substring = Py_None; 6047 int maxcount = -1; 6048 6049 if (!PyArg_ParseTuple(args, "|Oi:rsplit", &substring, &maxcount)) 6050 return NULL; 6051 6052 if (substring == Py_None) 6053 return rsplit(self, NULL, maxcount); 6054 else if (PyUnicode_Check(substring)) 6055 return rsplit(self, (PyUnicodeObject *)substring, maxcount); 6056 else 6057 return PyUnicode_RSplit((PyObject *)self, substring, maxcount); 6058} 6059 6060PyDoc_STRVAR(splitlines__doc__, 6061"S.splitlines([keepends]]) -> list of strings\n\ 6062\n\ 6063Return a list of the lines in S, breaking at line boundaries.\n\ 6064Line breaks are not included in the resulting list unless keepends\n\ 6065is given and true."); 6066 6067static PyObject* 6068unicode_splitlines(PyUnicodeObject *self, PyObject *args) 6069{ 6070 int keepends = 0; 6071 6072 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends)) 6073 return NULL; 6074 6075 return PyUnicode_Splitlines((PyObject *)self, keepends); 6076} 6077 6078static 6079PyObject *unicode_str(PyUnicodeObject *self) 6080{ 6081 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL); 6082} 6083 6084PyDoc_STRVAR(swapcase__doc__, 6085"S.swapcase() -> unicode\n\ 6086\n\ 6087Return a copy of S with uppercase characters converted to lowercase\n\ 6088and vice versa."); 6089 6090static PyObject* 6091unicode_swapcase(PyUnicodeObject *self) 6092{ 6093 return fixup(self, fixswapcase); 6094} 6095 6096PyDoc_STRVAR(translate__doc__, 6097"S.translate(table) -> unicode\n\ 6098\n\ 6099Return a copy of the string S, where all characters have been mapped\n\ 6100through the given translation table, which must be a mapping of\n\ 6101Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\ 6102Unmapped characters are left untouched. Characters mapped to None\n\ 6103are deleted."); 6104 6105static PyObject* 6106unicode_translate(PyUnicodeObject *self, PyObject *table) 6107{ 6108 return PyUnicode_TranslateCharmap(self->str, 6109 self->length, 6110 table, 6111 "ignore"); 6112} 6113 6114PyDoc_STRVAR(upper__doc__, 6115"S.upper() -> unicode\n\ 6116\n\ 6117Return a copy of S converted to uppercase."); 6118 6119static PyObject* 6120unicode_upper(PyUnicodeObject *self) 6121{ 6122 return fixup(self, fixupper); 6123} 6124 6125PyDoc_STRVAR(zfill__doc__, 6126"S.zfill(width) -> unicode\n\ 6127\n\ 6128Pad a numeric string x with zeros on the left, to fill a field\n\ 6129of the specified width. The string x is never truncated."); 6130 6131static PyObject * 6132unicode_zfill(PyUnicodeObject *self, PyObject *args) 6133{ 6134 int fill; 6135 PyUnicodeObject *u; 6136 6137 int width; 6138 if (!PyArg_ParseTuple(args, "i:zfill", &width)) 6139 return NULL; 6140 6141 if (self->length >= width) { 6142 if (PyUnicode_CheckExact(self)) { 6143 Py_INCREF(self); 6144 return (PyObject*) self; 6145 } 6146 else 6147 return PyUnicode_FromUnicode( 6148 PyUnicode_AS_UNICODE(self), 6149 PyUnicode_GET_SIZE(self) 6150 ); 6151 } 6152 6153 fill = width - self->length; 6154 6155 u = pad(self, fill, 0, '0'); 6156 6157 if (u == NULL) 6158 return NULL; 6159 6160 if (u->str[fill] == '+' || u->str[fill] == '-') { 6161 /* move sign to beginning of string */ 6162 u->str[0] = u->str[fill]; 6163 u->str[fill] = '0'; 6164 } 6165 6166 return (PyObject*) u; 6167} 6168 6169#if 0 6170static PyObject* 6171unicode_freelistsize(PyUnicodeObject *self) 6172{ 6173 return PyInt_FromLong(unicode_freelist_size); 6174} 6175#endif 6176 6177PyDoc_STRVAR(startswith__doc__, 6178"S.startswith(prefix[, start[, end]]) -> bool\n\ 6179\n\ 6180Return True if S starts with the specified prefix, False otherwise.\n\ 6181With optional start, test S beginning at that position.\n\ 6182With optional end, stop comparing S at that position."); 6183 6184static PyObject * 6185unicode_startswith(PyUnicodeObject *self, 6186 PyObject *args) 6187{ 6188 PyUnicodeObject *substring; 6189 int start = 0; 6190 int end = INT_MAX; 6191 PyObject *result; 6192 6193 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring, 6194 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 6195 return NULL; 6196 substring = (PyUnicodeObject *)PyUnicode_FromObject( 6197 (PyObject *)substring); 6198 if (substring == NULL) 6199 return NULL; 6200 6201 result = PyBool_FromLong(tailmatch(self, substring, start, end, -1)); 6202 6203 Py_DECREF(substring); 6204 return result; 6205} 6206 6207 6208PyDoc_STRVAR(endswith__doc__, 6209"S.endswith(suffix[, start[, end]]) -> bool\n\ 6210\n\ 6211Return True if S ends with the specified suffix, False otherwise.\n\ 6212With optional start, test S beginning at that position.\n\ 6213With optional end, stop comparing S at that position."); 6214 6215static PyObject * 6216unicode_endswith(PyUnicodeObject *self, 6217 PyObject *args) 6218{ 6219 PyUnicodeObject *substring; 6220 int start = 0; 6221 int end = INT_MAX; 6222 PyObject *result; 6223 6224 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring, 6225 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 6226 return NULL; 6227 substring = (PyUnicodeObject *)PyUnicode_FromObject( 6228 (PyObject *)substring); 6229 if (substring == NULL) 6230 return NULL; 6231 6232 result = PyBool_FromLong(tailmatch(self, substring, start, end, +1)); 6233 6234 Py_DECREF(substring); 6235 return result; 6236} 6237 6238 6239 6240static PyObject * 6241unicode_getnewargs(PyUnicodeObject *v) 6242{ 6243 return Py_BuildValue("(u#)", v->str, v->length); 6244} 6245 6246 6247static PyMethodDef unicode_methods[] = { 6248 6249 /* Order is according to common usage: often used methods should 6250 appear first, since lookup is done sequentially. */ 6251 6252 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__}, 6253 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__}, 6254 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__}, 6255 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__}, 6256 {"join", (PyCFunction) unicode_join, METH_O, join__doc__}, 6257 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__}, 6258 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__}, 6259 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__}, 6260 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__}, 6261 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__}, 6262 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__}, 6263 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__}, 6264 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__}, 6265 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__}, 6266 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__}, 6267 {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__}, 6268/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */ 6269 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__}, 6270 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__}, 6271 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__}, 6272 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__}, 6273 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__}, 6274 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__}, 6275 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__}, 6276 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__}, 6277 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__}, 6278 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__}, 6279 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__}, 6280 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__}, 6281 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__}, 6282 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__}, 6283 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__}, 6284 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__}, 6285 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__}, 6286 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__}, 6287 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__}, 6288 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__}, 6289 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__}, 6290#if 0 6291 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__}, 6292#endif 6293 6294#if 0 6295 /* This one is just used for debugging the implementation. */ 6296 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS}, 6297#endif 6298 6299 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS}, 6300 {NULL, NULL} 6301}; 6302 6303static PyObject * 6304unicode_mod(PyObject *v, PyObject *w) 6305{ 6306 if (!PyUnicode_Check(v)) { 6307 Py_INCREF(Py_NotImplemented); 6308 return Py_NotImplemented; 6309 } 6310 return PyUnicode_Format(v, w); 6311} 6312 6313static PyNumberMethods unicode_as_number = { 6314 0, /*nb_add*/ 6315 0, /*nb_subtract*/ 6316 0, /*nb_multiply*/ 6317 0, /*nb_divide*/ 6318 unicode_mod, /*nb_remainder*/ 6319}; 6320 6321static PySequenceMethods unicode_as_sequence = { 6322 (inquiry) unicode_length, /* sq_length */ 6323 (binaryfunc) PyUnicode_Concat, /* sq_concat */ 6324 (intargfunc) unicode_repeat, /* sq_repeat */ 6325 (intargfunc) unicode_getitem, /* sq_item */ 6326 (intintargfunc) unicode_slice, /* sq_slice */ 6327 0, /* sq_ass_item */ 6328 0, /* sq_ass_slice */ 6329 (objobjproc)PyUnicode_Contains, /*sq_contains*/ 6330}; 6331 6332static PyObject* 6333unicode_subscript(PyUnicodeObject* self, PyObject* item) 6334{ 6335 if (PyInt_Check(item)) { 6336 long i = PyInt_AS_LONG(item); 6337 if (i < 0) 6338 i += PyString_GET_SIZE(self); 6339 return unicode_getitem(self, i); 6340 } else if (PyLong_Check(item)) { 6341 long i = PyLong_AsLong(item); 6342 if (i == -1 && PyErr_Occurred()) 6343 return NULL; 6344 if (i < 0) 6345 i += PyString_GET_SIZE(self); 6346 return unicode_getitem(self, i); 6347 } else if (PySlice_Check(item)) { 6348 int start, stop, step, slicelength, cur, i; 6349 Py_UNICODE* source_buf; 6350 Py_UNICODE* result_buf; 6351 PyObject* result; 6352 6353 if (PySlice_GetIndicesEx((PySliceObject*)item, PyString_GET_SIZE(self), 6354 &start, &stop, &step, &slicelength) < 0) { 6355 return NULL; 6356 } 6357 6358 if (slicelength <= 0) { 6359 return PyUnicode_FromUnicode(NULL, 0); 6360 } else { 6361 source_buf = PyUnicode_AS_UNICODE((PyObject*)self); 6362 result_buf = PyMem_MALLOC(slicelength*sizeof(Py_UNICODE)); 6363 6364 for (cur = start, i = 0; i < slicelength; cur += step, i++) { 6365 result_buf[i] = source_buf[cur]; 6366 } 6367 6368 result = PyUnicode_FromUnicode(result_buf, slicelength); 6369 PyMem_FREE(result_buf); 6370 return result; 6371 } 6372 } else { 6373 PyErr_SetString(PyExc_TypeError, "string indices must be integers"); 6374 return NULL; 6375 } 6376} 6377 6378static PyMappingMethods unicode_as_mapping = { 6379 (inquiry)unicode_length, /* mp_length */ 6380 (binaryfunc)unicode_subscript, /* mp_subscript */ 6381 (objobjargproc)0, /* mp_ass_subscript */ 6382}; 6383 6384static int 6385unicode_buffer_getreadbuf(PyUnicodeObject *self, 6386 int index, 6387 const void **ptr) 6388{ 6389 if (index != 0) { 6390 PyErr_SetString(PyExc_SystemError, 6391 "accessing non-existent unicode segment"); 6392 return -1; 6393 } 6394 *ptr = (void *) self->str; 6395 return PyUnicode_GET_DATA_SIZE(self); 6396} 6397 6398static int 6399unicode_buffer_getwritebuf(PyUnicodeObject *self, int index, 6400 const void **ptr) 6401{ 6402 PyErr_SetString(PyExc_TypeError, 6403 "cannot use unicode as modifiable buffer"); 6404 return -1; 6405} 6406 6407static int 6408unicode_buffer_getsegcount(PyUnicodeObject *self, 6409 int *lenp) 6410{ 6411 if (lenp) 6412 *lenp = PyUnicode_GET_DATA_SIZE(self); 6413 return 1; 6414} 6415 6416static int 6417unicode_buffer_getcharbuf(PyUnicodeObject *self, 6418 int index, 6419 const void **ptr) 6420{ 6421 PyObject *str; 6422 6423 if (index != 0) { 6424 PyErr_SetString(PyExc_SystemError, 6425 "accessing non-existent unicode segment"); 6426 return -1; 6427 } 6428 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL); 6429 if (str == NULL) 6430 return -1; 6431 *ptr = (void *) PyString_AS_STRING(str); 6432 return PyString_GET_SIZE(str); 6433} 6434 6435/* Helpers for PyUnicode_Format() */ 6436 6437static PyObject * 6438getnextarg(PyObject *args, int arglen, int *p_argidx) 6439{ 6440 int argidx = *p_argidx; 6441 if (argidx < arglen) { 6442 (*p_argidx)++; 6443 if (arglen < 0) 6444 return args; 6445 else 6446 return PyTuple_GetItem(args, argidx); 6447 } 6448 PyErr_SetString(PyExc_TypeError, 6449 "not enough arguments for format string"); 6450 return NULL; 6451} 6452 6453#define F_LJUST (1<<0) 6454#define F_SIGN (1<<1) 6455#define F_BLANK (1<<2) 6456#define F_ALT (1<<3) 6457#define F_ZERO (1<<4) 6458 6459static 6460int usprintf(register Py_UNICODE *buffer, char *format, ...) 6461{ 6462 register int i; 6463 int len; 6464 va_list va; 6465 char *charbuffer; 6466 va_start(va, format); 6467 6468 /* First, format the string as char array, then expand to Py_UNICODE 6469 array. */ 6470 charbuffer = (char *)buffer; 6471 len = vsprintf(charbuffer, format, va); 6472 for (i = len - 1; i >= 0; i--) 6473 buffer[i] = (Py_UNICODE) charbuffer[i]; 6474 6475 va_end(va); 6476 return len; 6477} 6478 6479/* XXX To save some code duplication, formatfloat/long/int could have been 6480 shared with stringobject.c, converting from 8-bit to Unicode after the 6481 formatting is done. */ 6482 6483static int 6484formatfloat(Py_UNICODE *buf, 6485 size_t buflen, 6486 int flags, 6487 int prec, 6488 int type, 6489 PyObject *v) 6490{ 6491 /* fmt = '%#.' + `prec` + `type` 6492 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/ 6493 char fmt[20]; 6494 double x; 6495 6496 x = PyFloat_AsDouble(v); 6497 if (x == -1.0 && PyErr_Occurred()) 6498 return -1; 6499 if (prec < 0) 6500 prec = 6; 6501 if (type == 'f' && (fabs(x) / 1e25) >= 1e25) 6502 type = 'g'; 6503 /* Worst case length calc to ensure no buffer overrun: 6504 6505 'g' formats: 6506 fmt = %#.<prec>g 6507 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp 6508 for any double rep.) 6509 len = 1 + prec + 1 + 2 + 5 = 9 + prec 6510 6511 'f' formats: 6512 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50) 6513 len = 1 + 50 + 1 + prec = 52 + prec 6514 6515 If prec=0 the effective precision is 1 (the leading digit is 6516 always given), therefore increase the length by one. 6517 6518 */ 6519 if ((type == 'g' && buflen <= (size_t)10 + (size_t)prec) || 6520 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) { 6521 PyErr_SetString(PyExc_OverflowError, 6522 "formatted float is too long (precision too large?)"); 6523 return -1; 6524 } 6525 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c", 6526 (flags&F_ALT) ? "#" : "", 6527 prec, type); 6528 return usprintf(buf, fmt, x); 6529} 6530 6531static PyObject* 6532formatlong(PyObject *val, int flags, int prec, int type) 6533{ 6534 char *buf; 6535 int i, len; 6536 PyObject *str; /* temporary string object. */ 6537 PyUnicodeObject *result; 6538 6539 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len); 6540 if (!str) 6541 return NULL; 6542 result = _PyUnicode_New(len); 6543 for (i = 0; i < len; i++) 6544 result->str[i] = buf[i]; 6545 result->str[len] = 0; 6546 Py_DECREF(str); 6547 return (PyObject*)result; 6548} 6549 6550static int 6551formatint(Py_UNICODE *buf, 6552 size_t buflen, 6553 int flags, 6554 int prec, 6555 int type, 6556 PyObject *v) 6557{ 6558 /* fmt = '%#.' + `prec` + 'l' + `type` 6559 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine) 6560 * + 1 + 1 6561 * = 24 6562 */ 6563 char fmt[64]; /* plenty big enough! */ 6564 char *sign; 6565 long x; 6566 6567 x = PyInt_AsLong(v); 6568 if (x == -1 && PyErr_Occurred()) 6569 return -1; 6570 if (x < 0 && type == 'u') { 6571 type = 'd'; 6572 } 6573 if (x < 0 && (type == 'x' || type == 'X' || type == 'o')) 6574 sign = "-"; 6575 else 6576 sign = ""; 6577 if (prec < 0) 6578 prec = 1; 6579 6580 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal)) 6581 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11 6582 */ 6583 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) { 6584 PyErr_SetString(PyExc_OverflowError, 6585 "formatted integer is too long (precision too large?)"); 6586 return -1; 6587 } 6588 6589 if ((flags & F_ALT) && 6590 (type == 'x' || type == 'X')) { 6591 /* When converting under %#x or %#X, there are a number 6592 * of issues that cause pain: 6593 * - when 0 is being converted, the C standard leaves off 6594 * the '0x' or '0X', which is inconsistent with other 6595 * %#x/%#X conversions and inconsistent with Python's 6596 * hex() function 6597 * - there are platforms that violate the standard and 6598 * convert 0 with the '0x' or '0X' 6599 * (Metrowerks, Compaq Tru64) 6600 * - there are platforms that give '0x' when converting 6601 * under %#X, but convert 0 in accordance with the 6602 * standard (OS/2 EMX) 6603 * 6604 * We can achieve the desired consistency by inserting our 6605 * own '0x' or '0X' prefix, and substituting %x/%X in place 6606 * of %#x/%#X. 6607 * 6608 * Note that this is the same approach as used in 6609 * formatint() in stringobject.c 6610 */ 6611 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c", 6612 sign, type, prec, type); 6613 } 6614 else { 6615 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c", 6616 sign, (flags&F_ALT) ? "#" : "", 6617 prec, type); 6618 } 6619 if (sign[0]) 6620 return usprintf(buf, fmt, -x); 6621 else 6622 return usprintf(buf, fmt, x); 6623} 6624 6625static int 6626formatchar(Py_UNICODE *buf, 6627 size_t buflen, 6628 PyObject *v) 6629{ 6630 /* presume that the buffer is at least 2 characters long */ 6631 if (PyUnicode_Check(v)) { 6632 if (PyUnicode_GET_SIZE(v) != 1) 6633 goto onError; 6634 buf[0] = PyUnicode_AS_UNICODE(v)[0]; 6635 } 6636 6637 else if (PyString_Check(v)) { 6638 if (PyString_GET_SIZE(v) != 1) 6639 goto onError; 6640 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0]; 6641 } 6642 6643 else { 6644 /* Integer input truncated to a character */ 6645 long x; 6646 x = PyInt_AsLong(v); 6647 if (x == -1 && PyErr_Occurred()) 6648 goto onError; 6649#ifdef Py_UNICODE_WIDE 6650 if (x < 0 || x > 0x10ffff) { 6651 PyErr_SetString(PyExc_OverflowError, 6652 "%c arg not in range(0x110000) " 6653 "(wide Python build)"); 6654 return -1; 6655 } 6656#else 6657 if (x < 0 || x > 0xffff) { 6658 PyErr_SetString(PyExc_OverflowError, 6659 "%c arg not in range(0x10000) " 6660 "(narrow Python build)"); 6661 return -1; 6662 } 6663#endif 6664 buf[0] = (Py_UNICODE) x; 6665 } 6666 buf[1] = '\0'; 6667 return 1; 6668 6669 onError: 6670 PyErr_SetString(PyExc_TypeError, 6671 "%c requires int or char"); 6672 return -1; 6673} 6674 6675/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...) 6676 6677 FORMATBUFLEN is the length of the buffer in which the floats, ints, & 6678 chars are formatted. XXX This is a magic number. Each formatting 6679 routine does bounds checking to ensure no overflow, but a better 6680 solution may be to malloc a buffer of appropriate size for each 6681 format. For now, the current solution is sufficient. 6682*/ 6683#define FORMATBUFLEN (size_t)120 6684 6685PyObject *PyUnicode_Format(PyObject *format, 6686 PyObject *args) 6687{ 6688 Py_UNICODE *fmt, *res; 6689 int fmtcnt, rescnt, reslen, arglen, argidx; 6690 int args_owned = 0; 6691 PyUnicodeObject *result = NULL; 6692 PyObject *dict = NULL; 6693 PyObject *uformat; 6694 6695 if (format == NULL || args == NULL) { 6696 PyErr_BadInternalCall(); 6697 return NULL; 6698 } 6699 uformat = PyUnicode_FromObject(format); 6700 if (uformat == NULL) 6701 return NULL; 6702 fmt = PyUnicode_AS_UNICODE(uformat); 6703 fmtcnt = PyUnicode_GET_SIZE(uformat); 6704 6705 reslen = rescnt = fmtcnt + 100; 6706 result = _PyUnicode_New(reslen); 6707 if (result == NULL) 6708 goto onError; 6709 res = PyUnicode_AS_UNICODE(result); 6710 6711 if (PyTuple_Check(args)) { 6712 arglen = PyTuple_Size(args); 6713 argidx = 0; 6714 } 6715 else { 6716 arglen = -1; 6717 argidx = -2; 6718 } 6719 if (args->ob_type->tp_as_mapping && !PyTuple_Check(args) && 6720 !PyObject_TypeCheck(args, &PyBaseString_Type)) 6721 dict = args; 6722 6723 while (--fmtcnt >= 0) { 6724 if (*fmt != '%') { 6725 if (--rescnt < 0) { 6726 rescnt = fmtcnt + 100; 6727 reslen += rescnt; 6728 if (_PyUnicode_Resize(&result, reslen) < 0) 6729 return NULL; 6730 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt; 6731 --rescnt; 6732 } 6733 *res++ = *fmt++; 6734 } 6735 else { 6736 /* Got a format specifier */ 6737 int flags = 0; 6738 int width = -1; 6739 int prec = -1; 6740 Py_UNICODE c = '\0'; 6741 Py_UNICODE fill; 6742 PyObject *v = NULL; 6743 PyObject *temp = NULL; 6744 Py_UNICODE *pbuf; 6745 Py_UNICODE sign; 6746 int len; 6747 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */ 6748 6749 fmt++; 6750 if (*fmt == '(') { 6751 Py_UNICODE *keystart; 6752 int keylen; 6753 PyObject *key; 6754 int pcount = 1; 6755 6756 if (dict == NULL) { 6757 PyErr_SetString(PyExc_TypeError, 6758 "format requires a mapping"); 6759 goto onError; 6760 } 6761 ++fmt; 6762 --fmtcnt; 6763 keystart = fmt; 6764 /* Skip over balanced parentheses */ 6765 while (pcount > 0 && --fmtcnt >= 0) { 6766 if (*fmt == ')') 6767 --pcount; 6768 else if (*fmt == '(') 6769 ++pcount; 6770 fmt++; 6771 } 6772 keylen = fmt - keystart - 1; 6773 if (fmtcnt < 0 || pcount > 0) { 6774 PyErr_SetString(PyExc_ValueError, 6775 "incomplete format key"); 6776 goto onError; 6777 } 6778#if 0 6779 /* keys are converted to strings using UTF-8 and 6780 then looked up since Python uses strings to hold 6781 variables names etc. in its namespaces and we 6782 wouldn't want to break common idioms. */ 6783 key = PyUnicode_EncodeUTF8(keystart, 6784 keylen, 6785 NULL); 6786#else 6787 key = PyUnicode_FromUnicode(keystart, keylen); 6788#endif 6789 if (key == NULL) 6790 goto onError; 6791 if (args_owned) { 6792 Py_DECREF(args); 6793 args_owned = 0; 6794 } 6795 args = PyObject_GetItem(dict, key); 6796 Py_DECREF(key); 6797 if (args == NULL) { 6798 goto onError; 6799 } 6800 args_owned = 1; 6801 arglen = -1; 6802 argidx = -2; 6803 } 6804 while (--fmtcnt >= 0) { 6805 switch (c = *fmt++) { 6806 case '-': flags |= F_LJUST; continue; 6807 case '+': flags |= F_SIGN; continue; 6808 case ' ': flags |= F_BLANK; continue; 6809 case '#': flags |= F_ALT; continue; 6810 case '0': flags |= F_ZERO; continue; 6811 } 6812 break; 6813 } 6814 if (c == '*') { 6815 v = getnextarg(args, arglen, &argidx); 6816 if (v == NULL) 6817 goto onError; 6818 if (!PyInt_Check(v)) { 6819 PyErr_SetString(PyExc_TypeError, 6820 "* wants int"); 6821 goto onError; 6822 } 6823 width = PyInt_AsLong(v); 6824 if (width < 0) { 6825 flags |= F_LJUST; 6826 width = -width; 6827 } 6828 if (--fmtcnt >= 0) 6829 c = *fmt++; 6830 } 6831 else if (c >= '0' && c <= '9') { 6832 width = c - '0'; 6833 while (--fmtcnt >= 0) { 6834 c = *fmt++; 6835 if (c < '0' || c > '9') 6836 break; 6837 if ((width*10) / 10 != width) { 6838 PyErr_SetString(PyExc_ValueError, 6839 "width too big"); 6840 goto onError; 6841 } 6842 width = width*10 + (c - '0'); 6843 } 6844 } 6845 if (c == '.') { 6846 prec = 0; 6847 if (--fmtcnt >= 0) 6848 c = *fmt++; 6849 if (c == '*') { 6850 v = getnextarg(args, arglen, &argidx); 6851 if (v == NULL) 6852 goto onError; 6853 if (!PyInt_Check(v)) { 6854 PyErr_SetString(PyExc_TypeError, 6855 "* wants int"); 6856 goto onError; 6857 } 6858 prec = PyInt_AsLong(v); 6859 if (prec < 0) 6860 prec = 0; 6861 if (--fmtcnt >= 0) 6862 c = *fmt++; 6863 } 6864 else if (c >= '0' && c <= '9') { 6865 prec = c - '0'; 6866 while (--fmtcnt >= 0) { 6867 c = Py_CHARMASK(*fmt++); 6868 if (c < '0' || c > '9') 6869 break; 6870 if ((prec*10) / 10 != prec) { 6871 PyErr_SetString(PyExc_ValueError, 6872 "prec too big"); 6873 goto onError; 6874 } 6875 prec = prec*10 + (c - '0'); 6876 } 6877 } 6878 } /* prec */ 6879 if (fmtcnt >= 0) { 6880 if (c == 'h' || c == 'l' || c == 'L') { 6881 if (--fmtcnt >= 0) 6882 c = *fmt++; 6883 } 6884 } 6885 if (fmtcnt < 0) { 6886 PyErr_SetString(PyExc_ValueError, 6887 "incomplete format"); 6888 goto onError; 6889 } 6890 if (c != '%') { 6891 v = getnextarg(args, arglen, &argidx); 6892 if (v == NULL) 6893 goto onError; 6894 } 6895 sign = 0; 6896 fill = ' '; 6897 switch (c) { 6898 6899 case '%': 6900 pbuf = formatbuf; 6901 /* presume that buffer length is at least 1 */ 6902 pbuf[0] = '%'; 6903 len = 1; 6904 break; 6905 6906 case 's': 6907 case 'r': 6908 if (PyUnicode_Check(v) && c == 's') { 6909 temp = v; 6910 Py_INCREF(temp); 6911 } 6912 else { 6913 PyObject *unicode; 6914 if (c == 's') 6915 temp = PyObject_Unicode(v); 6916 else 6917 temp = PyObject_Repr(v); 6918 if (temp == NULL) 6919 goto onError; 6920 if (PyUnicode_Check(temp)) 6921 /* nothing to do */; 6922 else if (PyString_Check(temp)) { 6923 /* convert to string to Unicode */ 6924 unicode = PyUnicode_Decode(PyString_AS_STRING(temp), 6925 PyString_GET_SIZE(temp), 6926 NULL, 6927 "strict"); 6928 Py_DECREF(temp); 6929 temp = unicode; 6930 if (temp == NULL) 6931 goto onError; 6932 } 6933 else { 6934 Py_DECREF(temp); 6935 PyErr_SetString(PyExc_TypeError, 6936 "%s argument has non-string str()"); 6937 goto onError; 6938 } 6939 } 6940 pbuf = PyUnicode_AS_UNICODE(temp); 6941 len = PyUnicode_GET_SIZE(temp); 6942 if (prec >= 0 && len > prec) 6943 len = prec; 6944 break; 6945 6946 case 'i': 6947 case 'd': 6948 case 'u': 6949 case 'o': 6950 case 'x': 6951 case 'X': 6952 if (c == 'i') 6953 c = 'd'; 6954 if (PyLong_Check(v)) { 6955 temp = formatlong(v, flags, prec, c); 6956 if (!temp) 6957 goto onError; 6958 pbuf = PyUnicode_AS_UNICODE(temp); 6959 len = PyUnicode_GET_SIZE(temp); 6960 sign = 1; 6961 } 6962 else { 6963 pbuf = formatbuf; 6964 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), 6965 flags, prec, c, v); 6966 if (len < 0) 6967 goto onError; 6968 sign = 1; 6969 } 6970 if (flags & F_ZERO) 6971 fill = '0'; 6972 break; 6973 6974 case 'e': 6975 case 'E': 6976 case 'f': 6977 case 'F': 6978 case 'g': 6979 case 'G': 6980 if (c == 'F') 6981 c = 'f'; 6982 pbuf = formatbuf; 6983 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), 6984 flags, prec, c, v); 6985 if (len < 0) 6986 goto onError; 6987 sign = 1; 6988 if (flags & F_ZERO) 6989 fill = '0'; 6990 break; 6991 6992 case 'c': 6993 pbuf = formatbuf; 6994 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v); 6995 if (len < 0) 6996 goto onError; 6997 break; 6998 6999 default: 7000 PyErr_Format(PyExc_ValueError, 7001 "unsupported format character '%c' (0x%x) " 7002 "at index %i", 7003 (31<=c && c<=126) ? (char)c : '?', 7004 (int)c, 7005 (int)(fmt -1 - PyUnicode_AS_UNICODE(uformat))); 7006 goto onError; 7007 } 7008 if (sign) { 7009 if (*pbuf == '-' || *pbuf == '+') { 7010 sign = *pbuf++; 7011 len--; 7012 } 7013 else if (flags & F_SIGN) 7014 sign = '+'; 7015 else if (flags & F_BLANK) 7016 sign = ' '; 7017 else 7018 sign = 0; 7019 } 7020 if (width < len) 7021 width = len; 7022 if (rescnt - (sign != 0) < width) { 7023 reslen -= rescnt; 7024 rescnt = width + fmtcnt + 100; 7025 reslen += rescnt; 7026 if (reslen < 0) { 7027 Py_DECREF(result); 7028 return PyErr_NoMemory(); 7029 } 7030 if (_PyUnicode_Resize(&result, reslen) < 0) 7031 return NULL; 7032 res = PyUnicode_AS_UNICODE(result) 7033 + reslen - rescnt; 7034 } 7035 if (sign) { 7036 if (fill != ' ') 7037 *res++ = sign; 7038 rescnt--; 7039 if (width > len) 7040 width--; 7041 } 7042 if ((flags & F_ALT) && (c == 'x' || c == 'X')) { 7043 assert(pbuf[0] == '0'); 7044 assert(pbuf[1] == c); 7045 if (fill != ' ') { 7046 *res++ = *pbuf++; 7047 *res++ = *pbuf++; 7048 } 7049 rescnt -= 2; 7050 width -= 2; 7051 if (width < 0) 7052 width = 0; 7053 len -= 2; 7054 } 7055 if (width > len && !(flags & F_LJUST)) { 7056 do { 7057 --rescnt; 7058 *res++ = fill; 7059 } while (--width > len); 7060 } 7061 if (fill == ' ') { 7062 if (sign) 7063 *res++ = sign; 7064 if ((flags & F_ALT) && (c == 'x' || c == 'X')) { 7065 assert(pbuf[0] == '0'); 7066 assert(pbuf[1] == c); 7067 *res++ = *pbuf++; 7068 *res++ = *pbuf++; 7069 } 7070 } 7071 Py_UNICODE_COPY(res, pbuf, len); 7072 res += len; 7073 rescnt -= len; 7074 while (--width >= len) { 7075 --rescnt; 7076 *res++ = ' '; 7077 } 7078 if (dict && (argidx < arglen) && c != '%') { 7079 PyErr_SetString(PyExc_TypeError, 7080 "not all arguments converted during string formatting"); 7081 goto onError; 7082 } 7083 Py_XDECREF(temp); 7084 } /* '%' */ 7085 } /* until end */ 7086 if (argidx < arglen && !dict) { 7087 PyErr_SetString(PyExc_TypeError, 7088 "not all arguments converted during string formatting"); 7089 goto onError; 7090 } 7091 7092 if (args_owned) { 7093 Py_DECREF(args); 7094 } 7095 Py_DECREF(uformat); 7096 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0) 7097 goto onError; 7098 return (PyObject *)result; 7099 7100 onError: 7101 Py_XDECREF(result); 7102 Py_DECREF(uformat); 7103 if (args_owned) { 7104 Py_DECREF(args); 7105 } 7106 return NULL; 7107} 7108 7109static PyBufferProcs unicode_as_buffer = { 7110 (getreadbufferproc) unicode_buffer_getreadbuf, 7111 (getwritebufferproc) unicode_buffer_getwritebuf, 7112 (getsegcountproc) unicode_buffer_getsegcount, 7113 (getcharbufferproc) unicode_buffer_getcharbuf, 7114}; 7115 7116static PyObject * 7117unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds); 7118 7119static PyObject * 7120unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds) 7121{ 7122 PyObject *x = NULL; 7123 static char *kwlist[] = {"string", "encoding", "errors", 0}; 7124 char *encoding = NULL; 7125 char *errors = NULL; 7126 7127 if (type != &PyUnicode_Type) 7128 return unicode_subtype_new(type, args, kwds); 7129 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode", 7130 kwlist, &x, &encoding, &errors)) 7131 return NULL; 7132 if (x == NULL) 7133 return (PyObject *)_PyUnicode_New(0); 7134 if (encoding == NULL && errors == NULL) 7135 return PyObject_Unicode(x); 7136 else 7137 return PyUnicode_FromEncodedObject(x, encoding, errors); 7138} 7139 7140static PyObject * 7141unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds) 7142{ 7143 PyUnicodeObject *tmp, *pnew; 7144 int n; 7145 7146 assert(PyType_IsSubtype(type, &PyUnicode_Type)); 7147 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds); 7148 if (tmp == NULL) 7149 return NULL; 7150 assert(PyUnicode_Check(tmp)); 7151 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length); 7152 if (pnew == NULL) { 7153 Py_DECREF(tmp); 7154 return NULL; 7155 } 7156 pnew->str = PyMem_NEW(Py_UNICODE, n+1); 7157 if (pnew->str == NULL) { 7158 _Py_ForgetReference((PyObject *)pnew); 7159 PyObject_Del(pnew); 7160 Py_DECREF(tmp); 7161 return PyErr_NoMemory(); 7162 } 7163 Py_UNICODE_COPY(pnew->str, tmp->str, n+1); 7164 pnew->length = n; 7165 pnew->hash = tmp->hash; 7166 Py_DECREF(tmp); 7167 return (PyObject *)pnew; 7168} 7169 7170PyDoc_STRVAR(unicode_doc, 7171"unicode(string [, encoding[, errors]]) -> object\n\ 7172\n\ 7173Create a new Unicode object from the given encoded string.\n\ 7174encoding defaults to the current default string encoding.\n\ 7175errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'."); 7176 7177PyTypeObject PyUnicode_Type = { 7178 PyObject_HEAD_INIT(&PyType_Type) 7179 0, /* ob_size */ 7180 "unicode", /* tp_name */ 7181 sizeof(PyUnicodeObject), /* tp_size */ 7182 0, /* tp_itemsize */ 7183 /* Slots */ 7184 (destructor)unicode_dealloc, /* tp_dealloc */ 7185 0, /* tp_print */ 7186 0, /* tp_getattr */ 7187 0, /* tp_setattr */ 7188 (cmpfunc) unicode_compare, /* tp_compare */ 7189 (reprfunc) unicode_repr, /* tp_repr */ 7190 &unicode_as_number, /* tp_as_number */ 7191 &unicode_as_sequence, /* tp_as_sequence */ 7192 &unicode_as_mapping, /* tp_as_mapping */ 7193 (hashfunc) unicode_hash, /* tp_hash*/ 7194 0, /* tp_call*/ 7195 (reprfunc) unicode_str, /* tp_str */ 7196 PyObject_GenericGetAttr, /* tp_getattro */ 7197 0, /* tp_setattro */ 7198 &unicode_as_buffer, /* tp_as_buffer */ 7199 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES | 7200 Py_TPFLAGS_BASETYPE, /* tp_flags */ 7201 unicode_doc, /* tp_doc */ 7202 0, /* tp_traverse */ 7203 0, /* tp_clear */ 7204 0, /* tp_richcompare */ 7205 0, /* tp_weaklistoffset */ 7206 0, /* tp_iter */ 7207 0, /* tp_iternext */ 7208 unicode_methods, /* tp_methods */ 7209 0, /* tp_members */ 7210 0, /* tp_getset */ 7211 &PyBaseString_Type, /* tp_base */ 7212 0, /* tp_dict */ 7213 0, /* tp_descr_get */ 7214 0, /* tp_descr_set */ 7215 0, /* tp_dictoffset */ 7216 0, /* tp_init */ 7217 0, /* tp_alloc */ 7218 unicode_new, /* tp_new */ 7219 PyObject_Del, /* tp_free */ 7220}; 7221 7222/* Initialize the Unicode implementation */ 7223 7224void _PyUnicode_Init(void) 7225{ 7226 int i; 7227 7228 /* Init the implementation */ 7229 unicode_freelist = NULL; 7230 unicode_freelist_size = 0; 7231 unicode_empty = _PyUnicode_New(0); 7232 strcpy(unicode_default_encoding, "ascii"); 7233 for (i = 0; i < 256; i++) 7234 unicode_latin1[i] = NULL; 7235 if (PyType_Ready(&PyUnicode_Type) < 0) 7236 Py_FatalError("Can't initialize 'unicode'"); 7237} 7238 7239/* Finalize the Unicode implementation */ 7240 7241void 7242_PyUnicode_Fini(void) 7243{ 7244 PyUnicodeObject *u; 7245 int i; 7246 7247 Py_XDECREF(unicode_empty); 7248 unicode_empty = NULL; 7249 7250 for (i = 0; i < 256; i++) { 7251 if (unicode_latin1[i]) { 7252 Py_DECREF(unicode_latin1[i]); 7253 unicode_latin1[i] = NULL; 7254 } 7255 } 7256 7257 for (u = unicode_freelist; u != NULL;) { 7258 PyUnicodeObject *v = u; 7259 u = *(PyUnicodeObject **)u; 7260 if (v->str) 7261 PyMem_DEL(v->str); 7262 Py_XDECREF(v->defenc); 7263 PyObject_Del(v); 7264 } 7265 unicode_freelist = NULL; 7266 unicode_freelist_size = 0; 7267} 7268 7269/* 7270Local variables: 7271c-basic-offset: 4 7272indent-tabs-mode: nil 7273End: 7274*/ 7275