unicodeobject.c revision 8879a33613b33b32bda146a4da1a71d712a684d2
1/* 2 3Unicode implementation based on original code by Fredrik Lundh, 4modified by Marc-Andre Lemburg <mal@lemburg.com> according to the 5Unicode Integration Proposal (see file Misc/unicode.txt). 6 7Copyright (c) Corporation for National Research Initiatives. 8 9-------------------------------------------------------------------- 10The original string type implementation is: 11 12 Copyright (c) 1999 by Secret Labs AB 13 Copyright (c) 1999 by Fredrik Lundh 14 15By obtaining, using, and/or copying this software and/or its 16associated documentation, you agree that you have read, understood, 17and will comply with the following terms and conditions: 18 19Permission to use, copy, modify, and distribute this software and its 20associated documentation for any purpose and without fee is hereby 21granted, provided that the above copyright notice appears in all 22copies, and that both that copyright notice and this permission notice 23appear in supporting documentation, and that the name of Secret Labs 24AB or the author not be used in advertising or publicity pertaining to 25distribution of the software without specific, written prior 26permission. 27 28SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO 29THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND 30FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR 31ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 32WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 33ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT 34OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 35-------------------------------------------------------------------- 36 37*/ 38 39#include "Python.h" 40 41#include "unicodeobject.h" 42#include "ucnhash.h" 43 44#ifdef MS_WIN32 45#include <windows.h> 46#endif 47 48/* Limit for the Unicode object free list */ 49 50#define MAX_UNICODE_FREELIST_SIZE 1024 51 52/* Limit for the Unicode object free list stay alive optimization. 53 54 The implementation will keep allocated Unicode memory intact for 55 all objects on the free list having a size less than this 56 limit. This reduces malloc() overhead for small Unicode objects. 57 58 At worst this will result in MAX_UNICODE_FREELIST_SIZE * 59 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT + 60 malloc()-overhead) bytes of unused garbage. 61 62 Setting the limit to 0 effectively turns the feature off. 63 64 Note: This is an experimental feature ! If you get core dumps when 65 using Unicode objects, turn this feature off. 66 67*/ 68 69#define KEEPALIVE_SIZE_LIMIT 9 70 71/* Endianness switches; defaults to little endian */ 72 73#ifdef WORDS_BIGENDIAN 74# define BYTEORDER_IS_BIG_ENDIAN 75#else 76# define BYTEORDER_IS_LITTLE_ENDIAN 77#endif 78 79/* --- Globals ------------------------------------------------------------ 80 81 The globals are initialized by the _PyUnicode_Init() API and should 82 not be used before calling that API. 83 84*/ 85 86/* Free list for Unicode objects */ 87static PyUnicodeObject *unicode_freelist; 88static int unicode_freelist_size; 89 90/* The empty Unicode object is shared to improve performance. */ 91static PyUnicodeObject *unicode_empty; 92 93/* Single character Unicode strings in the Latin-1 range are being 94 shared as well. */ 95static PyUnicodeObject *unicode_latin1[256]; 96 97/* Default encoding to use and assume when NULL is passed as encoding 98 parameter; it is initialized by _PyUnicode_Init(). 99 100 Always use the PyUnicode_SetDefaultEncoding() and 101 PyUnicode_GetDefaultEncoding() APIs to access this global. 102 103*/ 104static char unicode_default_encoding[100]; 105 106/* --- Unicode Object ----------------------------------------------------- */ 107 108static 109int unicode_resize(register PyUnicodeObject *unicode, 110 int length) 111{ 112 void *oldstr; 113 114 /* Shortcut if there's nothing much to do. */ 115 if (unicode->length == length) 116 goto reset; 117 118 /* Resizing shared object (unicode_empty or single character 119 objects) in-place is not allowed. Use PyUnicode_Resize() 120 instead ! */ 121 if (unicode == unicode_empty || 122 (unicode->length == 1 && 123 unicode->str[0] < 256 && 124 unicode_latin1[unicode->str[0]] == unicode)) { 125 PyErr_SetString(PyExc_SystemError, 126 "can't resize shared unicode objects"); 127 return -1; 128 } 129 130 /* We allocate one more byte to make sure the string is 131 Ux0000 terminated -- XXX is this needed ? */ 132 oldstr = unicode->str; 133 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1); 134 if (!unicode->str) { 135 unicode->str = oldstr; 136 PyErr_NoMemory(); 137 return -1; 138 } 139 unicode->str[length] = 0; 140 unicode->length = length; 141 142 reset: 143 /* Reset the object caches */ 144 if (unicode->defenc) { 145 Py_DECREF(unicode->defenc); 146 unicode->defenc = NULL; 147 } 148 unicode->hash = -1; 149 150 return 0; 151} 152 153/* We allocate one more byte to make sure the string is 154 Ux0000 terminated -- XXX is this needed ? 155 156 XXX This allocator could further be enhanced by assuring that the 157 free list never reduces its size below 1. 158 159*/ 160 161static 162PyUnicodeObject *_PyUnicode_New(int length) 163{ 164 register PyUnicodeObject *unicode; 165 166 /* Optimization for empty strings */ 167 if (length == 0 && unicode_empty != NULL) { 168 Py_INCREF(unicode_empty); 169 return unicode_empty; 170 } 171 172 /* Unicode freelist & memory allocation */ 173 if (unicode_freelist) { 174 unicode = unicode_freelist; 175 unicode_freelist = *(PyUnicodeObject **)unicode; 176 unicode_freelist_size--; 177 if (unicode->str) { 178 /* Keep-Alive optimization: we only upsize the buffer, 179 never downsize it. */ 180 if ((unicode->length < length) && 181 unicode_resize(unicode, length)) { 182 PyMem_DEL(unicode->str); 183 goto onError; 184 } 185 } 186 else { 187 unicode->str = PyMem_NEW(Py_UNICODE, length + 1); 188 } 189 PyObject_INIT(unicode, &PyUnicode_Type); 190 } 191 else { 192 unicode = PyObject_NEW(PyUnicodeObject, &PyUnicode_Type); 193 if (unicode == NULL) 194 return NULL; 195 unicode->str = PyMem_NEW(Py_UNICODE, length + 1); 196 } 197 198 if (!unicode->str) { 199 PyErr_NoMemory(); 200 goto onError; 201 } 202 unicode->str[length] = 0; 203 unicode->length = length; 204 unicode->hash = -1; 205 unicode->defenc = NULL; 206 return unicode; 207 208 onError: 209 _Py_ForgetReference((PyObject *)unicode); 210 PyObject_DEL(unicode); 211 return NULL; 212} 213 214static 215void _PyUnicode_Free(register PyUnicodeObject *unicode) 216{ 217 if (unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) { 218 /* Keep-Alive optimization */ 219 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) { 220 PyMem_DEL(unicode->str); 221 unicode->str = NULL; 222 unicode->length = 0; 223 } 224 if (unicode->defenc) { 225 Py_DECREF(unicode->defenc); 226 unicode->defenc = NULL; 227 } 228 /* Add to free list */ 229 *(PyUnicodeObject **)unicode = unicode_freelist; 230 unicode_freelist = unicode; 231 unicode_freelist_size++; 232 } 233 else { 234 PyMem_DEL(unicode->str); 235 Py_XDECREF(unicode->defenc); 236 PyObject_DEL(unicode); 237 } 238} 239 240int PyUnicode_Resize(PyObject **unicode, 241 int length) 242{ 243 register PyUnicodeObject *v; 244 245 /* Argument checks */ 246 if (unicode == NULL) { 247 PyErr_BadInternalCall(); 248 return -1; 249 } 250 v = (PyUnicodeObject *)*unicode; 251 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1) { 252 PyErr_BadInternalCall(); 253 return -1; 254 } 255 256 /* Resizing unicode_empty and single character objects is not 257 possible since these are being shared. We simply return a fresh 258 copy with the same Unicode content. */ 259 if (v->length != length && 260 (v == unicode_empty || v->length == 1)) { 261 PyUnicodeObject *w = _PyUnicode_New(length); 262 if (w == NULL) 263 return -1; 264 Py_UNICODE_COPY(w->str, v->str, 265 length < v->length ? length : v->length); 266 *unicode = (PyObject *)w; 267 return 0; 268 } 269 270 /* Note that we don't have to modify *unicode for unshared Unicode 271 objects, since we can modify them in-place. */ 272 return unicode_resize(v, length); 273} 274 275/* Internal API for use in unicodeobject.c only ! */ 276#define _PyUnicode_Resize(unicodevar, length) \ 277 PyUnicode_Resize(((PyObject **)(unicodevar)), length) 278 279PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u, 280 int size) 281{ 282 PyUnicodeObject *unicode; 283 284 /* If the Unicode data is known at construction time, we can apply 285 some optimizations which share commonly used objects. */ 286 if (u != NULL) { 287 288 /* Optimization for empty strings */ 289 if (size == 0 && unicode_empty != NULL) { 290 Py_INCREF(unicode_empty); 291 return (PyObject *)unicode_empty; 292 } 293 294 /* Single character Unicode objects in the Latin-1 range are 295 shared when using this constructor */ 296 if (size == 1 && *u < 256) { 297 unicode = unicode_latin1[*u]; 298 if (!unicode) { 299 unicode = _PyUnicode_New(1); 300 if (!unicode) 301 return NULL; 302 unicode->str[0] = *u; 303 unicode_latin1[*u] = unicode; 304 } 305 Py_INCREF(unicode); 306 return (PyObject *)unicode; 307 } 308 } 309 310 unicode = _PyUnicode_New(size); 311 if (!unicode) 312 return NULL; 313 314 /* Copy the Unicode data into the new object */ 315 if (u != NULL) 316 Py_UNICODE_COPY(unicode->str, u, size); 317 318 return (PyObject *)unicode; 319} 320 321#ifdef HAVE_WCHAR_H 322 323PyObject *PyUnicode_FromWideChar(register const wchar_t *w, 324 int size) 325{ 326 PyUnicodeObject *unicode; 327 328 if (w == NULL) { 329 PyErr_BadInternalCall(); 330 return NULL; 331 } 332 333 unicode = _PyUnicode_New(size); 334 if (!unicode) 335 return NULL; 336 337 /* Copy the wchar_t data into the new object */ 338#ifdef HAVE_USABLE_WCHAR_T 339 memcpy(unicode->str, w, size * sizeof(wchar_t)); 340#else 341 { 342 register Py_UNICODE *u; 343 register int i; 344 u = PyUnicode_AS_UNICODE(unicode); 345 for (i = size; i >= 0; i--) 346 *u++ = *w++; 347 } 348#endif 349 350 return (PyObject *)unicode; 351} 352 353int PyUnicode_AsWideChar(PyUnicodeObject *unicode, 354 register wchar_t *w, 355 int size) 356{ 357 if (unicode == NULL) { 358 PyErr_BadInternalCall(); 359 return -1; 360 } 361 if (size > PyUnicode_GET_SIZE(unicode)) 362 size = PyUnicode_GET_SIZE(unicode); 363#ifdef HAVE_USABLE_WCHAR_T 364 memcpy(w, unicode->str, size * sizeof(wchar_t)); 365#else 366 { 367 register Py_UNICODE *u; 368 register int i; 369 u = PyUnicode_AS_UNICODE(unicode); 370 for (i = size; i >= 0; i--) 371 *w++ = *u++; 372 } 373#endif 374 375 return size; 376} 377 378#endif 379 380PyObject *PyUnicode_FromObject(register PyObject *obj) 381{ 382 return PyUnicode_FromEncodedObject(obj, NULL, "strict"); 383} 384 385PyObject *PyUnicode_FromEncodedObject(register PyObject *obj, 386 const char *encoding, 387 const char *errors) 388{ 389 const char *s; 390 int len; 391 int owned = 0; 392 PyObject *v; 393 394 if (obj == NULL) { 395 PyErr_BadInternalCall(); 396 return NULL; 397 } 398 399 /* Coerce object */ 400 if (PyInstance_Check(obj)) { 401 PyObject *func; 402 func = PyObject_GetAttrString(obj, "__str__"); 403 if (func == NULL) { 404 PyErr_SetString(PyExc_TypeError, 405 "coercing to Unicode: instance doesn't define __str__"); 406 return NULL; 407 } 408 obj = PyEval_CallObject(func, NULL); 409 Py_DECREF(func); 410 if (obj == NULL) 411 return NULL; 412 owned = 1; 413 } 414 if (PyUnicode_Check(obj)) { 415 Py_INCREF(obj); 416 v = obj; 417 if (encoding) { 418 PyErr_SetString(PyExc_TypeError, 419 "decoding Unicode is not supported"); 420 return NULL; 421 } 422 goto done; 423 } 424 else if (PyString_Check(obj)) { 425 s = PyString_AS_STRING(obj); 426 len = PyString_GET_SIZE(obj); 427 } 428 else if (PyObject_AsCharBuffer(obj, &s, &len)) { 429 /* Overwrite the error message with something more useful in 430 case of a TypeError. */ 431 if (PyErr_ExceptionMatches(PyExc_TypeError)) 432 PyErr_Format(PyExc_TypeError, 433 "coercing to Unicode: need string or buffer, " 434 "%.80s found", 435 obj->ob_type->tp_name); 436 goto onError; 437 } 438 439 /* Convert to Unicode */ 440 if (len == 0) { 441 Py_INCREF(unicode_empty); 442 v = (PyObject *)unicode_empty; 443 } 444 else 445 v = PyUnicode_Decode(s, len, encoding, errors); 446 447 done: 448 if (owned) { 449 Py_DECREF(obj); 450 } 451 return v; 452 453 onError: 454 if (owned) { 455 Py_DECREF(obj); 456 } 457 return NULL; 458} 459 460PyObject *PyUnicode_Decode(const char *s, 461 int size, 462 const char *encoding, 463 const char *errors) 464{ 465 PyObject *buffer = NULL, *unicode; 466 467 if (encoding == NULL) 468 encoding = PyUnicode_GetDefaultEncoding(); 469 470 /* Shortcuts for common default encodings */ 471 if (strcmp(encoding, "utf-8") == 0) 472 return PyUnicode_DecodeUTF8(s, size, errors); 473 else if (strcmp(encoding, "latin-1") == 0) 474 return PyUnicode_DecodeLatin1(s, size, errors); 475 else if (strcmp(encoding, "ascii") == 0) 476 return PyUnicode_DecodeASCII(s, size, errors); 477 478 /* Decode via the codec registry */ 479 buffer = PyBuffer_FromMemory((void *)s, size); 480 if (buffer == NULL) 481 goto onError; 482 unicode = PyCodec_Decode(buffer, encoding, errors); 483 if (unicode == NULL) 484 goto onError; 485 if (!PyUnicode_Check(unicode)) { 486 PyErr_Format(PyExc_TypeError, 487 "decoder did not return an unicode object (type=%.400s)", 488 unicode->ob_type->tp_name); 489 Py_DECREF(unicode); 490 goto onError; 491 } 492 Py_DECREF(buffer); 493 return unicode; 494 495 onError: 496 Py_XDECREF(buffer); 497 return NULL; 498} 499 500PyObject *PyUnicode_Encode(const Py_UNICODE *s, 501 int size, 502 const char *encoding, 503 const char *errors) 504{ 505 PyObject *v, *unicode; 506 507 unicode = PyUnicode_FromUnicode(s, size); 508 if (unicode == NULL) 509 return NULL; 510 v = PyUnicode_AsEncodedString(unicode, encoding, errors); 511 Py_DECREF(unicode); 512 return v; 513} 514 515PyObject *PyUnicode_AsEncodedString(PyObject *unicode, 516 const char *encoding, 517 const char *errors) 518{ 519 PyObject *v; 520 521 if (!PyUnicode_Check(unicode)) { 522 PyErr_BadArgument(); 523 goto onError; 524 } 525 526 if (encoding == NULL) 527 encoding = PyUnicode_GetDefaultEncoding(); 528 529 /* Shortcuts for common default encodings */ 530 if (errors == NULL) { 531 if (strcmp(encoding, "utf-8") == 0) 532 return PyUnicode_AsUTF8String(unicode); 533 else if (strcmp(encoding, "latin-1") == 0) 534 return PyUnicode_AsLatin1String(unicode); 535 else if (strcmp(encoding, "ascii") == 0) 536 return PyUnicode_AsASCIIString(unicode); 537 } 538 539 /* Encode via the codec registry */ 540 v = PyCodec_Encode(unicode, encoding, errors); 541 if (v == NULL) 542 goto onError; 543 /* XXX Should we really enforce this ? */ 544 if (!PyString_Check(v)) { 545 PyErr_Format(PyExc_TypeError, 546 "encoder did not return a string object (type=%.400s)", 547 v->ob_type->tp_name); 548 Py_DECREF(v); 549 goto onError; 550 } 551 return v; 552 553 onError: 554 return NULL; 555} 556 557/* Return a Python string holding the default encoded value of the 558 Unicode object. 559 560 The resulting string is cached in the Unicode object for subsequent 561 usage by this function. The cached version is needed to implement 562 the character buffer interface and will live (at least) as long as 563 the Unicode object itself. 564 565 The refcount of the string is *not* incremented. 566 567 *** Exported for internal use by the interpreter only !!! *** 568 569*/ 570 571PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode, 572 const char *errors) 573{ 574 PyObject *v = ((PyUnicodeObject *)unicode)->defenc; 575 576 if (v) 577 return v; 578 v = PyUnicode_AsEncodedString(unicode, NULL, errors); 579 if (v && errors == NULL) 580 ((PyUnicodeObject *)unicode)->defenc = v; 581 return v; 582} 583 584Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode) 585{ 586 if (!PyUnicode_Check(unicode)) { 587 PyErr_BadArgument(); 588 goto onError; 589 } 590 return PyUnicode_AS_UNICODE(unicode); 591 592 onError: 593 return NULL; 594} 595 596int PyUnicode_GetSize(PyObject *unicode) 597{ 598 if (!PyUnicode_Check(unicode)) { 599 PyErr_BadArgument(); 600 goto onError; 601 } 602 return PyUnicode_GET_SIZE(unicode); 603 604 onError: 605 return -1; 606} 607 608const char *PyUnicode_GetDefaultEncoding(void) 609{ 610 return unicode_default_encoding; 611} 612 613int PyUnicode_SetDefaultEncoding(const char *encoding) 614{ 615 PyObject *v; 616 617 /* Make sure the encoding is valid. As side effect, this also 618 loads the encoding into the codec registry cache. */ 619 v = _PyCodec_Lookup(encoding); 620 if (v == NULL) 621 goto onError; 622 Py_DECREF(v); 623 strncpy(unicode_default_encoding, 624 encoding, 625 sizeof(unicode_default_encoding)); 626 return 0; 627 628 onError: 629 return -1; 630} 631 632/* --- UTF-8 Codec -------------------------------------------------------- */ 633 634static 635char utf8_code_length[256] = { 636 /* Map UTF-8 encoded prefix byte to sequence length. zero means 637 illegal prefix. see RFC 2279 for details */ 638 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 639 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 640 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 641 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 642 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 643 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 644 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 645 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 646 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 647 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 648 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 649 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 650 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 651 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 652 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 653 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0 654}; 655 656static 657int utf8_decoding_error(const char **source, 658 Py_UNICODE **dest, 659 const char *errors, 660 const char *details) 661{ 662 if ((errors == NULL) || 663 (strcmp(errors,"strict") == 0)) { 664 PyErr_Format(PyExc_UnicodeError, 665 "UTF-8 decoding error: %.400s", 666 details); 667 return -1; 668 } 669 else if (strcmp(errors,"ignore") == 0) { 670 (*source)++; 671 return 0; 672 } 673 else if (strcmp(errors,"replace") == 0) { 674 (*source)++; 675 **dest = Py_UNICODE_REPLACEMENT_CHARACTER; 676 (*dest)++; 677 return 0; 678 } 679 else { 680 PyErr_Format(PyExc_ValueError, 681 "UTF-8 decoding error; unknown error handling code: %.400s", 682 errors); 683 return -1; 684 } 685} 686 687PyObject *PyUnicode_DecodeUTF8(const char *s, 688 int size, 689 const char *errors) 690{ 691 int n; 692 const char *e; 693 PyUnicodeObject *unicode; 694 Py_UNICODE *p; 695 const char *errmsg = ""; 696 697 /* Note: size will always be longer than the resulting Unicode 698 character count */ 699 unicode = _PyUnicode_New(size); 700 if (!unicode) 701 return NULL; 702 if (size == 0) 703 return (PyObject *)unicode; 704 705 /* Unpack UTF-8 encoded data */ 706 p = unicode->str; 707 e = s + size; 708 709 while (s < e) { 710 Py_UCS4 ch = (unsigned char)*s; 711 712 if (ch < 0x80) { 713 *p++ = (Py_UNICODE)ch; 714 s++; 715 continue; 716 } 717 718 n = utf8_code_length[ch]; 719 720 if (s + n > e) { 721 errmsg = "unexpected end of data"; 722 goto utf8Error; 723 } 724 725 switch (n) { 726 727 case 0: 728 errmsg = "unexpected code byte"; 729 goto utf8Error; 730 731 case 1: 732 errmsg = "internal error"; 733 goto utf8Error; 734 735 case 2: 736 if ((s[1] & 0xc0) != 0x80) { 737 errmsg = "invalid data"; 738 goto utf8Error; 739 } 740 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f); 741 if (ch < 0x80) { 742 errmsg = "illegal encoding"; 743 goto utf8Error; 744 } 745 else 746 *p++ = (Py_UNICODE)ch; 747 break; 748 749 case 3: 750 if ((s[1] & 0xc0) != 0x80 || 751 (s[2] & 0xc0) != 0x80) { 752 errmsg = "invalid data"; 753 goto utf8Error; 754 } 755 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f); 756 if (ch < 0x800 || (ch >= 0xd800 && ch < 0xe000)) { 757 errmsg = "illegal encoding"; 758 goto utf8Error; 759 } 760 else 761 *p++ = (Py_UNICODE)ch; 762 break; 763 764 case 4: 765 if ((s[1] & 0xc0) != 0x80 || 766 (s[2] & 0xc0) != 0x80 || 767 (s[3] & 0xc0) != 0x80) { 768 errmsg = "invalid data"; 769 goto utf8Error; 770 } 771 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) + 772 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f); 773 /* validate and convert to UTF-16 */ 774 if ((ch < 0x10000) || /* minimum value allowed for 4 775 byte encoding */ 776 (ch > 0x10ffff)) { /* maximum value allowed for 777 UTF-16 */ 778 errmsg = "illegal encoding"; 779 goto utf8Error; 780 } 781 /* compute and append the two surrogates: */ 782 783 /* translate from 10000..10FFFF to 0..FFFF */ 784 ch -= 0x10000; 785 786 /* high surrogate = top 10 bits added to D800 */ 787 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10)); 788 789 /* low surrogate = bottom 10 bits added to DC00 */ 790 *p++ = (Py_UNICODE)(0xDC00 + (ch & ~0xFC00)); 791 break; 792 793 default: 794 /* Other sizes are only needed for UCS-4 */ 795 errmsg = "unsupported Unicode code range"; 796 goto utf8Error; 797 } 798 s += n; 799 continue; 800 801 utf8Error: 802 if (utf8_decoding_error(&s, &p, errors, errmsg)) 803 goto onError; 804 } 805 806 /* Adjust length */ 807 if (_PyUnicode_Resize(&unicode, p - unicode->str)) 808 goto onError; 809 810 return (PyObject *)unicode; 811 812onError: 813 Py_DECREF(unicode); 814 return NULL; 815} 816 817/* Not used anymore, now that the encoder supports UTF-16 818 surrogates. */ 819#if 0 820static 821int utf8_encoding_error(const Py_UNICODE **source, 822 char **dest, 823 const char *errors, 824 const char *details) 825{ 826 if ((errors == NULL) || 827 (strcmp(errors,"strict") == 0)) { 828 PyErr_Format(PyExc_UnicodeError, 829 "UTF-8 encoding error: %.400s", 830 details); 831 return -1; 832 } 833 else if (strcmp(errors,"ignore") == 0) { 834 return 0; 835 } 836 else if (strcmp(errors,"replace") == 0) { 837 **dest = '?'; 838 (*dest)++; 839 return 0; 840 } 841 else { 842 PyErr_Format(PyExc_ValueError, 843 "UTF-8 encoding error; " 844 "unknown error handling code: %.400s", 845 errors); 846 return -1; 847 } 848} 849#endif 850 851PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s, 852 int size, 853 const char *errors) 854{ 855 PyObject *v; 856 char *p; 857 char *q; 858 Py_UCS4 ch2; 859 unsigned int cbAllocated = 3 * size; 860 unsigned int cbWritten = 0; 861 int i = 0; 862 863 v = PyString_FromStringAndSize(NULL, cbAllocated); 864 if (v == NULL) 865 return NULL; 866 if (size == 0) 867 return v; 868 869 p = q = PyString_AS_STRING(v); 870 while (i < size) { 871 Py_UCS4 ch = s[i++]; 872 if (ch < 0x80) { 873 *p++ = (char) ch; 874 cbWritten++; 875 } 876 else if (ch < 0x0800) { 877 *p++ = 0xc0 | (ch >> 6); 878 *p++ = 0x80 | (ch & 0x3f); 879 cbWritten += 2; 880 } 881 else { 882 /* Check for high surrogate */ 883 if (0xD800 <= ch && ch <= 0xDBFF) { 884 if (i != size) { 885 ch2 = s[i]; 886 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) { 887 888 if (cbWritten >= (cbAllocated - 4)) { 889 /* Provide enough room for some more 890 surrogates */ 891 cbAllocated += 4*10; 892 if (_PyString_Resize(&v, cbAllocated)) 893 goto onError; 894 } 895 896 /* combine the two values */ 897 ch = ((ch - 0xD800)<<10 | (ch2-0xDC00))+0x10000; 898 899 *p++ = (char)((ch >> 18) | 0xf0); 900 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f)); 901 i++; 902 cbWritten += 4; 903 } 904 } 905 } 906 else { 907 *p++ = (char)(0xe0 | (ch >> 12)); 908 cbWritten += 3; 909 } 910 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); 911 *p++ = (char)(0x80 | (ch & 0x3f)); 912 } 913 } 914 *p = '\0'; 915 if (_PyString_Resize(&v, p - q)) 916 goto onError; 917 return v; 918 919 onError: 920 Py_DECREF(v); 921 return NULL; 922} 923 924PyObject *PyUnicode_AsUTF8String(PyObject *unicode) 925{ 926 if (!PyUnicode_Check(unicode)) { 927 PyErr_BadArgument(); 928 return NULL; 929 } 930 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode), 931 PyUnicode_GET_SIZE(unicode), 932 NULL); 933} 934 935/* --- UTF-16 Codec ------------------------------------------------------- */ 936 937static 938int utf16_decoding_error(const Py_UNICODE **source, 939 Py_UNICODE **dest, 940 const char *errors, 941 const char *details) 942{ 943 if ((errors == NULL) || 944 (strcmp(errors,"strict") == 0)) { 945 PyErr_Format(PyExc_UnicodeError, 946 "UTF-16 decoding error: %.400s", 947 details); 948 return -1; 949 } 950 else if (strcmp(errors,"ignore") == 0) { 951 return 0; 952 } 953 else if (strcmp(errors,"replace") == 0) { 954 if (dest) { 955 **dest = Py_UNICODE_REPLACEMENT_CHARACTER; 956 (*dest)++; 957 } 958 return 0; 959 } 960 else { 961 PyErr_Format(PyExc_ValueError, 962 "UTF-16 decoding error; " 963 "unknown error handling code: %.400s", 964 errors); 965 return -1; 966 } 967} 968 969PyObject *PyUnicode_DecodeUTF16(const char *s, 970 int size, 971 const char *errors, 972 int *byteorder) 973{ 974 PyUnicodeObject *unicode; 975 Py_UNICODE *p; 976 const Py_UNICODE *q, *e; 977 int bo = 0; 978 const char *errmsg = ""; 979 980 /* size should be an even number */ 981 if (size % sizeof(Py_UNICODE) != 0) { 982 if (utf16_decoding_error(NULL, NULL, errors, "truncated data")) 983 return NULL; 984 /* The remaining input chars are ignored if we fall through 985 here... */ 986 } 987 988 /* Note: size will always be longer than the resulting Unicode 989 character count */ 990 unicode = _PyUnicode_New(size); 991 if (!unicode) 992 return NULL; 993 if (size == 0) 994 return (PyObject *)unicode; 995 996 /* Unpack UTF-16 encoded data */ 997 p = unicode->str; 998 q = (Py_UNICODE *)s; 999 e = q + (size / sizeof(Py_UNICODE)); 1000 1001 if (byteorder) 1002 bo = *byteorder; 1003 1004 /* Check for BOM marks (U+FEFF) in the input and adjust current 1005 byte order setting accordingly. In native mode, the leading BOM 1006 mark is skipped, in all other modes, it is copied to the output 1007 stream as-is (giving a ZWNBSP character). */ 1008 if (bo == 0) { 1009#ifdef BYTEORDER_IS_LITTLE_ENDIAN 1010 if (*q == 0xFEFF) { 1011 q++; 1012 bo = -1; 1013 } else if (*q == 0xFFFE) { 1014 q++; 1015 bo = 1; 1016 } 1017#else 1018 if (*q == 0xFEFF) { 1019 q++; 1020 bo = 1; 1021 } else if (*q == 0xFFFE) { 1022 q++; 1023 bo = -1; 1024 } 1025#endif 1026 } 1027 1028 while (q < e) { 1029 register Py_UNICODE ch = *q++; 1030 1031 /* Swap input bytes if needed. (This assumes 1032 sizeof(Py_UNICODE) == 2 !) */ 1033#ifdef BYTEORDER_IS_LITTLE_ENDIAN 1034 if (bo == 1) 1035 ch = (ch >> 8) | (ch << 8); 1036#else 1037 if (bo == -1) 1038 ch = (ch >> 8) | (ch << 8); 1039#endif 1040 if (ch < 0xD800 || ch > 0xDFFF) { 1041 *p++ = ch; 1042 continue; 1043 } 1044 1045 /* UTF-16 code pair: */ 1046 if (q >= e) { 1047 errmsg = "unexpected end of data"; 1048 goto utf16Error; 1049 } 1050 if (0xDC00 <= *q && *q <= 0xDFFF) { 1051 q++; 1052 if (0xD800 <= *q && *q <= 0xDBFF) { 1053 /* This is valid data (a UTF-16 surrogate pair), but 1054 we are not able to store this information since our 1055 Py_UNICODE type only has 16 bits... this might 1056 change someday, even though it's unlikely. */ 1057 errmsg = "code pairs are not supported"; 1058 goto utf16Error; 1059 } 1060 else 1061 continue; 1062 } 1063 errmsg = "illegal encoding"; 1064 /* Fall through to report the error */ 1065 1066 utf16Error: 1067 if (utf16_decoding_error(&q, &p, errors, errmsg)) 1068 goto onError; 1069 } 1070 1071 if (byteorder) 1072 *byteorder = bo; 1073 1074 /* Adjust length */ 1075 if (_PyUnicode_Resize(&unicode, p - unicode->str)) 1076 goto onError; 1077 1078 return (PyObject *)unicode; 1079 1080onError: 1081 Py_DECREF(unicode); 1082 return NULL; 1083} 1084 1085#undef UTF16_ERROR 1086 1087PyObject *PyUnicode_EncodeUTF16(const Py_UNICODE *s, 1088 int size, 1089 const char *errors, 1090 int byteorder) 1091{ 1092 PyObject *v; 1093 Py_UNICODE *p; 1094 char *q; 1095 1096 /* We don't create UTF-16 pairs... */ 1097 v = PyString_FromStringAndSize(NULL, 1098 sizeof(Py_UNICODE) * (size + (byteorder == 0))); 1099 if (v == NULL) 1100 return NULL; 1101 1102 q = PyString_AS_STRING(v); 1103 p = (Py_UNICODE *)q; 1104 if (byteorder == 0) 1105 *p++ = 0xFEFF; 1106 if (size == 0) 1107 return v; 1108 if (byteorder == 0 || 1109#ifdef BYTEORDER_IS_LITTLE_ENDIAN 1110 byteorder == -1 1111#else 1112 byteorder == 1 1113#endif 1114 ) 1115 Py_UNICODE_COPY(p, s, size); 1116 else 1117 while (size-- > 0) { 1118 Py_UNICODE ch = *s++; 1119 *p++ = (ch >> 8) | (ch << 8); 1120 } 1121 return v; 1122} 1123 1124PyObject *PyUnicode_AsUTF16String(PyObject *unicode) 1125{ 1126 if (!PyUnicode_Check(unicode)) { 1127 PyErr_BadArgument(); 1128 return NULL; 1129 } 1130 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode), 1131 PyUnicode_GET_SIZE(unicode), 1132 NULL, 1133 0); 1134} 1135 1136/* --- Unicode Escape Codec ----------------------------------------------- */ 1137 1138static 1139int unicodeescape_decoding_error(const char **source, 1140 Py_UNICODE *x, 1141 const char *errors, 1142 const char *details) 1143{ 1144 if ((errors == NULL) || 1145 (strcmp(errors,"strict") == 0)) { 1146 PyErr_Format(PyExc_UnicodeError, 1147 "Unicode-Escape decoding error: %.400s", 1148 details); 1149 return -1; 1150 } 1151 else if (strcmp(errors,"ignore") == 0) { 1152 return 0; 1153 } 1154 else if (strcmp(errors,"replace") == 0) { 1155 *x = Py_UNICODE_REPLACEMENT_CHARACTER; 1156 return 0; 1157 } 1158 else { 1159 PyErr_Format(PyExc_ValueError, 1160 "Unicode-Escape decoding error; " 1161 "unknown error handling code: %.400s", 1162 errors); 1163 return -1; 1164 } 1165} 1166 1167static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL; 1168 1169PyObject *PyUnicode_DecodeUnicodeEscape(const char *s, 1170 int size, 1171 const char *errors) 1172{ 1173 PyUnicodeObject *v; 1174 Py_UNICODE *p, *buf; 1175 const char *end; 1176 char* message; 1177 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */ 1178 1179 /* Escaped strings will always be longer than the resulting 1180 Unicode string, so we start with size here and then reduce the 1181 length after conversion to the true value. */ 1182 v = _PyUnicode_New(size); 1183 if (v == NULL) 1184 goto onError; 1185 if (size == 0) 1186 return (PyObject *)v; 1187 1188 p = buf = PyUnicode_AS_UNICODE(v); 1189 end = s + size; 1190 1191 while (s < end) { 1192 unsigned char c; 1193 Py_UNICODE x; 1194 int i, digits; 1195 1196 /* Non-escape characters are interpreted as Unicode ordinals */ 1197 if (*s != '\\') { 1198 *p++ = (unsigned char) *s++; 1199 continue; 1200 } 1201 1202 /* \ - Escapes */ 1203 s++; 1204 switch (*s++) { 1205 1206 /* \x escapes */ 1207 case '\n': break; 1208 case '\\': *p++ = '\\'; break; 1209 case '\'': *p++ = '\''; break; 1210 case '\"': *p++ = '\"'; break; 1211 case 'b': *p++ = '\b'; break; 1212 case 'f': *p++ = '\014'; break; /* FF */ 1213 case 't': *p++ = '\t'; break; 1214 case 'n': *p++ = '\n'; break; 1215 case 'r': *p++ = '\r'; break; 1216 case 'v': *p++ = '\013'; break; /* VT */ 1217 case 'a': *p++ = '\007'; break; /* BEL, not classic C */ 1218 1219 /* \OOO (octal) escapes */ 1220 case '0': case '1': case '2': case '3': 1221 case '4': case '5': case '6': case '7': 1222 x = s[-1] - '0'; 1223 if ('0' <= *s && *s <= '7') { 1224 x = (x<<3) + *s++ - '0'; 1225 if ('0' <= *s && *s <= '7') 1226 x = (x<<3) + *s++ - '0'; 1227 } 1228 *p++ = x; 1229 break; 1230 1231 /* hex escapes */ 1232 /* \xXX */ 1233 case 'x': 1234 digits = 2; 1235 message = "truncated \\xXX escape"; 1236 goto hexescape; 1237 1238 /* \uXXXX */ 1239 case 'u': 1240 digits = 4; 1241 message = "truncated \\uXXXX escape"; 1242 goto hexescape; 1243 1244 /* \UXXXXXXXX */ 1245 case 'U': 1246 digits = 8; 1247 message = "truncated \\UXXXXXXXX escape"; 1248 hexescape: 1249 chr = 0; 1250 for (i = 0; i < digits; i++) { 1251 c = (unsigned char) s[i]; 1252 if (!isxdigit(c)) { 1253 if (unicodeescape_decoding_error(&s, &x, errors, message)) 1254 goto onError; 1255 chr = x; 1256 i++; 1257 break; 1258 } 1259 chr = (chr<<4) & ~0xF; 1260 if (c >= '0' && c <= '9') 1261 chr += c - '0'; 1262 else if (c >= 'a' && c <= 'f') 1263 chr += 10 + c - 'a'; 1264 else 1265 chr += 10 + c - 'A'; 1266 } 1267 s += i; 1268 store: 1269 /* when we get here, chr is a 32-bit unicode character */ 1270 if (chr <= 0xffff) 1271 /* UCS-2 character */ 1272 *p++ = (Py_UNICODE) chr; 1273 else if (chr <= 0x10ffff) { 1274 /* UCS-4 character. store as two surrogate characters */ 1275 chr -= 0x10000L; 1276 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10); 1277 *p++ = 0xDC00 + (Py_UNICODE) (chr & ~0xFC00); 1278 } else { 1279 if (unicodeescape_decoding_error( 1280 &s, &x, errors, 1281 "illegal Unicode character") 1282 ) 1283 goto onError; 1284 *p++ = x; /* store replacement character */ 1285 } 1286 break; 1287 1288 /* \N{name} */ 1289 case 'N': 1290 message = "malformed \\N character escape"; 1291 if (ucnhash_CAPI == NULL) { 1292 /* load the unicode data module */ 1293 PyObject *m, *v; 1294 m = PyImport_ImportModule("unicodedata"); 1295 if (m == NULL) 1296 goto ucnhashError; 1297 v = PyObject_GetAttrString(m, "ucnhash_CAPI"); 1298 Py_DECREF(m); 1299 if (v == NULL) 1300 goto ucnhashError; 1301 ucnhash_CAPI = PyCObject_AsVoidPtr(v); 1302 Py_DECREF(v); 1303 if (ucnhash_CAPI == NULL) 1304 goto ucnhashError; 1305 } 1306 if (*s == '{') { 1307 const char *start = s+1; 1308 /* look for the closing brace */ 1309 while (*s != '}' && s < end) 1310 s++; 1311 if (s > start && s < end && *s == '}') { 1312 /* found a name. look it up in the unicode database */ 1313 message = "unknown Unicode character name"; 1314 s++; 1315 if (ucnhash_CAPI->getcode(start, s-start-1, &chr)) 1316 goto store; 1317 } 1318 } 1319 if (unicodeescape_decoding_error(&s, &x, errors, message)) 1320 goto onError; 1321 *p++ = x; 1322 break; 1323 1324 default: 1325 *p++ = '\\'; 1326 *p++ = (unsigned char)s[-1]; 1327 break; 1328 } 1329 } 1330 if (_PyUnicode_Resize(&v, (int)(p - buf))) 1331 goto onError; 1332 return (PyObject *)v; 1333 1334ucnhashError: 1335 PyErr_SetString( 1336 PyExc_UnicodeError, 1337 "\\N escapes not supported (can't load unicodedata module)" 1338 ); 1339 return NULL; 1340 1341onError: 1342 Py_XDECREF(v); 1343 return NULL; 1344} 1345 1346/* Return a Unicode-Escape string version of the Unicode object. 1347 1348 If quotes is true, the string is enclosed in u"" or u'' quotes as 1349 appropriate. 1350 1351*/ 1352 1353static const Py_UNICODE *findchar(const Py_UNICODE *s, 1354 int size, 1355 Py_UNICODE ch); 1356 1357static 1358PyObject *unicodeescape_string(const Py_UNICODE *s, 1359 int size, 1360 int quotes) 1361{ 1362 PyObject *repr; 1363 char *p; 1364 char *q; 1365 1366 static const char *hexdigit = "0123456789abcdef"; 1367 1368 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1); 1369 if (repr == NULL) 1370 return NULL; 1371 1372 p = q = PyString_AS_STRING(repr); 1373 1374 if (quotes) { 1375 *p++ = 'u'; 1376 *p++ = (findchar(s, size, '\'') && 1377 !findchar(s, size, '"')) ? '"' : '\''; 1378 } 1379 while (size-- > 0) { 1380 Py_UNICODE ch = *s++; 1381 /* Escape quotes */ 1382 if (quotes && (ch == q[1] || ch == '\\')) { 1383 *p++ = '\\'; 1384 *p++ = (char) ch; 1385 } 1386 /* Map 16-bit characters to '\uxxxx' */ 1387 else if (ch >= 256) { 1388 *p++ = '\\'; 1389 *p++ = 'u'; 1390 *p++ = hexdigit[(ch >> 12) & 0xf]; 1391 *p++ = hexdigit[(ch >> 8) & 0xf]; 1392 *p++ = hexdigit[(ch >> 4) & 0xf]; 1393 *p++ = hexdigit[ch & 15]; 1394 } 1395 /* Map special whitespace to '\t', \n', '\r' */ 1396 else if (ch == '\t') { 1397 *p++ = '\\'; 1398 *p++ = 't'; 1399 } 1400 else if (ch == '\n') { 1401 *p++ = '\\'; 1402 *p++ = 'n'; 1403 } 1404 else if (ch == '\r') { 1405 *p++ = '\\'; 1406 *p++ = 'r'; 1407 } 1408 /* Map non-printable US ASCII to '\xhh' */ 1409 else if (ch < ' ' || ch >= 128) { 1410 *p++ = '\\'; 1411 *p++ = 'x'; 1412 *p++ = hexdigit[(ch >> 4) & 0xf]; 1413 *p++ = hexdigit[ch & 15]; 1414 } 1415 /* Copy everything else as-is */ 1416 else 1417 *p++ = (char) ch; 1418 } 1419 if (quotes) 1420 *p++ = q[1]; 1421 1422 *p = '\0'; 1423 if (_PyString_Resize(&repr, p - q)) 1424 goto onError; 1425 1426 return repr; 1427 1428 onError: 1429 Py_DECREF(repr); 1430 return NULL; 1431} 1432 1433PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s, 1434 int size) 1435{ 1436 return unicodeescape_string(s, size, 0); 1437} 1438 1439PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode) 1440{ 1441 if (!PyUnicode_Check(unicode)) { 1442 PyErr_BadArgument(); 1443 return NULL; 1444 } 1445 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode), 1446 PyUnicode_GET_SIZE(unicode)); 1447} 1448 1449/* --- Raw Unicode Escape Codec ------------------------------------------- */ 1450 1451PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s, 1452 int size, 1453 const char *errors) 1454{ 1455 PyUnicodeObject *v; 1456 Py_UNICODE *p, *buf; 1457 const char *end; 1458 const char *bs; 1459 1460 /* Escaped strings will always be longer than the resulting 1461 Unicode string, so we start with size here and then reduce the 1462 length after conversion to the true value. */ 1463 v = _PyUnicode_New(size); 1464 if (v == NULL) 1465 goto onError; 1466 if (size == 0) 1467 return (PyObject *)v; 1468 p = buf = PyUnicode_AS_UNICODE(v); 1469 end = s + size; 1470 while (s < end) { 1471 unsigned char c; 1472 Py_UNICODE x; 1473 int i; 1474 1475 /* Non-escape characters are interpreted as Unicode ordinals */ 1476 if (*s != '\\') { 1477 *p++ = (unsigned char)*s++; 1478 continue; 1479 } 1480 1481 /* \u-escapes are only interpreted iff the number of leading 1482 backslashes if odd */ 1483 bs = s; 1484 for (;s < end;) { 1485 if (*s != '\\') 1486 break; 1487 *p++ = (unsigned char)*s++; 1488 } 1489 if (((s - bs) & 1) == 0 || 1490 s >= end || 1491 *s != 'u') { 1492 continue; 1493 } 1494 p--; 1495 s++; 1496 1497 /* \uXXXX with 4 hex digits */ 1498 for (x = 0, i = 0; i < 4; i++) { 1499 c = (unsigned char)s[i]; 1500 if (!isxdigit(c)) { 1501 if (unicodeescape_decoding_error(&s, &x, errors, 1502 "truncated \\uXXXX")) 1503 goto onError; 1504 i++; 1505 break; 1506 } 1507 x = (x<<4) & ~0xF; 1508 if (c >= '0' && c <= '9') 1509 x += c - '0'; 1510 else if (c >= 'a' && c <= 'f') 1511 x += 10 + c - 'a'; 1512 else 1513 x += 10 + c - 'A'; 1514 } 1515 s += i; 1516 *p++ = x; 1517 } 1518 if (_PyUnicode_Resize(&v, (int)(p - buf))) 1519 goto onError; 1520 return (PyObject *)v; 1521 1522 onError: 1523 Py_XDECREF(v); 1524 return NULL; 1525} 1526 1527PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s, 1528 int size) 1529{ 1530 PyObject *repr; 1531 char *p; 1532 char *q; 1533 1534 static const char *hexdigit = "0123456789abcdef"; 1535 1536 repr = PyString_FromStringAndSize(NULL, 6 * size); 1537 if (repr == NULL) 1538 return NULL; 1539 if (size == 0) 1540 return repr; 1541 1542 p = q = PyString_AS_STRING(repr); 1543 while (size-- > 0) { 1544 Py_UNICODE ch = *s++; 1545 /* Map 16-bit characters to '\uxxxx' */ 1546 if (ch >= 256) { 1547 *p++ = '\\'; 1548 *p++ = 'u'; 1549 *p++ = hexdigit[(ch >> 12) & 0xf]; 1550 *p++ = hexdigit[(ch >> 8) & 0xf]; 1551 *p++ = hexdigit[(ch >> 4) & 0xf]; 1552 *p++ = hexdigit[ch & 15]; 1553 } 1554 /* Copy everything else as-is */ 1555 else 1556 *p++ = (char) ch; 1557 } 1558 *p = '\0'; 1559 if (_PyString_Resize(&repr, p - q)) 1560 goto onError; 1561 1562 return repr; 1563 1564 onError: 1565 Py_DECREF(repr); 1566 return NULL; 1567} 1568 1569PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode) 1570{ 1571 if (!PyUnicode_Check(unicode)) { 1572 PyErr_BadArgument(); 1573 return NULL; 1574 } 1575 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode), 1576 PyUnicode_GET_SIZE(unicode)); 1577} 1578 1579/* --- Latin-1 Codec ------------------------------------------------------ */ 1580 1581PyObject *PyUnicode_DecodeLatin1(const char *s, 1582 int size, 1583 const char *errors) 1584{ 1585 PyUnicodeObject *v; 1586 Py_UNICODE *p; 1587 1588 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */ 1589 if (size == 1 && *(unsigned char*)s < 256) { 1590 Py_UNICODE r = *(unsigned char*)s; 1591 return PyUnicode_FromUnicode(&r, 1); 1592 } 1593 1594 v = _PyUnicode_New(size); 1595 if (v == NULL) 1596 goto onError; 1597 if (size == 0) 1598 return (PyObject *)v; 1599 p = PyUnicode_AS_UNICODE(v); 1600 while (size-- > 0) 1601 *p++ = (unsigned char)*s++; 1602 return (PyObject *)v; 1603 1604 onError: 1605 Py_XDECREF(v); 1606 return NULL; 1607} 1608 1609static 1610int latin1_encoding_error(const Py_UNICODE **source, 1611 char **dest, 1612 const char *errors, 1613 const char *details) 1614{ 1615 if ((errors == NULL) || 1616 (strcmp(errors,"strict") == 0)) { 1617 PyErr_Format(PyExc_UnicodeError, 1618 "Latin-1 encoding error: %.400s", 1619 details); 1620 return -1; 1621 } 1622 else if (strcmp(errors,"ignore") == 0) { 1623 return 0; 1624 } 1625 else if (strcmp(errors,"replace") == 0) { 1626 **dest = '?'; 1627 (*dest)++; 1628 return 0; 1629 } 1630 else { 1631 PyErr_Format(PyExc_ValueError, 1632 "Latin-1 encoding error; " 1633 "unknown error handling code: %.400s", 1634 errors); 1635 return -1; 1636 } 1637} 1638 1639PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p, 1640 int size, 1641 const char *errors) 1642{ 1643 PyObject *repr; 1644 char *s, *start; 1645 1646 repr = PyString_FromStringAndSize(NULL, size); 1647 if (repr == NULL) 1648 return NULL; 1649 if (size == 0) 1650 return repr; 1651 1652 s = PyString_AS_STRING(repr); 1653 start = s; 1654 while (size-- > 0) { 1655 Py_UNICODE ch = *p++; 1656 if (ch >= 256) { 1657 if (latin1_encoding_error(&p, &s, errors, 1658 "ordinal not in range(256)")) 1659 goto onError; 1660 } 1661 else 1662 *s++ = (char)ch; 1663 } 1664 /* Resize if error handling skipped some characters */ 1665 if (s - start < PyString_GET_SIZE(repr)) 1666 if (_PyString_Resize(&repr, s - start)) 1667 goto onError; 1668 return repr; 1669 1670 onError: 1671 Py_DECREF(repr); 1672 return NULL; 1673} 1674 1675PyObject *PyUnicode_AsLatin1String(PyObject *unicode) 1676{ 1677 if (!PyUnicode_Check(unicode)) { 1678 PyErr_BadArgument(); 1679 return NULL; 1680 } 1681 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode), 1682 PyUnicode_GET_SIZE(unicode), 1683 NULL); 1684} 1685 1686/* --- 7-bit ASCII Codec -------------------------------------------------- */ 1687 1688static 1689int ascii_decoding_error(const char **source, 1690 Py_UNICODE **dest, 1691 const char *errors, 1692 const char *details) 1693{ 1694 if ((errors == NULL) || 1695 (strcmp(errors,"strict") == 0)) { 1696 PyErr_Format(PyExc_UnicodeError, 1697 "ASCII decoding error: %.400s", 1698 details); 1699 return -1; 1700 } 1701 else if (strcmp(errors,"ignore") == 0) { 1702 return 0; 1703 } 1704 else if (strcmp(errors,"replace") == 0) { 1705 **dest = Py_UNICODE_REPLACEMENT_CHARACTER; 1706 (*dest)++; 1707 return 0; 1708 } 1709 else { 1710 PyErr_Format(PyExc_ValueError, 1711 "ASCII decoding error; " 1712 "unknown error handling code: %.400s", 1713 errors); 1714 return -1; 1715 } 1716} 1717 1718PyObject *PyUnicode_DecodeASCII(const char *s, 1719 int size, 1720 const char *errors) 1721{ 1722 PyUnicodeObject *v; 1723 Py_UNICODE *p; 1724 1725 /* ASCII is equivalent to the first 128 ordinals in Unicode. */ 1726 if (size == 1 && *(unsigned char*)s < 128) { 1727 Py_UNICODE r = *(unsigned char*)s; 1728 return PyUnicode_FromUnicode(&r, 1); 1729 } 1730 1731 v = _PyUnicode_New(size); 1732 if (v == NULL) 1733 goto onError; 1734 if (size == 0) 1735 return (PyObject *)v; 1736 p = PyUnicode_AS_UNICODE(v); 1737 while (size-- > 0) { 1738 register unsigned char c; 1739 1740 c = (unsigned char)*s++; 1741 if (c < 128) 1742 *p++ = c; 1743 else if (ascii_decoding_error(&s, &p, errors, 1744 "ordinal not in range(128)")) 1745 goto onError; 1746 } 1747 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v)) 1748 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v)))) 1749 goto onError; 1750 return (PyObject *)v; 1751 1752 onError: 1753 Py_XDECREF(v); 1754 return NULL; 1755} 1756 1757static 1758int ascii_encoding_error(const Py_UNICODE **source, 1759 char **dest, 1760 const char *errors, 1761 const char *details) 1762{ 1763 if ((errors == NULL) || 1764 (strcmp(errors,"strict") == 0)) { 1765 PyErr_Format(PyExc_UnicodeError, 1766 "ASCII encoding error: %.400s", 1767 details); 1768 return -1; 1769 } 1770 else if (strcmp(errors,"ignore") == 0) { 1771 return 0; 1772 } 1773 else if (strcmp(errors,"replace") == 0) { 1774 **dest = '?'; 1775 (*dest)++; 1776 return 0; 1777 } 1778 else { 1779 PyErr_Format(PyExc_ValueError, 1780 "ASCII encoding error; " 1781 "unknown error handling code: %.400s", 1782 errors); 1783 return -1; 1784 } 1785} 1786 1787PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p, 1788 int size, 1789 const char *errors) 1790{ 1791 PyObject *repr; 1792 char *s, *start; 1793 1794 repr = PyString_FromStringAndSize(NULL, size); 1795 if (repr == NULL) 1796 return NULL; 1797 if (size == 0) 1798 return repr; 1799 1800 s = PyString_AS_STRING(repr); 1801 start = s; 1802 while (size-- > 0) { 1803 Py_UNICODE ch = *p++; 1804 if (ch >= 128) { 1805 if (ascii_encoding_error(&p, &s, errors, 1806 "ordinal not in range(128)")) 1807 goto onError; 1808 } 1809 else 1810 *s++ = (char)ch; 1811 } 1812 /* Resize if error handling skipped some characters */ 1813 if (s - start < PyString_GET_SIZE(repr)) 1814 if (_PyString_Resize(&repr, s - start)) 1815 goto onError; 1816 return repr; 1817 1818 onError: 1819 Py_DECREF(repr); 1820 return NULL; 1821} 1822 1823PyObject *PyUnicode_AsASCIIString(PyObject *unicode) 1824{ 1825 if (!PyUnicode_Check(unicode)) { 1826 PyErr_BadArgument(); 1827 return NULL; 1828 } 1829 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode), 1830 PyUnicode_GET_SIZE(unicode), 1831 NULL); 1832} 1833 1834#ifdef MS_WIN32 1835 1836/* --- MBCS codecs for Windows -------------------------------------------- */ 1837 1838PyObject *PyUnicode_DecodeMBCS(const char *s, 1839 int size, 1840 const char *errors) 1841{ 1842 PyUnicodeObject *v; 1843 Py_UNICODE *p; 1844 1845 /* First get the size of the result */ 1846 DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0); 1847 if (size > 0 && usize==0) 1848 return PyErr_SetFromWindowsErrWithFilename(0, NULL); 1849 1850 v = _PyUnicode_New(usize); 1851 if (v == NULL) 1852 return NULL; 1853 if (usize == 0) 1854 return (PyObject *)v; 1855 p = PyUnicode_AS_UNICODE(v); 1856 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) { 1857 Py_DECREF(v); 1858 return PyErr_SetFromWindowsErrWithFilename(0, NULL); 1859 } 1860 1861 return (PyObject *)v; 1862} 1863 1864PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p, 1865 int size, 1866 const char *errors) 1867{ 1868 PyObject *repr; 1869 char *s; 1870 DWORD mbcssize; 1871 1872 /* If there are no characters, bail now! */ 1873 if (size==0) 1874 return PyString_FromString(""); 1875 1876 /* First get the size of the result */ 1877 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL); 1878 if (mbcssize==0) 1879 return PyErr_SetFromWindowsErrWithFilename(0, NULL); 1880 1881 repr = PyString_FromStringAndSize(NULL, mbcssize); 1882 if (repr == NULL) 1883 return NULL; 1884 if (mbcssize == 0) 1885 return repr; 1886 1887 /* Do the conversion */ 1888 s = PyString_AS_STRING(repr); 1889 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) { 1890 Py_DECREF(repr); 1891 return PyErr_SetFromWindowsErrWithFilename(0, NULL); 1892 } 1893 return repr; 1894} 1895 1896#endif /* MS_WIN32 */ 1897 1898/* --- Character Mapping Codec -------------------------------------------- */ 1899 1900static 1901int charmap_decoding_error(const char **source, 1902 Py_UNICODE **dest, 1903 const char *errors, 1904 const char *details) 1905{ 1906 if ((errors == NULL) || 1907 (strcmp(errors,"strict") == 0)) { 1908 PyErr_Format(PyExc_UnicodeError, 1909 "charmap decoding error: %.400s", 1910 details); 1911 return -1; 1912 } 1913 else if (strcmp(errors,"ignore") == 0) { 1914 return 0; 1915 } 1916 else if (strcmp(errors,"replace") == 0) { 1917 **dest = Py_UNICODE_REPLACEMENT_CHARACTER; 1918 (*dest)++; 1919 return 0; 1920 } 1921 else { 1922 PyErr_Format(PyExc_ValueError, 1923 "charmap decoding error; " 1924 "unknown error handling code: %.400s", 1925 errors); 1926 return -1; 1927 } 1928} 1929 1930PyObject *PyUnicode_DecodeCharmap(const char *s, 1931 int size, 1932 PyObject *mapping, 1933 const char *errors) 1934{ 1935 PyUnicodeObject *v; 1936 Py_UNICODE *p; 1937 int extrachars = 0; 1938 1939 /* Default to Latin-1 */ 1940 if (mapping == NULL) 1941 return PyUnicode_DecodeLatin1(s, size, errors); 1942 1943 v = _PyUnicode_New(size); 1944 if (v == NULL) 1945 goto onError; 1946 if (size == 0) 1947 return (PyObject *)v; 1948 p = PyUnicode_AS_UNICODE(v); 1949 while (size-- > 0) { 1950 unsigned char ch = *s++; 1951 PyObject *w, *x; 1952 1953 /* Get mapping (char ordinal -> integer, Unicode char or None) */ 1954 w = PyInt_FromLong((long)ch); 1955 if (w == NULL) 1956 goto onError; 1957 x = PyObject_GetItem(mapping, w); 1958 Py_DECREF(w); 1959 if (x == NULL) { 1960 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 1961 /* No mapping found means: mapping is undefined. */ 1962 PyErr_Clear(); 1963 x = Py_None; 1964 Py_INCREF(x); 1965 } else 1966 goto onError; 1967 } 1968 1969 /* Apply mapping */ 1970 if (PyInt_Check(x)) { 1971 long value = PyInt_AS_LONG(x); 1972 if (value < 0 || value > 65535) { 1973 PyErr_SetString(PyExc_TypeError, 1974 "character mapping must be in range(65536)"); 1975 Py_DECREF(x); 1976 goto onError; 1977 } 1978 *p++ = (Py_UNICODE)value; 1979 } 1980 else if (x == Py_None) { 1981 /* undefined mapping */ 1982 if (charmap_decoding_error(&s, &p, errors, 1983 "character maps to <undefined>")) { 1984 Py_DECREF(x); 1985 goto onError; 1986 } 1987 } 1988 else if (PyUnicode_Check(x)) { 1989 int targetsize = PyUnicode_GET_SIZE(x); 1990 1991 if (targetsize == 1) 1992 /* 1-1 mapping */ 1993 *p++ = *PyUnicode_AS_UNICODE(x); 1994 1995 else if (targetsize > 1) { 1996 /* 1-n mapping */ 1997 if (targetsize > extrachars) { 1998 /* resize first */ 1999 int oldpos = (int)(p - PyUnicode_AS_UNICODE(v)); 2000 int needed = (targetsize - extrachars) + \ 2001 (targetsize << 2); 2002 extrachars += needed; 2003 if (_PyUnicode_Resize(&v, 2004 PyUnicode_GET_SIZE(v) + needed)) { 2005 Py_DECREF(x); 2006 goto onError; 2007 } 2008 p = PyUnicode_AS_UNICODE(v) + oldpos; 2009 } 2010 Py_UNICODE_COPY(p, 2011 PyUnicode_AS_UNICODE(x), 2012 targetsize); 2013 p += targetsize; 2014 extrachars -= targetsize; 2015 } 2016 /* 1-0 mapping: skip the character */ 2017 } 2018 else { 2019 /* wrong return value */ 2020 PyErr_SetString(PyExc_TypeError, 2021 "character mapping must return integer, None or unicode"); 2022 Py_DECREF(x); 2023 goto onError; 2024 } 2025 Py_DECREF(x); 2026 } 2027 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v)) 2028 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v)))) 2029 goto onError; 2030 return (PyObject *)v; 2031 2032 onError: 2033 Py_XDECREF(v); 2034 return NULL; 2035} 2036 2037static 2038int charmap_encoding_error(const Py_UNICODE **source, 2039 char **dest, 2040 const char *errors, 2041 const char *details) 2042{ 2043 if ((errors == NULL) || 2044 (strcmp(errors,"strict") == 0)) { 2045 PyErr_Format(PyExc_UnicodeError, 2046 "charmap encoding error: %.400s", 2047 details); 2048 return -1; 2049 } 2050 else if (strcmp(errors,"ignore") == 0) { 2051 return 0; 2052 } 2053 else if (strcmp(errors,"replace") == 0) { 2054 **dest = '?'; 2055 (*dest)++; 2056 return 0; 2057 } 2058 else { 2059 PyErr_Format(PyExc_ValueError, 2060 "charmap encoding error; " 2061 "unknown error handling code: %.400s", 2062 errors); 2063 return -1; 2064 } 2065} 2066 2067PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p, 2068 int size, 2069 PyObject *mapping, 2070 const char *errors) 2071{ 2072 PyObject *v; 2073 char *s; 2074 int extrachars = 0; 2075 2076 /* Default to Latin-1 */ 2077 if (mapping == NULL) 2078 return PyUnicode_EncodeLatin1(p, size, errors); 2079 2080 v = PyString_FromStringAndSize(NULL, size); 2081 if (v == NULL) 2082 return NULL; 2083 if (size == 0) 2084 return v; 2085 s = PyString_AS_STRING(v); 2086 while (size-- > 0) { 2087 Py_UNICODE ch = *p++; 2088 PyObject *w, *x; 2089 2090 /* Get mapping (Unicode ordinal -> string char, integer or None) */ 2091 w = PyInt_FromLong((long)ch); 2092 if (w == NULL) 2093 goto onError; 2094 x = PyObject_GetItem(mapping, w); 2095 Py_DECREF(w); 2096 if (x == NULL) { 2097 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 2098 /* No mapping found means: mapping is undefined. */ 2099 PyErr_Clear(); 2100 x = Py_None; 2101 Py_INCREF(x); 2102 } else 2103 goto onError; 2104 } 2105 2106 /* Apply mapping */ 2107 if (PyInt_Check(x)) { 2108 long value = PyInt_AS_LONG(x); 2109 if (value < 0 || value > 255) { 2110 PyErr_SetString(PyExc_TypeError, 2111 "character mapping must be in range(256)"); 2112 Py_DECREF(x); 2113 goto onError; 2114 } 2115 *s++ = (char)value; 2116 } 2117 else if (x == Py_None) { 2118 /* undefined mapping */ 2119 if (charmap_encoding_error(&p, &s, errors, 2120 "character maps to <undefined>")) { 2121 Py_DECREF(x); 2122 goto onError; 2123 } 2124 } 2125 else if (PyString_Check(x)) { 2126 int targetsize = PyString_GET_SIZE(x); 2127 2128 if (targetsize == 1) 2129 /* 1-1 mapping */ 2130 *s++ = *PyString_AS_STRING(x); 2131 2132 else if (targetsize > 1) { 2133 /* 1-n mapping */ 2134 if (targetsize > extrachars) { 2135 /* resize first */ 2136 int oldpos = (int)(s - PyString_AS_STRING(v)); 2137 int needed = (targetsize - extrachars) + \ 2138 (targetsize << 2); 2139 extrachars += needed; 2140 if (_PyString_Resize(&v, PyString_GET_SIZE(v) + needed)) { 2141 Py_DECREF(x); 2142 goto onError; 2143 } 2144 s = PyString_AS_STRING(v) + oldpos; 2145 } 2146 memcpy(s, PyString_AS_STRING(x), targetsize); 2147 s += targetsize; 2148 extrachars -= targetsize; 2149 } 2150 /* 1-0 mapping: skip the character */ 2151 } 2152 else { 2153 /* wrong return value */ 2154 PyErr_SetString(PyExc_TypeError, 2155 "character mapping must return integer, None or unicode"); 2156 Py_DECREF(x); 2157 goto onError; 2158 } 2159 Py_DECREF(x); 2160 } 2161 if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v)) 2162 if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v)))) 2163 goto onError; 2164 return v; 2165 2166 onError: 2167 Py_DECREF(v); 2168 return NULL; 2169} 2170 2171PyObject *PyUnicode_AsCharmapString(PyObject *unicode, 2172 PyObject *mapping) 2173{ 2174 if (!PyUnicode_Check(unicode) || mapping == NULL) { 2175 PyErr_BadArgument(); 2176 return NULL; 2177 } 2178 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode), 2179 PyUnicode_GET_SIZE(unicode), 2180 mapping, 2181 NULL); 2182} 2183 2184static 2185int translate_error(const Py_UNICODE **source, 2186 Py_UNICODE **dest, 2187 const char *errors, 2188 const char *details) 2189{ 2190 if ((errors == NULL) || 2191 (strcmp(errors,"strict") == 0)) { 2192 PyErr_Format(PyExc_UnicodeError, 2193 "translate error: %.400s", 2194 details); 2195 return -1; 2196 } 2197 else if (strcmp(errors,"ignore") == 0) { 2198 return 0; 2199 } 2200 else if (strcmp(errors,"replace") == 0) { 2201 **dest = '?'; 2202 (*dest)++; 2203 return 0; 2204 } 2205 else { 2206 PyErr_Format(PyExc_ValueError, 2207 "translate error; " 2208 "unknown error handling code: %.400s", 2209 errors); 2210 return -1; 2211 } 2212} 2213 2214PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *s, 2215 int size, 2216 PyObject *mapping, 2217 const char *errors) 2218{ 2219 PyUnicodeObject *v; 2220 Py_UNICODE *p; 2221 2222 if (mapping == NULL) { 2223 PyErr_BadArgument(); 2224 return NULL; 2225 } 2226 2227 /* Output will never be longer than input */ 2228 v = _PyUnicode_New(size); 2229 if (v == NULL) 2230 goto onError; 2231 if (size == 0) 2232 goto done; 2233 p = PyUnicode_AS_UNICODE(v); 2234 while (size-- > 0) { 2235 Py_UNICODE ch = *s++; 2236 PyObject *w, *x; 2237 2238 /* Get mapping */ 2239 w = PyInt_FromLong(ch); 2240 if (w == NULL) 2241 goto onError; 2242 x = PyObject_GetItem(mapping, w); 2243 Py_DECREF(w); 2244 if (x == NULL) { 2245 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 2246 /* No mapping found: default to 1-1 mapping */ 2247 PyErr_Clear(); 2248 *p++ = ch; 2249 continue; 2250 } 2251 goto onError; 2252 } 2253 2254 /* Apply mapping */ 2255 if (PyInt_Check(x)) 2256 *p++ = (Py_UNICODE)PyInt_AS_LONG(x); 2257 else if (x == Py_None) { 2258 /* undefined mapping */ 2259 if (translate_error(&s, &p, errors, 2260 "character maps to <undefined>")) { 2261 Py_DECREF(x); 2262 goto onError; 2263 } 2264 } 2265 else if (PyUnicode_Check(x)) { 2266 if (PyUnicode_GET_SIZE(x) != 1) { 2267 /* 1-n mapping */ 2268 PyErr_SetString(PyExc_NotImplementedError, 2269 "1-n mappings are currently not implemented"); 2270 Py_DECREF(x); 2271 goto onError; 2272 } 2273 *p++ = *PyUnicode_AS_UNICODE(x); 2274 } 2275 else { 2276 /* wrong return value */ 2277 PyErr_SetString(PyExc_TypeError, 2278 "translate mapping must return integer, None or unicode"); 2279 Py_DECREF(x); 2280 goto onError; 2281 } 2282 Py_DECREF(x); 2283 } 2284 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v)) 2285 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v)))) 2286 goto onError; 2287 2288 done: 2289 return (PyObject *)v; 2290 2291 onError: 2292 Py_XDECREF(v); 2293 return NULL; 2294} 2295 2296PyObject *PyUnicode_Translate(PyObject *str, 2297 PyObject *mapping, 2298 const char *errors) 2299{ 2300 PyObject *result; 2301 2302 str = PyUnicode_FromObject(str); 2303 if (str == NULL) 2304 goto onError; 2305 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str), 2306 PyUnicode_GET_SIZE(str), 2307 mapping, 2308 errors); 2309 Py_DECREF(str); 2310 return result; 2311 2312 onError: 2313 Py_XDECREF(str); 2314 return NULL; 2315} 2316 2317/* --- Decimal Encoder ---------------------------------------------------- */ 2318 2319int PyUnicode_EncodeDecimal(Py_UNICODE *s, 2320 int length, 2321 char *output, 2322 const char *errors) 2323{ 2324 Py_UNICODE *p, *end; 2325 2326 if (output == NULL) { 2327 PyErr_BadArgument(); 2328 return -1; 2329 } 2330 2331 p = s; 2332 end = s + length; 2333 while (p < end) { 2334 register Py_UNICODE ch = *p++; 2335 int decimal; 2336 2337 if (Py_UNICODE_ISSPACE(ch)) { 2338 *output++ = ' '; 2339 continue; 2340 } 2341 decimal = Py_UNICODE_TODECIMAL(ch); 2342 if (decimal >= 0) { 2343 *output++ = '0' + decimal; 2344 continue; 2345 } 2346 if (0 < ch && ch < 256) { 2347 *output++ = (char)ch; 2348 continue; 2349 } 2350 /* All other characters are considered invalid */ 2351 if (errors == NULL || strcmp(errors, "strict") == 0) { 2352 PyErr_SetString(PyExc_ValueError, 2353 "invalid decimal Unicode string"); 2354 goto onError; 2355 } 2356 else if (strcmp(errors, "ignore") == 0) 2357 continue; 2358 else if (strcmp(errors, "replace") == 0) { 2359 *output++ = '?'; 2360 continue; 2361 } 2362 } 2363 /* 0-terminate the output string */ 2364 *output++ = '\0'; 2365 return 0; 2366 2367 onError: 2368 return -1; 2369} 2370 2371/* --- Helpers ------------------------------------------------------------ */ 2372 2373static 2374int count(PyUnicodeObject *self, 2375 int start, 2376 int end, 2377 PyUnicodeObject *substring) 2378{ 2379 int count = 0; 2380 2381 if (start < 0) 2382 start += self->length; 2383 if (start < 0) 2384 start = 0; 2385 if (end > self->length) 2386 end = self->length; 2387 if (end < 0) 2388 end += self->length; 2389 if (end < 0) 2390 end = 0; 2391 2392 if (substring->length == 0) 2393 return (end - start + 1); 2394 2395 end -= substring->length; 2396 2397 while (start <= end) 2398 if (Py_UNICODE_MATCH(self, start, substring)) { 2399 count++; 2400 start += substring->length; 2401 } else 2402 start++; 2403 2404 return count; 2405} 2406 2407int PyUnicode_Count(PyObject *str, 2408 PyObject *substr, 2409 int start, 2410 int end) 2411{ 2412 int result; 2413 2414 str = PyUnicode_FromObject(str); 2415 if (str == NULL) 2416 return -1; 2417 substr = PyUnicode_FromObject(substr); 2418 if (substr == NULL) { 2419 Py_DECREF(str); 2420 return -1; 2421 } 2422 2423 result = count((PyUnicodeObject *)str, 2424 start, end, 2425 (PyUnicodeObject *)substr); 2426 2427 Py_DECREF(str); 2428 Py_DECREF(substr); 2429 return result; 2430} 2431 2432static 2433int findstring(PyUnicodeObject *self, 2434 PyUnicodeObject *substring, 2435 int start, 2436 int end, 2437 int direction) 2438{ 2439 if (start < 0) 2440 start += self->length; 2441 if (start < 0) 2442 start = 0; 2443 2444 if (substring->length == 0) 2445 return start; 2446 2447 if (end > self->length) 2448 end = self->length; 2449 if (end < 0) 2450 end += self->length; 2451 if (end < 0) 2452 end = 0; 2453 2454 end -= substring->length; 2455 2456 if (direction < 0) { 2457 for (; end >= start; end--) 2458 if (Py_UNICODE_MATCH(self, end, substring)) 2459 return end; 2460 } else { 2461 for (; start <= end; start++) 2462 if (Py_UNICODE_MATCH(self, start, substring)) 2463 return start; 2464 } 2465 2466 return -1; 2467} 2468 2469int PyUnicode_Find(PyObject *str, 2470 PyObject *substr, 2471 int start, 2472 int end, 2473 int direction) 2474{ 2475 int result; 2476 2477 str = PyUnicode_FromObject(str); 2478 if (str == NULL) 2479 return -1; 2480 substr = PyUnicode_FromObject(substr); 2481 if (substr == NULL) { 2482 Py_DECREF(substr); 2483 return -1; 2484 } 2485 2486 result = findstring((PyUnicodeObject *)str, 2487 (PyUnicodeObject *)substr, 2488 start, end, direction); 2489 Py_DECREF(str); 2490 Py_DECREF(substr); 2491 return result; 2492} 2493 2494static 2495int tailmatch(PyUnicodeObject *self, 2496 PyUnicodeObject *substring, 2497 int start, 2498 int end, 2499 int direction) 2500{ 2501 if (start < 0) 2502 start += self->length; 2503 if (start < 0) 2504 start = 0; 2505 2506 if (substring->length == 0) 2507 return 1; 2508 2509 if (end > self->length) 2510 end = self->length; 2511 if (end < 0) 2512 end += self->length; 2513 if (end < 0) 2514 end = 0; 2515 2516 end -= substring->length; 2517 if (end < start) 2518 return 0; 2519 2520 if (direction > 0) { 2521 if (Py_UNICODE_MATCH(self, end, substring)) 2522 return 1; 2523 } else { 2524 if (Py_UNICODE_MATCH(self, start, substring)) 2525 return 1; 2526 } 2527 2528 return 0; 2529} 2530 2531int PyUnicode_Tailmatch(PyObject *str, 2532 PyObject *substr, 2533 int start, 2534 int end, 2535 int direction) 2536{ 2537 int result; 2538 2539 str = PyUnicode_FromObject(str); 2540 if (str == NULL) 2541 return -1; 2542 substr = PyUnicode_FromObject(substr); 2543 if (substr == NULL) { 2544 Py_DECREF(substr); 2545 return -1; 2546 } 2547 2548 result = tailmatch((PyUnicodeObject *)str, 2549 (PyUnicodeObject *)substr, 2550 start, end, direction); 2551 Py_DECREF(str); 2552 Py_DECREF(substr); 2553 return result; 2554} 2555 2556static 2557const Py_UNICODE *findchar(const Py_UNICODE *s, 2558 int size, 2559 Py_UNICODE ch) 2560{ 2561 /* like wcschr, but doesn't stop at NULL characters */ 2562 2563 while (size-- > 0) { 2564 if (*s == ch) 2565 return s; 2566 s++; 2567 } 2568 2569 return NULL; 2570} 2571 2572/* Apply fixfct filter to the Unicode object self and return a 2573 reference to the modified object */ 2574 2575static 2576PyObject *fixup(PyUnicodeObject *self, 2577 int (*fixfct)(PyUnicodeObject *s)) 2578{ 2579 2580 PyUnicodeObject *u; 2581 2582 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length); 2583 if (u == NULL) 2584 return NULL; 2585 2586 Py_UNICODE_COPY(u->str, self->str, self->length); 2587 2588 if (!fixfct(u)) { 2589 /* fixfct should return TRUE if it modified the buffer. If 2590 FALSE, return a reference to the original buffer instead 2591 (to save space, not time) */ 2592 Py_INCREF(self); 2593 Py_DECREF(u); 2594 return (PyObject*) self; 2595 } 2596 return (PyObject*) u; 2597} 2598 2599static 2600int fixupper(PyUnicodeObject *self) 2601{ 2602 int len = self->length; 2603 Py_UNICODE *s = self->str; 2604 int status = 0; 2605 2606 while (len-- > 0) { 2607 register Py_UNICODE ch; 2608 2609 ch = Py_UNICODE_TOUPPER(*s); 2610 if (ch != *s) { 2611 status = 1; 2612 *s = ch; 2613 } 2614 s++; 2615 } 2616 2617 return status; 2618} 2619 2620static 2621int fixlower(PyUnicodeObject *self) 2622{ 2623 int len = self->length; 2624 Py_UNICODE *s = self->str; 2625 int status = 0; 2626 2627 while (len-- > 0) { 2628 register Py_UNICODE ch; 2629 2630 ch = Py_UNICODE_TOLOWER(*s); 2631 if (ch != *s) { 2632 status = 1; 2633 *s = ch; 2634 } 2635 s++; 2636 } 2637 2638 return status; 2639} 2640 2641static 2642int fixswapcase(PyUnicodeObject *self) 2643{ 2644 int len = self->length; 2645 Py_UNICODE *s = self->str; 2646 int status = 0; 2647 2648 while (len-- > 0) { 2649 if (Py_UNICODE_ISUPPER(*s)) { 2650 *s = Py_UNICODE_TOLOWER(*s); 2651 status = 1; 2652 } else if (Py_UNICODE_ISLOWER(*s)) { 2653 *s = Py_UNICODE_TOUPPER(*s); 2654 status = 1; 2655 } 2656 s++; 2657 } 2658 2659 return status; 2660} 2661 2662static 2663int fixcapitalize(PyUnicodeObject *self) 2664{ 2665 int len = self->length; 2666 Py_UNICODE *s = self->str; 2667 int status = 0; 2668 2669 if (len == 0) 2670 return 0; 2671 if (Py_UNICODE_ISLOWER(*s)) { 2672 *s = Py_UNICODE_TOUPPER(*s); 2673 status = 1; 2674 } 2675 s++; 2676 while (--len > 0) { 2677 if (Py_UNICODE_ISUPPER(*s)) { 2678 *s = Py_UNICODE_TOLOWER(*s); 2679 status = 1; 2680 } 2681 s++; 2682 } 2683 return status; 2684} 2685 2686static 2687int fixtitle(PyUnicodeObject *self) 2688{ 2689 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 2690 register Py_UNICODE *e; 2691 int previous_is_cased; 2692 2693 /* Shortcut for single character strings */ 2694 if (PyUnicode_GET_SIZE(self) == 1) { 2695 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p); 2696 if (*p != ch) { 2697 *p = ch; 2698 return 1; 2699 } 2700 else 2701 return 0; 2702 } 2703 2704 e = p + PyUnicode_GET_SIZE(self); 2705 previous_is_cased = 0; 2706 for (; p < e; p++) { 2707 register const Py_UNICODE ch = *p; 2708 2709 if (previous_is_cased) 2710 *p = Py_UNICODE_TOLOWER(ch); 2711 else 2712 *p = Py_UNICODE_TOTITLE(ch); 2713 2714 if (Py_UNICODE_ISLOWER(ch) || 2715 Py_UNICODE_ISUPPER(ch) || 2716 Py_UNICODE_ISTITLE(ch)) 2717 previous_is_cased = 1; 2718 else 2719 previous_is_cased = 0; 2720 } 2721 return 1; 2722} 2723 2724PyObject *PyUnicode_Join(PyObject *separator, 2725 PyObject *seq) 2726{ 2727 Py_UNICODE *sep; 2728 int seplen; 2729 PyUnicodeObject *res = NULL; 2730 int reslen = 0; 2731 Py_UNICODE *p; 2732 int sz = 100; 2733 int i; 2734 PyObject *it; 2735 2736 it = PyObject_GetIter(seq); 2737 if (it == NULL) 2738 return NULL; 2739 2740 if (separator == NULL) { 2741 Py_UNICODE blank = ' '; 2742 sep = ␣ 2743 seplen = 1; 2744 } 2745 else { 2746 separator = PyUnicode_FromObject(separator); 2747 if (separator == NULL) 2748 goto onError; 2749 sep = PyUnicode_AS_UNICODE(separator); 2750 seplen = PyUnicode_GET_SIZE(separator); 2751 } 2752 2753 res = _PyUnicode_New(sz); 2754 if (res == NULL) 2755 goto onError; 2756 p = PyUnicode_AS_UNICODE(res); 2757 reslen = 0; 2758 2759 for (i = 0; ; ++i) { 2760 int itemlen; 2761 PyObject *item = PyIter_Next(it); 2762 if (item == NULL) { 2763 if (PyErr_Occurred()) 2764 goto onError; 2765 break; 2766 } 2767 if (!PyUnicode_Check(item)) { 2768 PyObject *v; 2769 v = PyUnicode_FromObject(item); 2770 Py_DECREF(item); 2771 item = v; 2772 if (item == NULL) 2773 goto onError; 2774 } 2775 itemlen = PyUnicode_GET_SIZE(item); 2776 while (reslen + itemlen + seplen >= sz) { 2777 if (_PyUnicode_Resize(&res, sz*2)) 2778 goto onError; 2779 sz *= 2; 2780 p = PyUnicode_AS_UNICODE(res) + reslen; 2781 } 2782 if (i > 0) { 2783 Py_UNICODE_COPY(p, sep, seplen); 2784 p += seplen; 2785 reslen += seplen; 2786 } 2787 Py_UNICODE_COPY(p, PyUnicode_AS_UNICODE(item), itemlen); 2788 p += itemlen; 2789 reslen += itemlen; 2790 Py_DECREF(item); 2791 } 2792 if (_PyUnicode_Resize(&res, reslen)) 2793 goto onError; 2794 2795 Py_XDECREF(separator); 2796 Py_DECREF(it); 2797 return (PyObject *)res; 2798 2799 onError: 2800 Py_XDECREF(separator); 2801 Py_XDECREF(res); 2802 Py_DECREF(it); 2803 return NULL; 2804} 2805 2806static 2807PyUnicodeObject *pad(PyUnicodeObject *self, 2808 int left, 2809 int right, 2810 Py_UNICODE fill) 2811{ 2812 PyUnicodeObject *u; 2813 2814 if (left < 0) 2815 left = 0; 2816 if (right < 0) 2817 right = 0; 2818 2819 if (left == 0 && right == 0) { 2820 Py_INCREF(self); 2821 return self; 2822 } 2823 2824 u = _PyUnicode_New(left + self->length + right); 2825 if (u) { 2826 if (left) 2827 Py_UNICODE_FILL(u->str, fill, left); 2828 Py_UNICODE_COPY(u->str + left, self->str, self->length); 2829 if (right) 2830 Py_UNICODE_FILL(u->str + left + self->length, fill, right); 2831 } 2832 2833 return u; 2834} 2835 2836#define SPLIT_APPEND(data, left, right) \ 2837 str = PyUnicode_FromUnicode(data + left, right - left); \ 2838 if (!str) \ 2839 goto onError; \ 2840 if (PyList_Append(list, str)) { \ 2841 Py_DECREF(str); \ 2842 goto onError; \ 2843 } \ 2844 else \ 2845 Py_DECREF(str); 2846 2847static 2848PyObject *split_whitespace(PyUnicodeObject *self, 2849 PyObject *list, 2850 int maxcount) 2851{ 2852 register int i; 2853 register int j; 2854 int len = self->length; 2855 PyObject *str; 2856 2857 for (i = j = 0; i < len; ) { 2858 /* find a token */ 2859 while (i < len && Py_UNICODE_ISSPACE(self->str[i])) 2860 i++; 2861 j = i; 2862 while (i < len && !Py_UNICODE_ISSPACE(self->str[i])) 2863 i++; 2864 if (j < i) { 2865 if (maxcount-- <= 0) 2866 break; 2867 SPLIT_APPEND(self->str, j, i); 2868 while (i < len && Py_UNICODE_ISSPACE(self->str[i])) 2869 i++; 2870 j = i; 2871 } 2872 } 2873 if (j < len) { 2874 SPLIT_APPEND(self->str, j, len); 2875 } 2876 return list; 2877 2878 onError: 2879 Py_DECREF(list); 2880 return NULL; 2881} 2882 2883PyObject *PyUnicode_Splitlines(PyObject *string, 2884 int keepends) 2885{ 2886 register int i; 2887 register int j; 2888 int len; 2889 PyObject *list; 2890 PyObject *str; 2891 Py_UNICODE *data; 2892 2893 string = PyUnicode_FromObject(string); 2894 if (string == NULL) 2895 return NULL; 2896 data = PyUnicode_AS_UNICODE(string); 2897 len = PyUnicode_GET_SIZE(string); 2898 2899 list = PyList_New(0); 2900 if (!list) 2901 goto onError; 2902 2903 for (i = j = 0; i < len; ) { 2904 int eol; 2905 2906 /* Find a line and append it */ 2907 while (i < len && !Py_UNICODE_ISLINEBREAK(data[i])) 2908 i++; 2909 2910 /* Skip the line break reading CRLF as one line break */ 2911 eol = i; 2912 if (i < len) { 2913 if (data[i] == '\r' && i + 1 < len && 2914 data[i+1] == '\n') 2915 i += 2; 2916 else 2917 i++; 2918 if (keepends) 2919 eol = i; 2920 } 2921 SPLIT_APPEND(data, j, eol); 2922 j = i; 2923 } 2924 if (j < len) { 2925 SPLIT_APPEND(data, j, len); 2926 } 2927 2928 Py_DECREF(string); 2929 return list; 2930 2931 onError: 2932 Py_DECREF(list); 2933 Py_DECREF(string); 2934 return NULL; 2935} 2936 2937static 2938PyObject *split_char(PyUnicodeObject *self, 2939 PyObject *list, 2940 Py_UNICODE ch, 2941 int maxcount) 2942{ 2943 register int i; 2944 register int j; 2945 int len = self->length; 2946 PyObject *str; 2947 2948 for (i = j = 0; i < len; ) { 2949 if (self->str[i] == ch) { 2950 if (maxcount-- <= 0) 2951 break; 2952 SPLIT_APPEND(self->str, j, i); 2953 i = j = i + 1; 2954 } else 2955 i++; 2956 } 2957 if (j <= len) { 2958 SPLIT_APPEND(self->str, j, len); 2959 } 2960 return list; 2961 2962 onError: 2963 Py_DECREF(list); 2964 return NULL; 2965} 2966 2967static 2968PyObject *split_substring(PyUnicodeObject *self, 2969 PyObject *list, 2970 PyUnicodeObject *substring, 2971 int maxcount) 2972{ 2973 register int i; 2974 register int j; 2975 int len = self->length; 2976 int sublen = substring->length; 2977 PyObject *str; 2978 2979 for (i = j = 0; i <= len - sublen; ) { 2980 if (Py_UNICODE_MATCH(self, i, substring)) { 2981 if (maxcount-- <= 0) 2982 break; 2983 SPLIT_APPEND(self->str, j, i); 2984 i = j = i + sublen; 2985 } else 2986 i++; 2987 } 2988 if (j <= len) { 2989 SPLIT_APPEND(self->str, j, len); 2990 } 2991 return list; 2992 2993 onError: 2994 Py_DECREF(list); 2995 return NULL; 2996} 2997 2998#undef SPLIT_APPEND 2999 3000static 3001PyObject *split(PyUnicodeObject *self, 3002 PyUnicodeObject *substring, 3003 int maxcount) 3004{ 3005 PyObject *list; 3006 3007 if (maxcount < 0) 3008 maxcount = INT_MAX; 3009 3010 list = PyList_New(0); 3011 if (!list) 3012 return NULL; 3013 3014 if (substring == NULL) 3015 return split_whitespace(self,list,maxcount); 3016 3017 else if (substring->length == 1) 3018 return split_char(self,list,substring->str[0],maxcount); 3019 3020 else if (substring->length == 0) { 3021 Py_DECREF(list); 3022 PyErr_SetString(PyExc_ValueError, "empty separator"); 3023 return NULL; 3024 } 3025 else 3026 return split_substring(self,list,substring,maxcount); 3027} 3028 3029static 3030PyObject *strip(PyUnicodeObject *self, 3031 int left, 3032 int right) 3033{ 3034 Py_UNICODE *p = self->str; 3035 int start = 0; 3036 int end = self->length; 3037 3038 if (left) 3039 while (start < end && Py_UNICODE_ISSPACE(p[start])) 3040 start++; 3041 3042 if (right) 3043 while (end > start && Py_UNICODE_ISSPACE(p[end-1])) 3044 end--; 3045 3046 if (start == 0 && end == self->length) { 3047 /* couldn't strip anything off, return original string */ 3048 Py_INCREF(self); 3049 return (PyObject*) self; 3050 } 3051 3052 return (PyObject*) PyUnicode_FromUnicode( 3053 self->str + start, 3054 end - start 3055 ); 3056} 3057 3058static 3059PyObject *replace(PyUnicodeObject *self, 3060 PyUnicodeObject *str1, 3061 PyUnicodeObject *str2, 3062 int maxcount) 3063{ 3064 PyUnicodeObject *u; 3065 3066 if (maxcount < 0) 3067 maxcount = INT_MAX; 3068 3069 if (str1->length == 1 && str2->length == 1) { 3070 int i; 3071 3072 /* replace characters */ 3073 if (!findchar(self->str, self->length, str1->str[0])) { 3074 /* nothing to replace, return original string */ 3075 Py_INCREF(self); 3076 u = self; 3077 } else { 3078 Py_UNICODE u1 = str1->str[0]; 3079 Py_UNICODE u2 = str2->str[0]; 3080 3081 u = (PyUnicodeObject*) PyUnicode_FromUnicode( 3082 NULL, 3083 self->length 3084 ); 3085 if (u != NULL) { 3086 Py_UNICODE_COPY(u->str, self->str, 3087 self->length); 3088 for (i = 0; i < u->length; i++) 3089 if (u->str[i] == u1) { 3090 if (--maxcount < 0) 3091 break; 3092 u->str[i] = u2; 3093 } 3094 } 3095 } 3096 3097 } else { 3098 int n, i; 3099 Py_UNICODE *p; 3100 3101 /* replace strings */ 3102 n = count(self, 0, self->length, str1); 3103 if (n > maxcount) 3104 n = maxcount; 3105 if (n == 0) { 3106 /* nothing to replace, return original string */ 3107 Py_INCREF(self); 3108 u = self; 3109 } else { 3110 u = _PyUnicode_New( 3111 self->length + n * (str2->length - str1->length)); 3112 if (u) { 3113 i = 0; 3114 p = u->str; 3115 while (i <= self->length - str1->length) 3116 if (Py_UNICODE_MATCH(self, i, str1)) { 3117 /* replace string segment */ 3118 Py_UNICODE_COPY(p, str2->str, str2->length); 3119 p += str2->length; 3120 i += str1->length; 3121 if (--n <= 0) { 3122 /* copy remaining part */ 3123 Py_UNICODE_COPY(p, self->str+i, self->length-i); 3124 break; 3125 } 3126 } else 3127 *p++ = self->str[i++]; 3128 } 3129 } 3130 } 3131 3132 return (PyObject *) u; 3133} 3134 3135/* --- Unicode Object Methods --------------------------------------------- */ 3136 3137static char title__doc__[] = 3138"S.title() -> unicode\n\ 3139\n\ 3140Return a titlecased version of S, i.e. words start with title case\n\ 3141characters, all remaining cased characters have lower case."; 3142 3143static PyObject* 3144unicode_title(PyUnicodeObject *self, PyObject *args) 3145{ 3146 if (!PyArg_NoArgs(args)) 3147 return NULL; 3148 return fixup(self, fixtitle); 3149} 3150 3151static char capitalize__doc__[] = 3152"S.capitalize() -> unicode\n\ 3153\n\ 3154Return a capitalized version of S, i.e. make the first character\n\ 3155have upper case."; 3156 3157static PyObject* 3158unicode_capitalize(PyUnicodeObject *self, PyObject *args) 3159{ 3160 if (!PyArg_NoArgs(args)) 3161 return NULL; 3162 return fixup(self, fixcapitalize); 3163} 3164 3165#if 0 3166static char capwords__doc__[] = 3167"S.capwords() -> unicode\n\ 3168\n\ 3169Apply .capitalize() to all words in S and return the result with\n\ 3170normalized whitespace (all whitespace strings are replaced by ' ')."; 3171 3172static PyObject* 3173unicode_capwords(PyUnicodeObject *self, PyObject *args) 3174{ 3175 PyObject *list; 3176 PyObject *item; 3177 int i; 3178 3179 if (!PyArg_NoArgs(args)) 3180 return NULL; 3181 3182 /* Split into words */ 3183 list = split(self, NULL, -1); 3184 if (!list) 3185 return NULL; 3186 3187 /* Capitalize each word */ 3188 for (i = 0; i < PyList_GET_SIZE(list); i++) { 3189 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i), 3190 fixcapitalize); 3191 if (item == NULL) 3192 goto onError; 3193 Py_DECREF(PyList_GET_ITEM(list, i)); 3194 PyList_SET_ITEM(list, i, item); 3195 } 3196 3197 /* Join the words to form a new string */ 3198 item = PyUnicode_Join(NULL, list); 3199 3200onError: 3201 Py_DECREF(list); 3202 return (PyObject *)item; 3203} 3204#endif 3205 3206static char center__doc__[] = 3207"S.center(width) -> unicode\n\ 3208\n\ 3209Return S centered in a Unicode string of length width. Padding is done\n\ 3210using spaces."; 3211 3212static PyObject * 3213unicode_center(PyUnicodeObject *self, PyObject *args) 3214{ 3215 int marg, left; 3216 int width; 3217 3218 if (!PyArg_ParseTuple(args, "i:center", &width)) 3219 return NULL; 3220 3221 if (self->length >= width) { 3222 Py_INCREF(self); 3223 return (PyObject*) self; 3224 } 3225 3226 marg = width - self->length; 3227 left = marg / 2 + (marg & width & 1); 3228 3229 return (PyObject*) pad(self, left, marg - left, ' '); 3230} 3231 3232#if 0 3233 3234/* This code should go into some future Unicode collation support 3235 module. The basic comparison should compare ordinals on a naive 3236 basis (this is what Java does and thus JPython too). */ 3237 3238/* speedy UTF-16 code point order comparison */ 3239/* gleaned from: */ 3240/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */ 3241 3242static short utf16Fixup[32] = 3243{ 3244 0, 0, 0, 0, 0, 0, 0, 0, 3245 0, 0, 0, 0, 0, 0, 0, 0, 3246 0, 0, 0, 0, 0, 0, 0, 0, 3247 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800 3248}; 3249 3250static int 3251unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2) 3252{ 3253 int len1, len2; 3254 3255 Py_UNICODE *s1 = str1->str; 3256 Py_UNICODE *s2 = str2->str; 3257 3258 len1 = str1->length; 3259 len2 = str2->length; 3260 3261 while (len1 > 0 && len2 > 0) { 3262 Py_UNICODE c1, c2; 3263 long diff; 3264 3265 c1 = *s1++; 3266 c2 = *s2++; 3267 if (c1 > (1<<11) * 26) 3268 c1 += utf16Fixup[c1>>11]; 3269 if (c2 > (1<<11) * 26) 3270 c2 += utf16Fixup[c2>>11]; 3271 3272 /* now c1 and c2 are in UTF-32-compatible order */ 3273 diff = (long)c1 - (long)c2; 3274 if (diff) 3275 return (diff < 0) ? -1 : (diff != 0); 3276 len1--; len2--; 3277 } 3278 3279 return (len1 < len2) ? -1 : (len1 != len2); 3280} 3281 3282#else 3283 3284static int 3285unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2) 3286{ 3287 register int len1, len2; 3288 3289 Py_UNICODE *s1 = str1->str; 3290 Py_UNICODE *s2 = str2->str; 3291 3292 len1 = str1->length; 3293 len2 = str2->length; 3294 3295 while (len1 > 0 && len2 > 0) { 3296 register long diff; 3297 3298 diff = (long)*s1++ - (long)*s2++; 3299 if (diff) 3300 return (diff < 0) ? -1 : (diff != 0); 3301 len1--; len2--; 3302 } 3303 3304 return (len1 < len2) ? -1 : (len1 != len2); 3305} 3306 3307#endif 3308 3309int PyUnicode_Compare(PyObject *left, 3310 PyObject *right) 3311{ 3312 PyUnicodeObject *u = NULL, *v = NULL; 3313 int result; 3314 3315 /* Coerce the two arguments */ 3316 u = (PyUnicodeObject *)PyUnicode_FromObject(left); 3317 if (u == NULL) 3318 goto onError; 3319 v = (PyUnicodeObject *)PyUnicode_FromObject(right); 3320 if (v == NULL) 3321 goto onError; 3322 3323 /* Shortcut for empty or interned objects */ 3324 if (v == u) { 3325 Py_DECREF(u); 3326 Py_DECREF(v); 3327 return 0; 3328 } 3329 3330 result = unicode_compare(u, v); 3331 3332 Py_DECREF(u); 3333 Py_DECREF(v); 3334 return result; 3335 3336onError: 3337 Py_XDECREF(u); 3338 Py_XDECREF(v); 3339 return -1; 3340} 3341 3342int PyUnicode_Contains(PyObject *container, 3343 PyObject *element) 3344{ 3345 PyUnicodeObject *u = NULL, *v = NULL; 3346 int result; 3347 register const Py_UNICODE *p, *e; 3348 register Py_UNICODE ch; 3349 3350 /* Coerce the two arguments */ 3351 v = (PyUnicodeObject *)PyUnicode_FromObject(element); 3352 if (v == NULL) { 3353 PyErr_SetString(PyExc_TypeError, 3354 "'in <string>' requires character as left operand"); 3355 goto onError; 3356 } 3357 u = (PyUnicodeObject *)PyUnicode_FromObject(container); 3358 if (u == NULL) { 3359 Py_DECREF(v); 3360 goto onError; 3361 } 3362 3363 /* Check v in u */ 3364 if (PyUnicode_GET_SIZE(v) != 1) { 3365 PyErr_SetString(PyExc_TypeError, 3366 "'in <string>' requires character as left operand"); 3367 goto onError; 3368 } 3369 ch = *PyUnicode_AS_UNICODE(v); 3370 p = PyUnicode_AS_UNICODE(u); 3371 e = p + PyUnicode_GET_SIZE(u); 3372 result = 0; 3373 while (p < e) { 3374 if (*p++ == ch) { 3375 result = 1; 3376 break; 3377 } 3378 } 3379 3380 Py_DECREF(u); 3381 Py_DECREF(v); 3382 return result; 3383 3384onError: 3385 Py_XDECREF(u); 3386 Py_XDECREF(v); 3387 return -1; 3388} 3389 3390/* Concat to string or Unicode object giving a new Unicode object. */ 3391 3392PyObject *PyUnicode_Concat(PyObject *left, 3393 PyObject *right) 3394{ 3395 PyUnicodeObject *u = NULL, *v = NULL, *w; 3396 3397 /* Coerce the two arguments */ 3398 u = (PyUnicodeObject *)PyUnicode_FromObject(left); 3399 if (u == NULL) 3400 goto onError; 3401 v = (PyUnicodeObject *)PyUnicode_FromObject(right); 3402 if (v == NULL) 3403 goto onError; 3404 3405 /* Shortcuts */ 3406 if (v == unicode_empty) { 3407 Py_DECREF(v); 3408 return (PyObject *)u; 3409 } 3410 if (u == unicode_empty) { 3411 Py_DECREF(u); 3412 return (PyObject *)v; 3413 } 3414 3415 /* Concat the two Unicode strings */ 3416 w = _PyUnicode_New(u->length + v->length); 3417 if (w == NULL) 3418 goto onError; 3419 Py_UNICODE_COPY(w->str, u->str, u->length); 3420 Py_UNICODE_COPY(w->str + u->length, v->str, v->length); 3421 3422 Py_DECREF(u); 3423 Py_DECREF(v); 3424 return (PyObject *)w; 3425 3426onError: 3427 Py_XDECREF(u); 3428 Py_XDECREF(v); 3429 return NULL; 3430} 3431 3432static char count__doc__[] = 3433"S.count(sub[, start[, end]]) -> int\n\ 3434\n\ 3435Return the number of occurrences of substring sub in Unicode string\n\ 3436S[start:end]. Optional arguments start and end are\n\ 3437interpreted as in slice notation."; 3438 3439static PyObject * 3440unicode_count(PyUnicodeObject *self, PyObject *args) 3441{ 3442 PyUnicodeObject *substring; 3443 int start = 0; 3444 int end = INT_MAX; 3445 PyObject *result; 3446 3447 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring, 3448 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 3449 return NULL; 3450 3451 substring = (PyUnicodeObject *)PyUnicode_FromObject( 3452 (PyObject *)substring); 3453 if (substring == NULL) 3454 return NULL; 3455 3456 if (start < 0) 3457 start += self->length; 3458 if (start < 0) 3459 start = 0; 3460 if (end > self->length) 3461 end = self->length; 3462 if (end < 0) 3463 end += self->length; 3464 if (end < 0) 3465 end = 0; 3466 3467 result = PyInt_FromLong((long) count(self, start, end, substring)); 3468 3469 Py_DECREF(substring); 3470 return result; 3471} 3472 3473static char encode__doc__[] = 3474"S.encode([encoding[,errors]]) -> string\n\ 3475\n\ 3476Return an encoded string version of S. Default encoding is the current\n\ 3477default string encoding. errors may be given to set a different error\n\ 3478handling scheme. Default is 'strict' meaning that encoding errors raise\n\ 3479a ValueError. Other possible values are 'ignore' and 'replace'."; 3480 3481static PyObject * 3482unicode_encode(PyUnicodeObject *self, PyObject *args) 3483{ 3484 char *encoding = NULL; 3485 char *errors = NULL; 3486 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors)) 3487 return NULL; 3488 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors); 3489} 3490 3491static char expandtabs__doc__[] = 3492"S.expandtabs([tabsize]) -> unicode\n\ 3493\n\ 3494Return a copy of S where all tab characters are expanded using spaces.\n\ 3495If tabsize is not given, a tab size of 8 characters is assumed."; 3496 3497static PyObject* 3498unicode_expandtabs(PyUnicodeObject *self, PyObject *args) 3499{ 3500 Py_UNICODE *e; 3501 Py_UNICODE *p; 3502 Py_UNICODE *q; 3503 int i, j; 3504 PyUnicodeObject *u; 3505 int tabsize = 8; 3506 3507 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize)) 3508 return NULL; 3509 3510 /* First pass: determine size of output string */ 3511 i = j = 0; 3512 e = self->str + self->length; 3513 for (p = self->str; p < e; p++) 3514 if (*p == '\t') { 3515 if (tabsize > 0) 3516 j += tabsize - (j % tabsize); 3517 } 3518 else { 3519 j++; 3520 if (*p == '\n' || *p == '\r') { 3521 i += j; 3522 j = 0; 3523 } 3524 } 3525 3526 /* Second pass: create output string and fill it */ 3527 u = _PyUnicode_New(i + j); 3528 if (!u) 3529 return NULL; 3530 3531 j = 0; 3532 q = u->str; 3533 3534 for (p = self->str; p < e; p++) 3535 if (*p == '\t') { 3536 if (tabsize > 0) { 3537 i = tabsize - (j % tabsize); 3538 j += i; 3539 while (i--) 3540 *q++ = ' '; 3541 } 3542 } 3543 else { 3544 j++; 3545 *q++ = *p; 3546 if (*p == '\n' || *p == '\r') 3547 j = 0; 3548 } 3549 3550 return (PyObject*) u; 3551} 3552 3553static char find__doc__[] = 3554"S.find(sub [,start [,end]]) -> int\n\ 3555\n\ 3556Return the lowest index in S where substring sub is found,\n\ 3557such that sub is contained within s[start,end]. Optional\n\ 3558arguments start and end are interpreted as in slice notation.\n\ 3559\n\ 3560Return -1 on failure."; 3561 3562static PyObject * 3563unicode_find(PyUnicodeObject *self, PyObject *args) 3564{ 3565 PyUnicodeObject *substring; 3566 int start = 0; 3567 int end = INT_MAX; 3568 PyObject *result; 3569 3570 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring, 3571 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 3572 return NULL; 3573 substring = (PyUnicodeObject *)PyUnicode_FromObject( 3574 (PyObject *)substring); 3575 if (substring == NULL) 3576 return NULL; 3577 3578 result = PyInt_FromLong(findstring(self, substring, start, end, 1)); 3579 3580 Py_DECREF(substring); 3581 return result; 3582} 3583 3584static PyObject * 3585unicode_getitem(PyUnicodeObject *self, int index) 3586{ 3587 if (index < 0 || index >= self->length) { 3588 PyErr_SetString(PyExc_IndexError, "string index out of range"); 3589 return NULL; 3590 } 3591 3592 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1); 3593} 3594 3595static long 3596unicode_hash(PyUnicodeObject *self) 3597{ 3598 /* Since Unicode objects compare equal to their ASCII string 3599 counterparts, they should use the individual character values 3600 as basis for their hash value. This is needed to assure that 3601 strings and Unicode objects behave in the same way as 3602 dictionary keys. */ 3603 3604 register int len; 3605 register Py_UNICODE *p; 3606 register long x; 3607 3608 if (self->hash != -1) 3609 return self->hash; 3610 len = PyUnicode_GET_SIZE(self); 3611 p = PyUnicode_AS_UNICODE(self); 3612 x = *p << 7; 3613 while (--len >= 0) 3614 x = (1000003*x) ^ *p++; 3615 x ^= PyUnicode_GET_SIZE(self); 3616 if (x == -1) 3617 x = -2; 3618 self->hash = x; 3619 return x; 3620} 3621 3622static char index__doc__[] = 3623"S.index(sub [,start [,end]]) -> int\n\ 3624\n\ 3625Like S.find() but raise ValueError when the substring is not found."; 3626 3627static PyObject * 3628unicode_index(PyUnicodeObject *self, PyObject *args) 3629{ 3630 int result; 3631 PyUnicodeObject *substring; 3632 int start = 0; 3633 int end = INT_MAX; 3634 3635 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring, 3636 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 3637 return NULL; 3638 3639 substring = (PyUnicodeObject *)PyUnicode_FromObject( 3640 (PyObject *)substring); 3641 if (substring == NULL) 3642 return NULL; 3643 3644 result = findstring(self, substring, start, end, 1); 3645 3646 Py_DECREF(substring); 3647 if (result < 0) { 3648 PyErr_SetString(PyExc_ValueError, "substring not found"); 3649 return NULL; 3650 } 3651 return PyInt_FromLong(result); 3652} 3653 3654static char islower__doc__[] = 3655"S.islower() -> int\n\ 3656\n\ 3657Return 1 if all cased characters in S are lowercase and there is\n\ 3658at least one cased character in S, 0 otherwise."; 3659 3660static PyObject* 3661unicode_islower(PyUnicodeObject *self, PyObject *args) 3662{ 3663 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 3664 register const Py_UNICODE *e; 3665 int cased; 3666 3667 if (!PyArg_NoArgs(args)) 3668 return NULL; 3669 3670 /* Shortcut for single character strings */ 3671 if (PyUnicode_GET_SIZE(self) == 1) 3672 return PyInt_FromLong(Py_UNICODE_ISLOWER(*p) != 0); 3673 3674 /* Special case for empty strings */ 3675 if (PyString_GET_SIZE(self) == 0) 3676 return PyInt_FromLong(0); 3677 3678 e = p + PyUnicode_GET_SIZE(self); 3679 cased = 0; 3680 for (; p < e; p++) { 3681 register const Py_UNICODE ch = *p; 3682 3683 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) 3684 return PyInt_FromLong(0); 3685 else if (!cased && Py_UNICODE_ISLOWER(ch)) 3686 cased = 1; 3687 } 3688 return PyInt_FromLong(cased); 3689} 3690 3691static char isupper__doc__[] = 3692"S.isupper() -> int\n\ 3693\n\ 3694Return 1 if all cased characters in S are uppercase and there is\n\ 3695at least one cased character in S, 0 otherwise."; 3696 3697static PyObject* 3698unicode_isupper(PyUnicodeObject *self, PyObject *args) 3699{ 3700 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 3701 register const Py_UNICODE *e; 3702 int cased; 3703 3704 if (!PyArg_NoArgs(args)) 3705 return NULL; 3706 3707 /* Shortcut for single character strings */ 3708 if (PyUnicode_GET_SIZE(self) == 1) 3709 return PyInt_FromLong(Py_UNICODE_ISUPPER(*p) != 0); 3710 3711 /* Special case for empty strings */ 3712 if (PyString_GET_SIZE(self) == 0) 3713 return PyInt_FromLong(0); 3714 3715 e = p + PyUnicode_GET_SIZE(self); 3716 cased = 0; 3717 for (; p < e; p++) { 3718 register const Py_UNICODE ch = *p; 3719 3720 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch)) 3721 return PyInt_FromLong(0); 3722 else if (!cased && Py_UNICODE_ISUPPER(ch)) 3723 cased = 1; 3724 } 3725 return PyInt_FromLong(cased); 3726} 3727 3728static char istitle__doc__[] = 3729"S.istitle() -> int\n\ 3730\n\ 3731Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\ 3732may only follow uncased characters and lowercase characters only cased\n\ 3733ones. Return 0 otherwise."; 3734 3735static PyObject* 3736unicode_istitle(PyUnicodeObject *self, PyObject *args) 3737{ 3738 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 3739 register const Py_UNICODE *e; 3740 int cased, previous_is_cased; 3741 3742 if (!PyArg_NoArgs(args)) 3743 return NULL; 3744 3745 /* Shortcut for single character strings */ 3746 if (PyUnicode_GET_SIZE(self) == 1) 3747 return PyInt_FromLong((Py_UNICODE_ISTITLE(*p) != 0) || 3748 (Py_UNICODE_ISUPPER(*p) != 0)); 3749 3750 /* Special case for empty strings */ 3751 if (PyString_GET_SIZE(self) == 0) 3752 return PyInt_FromLong(0); 3753 3754 e = p + PyUnicode_GET_SIZE(self); 3755 cased = 0; 3756 previous_is_cased = 0; 3757 for (; p < e; p++) { 3758 register const Py_UNICODE ch = *p; 3759 3760 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) { 3761 if (previous_is_cased) 3762 return PyInt_FromLong(0); 3763 previous_is_cased = 1; 3764 cased = 1; 3765 } 3766 else if (Py_UNICODE_ISLOWER(ch)) { 3767 if (!previous_is_cased) 3768 return PyInt_FromLong(0); 3769 previous_is_cased = 1; 3770 cased = 1; 3771 } 3772 else 3773 previous_is_cased = 0; 3774 } 3775 return PyInt_FromLong(cased); 3776} 3777 3778static char isspace__doc__[] = 3779"S.isspace() -> int\n\ 3780\n\ 3781Return 1 if there are only whitespace characters in S,\n\ 37820 otherwise."; 3783 3784static PyObject* 3785unicode_isspace(PyUnicodeObject *self, PyObject *args) 3786{ 3787 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 3788 register const Py_UNICODE *e; 3789 3790 if (!PyArg_NoArgs(args)) 3791 return NULL; 3792 3793 /* Shortcut for single character strings */ 3794 if (PyUnicode_GET_SIZE(self) == 1 && 3795 Py_UNICODE_ISSPACE(*p)) 3796 return PyInt_FromLong(1); 3797 3798 /* Special case for empty strings */ 3799 if (PyString_GET_SIZE(self) == 0) 3800 return PyInt_FromLong(0); 3801 3802 e = p + PyUnicode_GET_SIZE(self); 3803 for (; p < e; p++) { 3804 if (!Py_UNICODE_ISSPACE(*p)) 3805 return PyInt_FromLong(0); 3806 } 3807 return PyInt_FromLong(1); 3808} 3809 3810static char isalpha__doc__[] = 3811"S.isalpha() -> int\n\ 3812\n\ 3813Return 1 if all characters in S are alphabetic\n\ 3814and there is at least one character in S, 0 otherwise."; 3815 3816static PyObject* 3817unicode_isalpha(PyUnicodeObject *self, PyObject *args) 3818{ 3819 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 3820 register const Py_UNICODE *e; 3821 3822 if (!PyArg_NoArgs(args)) 3823 return NULL; 3824 3825 /* Shortcut for single character strings */ 3826 if (PyUnicode_GET_SIZE(self) == 1 && 3827 Py_UNICODE_ISALPHA(*p)) 3828 return PyInt_FromLong(1); 3829 3830 /* Special case for empty strings */ 3831 if (PyString_GET_SIZE(self) == 0) 3832 return PyInt_FromLong(0); 3833 3834 e = p + PyUnicode_GET_SIZE(self); 3835 for (; p < e; p++) { 3836 if (!Py_UNICODE_ISALPHA(*p)) 3837 return PyInt_FromLong(0); 3838 } 3839 return PyInt_FromLong(1); 3840} 3841 3842static char isalnum__doc__[] = 3843"S.isalnum() -> int\n\ 3844\n\ 3845Return 1 if all characters in S are alphanumeric\n\ 3846and there is at least one character in S, 0 otherwise."; 3847 3848static PyObject* 3849unicode_isalnum(PyUnicodeObject *self, PyObject *args) 3850{ 3851 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 3852 register const Py_UNICODE *e; 3853 3854 if (!PyArg_NoArgs(args)) 3855 return NULL; 3856 3857 /* Shortcut for single character strings */ 3858 if (PyUnicode_GET_SIZE(self) == 1 && 3859 Py_UNICODE_ISALNUM(*p)) 3860 return PyInt_FromLong(1); 3861 3862 /* Special case for empty strings */ 3863 if (PyString_GET_SIZE(self) == 0) 3864 return PyInt_FromLong(0); 3865 3866 e = p + PyUnicode_GET_SIZE(self); 3867 for (; p < e; p++) { 3868 if (!Py_UNICODE_ISALNUM(*p)) 3869 return PyInt_FromLong(0); 3870 } 3871 return PyInt_FromLong(1); 3872} 3873 3874static char isdecimal__doc__[] = 3875"S.isdecimal() -> int\n\ 3876\n\ 3877Return 1 if there are only decimal characters in S,\n\ 38780 otherwise."; 3879 3880static PyObject* 3881unicode_isdecimal(PyUnicodeObject *self, PyObject *args) 3882{ 3883 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 3884 register const Py_UNICODE *e; 3885 3886 if (!PyArg_NoArgs(args)) 3887 return NULL; 3888 3889 /* Shortcut for single character strings */ 3890 if (PyUnicode_GET_SIZE(self) == 1 && 3891 Py_UNICODE_ISDECIMAL(*p)) 3892 return PyInt_FromLong(1); 3893 3894 /* Special case for empty strings */ 3895 if (PyString_GET_SIZE(self) == 0) 3896 return PyInt_FromLong(0); 3897 3898 e = p + PyUnicode_GET_SIZE(self); 3899 for (; p < e; p++) { 3900 if (!Py_UNICODE_ISDECIMAL(*p)) 3901 return PyInt_FromLong(0); 3902 } 3903 return PyInt_FromLong(1); 3904} 3905 3906static char isdigit__doc__[] = 3907"S.isdigit() -> int\n\ 3908\n\ 3909Return 1 if there are only digit characters in S,\n\ 39100 otherwise."; 3911 3912static PyObject* 3913unicode_isdigit(PyUnicodeObject *self, PyObject *args) 3914{ 3915 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 3916 register const Py_UNICODE *e; 3917 3918 if (!PyArg_NoArgs(args)) 3919 return NULL; 3920 3921 /* Shortcut for single character strings */ 3922 if (PyUnicode_GET_SIZE(self) == 1 && 3923 Py_UNICODE_ISDIGIT(*p)) 3924 return PyInt_FromLong(1); 3925 3926 /* Special case for empty strings */ 3927 if (PyString_GET_SIZE(self) == 0) 3928 return PyInt_FromLong(0); 3929 3930 e = p + PyUnicode_GET_SIZE(self); 3931 for (; p < e; p++) { 3932 if (!Py_UNICODE_ISDIGIT(*p)) 3933 return PyInt_FromLong(0); 3934 } 3935 return PyInt_FromLong(1); 3936} 3937 3938static char isnumeric__doc__[] = 3939"S.isnumeric() -> int\n\ 3940\n\ 3941Return 1 if there are only numeric characters in S,\n\ 39420 otherwise."; 3943 3944static PyObject* 3945unicode_isnumeric(PyUnicodeObject *self, PyObject *args) 3946{ 3947 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 3948 register const Py_UNICODE *e; 3949 3950 if (!PyArg_NoArgs(args)) 3951 return NULL; 3952 3953 /* Shortcut for single character strings */ 3954 if (PyUnicode_GET_SIZE(self) == 1 && 3955 Py_UNICODE_ISNUMERIC(*p)) 3956 return PyInt_FromLong(1); 3957 3958 /* Special case for empty strings */ 3959 if (PyString_GET_SIZE(self) == 0) 3960 return PyInt_FromLong(0); 3961 3962 e = p + PyUnicode_GET_SIZE(self); 3963 for (; p < e; p++) { 3964 if (!Py_UNICODE_ISNUMERIC(*p)) 3965 return PyInt_FromLong(0); 3966 } 3967 return PyInt_FromLong(1); 3968} 3969 3970static char join__doc__[] = 3971"S.join(sequence) -> unicode\n\ 3972\n\ 3973Return a string which is the concatenation of the strings in the\n\ 3974sequence. The separator between elements is S."; 3975 3976static PyObject* 3977unicode_join(PyUnicodeObject *self, PyObject *args) 3978{ 3979 PyObject *data; 3980 if (!PyArg_ParseTuple(args, "O:join", &data)) 3981 return NULL; 3982 3983 return PyUnicode_Join((PyObject *)self, data); 3984} 3985 3986static int 3987unicode_length(PyUnicodeObject *self) 3988{ 3989 return self->length; 3990} 3991 3992static char ljust__doc__[] = 3993"S.ljust(width) -> unicode\n\ 3994\n\ 3995Return S left justified in a Unicode string of length width. Padding is\n\ 3996done using spaces."; 3997 3998static PyObject * 3999unicode_ljust(PyUnicodeObject *self, PyObject *args) 4000{ 4001 int width; 4002 if (!PyArg_ParseTuple(args, "i:ljust", &width)) 4003 return NULL; 4004 4005 if (self->length >= width) { 4006 Py_INCREF(self); 4007 return (PyObject*) self; 4008 } 4009 4010 return (PyObject*) pad(self, 0, width - self->length, ' '); 4011} 4012 4013static char lower__doc__[] = 4014"S.lower() -> unicode\n\ 4015\n\ 4016Return a copy of the string S converted to lowercase."; 4017 4018static PyObject* 4019unicode_lower(PyUnicodeObject *self, PyObject *args) 4020{ 4021 if (!PyArg_NoArgs(args)) 4022 return NULL; 4023 return fixup(self, fixlower); 4024} 4025 4026static char lstrip__doc__[] = 4027"S.lstrip() -> unicode\n\ 4028\n\ 4029Return a copy of the string S with leading whitespace removed."; 4030 4031static PyObject * 4032unicode_lstrip(PyUnicodeObject *self, PyObject *args) 4033{ 4034 if (!PyArg_NoArgs(args)) 4035 return NULL; 4036 return strip(self, 1, 0); 4037} 4038 4039static PyObject* 4040unicode_repeat(PyUnicodeObject *str, int len) 4041{ 4042 PyUnicodeObject *u; 4043 Py_UNICODE *p; 4044 int nchars; 4045 size_t nbytes; 4046 4047 if (len < 0) 4048 len = 0; 4049 4050 if (len == 1) { 4051 /* no repeat, return original string */ 4052 Py_INCREF(str); 4053 return (PyObject*) str; 4054 } 4055 4056 /* ensure # of chars needed doesn't overflow int and # of bytes 4057 * needed doesn't overflow size_t 4058 */ 4059 nchars = len * str->length; 4060 if (len && nchars / len != str->length) { 4061 PyErr_SetString(PyExc_OverflowError, 4062 "repeated string is too long"); 4063 return NULL; 4064 } 4065 nbytes = (nchars + 1) * sizeof(Py_UNICODE); 4066 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) { 4067 PyErr_SetString(PyExc_OverflowError, 4068 "repeated string is too long"); 4069 return NULL; 4070 } 4071 u = _PyUnicode_New(nchars); 4072 if (!u) 4073 return NULL; 4074 4075 p = u->str; 4076 4077 while (len-- > 0) { 4078 Py_UNICODE_COPY(p, str->str, str->length); 4079 p += str->length; 4080 } 4081 4082 return (PyObject*) u; 4083} 4084 4085PyObject *PyUnicode_Replace(PyObject *obj, 4086 PyObject *subobj, 4087 PyObject *replobj, 4088 int maxcount) 4089{ 4090 PyObject *self; 4091 PyObject *str1; 4092 PyObject *str2; 4093 PyObject *result; 4094 4095 self = PyUnicode_FromObject(obj); 4096 if (self == NULL) 4097 return NULL; 4098 str1 = PyUnicode_FromObject(subobj); 4099 if (str1 == NULL) { 4100 Py_DECREF(self); 4101 return NULL; 4102 } 4103 str2 = PyUnicode_FromObject(replobj); 4104 if (str2 == NULL) { 4105 Py_DECREF(self); 4106 Py_DECREF(str1); 4107 return NULL; 4108 } 4109 result = replace((PyUnicodeObject *)self, 4110 (PyUnicodeObject *)str1, 4111 (PyUnicodeObject *)str2, 4112 maxcount); 4113 Py_DECREF(self); 4114 Py_DECREF(str1); 4115 Py_DECREF(str2); 4116 return result; 4117} 4118 4119static char replace__doc__[] = 4120"S.replace (old, new[, maxsplit]) -> unicode\n\ 4121\n\ 4122Return a copy of S with all occurrences of substring\n\ 4123old replaced by new. If the optional argument maxsplit is\n\ 4124given, only the first maxsplit occurrences are replaced."; 4125 4126static PyObject* 4127unicode_replace(PyUnicodeObject *self, PyObject *args) 4128{ 4129 PyUnicodeObject *str1; 4130 PyUnicodeObject *str2; 4131 int maxcount = -1; 4132 PyObject *result; 4133 4134 if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount)) 4135 return NULL; 4136 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1); 4137 if (str1 == NULL) 4138 return NULL; 4139 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2); 4140 if (str2 == NULL) 4141 return NULL; 4142 4143 result = replace(self, str1, str2, maxcount); 4144 4145 Py_DECREF(str1); 4146 Py_DECREF(str2); 4147 return result; 4148} 4149 4150static 4151PyObject *unicode_repr(PyObject *unicode) 4152{ 4153 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode), 4154 PyUnicode_GET_SIZE(unicode), 4155 1); 4156} 4157 4158static char rfind__doc__[] = 4159"S.rfind(sub [,start [,end]]) -> int\n\ 4160\n\ 4161Return the highest index in S where substring sub is found,\n\ 4162such that sub is contained within s[start,end]. Optional\n\ 4163arguments start and end are interpreted as in slice notation.\n\ 4164\n\ 4165Return -1 on failure."; 4166 4167static PyObject * 4168unicode_rfind(PyUnicodeObject *self, PyObject *args) 4169{ 4170 PyUnicodeObject *substring; 4171 int start = 0; 4172 int end = INT_MAX; 4173 PyObject *result; 4174 4175 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring, 4176 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 4177 return NULL; 4178 substring = (PyUnicodeObject *)PyUnicode_FromObject( 4179 (PyObject *)substring); 4180 if (substring == NULL) 4181 return NULL; 4182 4183 result = PyInt_FromLong(findstring(self, substring, start, end, -1)); 4184 4185 Py_DECREF(substring); 4186 return result; 4187} 4188 4189static char rindex__doc__[] = 4190"S.rindex(sub [,start [,end]]) -> int\n\ 4191\n\ 4192Like S.rfind() but raise ValueError when the substring is not found."; 4193 4194static PyObject * 4195unicode_rindex(PyUnicodeObject *self, PyObject *args) 4196{ 4197 int result; 4198 PyUnicodeObject *substring; 4199 int start = 0; 4200 int end = INT_MAX; 4201 4202 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring, 4203 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 4204 return NULL; 4205 substring = (PyUnicodeObject *)PyUnicode_FromObject( 4206 (PyObject *)substring); 4207 if (substring == NULL) 4208 return NULL; 4209 4210 result = findstring(self, substring, start, end, -1); 4211 4212 Py_DECREF(substring); 4213 if (result < 0) { 4214 PyErr_SetString(PyExc_ValueError, "substring not found"); 4215 return NULL; 4216 } 4217 return PyInt_FromLong(result); 4218} 4219 4220static char rjust__doc__[] = 4221"S.rjust(width) -> unicode\n\ 4222\n\ 4223Return S right justified in a Unicode string of length width. Padding is\n\ 4224done using spaces."; 4225 4226static PyObject * 4227unicode_rjust(PyUnicodeObject *self, PyObject *args) 4228{ 4229 int width; 4230 if (!PyArg_ParseTuple(args, "i:rjust", &width)) 4231 return NULL; 4232 4233 if (self->length >= width) { 4234 Py_INCREF(self); 4235 return (PyObject*) self; 4236 } 4237 4238 return (PyObject*) pad(self, width - self->length, 0, ' '); 4239} 4240 4241static char rstrip__doc__[] = 4242"S.rstrip() -> unicode\n\ 4243\n\ 4244Return a copy of the string S with trailing whitespace removed."; 4245 4246static PyObject * 4247unicode_rstrip(PyUnicodeObject *self, PyObject *args) 4248{ 4249 if (!PyArg_NoArgs(args)) 4250 return NULL; 4251 return strip(self, 0, 1); 4252} 4253 4254static PyObject* 4255unicode_slice(PyUnicodeObject *self, int start, int end) 4256{ 4257 /* standard clamping */ 4258 if (start < 0) 4259 start = 0; 4260 if (end < 0) 4261 end = 0; 4262 if (end > self->length) 4263 end = self->length; 4264 if (start == 0 && end == self->length) { 4265 /* full slice, return original string */ 4266 Py_INCREF(self); 4267 return (PyObject*) self; 4268 } 4269 if (start > end) 4270 start = end; 4271 /* copy slice */ 4272 return (PyObject*) PyUnicode_FromUnicode(self->str + start, 4273 end - start); 4274} 4275 4276PyObject *PyUnicode_Split(PyObject *s, 4277 PyObject *sep, 4278 int maxsplit) 4279{ 4280 PyObject *result; 4281 4282 s = PyUnicode_FromObject(s); 4283 if (s == NULL) 4284 return NULL; 4285 if (sep != NULL) { 4286 sep = PyUnicode_FromObject(sep); 4287 if (sep == NULL) { 4288 Py_DECREF(s); 4289 return NULL; 4290 } 4291 } 4292 4293 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit); 4294 4295 Py_DECREF(s); 4296 Py_XDECREF(sep); 4297 return result; 4298} 4299 4300static char split__doc__[] = 4301"S.split([sep [,maxsplit]]) -> list of strings\n\ 4302\n\ 4303Return a list of the words in S, using sep as the\n\ 4304delimiter string. If maxsplit is given, at most maxsplit\n\ 4305splits are done. If sep is not specified, any whitespace string\n\ 4306is a separator."; 4307 4308static PyObject* 4309unicode_split(PyUnicodeObject *self, PyObject *args) 4310{ 4311 PyObject *substring = Py_None; 4312 int maxcount = -1; 4313 4314 if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount)) 4315 return NULL; 4316 4317 if (substring == Py_None) 4318 return split(self, NULL, maxcount); 4319 else if (PyUnicode_Check(substring)) 4320 return split(self, (PyUnicodeObject *)substring, maxcount); 4321 else 4322 return PyUnicode_Split((PyObject *)self, substring, maxcount); 4323} 4324 4325static char splitlines__doc__[] = 4326"S.splitlines([keepends]]) -> list of strings\n\ 4327\n\ 4328Return a list of the lines in S, breaking at line boundaries.\n\ 4329Line breaks are not included in the resulting list unless keepends\n\ 4330is given and true."; 4331 4332static PyObject* 4333unicode_splitlines(PyUnicodeObject *self, PyObject *args) 4334{ 4335 int keepends = 0; 4336 4337 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends)) 4338 return NULL; 4339 4340 return PyUnicode_Splitlines((PyObject *)self, keepends); 4341} 4342 4343static 4344PyObject *unicode_str(PyUnicodeObject *self) 4345{ 4346 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL); 4347} 4348 4349static char strip__doc__[] = 4350"S.strip() -> unicode\n\ 4351\n\ 4352Return a copy of S with leading and trailing whitespace removed."; 4353 4354static PyObject * 4355unicode_strip(PyUnicodeObject *self, PyObject *args) 4356{ 4357 if (!PyArg_NoArgs(args)) 4358 return NULL; 4359 return strip(self, 1, 1); 4360} 4361 4362static char swapcase__doc__[] = 4363"S.swapcase() -> unicode\n\ 4364\n\ 4365Return a copy of S with uppercase characters converted to lowercase\n\ 4366and vice versa."; 4367 4368static PyObject* 4369unicode_swapcase(PyUnicodeObject *self, PyObject *args) 4370{ 4371 if (!PyArg_NoArgs(args)) 4372 return NULL; 4373 return fixup(self, fixswapcase); 4374} 4375 4376static char translate__doc__[] = 4377"S.translate(table) -> unicode\n\ 4378\n\ 4379Return a copy of the string S, where all characters have been mapped\n\ 4380through the given translation table, which must be a mapping of\n\ 4381Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\ 4382are left untouched. Characters mapped to None are deleted."; 4383 4384static PyObject* 4385unicode_translate(PyUnicodeObject *self, PyObject *args) 4386{ 4387 PyObject *table; 4388 4389 if (!PyArg_ParseTuple(args, "O:translate", &table)) 4390 return NULL; 4391 return PyUnicode_TranslateCharmap(self->str, 4392 self->length, 4393 table, 4394 "ignore"); 4395} 4396 4397static char upper__doc__[] = 4398"S.upper() -> unicode\n\ 4399\n\ 4400Return a copy of S converted to uppercase."; 4401 4402static PyObject* 4403unicode_upper(PyUnicodeObject *self, PyObject *args) 4404{ 4405 if (!PyArg_NoArgs(args)) 4406 return NULL; 4407 return fixup(self, fixupper); 4408} 4409 4410#if 0 4411static char zfill__doc__[] = 4412"S.zfill(width) -> unicode\n\ 4413\n\ 4414Pad a numeric string x with zeros on the left, to fill a field\n\ 4415of the specified width. The string x is never truncated."; 4416 4417static PyObject * 4418unicode_zfill(PyUnicodeObject *self, PyObject *args) 4419{ 4420 int fill; 4421 PyUnicodeObject *u; 4422 4423 int width; 4424 if (!PyArg_ParseTuple(args, "i:zfill", &width)) 4425 return NULL; 4426 4427 if (self->length >= width) { 4428 Py_INCREF(self); 4429 return (PyObject*) self; 4430 } 4431 4432 fill = width - self->length; 4433 4434 u = pad(self, fill, 0, '0'); 4435 4436 if (u->str[fill] == '+' || u->str[fill] == '-') { 4437 /* move sign to beginning of string */ 4438 u->str[0] = u->str[fill]; 4439 u->str[fill] = '0'; 4440 } 4441 4442 return (PyObject*) u; 4443} 4444#endif 4445 4446#if 0 4447static PyObject* 4448unicode_freelistsize(PyUnicodeObject *self, PyObject *args) 4449{ 4450 if (!PyArg_NoArgs(args)) 4451 return NULL; 4452 return PyInt_FromLong(unicode_freelist_size); 4453} 4454#endif 4455 4456static char startswith__doc__[] = 4457"S.startswith(prefix[, start[, end]]) -> int\n\ 4458\n\ 4459Return 1 if S starts with the specified prefix, otherwise return 0. With\n\ 4460optional start, test S beginning at that position. With optional end, stop\n\ 4461comparing S at that position."; 4462 4463static PyObject * 4464unicode_startswith(PyUnicodeObject *self, 4465 PyObject *args) 4466{ 4467 PyUnicodeObject *substring; 4468 int start = 0; 4469 int end = INT_MAX; 4470 PyObject *result; 4471 4472 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring, 4473 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 4474 return NULL; 4475 substring = (PyUnicodeObject *)PyUnicode_FromObject( 4476 (PyObject *)substring); 4477 if (substring == NULL) 4478 return NULL; 4479 4480 result = PyInt_FromLong(tailmatch(self, substring, start, end, -1)); 4481 4482 Py_DECREF(substring); 4483 return result; 4484} 4485 4486 4487static char endswith__doc__[] = 4488"S.endswith(suffix[, start[, end]]) -> int\n\ 4489\n\ 4490Return 1 if S ends with the specified suffix, otherwise return 0. With\n\ 4491optional start, test S beginning at that position. With optional end, stop\n\ 4492comparing S at that position."; 4493 4494static PyObject * 4495unicode_endswith(PyUnicodeObject *self, 4496 PyObject *args) 4497{ 4498 PyUnicodeObject *substring; 4499 int start = 0; 4500 int end = INT_MAX; 4501 PyObject *result; 4502 4503 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring, 4504 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 4505 return NULL; 4506 substring = (PyUnicodeObject *)PyUnicode_FromObject( 4507 (PyObject *)substring); 4508 if (substring == NULL) 4509 return NULL; 4510 4511 result = PyInt_FromLong(tailmatch(self, substring, start, end, +1)); 4512 4513 Py_DECREF(substring); 4514 return result; 4515} 4516 4517 4518static PyMethodDef unicode_methods[] = { 4519 4520 /* Order is according to common usage: often used methods should 4521 appear first, since lookup is done sequentially. */ 4522 4523 {"encode", (PyCFunction) unicode_encode, 1, encode__doc__}, 4524 {"replace", (PyCFunction) unicode_replace, 1, replace__doc__}, 4525 {"split", (PyCFunction) unicode_split, 1, split__doc__}, 4526 {"join", (PyCFunction) unicode_join, 1, join__doc__}, 4527 {"capitalize", (PyCFunction) unicode_capitalize, 0, capitalize__doc__}, 4528 {"title", (PyCFunction) unicode_title, 0, title__doc__}, 4529 {"center", (PyCFunction) unicode_center, 1, center__doc__}, 4530 {"count", (PyCFunction) unicode_count, 1, count__doc__}, 4531 {"expandtabs", (PyCFunction) unicode_expandtabs, 1, expandtabs__doc__}, 4532 {"find", (PyCFunction) unicode_find, 1, find__doc__}, 4533 {"index", (PyCFunction) unicode_index, 1, index__doc__}, 4534 {"ljust", (PyCFunction) unicode_ljust, 1, ljust__doc__}, 4535 {"lower", (PyCFunction) unicode_lower, 0, lower__doc__}, 4536 {"lstrip", (PyCFunction) unicode_lstrip, 0, lstrip__doc__}, 4537/* {"maketrans", (PyCFunction) unicode_maketrans, 1, maketrans__doc__}, */ 4538 {"rfind", (PyCFunction) unicode_rfind, 1, rfind__doc__}, 4539 {"rindex", (PyCFunction) unicode_rindex, 1, rindex__doc__}, 4540 {"rjust", (PyCFunction) unicode_rjust, 1, rjust__doc__}, 4541 {"rstrip", (PyCFunction) unicode_rstrip, 0, rstrip__doc__}, 4542 {"splitlines", (PyCFunction) unicode_splitlines, 1, splitlines__doc__}, 4543 {"strip", (PyCFunction) unicode_strip, 0, strip__doc__}, 4544 {"swapcase", (PyCFunction) unicode_swapcase, 0, swapcase__doc__}, 4545 {"translate", (PyCFunction) unicode_translate, 1, translate__doc__}, 4546 {"upper", (PyCFunction) unicode_upper, 0, upper__doc__}, 4547 {"startswith", (PyCFunction) unicode_startswith, 1, startswith__doc__}, 4548 {"endswith", (PyCFunction) unicode_endswith, 1, endswith__doc__}, 4549 {"islower", (PyCFunction) unicode_islower, 0, islower__doc__}, 4550 {"isupper", (PyCFunction) unicode_isupper, 0, isupper__doc__}, 4551 {"istitle", (PyCFunction) unicode_istitle, 0, istitle__doc__}, 4552 {"isspace", (PyCFunction) unicode_isspace, 0, isspace__doc__}, 4553 {"isdecimal", (PyCFunction) unicode_isdecimal, 0, isdecimal__doc__}, 4554 {"isdigit", (PyCFunction) unicode_isdigit, 0, isdigit__doc__}, 4555 {"isnumeric", (PyCFunction) unicode_isnumeric, 0, isnumeric__doc__}, 4556 {"isalpha", (PyCFunction) unicode_isalpha, 0, isalpha__doc__}, 4557 {"isalnum", (PyCFunction) unicode_isalnum, 0, isalnum__doc__}, 4558#if 0 4559 {"zfill", (PyCFunction) unicode_zfill, 1, zfill__doc__}, 4560 {"capwords", (PyCFunction) unicode_capwords, 0, capwords__doc__}, 4561#endif 4562 4563#if 0 4564 /* This one is just used for debugging the implementation. */ 4565 {"freelistsize", (PyCFunction) unicode_freelistsize, 0}, 4566#endif 4567 4568 {NULL, NULL} 4569}; 4570 4571static PyObject * 4572unicode_getattr(PyUnicodeObject *self, char *name) 4573{ 4574 return Py_FindMethod(unicode_methods, (PyObject*) self, name); 4575} 4576 4577static PySequenceMethods unicode_as_sequence = { 4578 (inquiry) unicode_length, /* sq_length */ 4579 (binaryfunc) PyUnicode_Concat, /* sq_concat */ 4580 (intargfunc) unicode_repeat, /* sq_repeat */ 4581 (intargfunc) unicode_getitem, /* sq_item */ 4582 (intintargfunc) unicode_slice, /* sq_slice */ 4583 0, /* sq_ass_item */ 4584 0, /* sq_ass_slice */ 4585 (objobjproc)PyUnicode_Contains, /*sq_contains*/ 4586}; 4587 4588static int 4589unicode_buffer_getreadbuf(PyUnicodeObject *self, 4590 int index, 4591 const void **ptr) 4592{ 4593 if (index != 0) { 4594 PyErr_SetString(PyExc_SystemError, 4595 "accessing non-existent unicode segment"); 4596 return -1; 4597 } 4598 *ptr = (void *) self->str; 4599 return PyUnicode_GET_DATA_SIZE(self); 4600} 4601 4602static int 4603unicode_buffer_getwritebuf(PyUnicodeObject *self, int index, 4604 const void **ptr) 4605{ 4606 PyErr_SetString(PyExc_TypeError, 4607 "cannot use unicode as modifyable buffer"); 4608 return -1; 4609} 4610 4611static int 4612unicode_buffer_getsegcount(PyUnicodeObject *self, 4613 int *lenp) 4614{ 4615 if (lenp) 4616 *lenp = PyUnicode_GET_DATA_SIZE(self); 4617 return 1; 4618} 4619 4620static int 4621unicode_buffer_getcharbuf(PyUnicodeObject *self, 4622 int index, 4623 const void **ptr) 4624{ 4625 PyObject *str; 4626 4627 if (index != 0) { 4628 PyErr_SetString(PyExc_SystemError, 4629 "accessing non-existent unicode segment"); 4630 return -1; 4631 } 4632 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL); 4633 if (str == NULL) 4634 return -1; 4635 *ptr = (void *) PyString_AS_STRING(str); 4636 return PyString_GET_SIZE(str); 4637} 4638 4639/* Helpers for PyUnicode_Format() */ 4640 4641static PyObject * 4642getnextarg(PyObject *args, int arglen, int *p_argidx) 4643{ 4644 int argidx = *p_argidx; 4645 if (argidx < arglen) { 4646 (*p_argidx)++; 4647 if (arglen < 0) 4648 return args; 4649 else 4650 return PyTuple_GetItem(args, argidx); 4651 } 4652 PyErr_SetString(PyExc_TypeError, 4653 "not enough arguments for format string"); 4654 return NULL; 4655} 4656 4657#define F_LJUST (1<<0) 4658#define F_SIGN (1<<1) 4659#define F_BLANK (1<<2) 4660#define F_ALT (1<<3) 4661#define F_ZERO (1<<4) 4662 4663static 4664int usprintf(register Py_UNICODE *buffer, char *format, ...) 4665{ 4666 register int i; 4667 int len; 4668 va_list va; 4669 char *charbuffer; 4670 va_start(va, format); 4671 4672 /* First, format the string as char array, then expand to Py_UNICODE 4673 array. */ 4674 charbuffer = (char *)buffer; 4675 len = vsprintf(charbuffer, format, va); 4676 for (i = len - 1; i >= 0; i--) 4677 buffer[i] = (Py_UNICODE) charbuffer[i]; 4678 4679 va_end(va); 4680 return len; 4681} 4682 4683static int 4684formatfloat(Py_UNICODE *buf, 4685 size_t buflen, 4686 int flags, 4687 int prec, 4688 int type, 4689 PyObject *v) 4690{ 4691 /* fmt = '%#.' + `prec` + `type` 4692 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/ 4693 char fmt[20]; 4694 double x; 4695 4696 x = PyFloat_AsDouble(v); 4697 if (x == -1.0 && PyErr_Occurred()) 4698 return -1; 4699 if (prec < 0) 4700 prec = 6; 4701 if (type == 'f' && (fabs(x) / 1e25) >= 1e25) 4702 type = 'g'; 4703 sprintf(fmt, "%%%s.%d%c", (flags & F_ALT) ? "#" : "", prec, type); 4704 /* worst case length calc to ensure no buffer overrun: 4705 fmt = %#.<prec>g 4706 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp 4707 for any double rep.) 4708 len = 1 + prec + 1 + 2 + 5 = 9 + prec 4709 If prec=0 the effective precision is 1 (the leading digit is 4710 always given), therefore increase by one to 10+prec. */ 4711 if (buflen <= (size_t)10 + (size_t)prec) { 4712 PyErr_SetString(PyExc_OverflowError, 4713 "formatted float is too long (precision too long?)"); 4714 return -1; 4715 } 4716 return usprintf(buf, fmt, x); 4717} 4718 4719static PyObject* 4720formatlong(PyObject *val, int flags, int prec, int type) 4721{ 4722 char *buf; 4723 int i, len; 4724 PyObject *str; /* temporary string object. */ 4725 PyUnicodeObject *result; 4726 4727 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len); 4728 if (!str) 4729 return NULL; 4730 result = _PyUnicode_New(len); 4731 for (i = 0; i < len; i++) 4732 result->str[i] = buf[i]; 4733 result->str[len] = 0; 4734 Py_DECREF(str); 4735 return (PyObject*)result; 4736} 4737 4738static int 4739formatint(Py_UNICODE *buf, 4740 size_t buflen, 4741 int flags, 4742 int prec, 4743 int type, 4744 PyObject *v) 4745{ 4746 /* fmt = '%#.' + `prec` + 'l' + `type` 4747 worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine) 4748 + 1 + 1 = 24*/ 4749 char fmt[64]; /* plenty big enough! */ 4750 long x; 4751 int use_native_c_format = 1; 4752 4753 x = PyInt_AsLong(v); 4754 if (x == -1 && PyErr_Occurred()) 4755 return -1; 4756 if (prec < 0) 4757 prec = 1; 4758 /* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal)) 4759 worst case buf = '0x' + [0-9]*prec, where prec >= 11 */ 4760 if (buflen <= 13 || buflen <= (size_t)2+(size_t)prec) { 4761 PyErr_SetString(PyExc_OverflowError, 4762 "formatted integer is too long (precision too long?)"); 4763 return -1; 4764 } 4765 /* When converting 0 under %#x or %#X, C leaves off the base marker, 4766 * but we want it (for consistency with other %#x conversions, and 4767 * for consistency with Python's hex() function). 4768 * BUG 28-Apr-2001 tim: At least two platform Cs (Metrowerks & 4769 * Compaq Tru64) violate the std by converting 0 w/ leading 0x anyway. 4770 * So add it only if the platform doesn't already. 4771 */ 4772 if (x == 0 && (flags & F_ALT) && (type == 'x' || type == 'X')) { 4773 /* Only way to know what the platform does is to try it. */ 4774 sprintf(fmt, type == 'x' ? "%#x" : "%#X", 0); 4775 if (fmt[1] != (char)type) { 4776 /* Supply our own leading 0x/0X -- needed under std C */ 4777 use_native_c_format = 0; 4778 sprintf(fmt, "0%c%%#.%dl%c", type, prec, type); 4779 } 4780 } 4781 if (use_native_c_format) 4782 sprintf(fmt, "%%%s.%dl%c", (flags & F_ALT) ? "#" : "", prec, type); 4783 return usprintf(buf, fmt, x); 4784} 4785 4786static int 4787formatchar(Py_UNICODE *buf, 4788 size_t buflen, 4789 PyObject *v) 4790{ 4791 /* presume that the buffer is at least 2 characters long */ 4792 if (PyUnicode_Check(v)) { 4793 if (PyUnicode_GET_SIZE(v) != 1) 4794 goto onError; 4795 buf[0] = PyUnicode_AS_UNICODE(v)[0]; 4796 } 4797 4798 else if (PyString_Check(v)) { 4799 if (PyString_GET_SIZE(v) != 1) 4800 goto onError; 4801 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0]; 4802 } 4803 4804 else { 4805 /* Integer input truncated to a character */ 4806 long x; 4807 x = PyInt_AsLong(v); 4808 if (x == -1 && PyErr_Occurred()) 4809 goto onError; 4810 buf[0] = (char) x; 4811 } 4812 buf[1] = '\0'; 4813 return 1; 4814 4815 onError: 4816 PyErr_SetString(PyExc_TypeError, 4817 "%c requires int or char"); 4818 return -1; 4819} 4820 4821/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...) 4822 4823 FORMATBUFLEN is the length of the buffer in which the floats, ints, & 4824 chars are formatted. XXX This is a magic number. Each formatting 4825 routine does bounds checking to ensure no overflow, but a better 4826 solution may be to malloc a buffer of appropriate size for each 4827 format. For now, the current solution is sufficient. 4828*/ 4829#define FORMATBUFLEN (size_t)120 4830 4831PyObject *PyUnicode_Format(PyObject *format, 4832 PyObject *args) 4833{ 4834 Py_UNICODE *fmt, *res; 4835 int fmtcnt, rescnt, reslen, arglen, argidx; 4836 int args_owned = 0; 4837 PyUnicodeObject *result = NULL; 4838 PyObject *dict = NULL; 4839 PyObject *uformat; 4840 4841 if (format == NULL || args == NULL) { 4842 PyErr_BadInternalCall(); 4843 return NULL; 4844 } 4845 uformat = PyUnicode_FromObject(format); 4846 if (uformat == NULL) 4847 return NULL; 4848 fmt = PyUnicode_AS_UNICODE(uformat); 4849 fmtcnt = PyUnicode_GET_SIZE(uformat); 4850 4851 reslen = rescnt = fmtcnt + 100; 4852 result = _PyUnicode_New(reslen); 4853 if (result == NULL) 4854 goto onError; 4855 res = PyUnicode_AS_UNICODE(result); 4856 4857 if (PyTuple_Check(args)) { 4858 arglen = PyTuple_Size(args); 4859 argidx = 0; 4860 } 4861 else { 4862 arglen = -1; 4863 argidx = -2; 4864 } 4865 if (args->ob_type->tp_as_mapping) 4866 dict = args; 4867 4868 while (--fmtcnt >= 0) { 4869 if (*fmt != '%') { 4870 if (--rescnt < 0) { 4871 rescnt = fmtcnt + 100; 4872 reslen += rescnt; 4873 if (_PyUnicode_Resize(&result, reslen) < 0) 4874 return NULL; 4875 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt; 4876 --rescnt; 4877 } 4878 *res++ = *fmt++; 4879 } 4880 else { 4881 /* Got a format specifier */ 4882 int flags = 0; 4883 int width = -1; 4884 int prec = -1; 4885 Py_UNICODE c = '\0'; 4886 Py_UNICODE fill; 4887 PyObject *v = NULL; 4888 PyObject *temp = NULL; 4889 Py_UNICODE *pbuf; 4890 Py_UNICODE sign; 4891 int len; 4892 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */ 4893 4894 fmt++; 4895 if (*fmt == '(') { 4896 Py_UNICODE *keystart; 4897 int keylen; 4898 PyObject *key; 4899 int pcount = 1; 4900 4901 if (dict == NULL) { 4902 PyErr_SetString(PyExc_TypeError, 4903 "format requires a mapping"); 4904 goto onError; 4905 } 4906 ++fmt; 4907 --fmtcnt; 4908 keystart = fmt; 4909 /* Skip over balanced parentheses */ 4910 while (pcount > 0 && --fmtcnt >= 0) { 4911 if (*fmt == ')') 4912 --pcount; 4913 else if (*fmt == '(') 4914 ++pcount; 4915 fmt++; 4916 } 4917 keylen = fmt - keystart - 1; 4918 if (fmtcnt < 0 || pcount > 0) { 4919 PyErr_SetString(PyExc_ValueError, 4920 "incomplete format key"); 4921 goto onError; 4922 } 4923 /* keys are converted to strings using UTF-8 and 4924 then looked up since Python uses strings to hold 4925 variables names etc. in its namespaces and we 4926 wouldn't want to break common idioms. */ 4927 key = PyUnicode_EncodeUTF8(keystart, 4928 keylen, 4929 NULL); 4930 if (key == NULL) 4931 goto onError; 4932 if (args_owned) { 4933 Py_DECREF(args); 4934 args_owned = 0; 4935 } 4936 args = PyObject_GetItem(dict, key); 4937 Py_DECREF(key); 4938 if (args == NULL) { 4939 goto onError; 4940 } 4941 args_owned = 1; 4942 arglen = -1; 4943 argidx = -2; 4944 } 4945 while (--fmtcnt >= 0) { 4946 switch (c = *fmt++) { 4947 case '-': flags |= F_LJUST; continue; 4948 case '+': flags |= F_SIGN; continue; 4949 case ' ': flags |= F_BLANK; continue; 4950 case '#': flags |= F_ALT; continue; 4951 case '0': flags |= F_ZERO; continue; 4952 } 4953 break; 4954 } 4955 if (c == '*') { 4956 v = getnextarg(args, arglen, &argidx); 4957 if (v == NULL) 4958 goto onError; 4959 if (!PyInt_Check(v)) { 4960 PyErr_SetString(PyExc_TypeError, 4961 "* wants int"); 4962 goto onError; 4963 } 4964 width = PyInt_AsLong(v); 4965 if (width < 0) { 4966 flags |= F_LJUST; 4967 width = -width; 4968 } 4969 if (--fmtcnt >= 0) 4970 c = *fmt++; 4971 } 4972 else if (c >= '0' && c <= '9') { 4973 width = c - '0'; 4974 while (--fmtcnt >= 0) { 4975 c = *fmt++; 4976 if (c < '0' || c > '9') 4977 break; 4978 if ((width*10) / 10 != width) { 4979 PyErr_SetString(PyExc_ValueError, 4980 "width too big"); 4981 goto onError; 4982 } 4983 width = width*10 + (c - '0'); 4984 } 4985 } 4986 if (c == '.') { 4987 prec = 0; 4988 if (--fmtcnt >= 0) 4989 c = *fmt++; 4990 if (c == '*') { 4991 v = getnextarg(args, arglen, &argidx); 4992 if (v == NULL) 4993 goto onError; 4994 if (!PyInt_Check(v)) { 4995 PyErr_SetString(PyExc_TypeError, 4996 "* wants int"); 4997 goto onError; 4998 } 4999 prec = PyInt_AsLong(v); 5000 if (prec < 0) 5001 prec = 0; 5002 if (--fmtcnt >= 0) 5003 c = *fmt++; 5004 } 5005 else if (c >= '0' && c <= '9') { 5006 prec = c - '0'; 5007 while (--fmtcnt >= 0) { 5008 c = Py_CHARMASK(*fmt++); 5009 if (c < '0' || c > '9') 5010 break; 5011 if ((prec*10) / 10 != prec) { 5012 PyErr_SetString(PyExc_ValueError, 5013 "prec too big"); 5014 goto onError; 5015 } 5016 prec = prec*10 + (c - '0'); 5017 } 5018 } 5019 } /* prec */ 5020 if (fmtcnt >= 0) { 5021 if (c == 'h' || c == 'l' || c == 'L') { 5022 if (--fmtcnt >= 0) 5023 c = *fmt++; 5024 } 5025 } 5026 if (fmtcnt < 0) { 5027 PyErr_SetString(PyExc_ValueError, 5028 "incomplete format"); 5029 goto onError; 5030 } 5031 if (c != '%') { 5032 v = getnextarg(args, arglen, &argidx); 5033 if (v == NULL) 5034 goto onError; 5035 } 5036 sign = 0; 5037 fill = ' '; 5038 switch (c) { 5039 5040 case '%': 5041 pbuf = formatbuf; 5042 /* presume that buffer length is at least 1 */ 5043 pbuf[0] = '%'; 5044 len = 1; 5045 break; 5046 5047 case 's': 5048 case 'r': 5049 if (PyUnicode_Check(v) && c == 's') { 5050 temp = v; 5051 Py_INCREF(temp); 5052 } 5053 else { 5054 PyObject *unicode; 5055 if (c == 's') 5056 temp = PyObject_Str(v); 5057 else 5058 temp = PyObject_Repr(v); 5059 if (temp == NULL) 5060 goto onError; 5061 if (!PyString_Check(temp)) { 5062 /* XXX Note: this should never happen, since 5063 PyObject_Repr() and PyObject_Str() assure 5064 this */ 5065 Py_DECREF(temp); 5066 PyErr_SetString(PyExc_TypeError, 5067 "%s argument has non-string str()"); 5068 goto onError; 5069 } 5070 unicode = PyUnicode_Decode(PyString_AS_STRING(temp), 5071 PyString_GET_SIZE(temp), 5072 NULL, 5073 "strict"); 5074 Py_DECREF(temp); 5075 temp = unicode; 5076 if (temp == NULL) 5077 goto onError; 5078 } 5079 pbuf = PyUnicode_AS_UNICODE(temp); 5080 len = PyUnicode_GET_SIZE(temp); 5081 if (prec >= 0 && len > prec) 5082 len = prec; 5083 break; 5084 5085 case 'i': 5086 case 'd': 5087 case 'u': 5088 case 'o': 5089 case 'x': 5090 case 'X': 5091 if (c == 'i') 5092 c = 'd'; 5093 if (PyLong_Check(v)) { 5094 temp = formatlong(v, flags, prec, c); 5095 if (!temp) 5096 goto onError; 5097 pbuf = PyUnicode_AS_UNICODE(temp); 5098 len = PyUnicode_GET_SIZE(temp); 5099 /* unbounded ints can always produce 5100 a sign character! */ 5101 sign = 1; 5102 } 5103 else { 5104 pbuf = formatbuf; 5105 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), 5106 flags, prec, c, v); 5107 if (len < 0) 5108 goto onError; 5109 /* only d conversion is signed */ 5110 sign = c == 'd'; 5111 } 5112 if (flags & F_ZERO) 5113 fill = '0'; 5114 break; 5115 5116 case 'e': 5117 case 'E': 5118 case 'f': 5119 case 'g': 5120 case 'G': 5121 pbuf = formatbuf; 5122 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), 5123 flags, prec, c, v); 5124 if (len < 0) 5125 goto onError; 5126 sign = 1; 5127 if (flags & F_ZERO) 5128 fill = '0'; 5129 break; 5130 5131 case 'c': 5132 pbuf = formatbuf; 5133 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v); 5134 if (len < 0) 5135 goto onError; 5136 break; 5137 5138 default: 5139 PyErr_Format(PyExc_ValueError, 5140 "unsupported format character '%c' (0x%x) " 5141 "at index %i", 5142 (31<=c && c<=126) ? c : '?', 5143 c, fmt -1 - PyUnicode_AS_UNICODE(uformat)); 5144 goto onError; 5145 } 5146 if (sign) { 5147 if (*pbuf == '-' || *pbuf == '+') { 5148 sign = *pbuf++; 5149 len--; 5150 } 5151 else if (flags & F_SIGN) 5152 sign = '+'; 5153 else if (flags & F_BLANK) 5154 sign = ' '; 5155 else 5156 sign = 0; 5157 } 5158 if (width < len) 5159 width = len; 5160 if (rescnt < width + (sign != 0)) { 5161 reslen -= rescnt; 5162 rescnt = width + fmtcnt + 100; 5163 reslen += rescnt; 5164 if (_PyUnicode_Resize(&result, reslen) < 0) 5165 return NULL; 5166 res = PyUnicode_AS_UNICODE(result) 5167 + reslen - rescnt; 5168 } 5169 if (sign) { 5170 if (fill != ' ') 5171 *res++ = sign; 5172 rescnt--; 5173 if (width > len) 5174 width--; 5175 } 5176 if ((flags & F_ALT) && (c == 'x' || c == 'X')) { 5177 assert(pbuf[0] == '0'); 5178 assert(pbuf[1] == c); 5179 if (fill != ' ') { 5180 *res++ = *pbuf++; 5181 *res++ = *pbuf++; 5182 } 5183 rescnt -= 2; 5184 width -= 2; 5185 if (width < 0) 5186 width = 0; 5187 len -= 2; 5188 } 5189 if (width > len && !(flags & F_LJUST)) { 5190 do { 5191 --rescnt; 5192 *res++ = fill; 5193 } while (--width > len); 5194 } 5195 if (fill == ' ') { 5196 if (sign) 5197 *res++ = sign; 5198 if ((flags & F_ALT) && (c == 'x' || c == 'X')) { 5199 assert(pbuf[0] == '0'); 5200 assert(pbuf[1] == c); 5201 *res++ = *pbuf++; 5202 *res++ = *pbuf++; 5203 } 5204 } 5205 Py_UNICODE_COPY(res, pbuf, len); 5206 res += len; 5207 rescnt -= len; 5208 while (--width >= len) { 5209 --rescnt; 5210 *res++ = ' '; 5211 } 5212 if (dict && (argidx < arglen) && c != '%') { 5213 PyErr_SetString(PyExc_TypeError, 5214 "not all arguments converted"); 5215 goto onError; 5216 } 5217 Py_XDECREF(temp); 5218 } /* '%' */ 5219 } /* until end */ 5220 if (argidx < arglen && !dict) { 5221 PyErr_SetString(PyExc_TypeError, 5222 "not all arguments converted"); 5223 goto onError; 5224 } 5225 5226 if (args_owned) { 5227 Py_DECREF(args); 5228 } 5229 Py_DECREF(uformat); 5230 if (_PyUnicode_Resize(&result, reslen - rescnt)) 5231 goto onError; 5232 return (PyObject *)result; 5233 5234 onError: 5235 Py_XDECREF(result); 5236 Py_DECREF(uformat); 5237 if (args_owned) { 5238 Py_DECREF(args); 5239 } 5240 return NULL; 5241} 5242 5243static PyBufferProcs unicode_as_buffer = { 5244 (getreadbufferproc) unicode_buffer_getreadbuf, 5245 (getwritebufferproc) unicode_buffer_getwritebuf, 5246 (getsegcountproc) unicode_buffer_getsegcount, 5247 (getcharbufferproc) unicode_buffer_getcharbuf, 5248}; 5249 5250PyTypeObject PyUnicode_Type = { 5251 PyObject_HEAD_INIT(&PyType_Type) 5252 0, /* ob_size */ 5253 "unicode", /* tp_name */ 5254 sizeof(PyUnicodeObject), /* tp_size */ 5255 0, /* tp_itemsize */ 5256 /* Slots */ 5257 (destructor)_PyUnicode_Free, /* tp_dealloc */ 5258 0, /* tp_print */ 5259 (getattrfunc)unicode_getattr, /* tp_getattr */ 5260 0, /* tp_setattr */ 5261 (cmpfunc) unicode_compare, /* tp_compare */ 5262 (reprfunc) unicode_repr, /* tp_repr */ 5263 0, /* tp_as_number */ 5264 &unicode_as_sequence, /* tp_as_sequence */ 5265 0, /* tp_as_mapping */ 5266 (hashfunc) unicode_hash, /* tp_hash*/ 5267 0, /* tp_call*/ 5268 (reprfunc) unicode_str, /* tp_str */ 5269 (getattrofunc) NULL, /* tp_getattro */ 5270 (setattrofunc) NULL, /* tp_setattro */ 5271 &unicode_as_buffer, /* tp_as_buffer */ 5272 Py_TPFLAGS_DEFAULT, /* tp_flags */ 5273}; 5274 5275/* Initialize the Unicode implementation */ 5276 5277void _PyUnicode_Init(void) 5278{ 5279 int i; 5280 5281 /* Doublecheck the configuration... */ 5282 if (sizeof(Py_UNICODE) != 2) 5283 Py_FatalError("Unicode configuration error: " 5284 "sizeof(Py_UNICODE) != 2 bytes"); 5285 5286 /* Init the implementation */ 5287 unicode_freelist = NULL; 5288 unicode_freelist_size = 0; 5289 unicode_empty = _PyUnicode_New(0); 5290 strcpy(unicode_default_encoding, "ascii"); 5291 for (i = 0; i < 256; i++) 5292 unicode_latin1[i] = NULL; 5293} 5294 5295/* Finalize the Unicode implementation */ 5296 5297void 5298_PyUnicode_Fini(void) 5299{ 5300 PyUnicodeObject *u; 5301 int i; 5302 5303 Py_XDECREF(unicode_empty); 5304 unicode_empty = NULL; 5305 5306 for (i = 0; i < 256; i++) { 5307 if (unicode_latin1[i]) { 5308 Py_DECREF(unicode_latin1[i]); 5309 unicode_latin1[i] = NULL; 5310 } 5311 } 5312 5313 for (u = unicode_freelist; u != NULL;) { 5314 PyUnicodeObject *v = u; 5315 u = *(PyUnicodeObject **)u; 5316 if (v->str) 5317 PyMem_DEL(v->str); 5318 Py_XDECREF(v->defenc); 5319 PyObject_DEL(v); 5320 } 5321 unicode_freelist = NULL; 5322 unicode_freelist_size = 0; 5323} 5324