unicodeobject.c revision 2cfe36828342e16cd274b968736a01aed5c49557
1/* 2 3Unicode implementation based on original code by Fredrik Lundh, 4modified by Marc-Andre Lemburg <mal@lemburg.com> according to the 5Unicode Integration Proposal (see file Misc/unicode.txt). 6 7Copyright (c) Corporation for National Research Initiatives. 8 9-------------------------------------------------------------------- 10The original string type implementation is: 11 12 Copyright (c) 1999 by Secret Labs AB 13 Copyright (c) 1999 by Fredrik Lundh 14 15By obtaining, using, and/or copying this software and/or its 16associated documentation, you agree that you have read, understood, 17and will comply with the following terms and conditions: 18 19Permission to use, copy, modify, and distribute this software and its 20associated documentation for any purpose and without fee is hereby 21granted, provided that the above copyright notice appears in all 22copies, and that both that copyright notice and this permission notice 23appear in supporting documentation, and that the name of Secret Labs 24AB or the author not be used in advertising or publicity pertaining to 25distribution of the software without specific, written prior 26permission. 27 28SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO 29THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND 30FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR 31ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 32WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 33ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT 34OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 35-------------------------------------------------------------------- 36 37*/ 38 39#include "Python.h" 40 41#include "unicodeobject.h" 42#include "ucnhash.h" 43 44#ifdef MS_WIN32 45#include <windows.h> 46#endif 47 48/* Limit for the Unicode object free list */ 49 50#define MAX_UNICODE_FREELIST_SIZE 1024 51 52/* Limit for the Unicode object free list stay alive optimization. 53 54 The implementation will keep allocated Unicode memory intact for 55 all objects on the free list having a size less than this 56 limit. This reduces malloc() overhead for small Unicode objects. 57 58 At worst this will result in MAX_UNICODE_FREELIST_SIZE * 59 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT + 60 malloc()-overhead) bytes of unused garbage. 61 62 Setting the limit to 0 effectively turns the feature off. 63 64 Note: This is an experimental feature ! If you get core dumps when 65 using Unicode objects, turn this feature off. 66 67*/ 68 69#define KEEPALIVE_SIZE_LIMIT 9 70 71/* Endianness switches; defaults to little endian */ 72 73#ifdef WORDS_BIGENDIAN 74# define BYTEORDER_IS_BIG_ENDIAN 75#else 76# define BYTEORDER_IS_LITTLE_ENDIAN 77#endif 78 79/* --- Globals ------------------------------------------------------------ 80 81 The globals are initialized by the _PyUnicode_Init() API and should 82 not be used before calling that API. 83 84*/ 85 86/* Free list for Unicode objects */ 87static PyUnicodeObject *unicode_freelist; 88static int unicode_freelist_size; 89 90/* The empty Unicode object is shared to improve performance. */ 91static PyUnicodeObject *unicode_empty; 92 93/* Single character Unicode strings in the Latin-1 range are being 94 shared as well. */ 95static PyUnicodeObject *unicode_latin1[256]; 96 97/* Default encoding to use and assume when NULL is passed as encoding 98 parameter; it is initialized by _PyUnicode_Init(). 99 100 Always use the PyUnicode_SetDefaultEncoding() and 101 PyUnicode_GetDefaultEncoding() APIs to access this global. 102 103*/ 104static char unicode_default_encoding[100]; 105 106/* --- Unicode Object ----------------------------------------------------- */ 107 108static 109int unicode_resize(register PyUnicodeObject *unicode, 110 int length) 111{ 112 void *oldstr; 113 114 /* Shortcut if there's nothing much to do. */ 115 if (unicode->length == length) 116 goto reset; 117 118 /* Resizing shared object (unicode_empty or single character 119 objects) in-place is not allowed. Use PyUnicode_Resize() 120 instead ! */ 121 if (unicode == unicode_empty || 122 (unicode->length == 1 && 123 unicode->str[0] < 256 && 124 unicode_latin1[unicode->str[0]] == unicode)) { 125 PyErr_SetString(PyExc_SystemError, 126 "can't resize shared unicode objects"); 127 return -1; 128 } 129 130 /* We allocate one more byte to make sure the string is 131 Ux0000 terminated -- XXX is this needed ? */ 132 oldstr = unicode->str; 133 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1); 134 if (!unicode->str) { 135 unicode->str = oldstr; 136 PyErr_NoMemory(); 137 return -1; 138 } 139 unicode->str[length] = 0; 140 unicode->length = length; 141 142 reset: 143 /* Reset the object caches */ 144 if (unicode->defenc) { 145 Py_DECREF(unicode->defenc); 146 unicode->defenc = NULL; 147 } 148 unicode->hash = -1; 149 150 return 0; 151} 152 153/* We allocate one more byte to make sure the string is 154 Ux0000 terminated -- XXX is this needed ? 155 156 XXX This allocator could further be enhanced by assuring that the 157 free list never reduces its size below 1. 158 159*/ 160 161static 162PyUnicodeObject *_PyUnicode_New(int length) 163{ 164 register PyUnicodeObject *unicode; 165 166 /* Optimization for empty strings */ 167 if (length == 0 && unicode_empty != NULL) { 168 Py_INCREF(unicode_empty); 169 return unicode_empty; 170 } 171 172 /* Unicode freelist & memory allocation */ 173 if (unicode_freelist) { 174 unicode = unicode_freelist; 175 unicode_freelist = *(PyUnicodeObject **)unicode; 176 unicode_freelist_size--; 177 if (unicode->str) { 178 /* Keep-Alive optimization: we only upsize the buffer, 179 never downsize it. */ 180 if ((unicode->length < length) && 181 unicode_resize(unicode, length)) { 182 PyMem_DEL(unicode->str); 183 goto onError; 184 } 185 } 186 else { 187 unicode->str = PyMem_NEW(Py_UNICODE, length + 1); 188 } 189 PyObject_INIT(unicode, &PyUnicode_Type); 190 } 191 else { 192 unicode = PyObject_NEW(PyUnicodeObject, &PyUnicode_Type); 193 if (unicode == NULL) 194 return NULL; 195 unicode->str = PyMem_NEW(Py_UNICODE, length + 1); 196 } 197 198 if (!unicode->str) { 199 PyErr_NoMemory(); 200 goto onError; 201 } 202 unicode->str[length] = 0; 203 unicode->length = length; 204 unicode->hash = -1; 205 unicode->defenc = NULL; 206 return unicode; 207 208 onError: 209 _Py_ForgetReference((PyObject *)unicode); 210 PyObject_DEL(unicode); 211 return NULL; 212} 213 214static 215void _PyUnicode_Free(register PyUnicodeObject *unicode) 216{ 217 if (unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) { 218 /* Keep-Alive optimization */ 219 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) { 220 PyMem_DEL(unicode->str); 221 unicode->str = NULL; 222 unicode->length = 0; 223 } 224 if (unicode->defenc) { 225 Py_DECREF(unicode->defenc); 226 unicode->defenc = NULL; 227 } 228 /* Add to free list */ 229 *(PyUnicodeObject **)unicode = unicode_freelist; 230 unicode_freelist = unicode; 231 unicode_freelist_size++; 232 } 233 else { 234 PyMem_DEL(unicode->str); 235 Py_XDECREF(unicode->defenc); 236 PyObject_DEL(unicode); 237 } 238} 239 240int PyUnicode_Resize(PyObject **unicode, 241 int length) 242{ 243 register PyUnicodeObject *v; 244 245 /* Argument checks */ 246 if (unicode == NULL) { 247 PyErr_BadInternalCall(); 248 return -1; 249 } 250 v = (PyUnicodeObject *)*unicode; 251 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1) { 252 PyErr_BadInternalCall(); 253 return -1; 254 } 255 256 /* Resizing unicode_empty and single character objects is not 257 possible since these are being shared. We simply return a fresh 258 copy with the same Unicode content. */ 259 if (v->length != length && 260 (v == unicode_empty || v->length == 1)) { 261 PyUnicodeObject *w = _PyUnicode_New(length); 262 if (w == NULL) 263 return -1; 264 Py_UNICODE_COPY(w->str, v->str, 265 length < v->length ? length : v->length); 266 *unicode = (PyObject *)w; 267 return 0; 268 } 269 270 /* Note that we don't have to modify *unicode for unshared Unicode 271 objects, since we can modify them in-place. */ 272 return unicode_resize(v, length); 273} 274 275/* Internal API for use in unicodeobject.c only ! */ 276#define _PyUnicode_Resize(unicodevar, length) \ 277 PyUnicode_Resize(((PyObject **)(unicodevar)), length) 278 279PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u, 280 int size) 281{ 282 PyUnicodeObject *unicode; 283 284 /* If the Unicode data is known at construction time, we can apply 285 some optimizations which share commonly used objects. */ 286 if (u != NULL) { 287 288 /* Optimization for empty strings */ 289 if (size == 0 && unicode_empty != NULL) { 290 Py_INCREF(unicode_empty); 291 return (PyObject *)unicode_empty; 292 } 293 294 /* Single character Unicode objects in the Latin-1 range are 295 shared when using this constructor */ 296 if (size == 1 && *u < 256) { 297 unicode = unicode_latin1[*u]; 298 if (!unicode) { 299 unicode = _PyUnicode_New(1); 300 unicode->str[0] = *u; 301 if (!unicode) 302 return NULL; 303 unicode_latin1[*u] = unicode; 304 } 305 Py_INCREF(unicode); 306 return (PyObject *)unicode; 307 } 308 } 309 310 unicode = _PyUnicode_New(size); 311 if (!unicode) 312 return NULL; 313 314 /* Copy the Unicode data into the new object */ 315 if (u != NULL) 316 Py_UNICODE_COPY(unicode->str, u, size); 317 318 return (PyObject *)unicode; 319} 320 321#ifdef HAVE_WCHAR_H 322 323PyObject *PyUnicode_FromWideChar(register const wchar_t *w, 324 int size) 325{ 326 PyUnicodeObject *unicode; 327 328 if (w == NULL) { 329 PyErr_BadInternalCall(); 330 return NULL; 331 } 332 333 unicode = _PyUnicode_New(size); 334 if (!unicode) 335 return NULL; 336 337 /* Copy the wchar_t data into the new object */ 338#ifdef HAVE_USABLE_WCHAR_T 339 memcpy(unicode->str, w, size * sizeof(wchar_t)); 340#else 341 { 342 register Py_UNICODE *u; 343 register int i; 344 u = PyUnicode_AS_UNICODE(unicode); 345 for (i = size; i >= 0; i--) 346 *u++ = *w++; 347 } 348#endif 349 350 return (PyObject *)unicode; 351} 352 353int PyUnicode_AsWideChar(PyUnicodeObject *unicode, 354 register wchar_t *w, 355 int size) 356{ 357 if (unicode == NULL) { 358 PyErr_BadInternalCall(); 359 return -1; 360 } 361 if (size > PyUnicode_GET_SIZE(unicode)) 362 size = PyUnicode_GET_SIZE(unicode); 363#ifdef HAVE_USABLE_WCHAR_T 364 memcpy(w, unicode->str, size * sizeof(wchar_t)); 365#else 366 { 367 register Py_UNICODE *u; 368 register int i; 369 u = PyUnicode_AS_UNICODE(unicode); 370 for (i = size; i >= 0; i--) 371 *w++ = *u++; 372 } 373#endif 374 375 return size; 376} 377 378#endif 379 380PyObject *PyUnicode_FromObject(register PyObject *obj) 381{ 382 return PyUnicode_FromEncodedObject(obj, NULL, "strict"); 383} 384 385PyObject *PyUnicode_FromEncodedObject(register PyObject *obj, 386 const char *encoding, 387 const char *errors) 388{ 389 const char *s; 390 int len; 391 int owned = 0; 392 PyObject *v; 393 394 if (obj == NULL) { 395 PyErr_BadInternalCall(); 396 return NULL; 397 } 398 399 /* Coerce object */ 400 if (PyInstance_Check(obj)) { 401 PyObject *func; 402 func = PyObject_GetAttrString(obj, "__str__"); 403 if (func == NULL) { 404 PyErr_SetString(PyExc_TypeError, 405 "coercing to Unicode: instance doesn't define __str__"); 406 return NULL; 407 } 408 obj = PyEval_CallObject(func, NULL); 409 Py_DECREF(func); 410 if (obj == NULL) 411 return NULL; 412 owned = 1; 413 } 414 if (PyUnicode_Check(obj)) { 415 Py_INCREF(obj); 416 v = obj; 417 if (encoding) { 418 PyErr_SetString(PyExc_TypeError, 419 "decoding Unicode is not supported"); 420 return NULL; 421 } 422 goto done; 423 } 424 else if (PyString_Check(obj)) { 425 s = PyString_AS_STRING(obj); 426 len = PyString_GET_SIZE(obj); 427 } 428 else if (PyObject_AsCharBuffer(obj, &s, &len)) { 429 /* Overwrite the error message with something more useful in 430 case of a TypeError. */ 431 if (PyErr_ExceptionMatches(PyExc_TypeError)) 432 PyErr_Format(PyExc_TypeError, 433 "coercing to Unicode: need string or buffer, " 434 "%.80s found", 435 obj->ob_type->tp_name); 436 goto onError; 437 } 438 439 /* Convert to Unicode */ 440 if (len == 0) { 441 Py_INCREF(unicode_empty); 442 v = (PyObject *)unicode_empty; 443 } 444 else 445 v = PyUnicode_Decode(s, len, encoding, errors); 446 447 done: 448 if (owned) { 449 Py_DECREF(obj); 450 } 451 return v; 452 453 onError: 454 if (owned) { 455 Py_DECREF(obj); 456 } 457 return NULL; 458} 459 460PyObject *PyUnicode_Decode(const char *s, 461 int size, 462 const char *encoding, 463 const char *errors) 464{ 465 PyObject *buffer = NULL, *unicode; 466 467 if (encoding == NULL) 468 encoding = PyUnicode_GetDefaultEncoding(); 469 470 /* Shortcuts for common default encodings */ 471 if (strcmp(encoding, "utf-8") == 0) 472 return PyUnicode_DecodeUTF8(s, size, errors); 473 else if (strcmp(encoding, "latin-1") == 0) 474 return PyUnicode_DecodeLatin1(s, size, errors); 475 else if (strcmp(encoding, "ascii") == 0) 476 return PyUnicode_DecodeASCII(s, size, errors); 477 478 /* Decode via the codec registry */ 479 buffer = PyBuffer_FromMemory((void *)s, size); 480 if (buffer == NULL) 481 goto onError; 482 unicode = PyCodec_Decode(buffer, encoding, errors); 483 if (unicode == NULL) 484 goto onError; 485 if (!PyUnicode_Check(unicode)) { 486 PyErr_Format(PyExc_TypeError, 487 "decoder did not return an unicode object (type=%.400s)", 488 unicode->ob_type->tp_name); 489 Py_DECREF(unicode); 490 goto onError; 491 } 492 Py_DECREF(buffer); 493 return unicode; 494 495 onError: 496 Py_XDECREF(buffer); 497 return NULL; 498} 499 500PyObject *PyUnicode_Encode(const Py_UNICODE *s, 501 int size, 502 const char *encoding, 503 const char *errors) 504{ 505 PyObject *v, *unicode; 506 507 unicode = PyUnicode_FromUnicode(s, size); 508 if (unicode == NULL) 509 return NULL; 510 v = PyUnicode_AsEncodedString(unicode, encoding, errors); 511 Py_DECREF(unicode); 512 return v; 513} 514 515PyObject *PyUnicode_AsEncodedString(PyObject *unicode, 516 const char *encoding, 517 const char *errors) 518{ 519 PyObject *v; 520 521 if (!PyUnicode_Check(unicode)) { 522 PyErr_BadArgument(); 523 goto onError; 524 } 525 526 if (encoding == NULL) 527 encoding = PyUnicode_GetDefaultEncoding(); 528 529 /* Shortcuts for common default encodings */ 530 if (errors == NULL) { 531 if (strcmp(encoding, "utf-8") == 0) 532 return PyUnicode_AsUTF8String(unicode); 533 else if (strcmp(encoding, "latin-1") == 0) 534 return PyUnicode_AsLatin1String(unicode); 535 else if (strcmp(encoding, "ascii") == 0) 536 return PyUnicode_AsASCIIString(unicode); 537 } 538 539 /* Encode via the codec registry */ 540 v = PyCodec_Encode(unicode, encoding, errors); 541 if (v == NULL) 542 goto onError; 543 /* XXX Should we really enforce this ? */ 544 if (!PyString_Check(v)) { 545 PyErr_Format(PyExc_TypeError, 546 "encoder did not return a string object (type=%.400s)", 547 v->ob_type->tp_name); 548 Py_DECREF(v); 549 goto onError; 550 } 551 return v; 552 553 onError: 554 return NULL; 555} 556 557/* Return a Python string holding the default encoded value of the 558 Unicode object. 559 560 The resulting string is cached in the Unicode object for subsequent 561 usage by this function. The cached version is needed to implement 562 the character buffer interface and will live (at least) as long as 563 the Unicode object itself. 564 565 The refcount of the string is *not* incremented. 566 567 *** Exported for internal use by the interpreter only !!! *** 568 569*/ 570 571PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode, 572 const char *errors) 573{ 574 PyObject *v = ((PyUnicodeObject *)unicode)->defenc; 575 576 if (v) 577 return v; 578 v = PyUnicode_AsEncodedString(unicode, NULL, errors); 579 if (v && errors == NULL) 580 ((PyUnicodeObject *)unicode)->defenc = v; 581 return v; 582} 583 584Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode) 585{ 586 if (!PyUnicode_Check(unicode)) { 587 PyErr_BadArgument(); 588 goto onError; 589 } 590 return PyUnicode_AS_UNICODE(unicode); 591 592 onError: 593 return NULL; 594} 595 596int PyUnicode_GetSize(PyObject *unicode) 597{ 598 if (!PyUnicode_Check(unicode)) { 599 PyErr_BadArgument(); 600 goto onError; 601 } 602 return PyUnicode_GET_SIZE(unicode); 603 604 onError: 605 return -1; 606} 607 608const char *PyUnicode_GetDefaultEncoding(void) 609{ 610 return unicode_default_encoding; 611} 612 613int PyUnicode_SetDefaultEncoding(const char *encoding) 614{ 615 PyObject *v; 616 617 /* Make sure the encoding is valid. As side effect, this also 618 loads the encoding into the codec registry cache. */ 619 v = _PyCodec_Lookup(encoding); 620 if (v == NULL) 621 goto onError; 622 Py_DECREF(v); 623 strncpy(unicode_default_encoding, 624 encoding, 625 sizeof(unicode_default_encoding)); 626 return 0; 627 628 onError: 629 return -1; 630} 631 632/* --- UTF-8 Codec -------------------------------------------------------- */ 633 634static 635char utf8_code_length[256] = { 636 /* Map UTF-8 encoded prefix byte to sequence length. zero means 637 illegal prefix. see RFC 2279 for details */ 638 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 639 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 640 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 641 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 642 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 643 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 644 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 645 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 646 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 647 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 648 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 649 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 650 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 651 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 652 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 653 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0 654}; 655 656static 657int utf8_decoding_error(const char **source, 658 Py_UNICODE **dest, 659 const char *errors, 660 const char *details) 661{ 662 if ((errors == NULL) || 663 (strcmp(errors,"strict") == 0)) { 664 PyErr_Format(PyExc_UnicodeError, 665 "UTF-8 decoding error: %.400s", 666 details); 667 return -1; 668 } 669 else if (strcmp(errors,"ignore") == 0) { 670 (*source)++; 671 return 0; 672 } 673 else if (strcmp(errors,"replace") == 0) { 674 (*source)++; 675 **dest = Py_UNICODE_REPLACEMENT_CHARACTER; 676 (*dest)++; 677 return 0; 678 } 679 else { 680 PyErr_Format(PyExc_ValueError, 681 "UTF-8 decoding error; unknown error handling code: %.400s", 682 errors); 683 return -1; 684 } 685} 686 687PyObject *PyUnicode_DecodeUTF8(const char *s, 688 int size, 689 const char *errors) 690{ 691 int n; 692 const char *e; 693 PyUnicodeObject *unicode; 694 Py_UNICODE *p; 695 const char *errmsg = ""; 696 697 /* Note: size will always be longer than the resulting Unicode 698 character count */ 699 unicode = _PyUnicode_New(size); 700 if (!unicode) 701 return NULL; 702 if (size == 0) 703 return (PyObject *)unicode; 704 705 /* Unpack UTF-8 encoded data */ 706 p = unicode->str; 707 e = s + size; 708 709 while (s < e) { 710 Py_UCS4 ch = (unsigned char)*s; 711 712 if (ch < 0x80) { 713 *p++ = (Py_UNICODE)ch; 714 s++; 715 continue; 716 } 717 718 n = utf8_code_length[ch]; 719 720 if (s + n > e) { 721 errmsg = "unexpected end of data"; 722 goto utf8Error; 723 } 724 725 switch (n) { 726 727 case 0: 728 errmsg = "unexpected code byte"; 729 goto utf8Error; 730 731 case 1: 732 errmsg = "internal error"; 733 goto utf8Error; 734 735 case 2: 736 if ((s[1] & 0xc0) != 0x80) { 737 errmsg = "invalid data"; 738 goto utf8Error; 739 } 740 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f); 741 if (ch < 0x80) { 742 errmsg = "illegal encoding"; 743 goto utf8Error; 744 } 745 else 746 *p++ = (Py_UNICODE)ch; 747 break; 748 749 case 3: 750 if ((s[1] & 0xc0) != 0x80 || 751 (s[2] & 0xc0) != 0x80) { 752 errmsg = "invalid data"; 753 goto utf8Error; 754 } 755 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f); 756 if (ch < 0x800 || (ch >= 0xd800 && ch < 0xe000)) { 757 errmsg = "illegal encoding"; 758 goto utf8Error; 759 } 760 else 761 *p++ = (Py_UNICODE)ch; 762 break; 763 764 case 4: 765 if ((s[1] & 0xc0) != 0x80 || 766 (s[2] & 0xc0) != 0x80 || 767 (s[3] & 0xc0) != 0x80) { 768 errmsg = "invalid data"; 769 goto utf8Error; 770 } 771 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) + 772 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f); 773 /* validate and convert to UTF-16 */ 774 if ((ch < 0x10000) || /* minimum value allowed for 4 775 byte encoding */ 776 (ch > 0x10ffff)) { /* maximum value allowed for 777 UTF-16 */ 778 errmsg = "illegal encoding"; 779 goto utf8Error; 780 } 781 /* compute and append the two surrogates: */ 782 783 /* translate from 10000..10FFFF to 0..FFFF */ 784 ch -= 0x10000; 785 786 /* high surrogate = top 10 bits added to D800 */ 787 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10)); 788 789 /* low surrogate = bottom 10 bits added to DC00 */ 790 *p++ = (Py_UNICODE)(0xDC00 + (ch & ~0xFC00)); 791 break; 792 793 default: 794 /* Other sizes are only needed for UCS-4 */ 795 errmsg = "unsupported Unicode code range"; 796 goto utf8Error; 797 } 798 s += n; 799 continue; 800 801 utf8Error: 802 if (utf8_decoding_error(&s, &p, errors, errmsg)) 803 goto onError; 804 } 805 806 /* Adjust length */ 807 if (_PyUnicode_Resize(&unicode, p - unicode->str)) 808 goto onError; 809 810 return (PyObject *)unicode; 811 812onError: 813 Py_DECREF(unicode); 814 return NULL; 815} 816 817/* Not used anymore, now that the encoder supports UTF-16 818 surrogates. */ 819#if 0 820static 821int utf8_encoding_error(const Py_UNICODE **source, 822 char **dest, 823 const char *errors, 824 const char *details) 825{ 826 if ((errors == NULL) || 827 (strcmp(errors,"strict") == 0)) { 828 PyErr_Format(PyExc_UnicodeError, 829 "UTF-8 encoding error: %.400s", 830 details); 831 return -1; 832 } 833 else if (strcmp(errors,"ignore") == 0) { 834 return 0; 835 } 836 else if (strcmp(errors,"replace") == 0) { 837 **dest = '?'; 838 (*dest)++; 839 return 0; 840 } 841 else { 842 PyErr_Format(PyExc_ValueError, 843 "UTF-8 encoding error; " 844 "unknown error handling code: %.400s", 845 errors); 846 return -1; 847 } 848} 849#endif 850 851PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s, 852 int size, 853 const char *errors) 854{ 855 PyObject *v; 856 char *p; 857 char *q; 858 Py_UCS4 ch2; 859 unsigned int cbAllocated = 3 * size; 860 unsigned int cbWritten = 0; 861 int i = 0; 862 863 v = PyString_FromStringAndSize(NULL, cbAllocated); 864 if (v == NULL) 865 return NULL; 866 if (size == 0) 867 return v; 868 869 p = q = PyString_AS_STRING(v); 870 while (i < size) { 871 Py_UCS4 ch = s[i++]; 872 if (ch < 0x80) { 873 *p++ = (char) ch; 874 cbWritten++; 875 } 876 else if (ch < 0x0800) { 877 *p++ = 0xc0 | (ch >> 6); 878 *p++ = 0x80 | (ch & 0x3f); 879 cbWritten += 2; 880 } 881 else { 882 /* Check for high surrogate */ 883 if (0xD800 <= ch && ch <= 0xDBFF) { 884 if (i != size) { 885 ch2 = s[i]; 886 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) { 887 888 if (cbWritten >= (cbAllocated - 4)) { 889 /* Provide enough room for some more 890 surrogates */ 891 cbAllocated += 4*10; 892 if (_PyString_Resize(&v, cbAllocated)) 893 goto onError; 894 } 895 896 /* combine the two values */ 897 ch = ((ch - 0xD800)<<10 | (ch2-0xDC00))+0x10000; 898 899 *p++ = (char)((ch >> 18) | 0xf0); 900 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f)); 901 i++; 902 cbWritten += 4; 903 } 904 } 905 } 906 else { 907 *p++ = (char)(0xe0 | (ch >> 12)); 908 cbWritten += 3; 909 } 910 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); 911 *p++ = (char)(0x80 | (ch & 0x3f)); 912 } 913 } 914 *p = '\0'; 915 if (_PyString_Resize(&v, p - q)) 916 goto onError; 917 return v; 918 919 onError: 920 Py_DECREF(v); 921 return NULL; 922} 923 924PyObject *PyUnicode_AsUTF8String(PyObject *unicode) 925{ 926 if (!PyUnicode_Check(unicode)) { 927 PyErr_BadArgument(); 928 return NULL; 929 } 930 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode), 931 PyUnicode_GET_SIZE(unicode), 932 NULL); 933} 934 935/* --- UTF-16 Codec ------------------------------------------------------- */ 936 937static 938int utf16_decoding_error(const Py_UNICODE **source, 939 Py_UNICODE **dest, 940 const char *errors, 941 const char *details) 942{ 943 if ((errors == NULL) || 944 (strcmp(errors,"strict") == 0)) { 945 PyErr_Format(PyExc_UnicodeError, 946 "UTF-16 decoding error: %.400s", 947 details); 948 return -1; 949 } 950 else if (strcmp(errors,"ignore") == 0) { 951 return 0; 952 } 953 else if (strcmp(errors,"replace") == 0) { 954 if (dest) { 955 **dest = Py_UNICODE_REPLACEMENT_CHARACTER; 956 (*dest)++; 957 } 958 return 0; 959 } 960 else { 961 PyErr_Format(PyExc_ValueError, 962 "UTF-16 decoding error; " 963 "unknown error handling code: %.400s", 964 errors); 965 return -1; 966 } 967} 968 969PyObject *PyUnicode_DecodeUTF16(const char *s, 970 int size, 971 const char *errors, 972 int *byteorder) 973{ 974 PyUnicodeObject *unicode; 975 Py_UNICODE *p; 976 const Py_UNICODE *q, *e; 977 int bo = 0; 978 const char *errmsg = ""; 979 980 /* size should be an even number */ 981 if (size % sizeof(Py_UNICODE) != 0) { 982 if (utf16_decoding_error(NULL, NULL, errors, "truncated data")) 983 return NULL; 984 /* The remaining input chars are ignored if we fall through 985 here... */ 986 } 987 988 /* Note: size will always be longer than the resulting Unicode 989 character count */ 990 unicode = _PyUnicode_New(size); 991 if (!unicode) 992 return NULL; 993 if (size == 0) 994 return (PyObject *)unicode; 995 996 /* Unpack UTF-16 encoded data */ 997 p = unicode->str; 998 q = (Py_UNICODE *)s; 999 e = q + (size / sizeof(Py_UNICODE)); 1000 1001 if (byteorder) 1002 bo = *byteorder; 1003 1004 while (q < e) { 1005 register Py_UNICODE ch = *q++; 1006 1007 /* Check for BOM marks (U+FEFF) in the input and adjust 1008 current byte order setting accordingly. Swap input 1009 bytes if needed. (This assumes sizeof(Py_UNICODE) == 2 1010 !) */ 1011#ifdef BYTEORDER_IS_LITTLE_ENDIAN 1012 if (ch == 0xFEFF) { 1013 bo = -1; 1014 continue; 1015 } else if (ch == 0xFFFE) { 1016 bo = 1; 1017 continue; 1018 } 1019 if (bo == 1) 1020 ch = (ch >> 8) | (ch << 8); 1021#else 1022 if (ch == 0xFEFF) { 1023 bo = 1; 1024 continue; 1025 } else if (ch == 0xFFFE) { 1026 bo = -1; 1027 continue; 1028 } 1029 if (bo == -1) 1030 ch = (ch >> 8) | (ch << 8); 1031#endif 1032 if (ch < 0xD800 || ch > 0xDFFF) { 1033 *p++ = ch; 1034 continue; 1035 } 1036 1037 /* UTF-16 code pair: */ 1038 if (q >= e) { 1039 errmsg = "unexpected end of data"; 1040 goto utf16Error; 1041 } 1042 if (0xDC00 <= *q && *q <= 0xDFFF) { 1043 q++; 1044 if (0xD800 <= *q && *q <= 0xDBFF) { 1045 /* This is valid data (a UTF-16 surrogate pair), but 1046 we are not able to store this information since our 1047 Py_UNICODE type only has 16 bits... this might 1048 change someday, even though it's unlikely. */ 1049 errmsg = "code pairs are not supported"; 1050 goto utf16Error; 1051 } 1052 else 1053 continue; 1054 } 1055 errmsg = "illegal encoding"; 1056 /* Fall through to report the error */ 1057 1058 utf16Error: 1059 if (utf16_decoding_error(&q, &p, errors, errmsg)) 1060 goto onError; 1061 } 1062 1063 if (byteorder) 1064 *byteorder = bo; 1065 1066 /* Adjust length */ 1067 if (_PyUnicode_Resize(&unicode, p - unicode->str)) 1068 goto onError; 1069 1070 return (PyObject *)unicode; 1071 1072onError: 1073 Py_DECREF(unicode); 1074 return NULL; 1075} 1076 1077#undef UTF16_ERROR 1078 1079PyObject *PyUnicode_EncodeUTF16(const Py_UNICODE *s, 1080 int size, 1081 const char *errors, 1082 int byteorder) 1083{ 1084 PyObject *v; 1085 Py_UNICODE *p; 1086 char *q; 1087 1088 /* We don't create UTF-16 pairs... */ 1089 v = PyString_FromStringAndSize(NULL, 1090 sizeof(Py_UNICODE) * (size + (byteorder == 0))); 1091 if (v == NULL) 1092 return NULL; 1093 1094 q = PyString_AS_STRING(v); 1095 p = (Py_UNICODE *)q; 1096 if (byteorder == 0) 1097 *p++ = 0xFEFF; 1098 if (size == 0) 1099 return v; 1100 if (byteorder == 0 || 1101#ifdef BYTEORDER_IS_LITTLE_ENDIAN 1102 byteorder == -1 1103#else 1104 byteorder == 1 1105#endif 1106 ) 1107 Py_UNICODE_COPY(p, s, size); 1108 else 1109 while (size-- > 0) { 1110 Py_UNICODE ch = *s++; 1111 *p++ = (ch >> 8) | (ch << 8); 1112 } 1113 return v; 1114} 1115 1116PyObject *PyUnicode_AsUTF16String(PyObject *unicode) 1117{ 1118 if (!PyUnicode_Check(unicode)) { 1119 PyErr_BadArgument(); 1120 return NULL; 1121 } 1122 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode), 1123 PyUnicode_GET_SIZE(unicode), 1124 NULL, 1125 0); 1126} 1127 1128/* --- Unicode Escape Codec ----------------------------------------------- */ 1129 1130static 1131int unicodeescape_decoding_error(const char **source, 1132 Py_UNICODE *x, 1133 const char *errors, 1134 const char *details) 1135{ 1136 if ((errors == NULL) || 1137 (strcmp(errors,"strict") == 0)) { 1138 PyErr_Format(PyExc_UnicodeError, 1139 "Unicode-Escape decoding error: %.400s", 1140 details); 1141 return -1; 1142 } 1143 else if (strcmp(errors,"ignore") == 0) { 1144 return 0; 1145 } 1146 else if (strcmp(errors,"replace") == 0) { 1147 *x = Py_UNICODE_REPLACEMENT_CHARACTER; 1148 return 0; 1149 } 1150 else { 1151 PyErr_Format(PyExc_ValueError, 1152 "Unicode-Escape decoding error; " 1153 "unknown error handling code: %.400s", 1154 errors); 1155 return -1; 1156 } 1157} 1158 1159static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL; 1160 1161PyObject *PyUnicode_DecodeUnicodeEscape(const char *s, 1162 int size, 1163 const char *errors) 1164{ 1165 PyUnicodeObject *v; 1166 Py_UNICODE *p, *buf; 1167 const char *end; 1168 char* message; 1169 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */ 1170 1171 /* Escaped strings will always be longer than the resulting 1172 Unicode string, so we start with size here and then reduce the 1173 length after conversion to the true value. */ 1174 v = _PyUnicode_New(size); 1175 if (v == NULL) 1176 goto onError; 1177 if (size == 0) 1178 return (PyObject *)v; 1179 1180 p = buf = PyUnicode_AS_UNICODE(v); 1181 end = s + size; 1182 1183 while (s < end) { 1184 unsigned char c; 1185 Py_UNICODE x; 1186 int i, digits; 1187 1188 /* Non-escape characters are interpreted as Unicode ordinals */ 1189 if (*s != '\\') { 1190 *p++ = (unsigned char) *s++; 1191 continue; 1192 } 1193 1194 /* \ - Escapes */ 1195 s++; 1196 switch (*s++) { 1197 1198 /* \x escapes */ 1199 case '\n': break; 1200 case '\\': *p++ = '\\'; break; 1201 case '\'': *p++ = '\''; break; 1202 case '\"': *p++ = '\"'; break; 1203 case 'b': *p++ = '\b'; break; 1204 case 'f': *p++ = '\014'; break; /* FF */ 1205 case 't': *p++ = '\t'; break; 1206 case 'n': *p++ = '\n'; break; 1207 case 'r': *p++ = '\r'; break; 1208 case 'v': *p++ = '\013'; break; /* VT */ 1209 case 'a': *p++ = '\007'; break; /* BEL, not classic C */ 1210 1211 /* \OOO (octal) escapes */ 1212 case '0': case '1': case '2': case '3': 1213 case '4': case '5': case '6': case '7': 1214 x = s[-1] - '0'; 1215 if ('0' <= *s && *s <= '7') { 1216 x = (x<<3) + *s++ - '0'; 1217 if ('0' <= *s && *s <= '7') 1218 x = (x<<3) + *s++ - '0'; 1219 } 1220 *p++ = x; 1221 break; 1222 1223 /* hex escapes */ 1224 /* \xXX */ 1225 case 'x': 1226 digits = 2; 1227 message = "truncated \\xXX escape"; 1228 goto hexescape; 1229 1230 /* \uXXXX */ 1231 case 'u': 1232 digits = 4; 1233 message = "truncated \\uXXXX escape"; 1234 goto hexescape; 1235 1236 /* \UXXXXXXXX */ 1237 case 'U': 1238 digits = 8; 1239 message = "truncated \\UXXXXXXXX escape"; 1240 hexescape: 1241 chr = 0; 1242 for (i = 0; i < digits; i++) { 1243 c = (unsigned char) s[i]; 1244 if (!isxdigit(c)) { 1245 if (unicodeescape_decoding_error(&s, &x, errors, message)) 1246 goto onError; 1247 chr = x; 1248 i++; 1249 break; 1250 } 1251 chr = (chr<<4) & ~0xF; 1252 if (c >= '0' && c <= '9') 1253 chr += c - '0'; 1254 else if (c >= 'a' && c <= 'f') 1255 chr += 10 + c - 'a'; 1256 else 1257 chr += 10 + c - 'A'; 1258 } 1259 s += i; 1260 store: 1261 /* when we get here, chr is a 32-bit unicode character */ 1262 if (chr <= 0xffff) 1263 /* UCS-2 character */ 1264 *p++ = (Py_UNICODE) chr; 1265 else if (chr <= 0x10ffff) { 1266 /* UCS-4 character. store as two surrogate characters */ 1267 chr -= 0x10000L; 1268 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10); 1269 *p++ = 0xDC00 + (Py_UNICODE) (chr & ~0xFC00); 1270 } else { 1271 if (unicodeescape_decoding_error( 1272 &s, &x, errors, 1273 "illegal Unicode character") 1274 ) 1275 goto onError; 1276 *p++ = x; /* store replacement character */ 1277 } 1278 break; 1279 1280 /* \N{name} */ 1281 case 'N': 1282 message = "malformed \\N character escape"; 1283 if (ucnhash_CAPI == NULL) { 1284 /* load the unicode data module */ 1285 PyObject *m, *v; 1286 m = PyImport_ImportModule("unicodedata"); 1287 if (m == NULL) 1288 goto ucnhashError; 1289 v = PyObject_GetAttrString(m, "ucnhash_CAPI"); 1290 Py_DECREF(m); 1291 if (v == NULL) 1292 goto ucnhashError; 1293 ucnhash_CAPI = PyCObject_AsVoidPtr(v); 1294 Py_DECREF(v); 1295 if (ucnhash_CAPI == NULL) 1296 goto ucnhashError; 1297 } 1298 if (*s == '{') { 1299 const char *start = s+1; 1300 /* look for the closing brace */ 1301 while (*s != '}' && s < end) 1302 s++; 1303 if (s > start && s < end && *s == '}') { 1304 /* found a name. look it up in the unicode database */ 1305 message = "unknown Unicode character name"; 1306 s++; 1307 if (ucnhash_CAPI->getcode(start, s-start-1, &chr)) 1308 goto store; 1309 } 1310 } 1311 if (unicodeescape_decoding_error(&s, &x, errors, message)) 1312 goto onError; 1313 *p++ = x; 1314 break; 1315 1316 default: 1317 *p++ = '\\'; 1318 *p++ = (unsigned char)s[-1]; 1319 break; 1320 } 1321 } 1322 if (_PyUnicode_Resize(&v, (int)(p - buf))) 1323 goto onError; 1324 return (PyObject *)v; 1325 1326ucnhashError: 1327 PyErr_SetString( 1328 PyExc_UnicodeError, 1329 "\\N escapes not supported (can't load unicodedata module)" 1330 ); 1331 return NULL; 1332 1333onError: 1334 Py_XDECREF(v); 1335 return NULL; 1336} 1337 1338/* Return a Unicode-Escape string version of the Unicode object. 1339 1340 If quotes is true, the string is enclosed in u"" or u'' quotes as 1341 appropriate. 1342 1343*/ 1344 1345static const Py_UNICODE *findchar(const Py_UNICODE *s, 1346 int size, 1347 Py_UNICODE ch); 1348 1349static 1350PyObject *unicodeescape_string(const Py_UNICODE *s, 1351 int size, 1352 int quotes) 1353{ 1354 PyObject *repr; 1355 char *p; 1356 char *q; 1357 1358 static const char *hexdigit = "0123456789abcdef"; 1359 1360 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1); 1361 if (repr == NULL) 1362 return NULL; 1363 1364 p = q = PyString_AS_STRING(repr); 1365 1366 if (quotes) { 1367 *p++ = 'u'; 1368 *p++ = (findchar(s, size, '\'') && 1369 !findchar(s, size, '"')) ? '"' : '\''; 1370 } 1371 while (size-- > 0) { 1372 Py_UNICODE ch = *s++; 1373 /* Escape quotes */ 1374 if (quotes && (ch == q[1] || ch == '\\')) { 1375 *p++ = '\\'; 1376 *p++ = (char) ch; 1377 } 1378 /* Map 16-bit characters to '\uxxxx' */ 1379 else if (ch >= 256) { 1380 *p++ = '\\'; 1381 *p++ = 'u'; 1382 *p++ = hexdigit[(ch >> 12) & 0xf]; 1383 *p++ = hexdigit[(ch >> 8) & 0xf]; 1384 *p++ = hexdigit[(ch >> 4) & 0xf]; 1385 *p++ = hexdigit[ch & 15]; 1386 } 1387 /* Map special whitespace to '\t', \n', '\r' */ 1388 else if (ch == '\t') { 1389 *p++ = '\\'; 1390 *p++ = 't'; 1391 } 1392 else if (ch == '\n') { 1393 *p++ = '\\'; 1394 *p++ = 'n'; 1395 } 1396 else if (ch == '\r') { 1397 *p++ = '\\'; 1398 *p++ = 'r'; 1399 } 1400 /* Map non-printable US ASCII to '\xhh' */ 1401 else if (ch < ' ' || ch >= 128) { 1402 *p++ = '\\'; 1403 *p++ = 'x'; 1404 *p++ = hexdigit[(ch >> 4) & 0xf]; 1405 *p++ = hexdigit[ch & 15]; 1406 } 1407 /* Copy everything else as-is */ 1408 else 1409 *p++ = (char) ch; 1410 } 1411 if (quotes) 1412 *p++ = q[1]; 1413 1414 *p = '\0'; 1415 if (_PyString_Resize(&repr, p - q)) 1416 goto onError; 1417 1418 return repr; 1419 1420 onError: 1421 Py_DECREF(repr); 1422 return NULL; 1423} 1424 1425PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s, 1426 int size) 1427{ 1428 return unicodeescape_string(s, size, 0); 1429} 1430 1431PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode) 1432{ 1433 if (!PyUnicode_Check(unicode)) { 1434 PyErr_BadArgument(); 1435 return NULL; 1436 } 1437 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode), 1438 PyUnicode_GET_SIZE(unicode)); 1439} 1440 1441/* --- Raw Unicode Escape Codec ------------------------------------------- */ 1442 1443PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s, 1444 int size, 1445 const char *errors) 1446{ 1447 PyUnicodeObject *v; 1448 Py_UNICODE *p, *buf; 1449 const char *end; 1450 const char *bs; 1451 1452 /* Escaped strings will always be longer than the resulting 1453 Unicode string, so we start with size here and then reduce the 1454 length after conversion to the true value. */ 1455 v = _PyUnicode_New(size); 1456 if (v == NULL) 1457 goto onError; 1458 if (size == 0) 1459 return (PyObject *)v; 1460 p = buf = PyUnicode_AS_UNICODE(v); 1461 end = s + size; 1462 while (s < end) { 1463 unsigned char c; 1464 Py_UNICODE x; 1465 int i; 1466 1467 /* Non-escape characters are interpreted as Unicode ordinals */ 1468 if (*s != '\\') { 1469 *p++ = (unsigned char)*s++; 1470 continue; 1471 } 1472 1473 /* \u-escapes are only interpreted iff the number of leading 1474 backslashes if odd */ 1475 bs = s; 1476 for (;s < end;) { 1477 if (*s != '\\') 1478 break; 1479 *p++ = (unsigned char)*s++; 1480 } 1481 if (((s - bs) & 1) == 0 || 1482 s >= end || 1483 *s != 'u') { 1484 continue; 1485 } 1486 p--; 1487 s++; 1488 1489 /* \uXXXX with 4 hex digits */ 1490 for (x = 0, i = 0; i < 4; i++) { 1491 c = (unsigned char)s[i]; 1492 if (!isxdigit(c)) { 1493 if (unicodeescape_decoding_error(&s, &x, errors, 1494 "truncated \\uXXXX")) 1495 goto onError; 1496 i++; 1497 break; 1498 } 1499 x = (x<<4) & ~0xF; 1500 if (c >= '0' && c <= '9') 1501 x += c - '0'; 1502 else if (c >= 'a' && c <= 'f') 1503 x += 10 + c - 'a'; 1504 else 1505 x += 10 + c - 'A'; 1506 } 1507 s += i; 1508 *p++ = x; 1509 } 1510 if (_PyUnicode_Resize(&v, (int)(p - buf))) 1511 goto onError; 1512 return (PyObject *)v; 1513 1514 onError: 1515 Py_XDECREF(v); 1516 return NULL; 1517} 1518 1519PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s, 1520 int size) 1521{ 1522 PyObject *repr; 1523 char *p; 1524 char *q; 1525 1526 static const char *hexdigit = "0123456789abcdef"; 1527 1528 repr = PyString_FromStringAndSize(NULL, 6 * size); 1529 if (repr == NULL) 1530 return NULL; 1531 if (size == 0) 1532 return repr; 1533 1534 p = q = PyString_AS_STRING(repr); 1535 while (size-- > 0) { 1536 Py_UNICODE ch = *s++; 1537 /* Map 16-bit characters to '\uxxxx' */ 1538 if (ch >= 256) { 1539 *p++ = '\\'; 1540 *p++ = 'u'; 1541 *p++ = hexdigit[(ch >> 12) & 0xf]; 1542 *p++ = hexdigit[(ch >> 8) & 0xf]; 1543 *p++ = hexdigit[(ch >> 4) & 0xf]; 1544 *p++ = hexdigit[ch & 15]; 1545 } 1546 /* Copy everything else as-is */ 1547 else 1548 *p++ = (char) ch; 1549 } 1550 *p = '\0'; 1551 if (_PyString_Resize(&repr, p - q)) 1552 goto onError; 1553 1554 return repr; 1555 1556 onError: 1557 Py_DECREF(repr); 1558 return NULL; 1559} 1560 1561PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode) 1562{ 1563 if (!PyUnicode_Check(unicode)) { 1564 PyErr_BadArgument(); 1565 return NULL; 1566 } 1567 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode), 1568 PyUnicode_GET_SIZE(unicode)); 1569} 1570 1571/* --- Latin-1 Codec ------------------------------------------------------ */ 1572 1573PyObject *PyUnicode_DecodeLatin1(const char *s, 1574 int size, 1575 const char *errors) 1576{ 1577 PyUnicodeObject *v; 1578 Py_UNICODE *p; 1579 1580 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */ 1581 if (size == 1 && *(unsigned char*)s < 256) { 1582 Py_UNICODE r = *(unsigned char*)s; 1583 return PyUnicode_FromUnicode(&r, 1); 1584 } 1585 1586 v = _PyUnicode_New(size); 1587 if (v == NULL) 1588 goto onError; 1589 if (size == 0) 1590 return (PyObject *)v; 1591 p = PyUnicode_AS_UNICODE(v); 1592 while (size-- > 0) 1593 *p++ = (unsigned char)*s++; 1594 return (PyObject *)v; 1595 1596 onError: 1597 Py_XDECREF(v); 1598 return NULL; 1599} 1600 1601static 1602int latin1_encoding_error(const Py_UNICODE **source, 1603 char **dest, 1604 const char *errors, 1605 const char *details) 1606{ 1607 if ((errors == NULL) || 1608 (strcmp(errors,"strict") == 0)) { 1609 PyErr_Format(PyExc_UnicodeError, 1610 "Latin-1 encoding error: %.400s", 1611 details); 1612 return -1; 1613 } 1614 else if (strcmp(errors,"ignore") == 0) { 1615 return 0; 1616 } 1617 else if (strcmp(errors,"replace") == 0) { 1618 **dest = '?'; 1619 (*dest)++; 1620 return 0; 1621 } 1622 else { 1623 PyErr_Format(PyExc_ValueError, 1624 "Latin-1 encoding error; " 1625 "unknown error handling code: %.400s", 1626 errors); 1627 return -1; 1628 } 1629} 1630 1631PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p, 1632 int size, 1633 const char *errors) 1634{ 1635 PyObject *repr; 1636 char *s, *start; 1637 1638 repr = PyString_FromStringAndSize(NULL, size); 1639 if (repr == NULL) 1640 return NULL; 1641 if (size == 0) 1642 return repr; 1643 1644 s = PyString_AS_STRING(repr); 1645 start = s; 1646 while (size-- > 0) { 1647 Py_UNICODE ch = *p++; 1648 if (ch >= 256) { 1649 if (latin1_encoding_error(&p, &s, errors, 1650 "ordinal not in range(256)")) 1651 goto onError; 1652 } 1653 else 1654 *s++ = (char)ch; 1655 } 1656 /* Resize if error handling skipped some characters */ 1657 if (s - start < PyString_GET_SIZE(repr)) 1658 if (_PyString_Resize(&repr, s - start)) 1659 goto onError; 1660 return repr; 1661 1662 onError: 1663 Py_DECREF(repr); 1664 return NULL; 1665} 1666 1667PyObject *PyUnicode_AsLatin1String(PyObject *unicode) 1668{ 1669 if (!PyUnicode_Check(unicode)) { 1670 PyErr_BadArgument(); 1671 return NULL; 1672 } 1673 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode), 1674 PyUnicode_GET_SIZE(unicode), 1675 NULL); 1676} 1677 1678/* --- 7-bit ASCII Codec -------------------------------------------------- */ 1679 1680static 1681int ascii_decoding_error(const char **source, 1682 Py_UNICODE **dest, 1683 const char *errors, 1684 const char *details) 1685{ 1686 if ((errors == NULL) || 1687 (strcmp(errors,"strict") == 0)) { 1688 PyErr_Format(PyExc_UnicodeError, 1689 "ASCII decoding error: %.400s", 1690 details); 1691 return -1; 1692 } 1693 else if (strcmp(errors,"ignore") == 0) { 1694 return 0; 1695 } 1696 else if (strcmp(errors,"replace") == 0) { 1697 **dest = Py_UNICODE_REPLACEMENT_CHARACTER; 1698 (*dest)++; 1699 return 0; 1700 } 1701 else { 1702 PyErr_Format(PyExc_ValueError, 1703 "ASCII decoding error; " 1704 "unknown error handling code: %.400s", 1705 errors); 1706 return -1; 1707 } 1708} 1709 1710PyObject *PyUnicode_DecodeASCII(const char *s, 1711 int size, 1712 const char *errors) 1713{ 1714 PyUnicodeObject *v; 1715 Py_UNICODE *p; 1716 1717 /* ASCII is equivalent to the first 128 ordinals in Unicode. */ 1718 if (size == 1 && *(unsigned char*)s < 128) { 1719 Py_UNICODE r = *(unsigned char*)s; 1720 return PyUnicode_FromUnicode(&r, 1); 1721 } 1722 1723 v = _PyUnicode_New(size); 1724 if (v == NULL) 1725 goto onError; 1726 if (size == 0) 1727 return (PyObject *)v; 1728 p = PyUnicode_AS_UNICODE(v); 1729 while (size-- > 0) { 1730 register unsigned char c; 1731 1732 c = (unsigned char)*s++; 1733 if (c < 128) 1734 *p++ = c; 1735 else if (ascii_decoding_error(&s, &p, errors, 1736 "ordinal not in range(128)")) 1737 goto onError; 1738 } 1739 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v)) 1740 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v)))) 1741 goto onError; 1742 return (PyObject *)v; 1743 1744 onError: 1745 Py_XDECREF(v); 1746 return NULL; 1747} 1748 1749static 1750int ascii_encoding_error(const Py_UNICODE **source, 1751 char **dest, 1752 const char *errors, 1753 const char *details) 1754{ 1755 if ((errors == NULL) || 1756 (strcmp(errors,"strict") == 0)) { 1757 PyErr_Format(PyExc_UnicodeError, 1758 "ASCII encoding error: %.400s", 1759 details); 1760 return -1; 1761 } 1762 else if (strcmp(errors,"ignore") == 0) { 1763 return 0; 1764 } 1765 else if (strcmp(errors,"replace") == 0) { 1766 **dest = '?'; 1767 (*dest)++; 1768 return 0; 1769 } 1770 else { 1771 PyErr_Format(PyExc_ValueError, 1772 "ASCII encoding error; " 1773 "unknown error handling code: %.400s", 1774 errors); 1775 return -1; 1776 } 1777} 1778 1779PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p, 1780 int size, 1781 const char *errors) 1782{ 1783 PyObject *repr; 1784 char *s, *start; 1785 1786 repr = PyString_FromStringAndSize(NULL, size); 1787 if (repr == NULL) 1788 return NULL; 1789 if (size == 0) 1790 return repr; 1791 1792 s = PyString_AS_STRING(repr); 1793 start = s; 1794 while (size-- > 0) { 1795 Py_UNICODE ch = *p++; 1796 if (ch >= 128) { 1797 if (ascii_encoding_error(&p, &s, errors, 1798 "ordinal not in range(128)")) 1799 goto onError; 1800 } 1801 else 1802 *s++ = (char)ch; 1803 } 1804 /* Resize if error handling skipped some characters */ 1805 if (s - start < PyString_GET_SIZE(repr)) 1806 if (_PyString_Resize(&repr, s - start)) 1807 goto onError; 1808 return repr; 1809 1810 onError: 1811 Py_DECREF(repr); 1812 return NULL; 1813} 1814 1815PyObject *PyUnicode_AsASCIIString(PyObject *unicode) 1816{ 1817 if (!PyUnicode_Check(unicode)) { 1818 PyErr_BadArgument(); 1819 return NULL; 1820 } 1821 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode), 1822 PyUnicode_GET_SIZE(unicode), 1823 NULL); 1824} 1825 1826#ifdef MS_WIN32 1827 1828/* --- MBCS codecs for Windows -------------------------------------------- */ 1829 1830PyObject *PyUnicode_DecodeMBCS(const char *s, 1831 int size, 1832 const char *errors) 1833{ 1834 PyUnicodeObject *v; 1835 Py_UNICODE *p; 1836 1837 /* First get the size of the result */ 1838 DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0); 1839 if (size > 0 && usize==0) 1840 return PyErr_SetFromWindowsErrWithFilename(0, NULL); 1841 1842 v = _PyUnicode_New(usize); 1843 if (v == NULL) 1844 return NULL; 1845 if (usize == 0) 1846 return (PyObject *)v; 1847 p = PyUnicode_AS_UNICODE(v); 1848 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) { 1849 Py_DECREF(v); 1850 return PyErr_SetFromWindowsErrWithFilename(0, NULL); 1851 } 1852 1853 return (PyObject *)v; 1854} 1855 1856PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p, 1857 int size, 1858 const char *errors) 1859{ 1860 PyObject *repr; 1861 char *s; 1862 DWORD mbcssize; 1863 1864 /* If there are no characters, bail now! */ 1865 if (size==0) 1866 return PyString_FromString(""); 1867 1868 /* First get the size of the result */ 1869 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL); 1870 if (mbcssize==0) 1871 return PyErr_SetFromWindowsErrWithFilename(0, NULL); 1872 1873 repr = PyString_FromStringAndSize(NULL, mbcssize); 1874 if (repr == NULL) 1875 return NULL; 1876 if (mbcssize == 0) 1877 return repr; 1878 1879 /* Do the conversion */ 1880 s = PyString_AS_STRING(repr); 1881 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) { 1882 Py_DECREF(repr); 1883 return PyErr_SetFromWindowsErrWithFilename(0, NULL); 1884 } 1885 return repr; 1886} 1887 1888#endif /* MS_WIN32 */ 1889 1890/* --- Character Mapping Codec -------------------------------------------- */ 1891 1892static 1893int charmap_decoding_error(const char **source, 1894 Py_UNICODE **dest, 1895 const char *errors, 1896 const char *details) 1897{ 1898 if ((errors == NULL) || 1899 (strcmp(errors,"strict") == 0)) { 1900 PyErr_Format(PyExc_UnicodeError, 1901 "charmap decoding error: %.400s", 1902 details); 1903 return -1; 1904 } 1905 else if (strcmp(errors,"ignore") == 0) { 1906 return 0; 1907 } 1908 else if (strcmp(errors,"replace") == 0) { 1909 **dest = Py_UNICODE_REPLACEMENT_CHARACTER; 1910 (*dest)++; 1911 return 0; 1912 } 1913 else { 1914 PyErr_Format(PyExc_ValueError, 1915 "charmap decoding error; " 1916 "unknown error handling code: %.400s", 1917 errors); 1918 return -1; 1919 } 1920} 1921 1922PyObject *PyUnicode_DecodeCharmap(const char *s, 1923 int size, 1924 PyObject *mapping, 1925 const char *errors) 1926{ 1927 PyUnicodeObject *v; 1928 Py_UNICODE *p; 1929 int extrachars = 0; 1930 1931 /* Default to Latin-1 */ 1932 if (mapping == NULL) 1933 return PyUnicode_DecodeLatin1(s, size, errors); 1934 1935 v = _PyUnicode_New(size); 1936 if (v == NULL) 1937 goto onError; 1938 if (size == 0) 1939 return (PyObject *)v; 1940 p = PyUnicode_AS_UNICODE(v); 1941 while (size-- > 0) { 1942 unsigned char ch = *s++; 1943 PyObject *w, *x; 1944 1945 /* Get mapping (char ordinal -> integer, Unicode char or None) */ 1946 w = PyInt_FromLong((long)ch); 1947 if (w == NULL) 1948 goto onError; 1949 x = PyObject_GetItem(mapping, w); 1950 Py_DECREF(w); 1951 if (x == NULL) { 1952 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 1953 /* No mapping found means: mapping is undefined. */ 1954 PyErr_Clear(); 1955 x = Py_None; 1956 Py_INCREF(x); 1957 } else 1958 goto onError; 1959 } 1960 1961 /* Apply mapping */ 1962 if (PyInt_Check(x)) { 1963 long value = PyInt_AS_LONG(x); 1964 if (value < 0 || value > 65535) { 1965 PyErr_SetString(PyExc_TypeError, 1966 "character mapping must be in range(65536)"); 1967 Py_DECREF(x); 1968 goto onError; 1969 } 1970 *p++ = (Py_UNICODE)value; 1971 } 1972 else if (x == Py_None) { 1973 /* undefined mapping */ 1974 if (charmap_decoding_error(&s, &p, errors, 1975 "character maps to <undefined>")) { 1976 Py_DECREF(x); 1977 goto onError; 1978 } 1979 } 1980 else if (PyUnicode_Check(x)) { 1981 int targetsize = PyUnicode_GET_SIZE(x); 1982 1983 if (targetsize == 1) 1984 /* 1-1 mapping */ 1985 *p++ = *PyUnicode_AS_UNICODE(x); 1986 1987 else if (targetsize > 1) { 1988 /* 1-n mapping */ 1989 if (targetsize > extrachars) { 1990 /* resize first */ 1991 int oldpos = (int)(p - PyUnicode_AS_UNICODE(v)); 1992 int needed = (targetsize - extrachars) + \ 1993 (targetsize << 2); 1994 extrachars += needed; 1995 if (_PyUnicode_Resize(&v, 1996 PyUnicode_GET_SIZE(v) + needed)) { 1997 Py_DECREF(x); 1998 goto onError; 1999 } 2000 p = PyUnicode_AS_UNICODE(v) + oldpos; 2001 } 2002 Py_UNICODE_COPY(p, 2003 PyUnicode_AS_UNICODE(x), 2004 targetsize); 2005 p += targetsize; 2006 extrachars -= targetsize; 2007 } 2008 /* 1-0 mapping: skip the character */ 2009 } 2010 else { 2011 /* wrong return value */ 2012 PyErr_SetString(PyExc_TypeError, 2013 "character mapping must return integer, None or unicode"); 2014 Py_DECREF(x); 2015 goto onError; 2016 } 2017 Py_DECREF(x); 2018 } 2019 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v)) 2020 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v)))) 2021 goto onError; 2022 return (PyObject *)v; 2023 2024 onError: 2025 Py_XDECREF(v); 2026 return NULL; 2027} 2028 2029static 2030int charmap_encoding_error(const Py_UNICODE **source, 2031 char **dest, 2032 const char *errors, 2033 const char *details) 2034{ 2035 if ((errors == NULL) || 2036 (strcmp(errors,"strict") == 0)) { 2037 PyErr_Format(PyExc_UnicodeError, 2038 "charmap encoding error: %.400s", 2039 details); 2040 return -1; 2041 } 2042 else if (strcmp(errors,"ignore") == 0) { 2043 return 0; 2044 } 2045 else if (strcmp(errors,"replace") == 0) { 2046 **dest = '?'; 2047 (*dest)++; 2048 return 0; 2049 } 2050 else { 2051 PyErr_Format(PyExc_ValueError, 2052 "charmap encoding error; " 2053 "unknown error handling code: %.400s", 2054 errors); 2055 return -1; 2056 } 2057} 2058 2059PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p, 2060 int size, 2061 PyObject *mapping, 2062 const char *errors) 2063{ 2064 PyObject *v; 2065 char *s; 2066 int extrachars = 0; 2067 2068 /* Default to Latin-1 */ 2069 if (mapping == NULL) 2070 return PyUnicode_EncodeLatin1(p, size, errors); 2071 2072 v = PyString_FromStringAndSize(NULL, size); 2073 if (v == NULL) 2074 return NULL; 2075 if (size == 0) 2076 return v; 2077 s = PyString_AS_STRING(v); 2078 while (size-- > 0) { 2079 Py_UNICODE ch = *p++; 2080 PyObject *w, *x; 2081 2082 /* Get mapping (Unicode ordinal -> string char, integer or None) */ 2083 w = PyInt_FromLong((long)ch); 2084 if (w == NULL) 2085 goto onError; 2086 x = PyObject_GetItem(mapping, w); 2087 Py_DECREF(w); 2088 if (x == NULL) { 2089 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 2090 /* No mapping found means: mapping is undefined. */ 2091 PyErr_Clear(); 2092 x = Py_None; 2093 Py_INCREF(x); 2094 } else 2095 goto onError; 2096 } 2097 2098 /* Apply mapping */ 2099 if (PyInt_Check(x)) { 2100 long value = PyInt_AS_LONG(x); 2101 if (value < 0 || value > 255) { 2102 PyErr_SetString(PyExc_TypeError, 2103 "character mapping must be in range(256)"); 2104 Py_DECREF(x); 2105 goto onError; 2106 } 2107 *s++ = (char)value; 2108 } 2109 else if (x == Py_None) { 2110 /* undefined mapping */ 2111 if (charmap_encoding_error(&p, &s, errors, 2112 "character maps to <undefined>")) { 2113 Py_DECREF(x); 2114 goto onError; 2115 } 2116 } 2117 else if (PyString_Check(x)) { 2118 int targetsize = PyString_GET_SIZE(x); 2119 2120 if (targetsize == 1) 2121 /* 1-1 mapping */ 2122 *s++ = *PyString_AS_STRING(x); 2123 2124 else if (targetsize > 1) { 2125 /* 1-n mapping */ 2126 if (targetsize > extrachars) { 2127 /* resize first */ 2128 int oldpos = (int)(s - PyString_AS_STRING(v)); 2129 int needed = (targetsize - extrachars) + \ 2130 (targetsize << 2); 2131 extrachars += needed; 2132 if (_PyString_Resize(&v, PyString_GET_SIZE(v) + needed)) { 2133 Py_DECREF(x); 2134 goto onError; 2135 } 2136 s = PyString_AS_STRING(v) + oldpos; 2137 } 2138 memcpy(s, PyString_AS_STRING(x), targetsize); 2139 s += targetsize; 2140 extrachars -= targetsize; 2141 } 2142 /* 1-0 mapping: skip the character */ 2143 } 2144 else { 2145 /* wrong return value */ 2146 PyErr_SetString(PyExc_TypeError, 2147 "character mapping must return integer, None or unicode"); 2148 Py_DECREF(x); 2149 goto onError; 2150 } 2151 Py_DECREF(x); 2152 } 2153 if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v)) 2154 if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v)))) 2155 goto onError; 2156 return v; 2157 2158 onError: 2159 Py_DECREF(v); 2160 return NULL; 2161} 2162 2163PyObject *PyUnicode_AsCharmapString(PyObject *unicode, 2164 PyObject *mapping) 2165{ 2166 if (!PyUnicode_Check(unicode) || mapping == NULL) { 2167 PyErr_BadArgument(); 2168 return NULL; 2169 } 2170 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode), 2171 PyUnicode_GET_SIZE(unicode), 2172 mapping, 2173 NULL); 2174} 2175 2176static 2177int translate_error(const Py_UNICODE **source, 2178 Py_UNICODE **dest, 2179 const char *errors, 2180 const char *details) 2181{ 2182 if ((errors == NULL) || 2183 (strcmp(errors,"strict") == 0)) { 2184 PyErr_Format(PyExc_UnicodeError, 2185 "translate error: %.400s", 2186 details); 2187 return -1; 2188 } 2189 else if (strcmp(errors,"ignore") == 0) { 2190 return 0; 2191 } 2192 else if (strcmp(errors,"replace") == 0) { 2193 **dest = '?'; 2194 (*dest)++; 2195 return 0; 2196 } 2197 else { 2198 PyErr_Format(PyExc_ValueError, 2199 "translate error; " 2200 "unknown error handling code: %.400s", 2201 errors); 2202 return -1; 2203 } 2204} 2205 2206PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *s, 2207 int size, 2208 PyObject *mapping, 2209 const char *errors) 2210{ 2211 PyUnicodeObject *v; 2212 Py_UNICODE *p; 2213 2214 if (mapping == NULL) { 2215 PyErr_BadArgument(); 2216 return NULL; 2217 } 2218 2219 /* Output will never be longer than input */ 2220 v = _PyUnicode_New(size); 2221 if (v == NULL) 2222 goto onError; 2223 if (size == 0) 2224 goto done; 2225 p = PyUnicode_AS_UNICODE(v); 2226 while (size-- > 0) { 2227 Py_UNICODE ch = *s++; 2228 PyObject *w, *x; 2229 2230 /* Get mapping */ 2231 w = PyInt_FromLong(ch); 2232 if (w == NULL) 2233 goto onError; 2234 x = PyObject_GetItem(mapping, w); 2235 Py_DECREF(w); 2236 if (x == NULL) { 2237 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 2238 /* No mapping found: default to 1-1 mapping */ 2239 PyErr_Clear(); 2240 *p++ = ch; 2241 continue; 2242 } 2243 goto onError; 2244 } 2245 2246 /* Apply mapping */ 2247 if (PyInt_Check(x)) 2248 *p++ = (Py_UNICODE)PyInt_AS_LONG(x); 2249 else if (x == Py_None) { 2250 /* undefined mapping */ 2251 if (translate_error(&s, &p, errors, 2252 "character maps to <undefined>")) { 2253 Py_DECREF(x); 2254 goto onError; 2255 } 2256 } 2257 else if (PyUnicode_Check(x)) { 2258 if (PyUnicode_GET_SIZE(x) != 1) { 2259 /* 1-n mapping */ 2260 PyErr_SetString(PyExc_NotImplementedError, 2261 "1-n mappings are currently not implemented"); 2262 Py_DECREF(x); 2263 goto onError; 2264 } 2265 *p++ = *PyUnicode_AS_UNICODE(x); 2266 } 2267 else { 2268 /* wrong return value */ 2269 PyErr_SetString(PyExc_TypeError, 2270 "translate mapping must return integer, None or unicode"); 2271 Py_DECREF(x); 2272 goto onError; 2273 } 2274 Py_DECREF(x); 2275 } 2276 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v)) 2277 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v)))) 2278 goto onError; 2279 2280 done: 2281 return (PyObject *)v; 2282 2283 onError: 2284 Py_XDECREF(v); 2285 return NULL; 2286} 2287 2288PyObject *PyUnicode_Translate(PyObject *str, 2289 PyObject *mapping, 2290 const char *errors) 2291{ 2292 PyObject *result; 2293 2294 str = PyUnicode_FromObject(str); 2295 if (str == NULL) 2296 goto onError; 2297 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str), 2298 PyUnicode_GET_SIZE(str), 2299 mapping, 2300 errors); 2301 Py_DECREF(str); 2302 return result; 2303 2304 onError: 2305 Py_XDECREF(str); 2306 return NULL; 2307} 2308 2309/* --- Decimal Encoder ---------------------------------------------------- */ 2310 2311int PyUnicode_EncodeDecimal(Py_UNICODE *s, 2312 int length, 2313 char *output, 2314 const char *errors) 2315{ 2316 Py_UNICODE *p, *end; 2317 2318 if (output == NULL) { 2319 PyErr_BadArgument(); 2320 return -1; 2321 } 2322 2323 p = s; 2324 end = s + length; 2325 while (p < end) { 2326 register Py_UNICODE ch = *p++; 2327 int decimal; 2328 2329 if (Py_UNICODE_ISSPACE(ch)) { 2330 *output++ = ' '; 2331 continue; 2332 } 2333 decimal = Py_UNICODE_TODECIMAL(ch); 2334 if (decimal >= 0) { 2335 *output++ = '0' + decimal; 2336 continue; 2337 } 2338 if (0 < ch && ch < 256) { 2339 *output++ = (char)ch; 2340 continue; 2341 } 2342 /* All other characters are considered invalid */ 2343 if (errors == NULL || strcmp(errors, "strict") == 0) { 2344 PyErr_SetString(PyExc_ValueError, 2345 "invalid decimal Unicode string"); 2346 goto onError; 2347 } 2348 else if (strcmp(errors, "ignore") == 0) 2349 continue; 2350 else if (strcmp(errors, "replace") == 0) { 2351 *output++ = '?'; 2352 continue; 2353 } 2354 } 2355 /* 0-terminate the output string */ 2356 *output++ = '\0'; 2357 return 0; 2358 2359 onError: 2360 return -1; 2361} 2362 2363/* --- Helpers ------------------------------------------------------------ */ 2364 2365static 2366int count(PyUnicodeObject *self, 2367 int start, 2368 int end, 2369 PyUnicodeObject *substring) 2370{ 2371 int count = 0; 2372 2373 if (start < 0) 2374 start += self->length; 2375 if (start < 0) 2376 start = 0; 2377 if (end > self->length) 2378 end = self->length; 2379 if (end < 0) 2380 end += self->length; 2381 if (end < 0) 2382 end = 0; 2383 2384 if (substring->length == 0) 2385 return (end - start + 1); 2386 2387 end -= substring->length; 2388 2389 while (start <= end) 2390 if (Py_UNICODE_MATCH(self, start, substring)) { 2391 count++; 2392 start += substring->length; 2393 } else 2394 start++; 2395 2396 return count; 2397} 2398 2399int PyUnicode_Count(PyObject *str, 2400 PyObject *substr, 2401 int start, 2402 int end) 2403{ 2404 int result; 2405 2406 str = PyUnicode_FromObject(str); 2407 if (str == NULL) 2408 return -1; 2409 substr = PyUnicode_FromObject(substr); 2410 if (substr == NULL) { 2411 Py_DECREF(str); 2412 return -1; 2413 } 2414 2415 result = count((PyUnicodeObject *)str, 2416 start, end, 2417 (PyUnicodeObject *)substr); 2418 2419 Py_DECREF(str); 2420 Py_DECREF(substr); 2421 return result; 2422} 2423 2424static 2425int findstring(PyUnicodeObject *self, 2426 PyUnicodeObject *substring, 2427 int start, 2428 int end, 2429 int direction) 2430{ 2431 if (start < 0) 2432 start += self->length; 2433 if (start < 0) 2434 start = 0; 2435 2436 if (substring->length == 0) 2437 return start; 2438 2439 if (end > self->length) 2440 end = self->length; 2441 if (end < 0) 2442 end += self->length; 2443 if (end < 0) 2444 end = 0; 2445 2446 end -= substring->length; 2447 2448 if (direction < 0) { 2449 for (; end >= start; end--) 2450 if (Py_UNICODE_MATCH(self, end, substring)) 2451 return end; 2452 } else { 2453 for (; start <= end; start++) 2454 if (Py_UNICODE_MATCH(self, start, substring)) 2455 return start; 2456 } 2457 2458 return -1; 2459} 2460 2461int PyUnicode_Find(PyObject *str, 2462 PyObject *substr, 2463 int start, 2464 int end, 2465 int direction) 2466{ 2467 int result; 2468 2469 str = PyUnicode_FromObject(str); 2470 if (str == NULL) 2471 return -1; 2472 substr = PyUnicode_FromObject(substr); 2473 if (substr == NULL) { 2474 Py_DECREF(substr); 2475 return -1; 2476 } 2477 2478 result = findstring((PyUnicodeObject *)str, 2479 (PyUnicodeObject *)substr, 2480 start, end, direction); 2481 Py_DECREF(str); 2482 Py_DECREF(substr); 2483 return result; 2484} 2485 2486static 2487int tailmatch(PyUnicodeObject *self, 2488 PyUnicodeObject *substring, 2489 int start, 2490 int end, 2491 int direction) 2492{ 2493 if (start < 0) 2494 start += self->length; 2495 if (start < 0) 2496 start = 0; 2497 2498 if (substring->length == 0) 2499 return 1; 2500 2501 if (end > self->length) 2502 end = self->length; 2503 if (end < 0) 2504 end += self->length; 2505 if (end < 0) 2506 end = 0; 2507 2508 end -= substring->length; 2509 if (end < start) 2510 return 0; 2511 2512 if (direction > 0) { 2513 if (Py_UNICODE_MATCH(self, end, substring)) 2514 return 1; 2515 } else { 2516 if (Py_UNICODE_MATCH(self, start, substring)) 2517 return 1; 2518 } 2519 2520 return 0; 2521} 2522 2523int PyUnicode_Tailmatch(PyObject *str, 2524 PyObject *substr, 2525 int start, 2526 int end, 2527 int direction) 2528{ 2529 int result; 2530 2531 str = PyUnicode_FromObject(str); 2532 if (str == NULL) 2533 return -1; 2534 substr = PyUnicode_FromObject(substr); 2535 if (substr == NULL) { 2536 Py_DECREF(substr); 2537 return -1; 2538 } 2539 2540 result = tailmatch((PyUnicodeObject *)str, 2541 (PyUnicodeObject *)substr, 2542 start, end, direction); 2543 Py_DECREF(str); 2544 Py_DECREF(substr); 2545 return result; 2546} 2547 2548static 2549const Py_UNICODE *findchar(const Py_UNICODE *s, 2550 int size, 2551 Py_UNICODE ch) 2552{ 2553 /* like wcschr, but doesn't stop at NULL characters */ 2554 2555 while (size-- > 0) { 2556 if (*s == ch) 2557 return s; 2558 s++; 2559 } 2560 2561 return NULL; 2562} 2563 2564/* Apply fixfct filter to the Unicode object self and return a 2565 reference to the modified object */ 2566 2567static 2568PyObject *fixup(PyUnicodeObject *self, 2569 int (*fixfct)(PyUnicodeObject *s)) 2570{ 2571 2572 PyUnicodeObject *u; 2573 2574 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length); 2575 if (u == NULL) 2576 return NULL; 2577 2578 Py_UNICODE_COPY(u->str, self->str, self->length); 2579 2580 if (!fixfct(u)) { 2581 /* fixfct should return TRUE if it modified the buffer. If 2582 FALSE, return a reference to the original buffer instead 2583 (to save space, not time) */ 2584 Py_INCREF(self); 2585 Py_DECREF(u); 2586 return (PyObject*) self; 2587 } 2588 return (PyObject*) u; 2589} 2590 2591static 2592int fixupper(PyUnicodeObject *self) 2593{ 2594 int len = self->length; 2595 Py_UNICODE *s = self->str; 2596 int status = 0; 2597 2598 while (len-- > 0) { 2599 register Py_UNICODE ch; 2600 2601 ch = Py_UNICODE_TOUPPER(*s); 2602 if (ch != *s) { 2603 status = 1; 2604 *s = ch; 2605 } 2606 s++; 2607 } 2608 2609 return status; 2610} 2611 2612static 2613int fixlower(PyUnicodeObject *self) 2614{ 2615 int len = self->length; 2616 Py_UNICODE *s = self->str; 2617 int status = 0; 2618 2619 while (len-- > 0) { 2620 register Py_UNICODE ch; 2621 2622 ch = Py_UNICODE_TOLOWER(*s); 2623 if (ch != *s) { 2624 status = 1; 2625 *s = ch; 2626 } 2627 s++; 2628 } 2629 2630 return status; 2631} 2632 2633static 2634int fixswapcase(PyUnicodeObject *self) 2635{ 2636 int len = self->length; 2637 Py_UNICODE *s = self->str; 2638 int status = 0; 2639 2640 while (len-- > 0) { 2641 if (Py_UNICODE_ISUPPER(*s)) { 2642 *s = Py_UNICODE_TOLOWER(*s); 2643 status = 1; 2644 } else if (Py_UNICODE_ISLOWER(*s)) { 2645 *s = Py_UNICODE_TOUPPER(*s); 2646 status = 1; 2647 } 2648 s++; 2649 } 2650 2651 return status; 2652} 2653 2654static 2655int fixcapitalize(PyUnicodeObject *self) 2656{ 2657 int len = self->length; 2658 Py_UNICODE *s = self->str; 2659 int status = 0; 2660 2661 if (len == 0) 2662 return 0; 2663 if (Py_UNICODE_ISLOWER(*s)) { 2664 *s = Py_UNICODE_TOUPPER(*s); 2665 status = 1; 2666 } 2667 s++; 2668 while (--len > 0) { 2669 if (Py_UNICODE_ISUPPER(*s)) { 2670 *s = Py_UNICODE_TOLOWER(*s); 2671 status = 1; 2672 } 2673 s++; 2674 } 2675 return status; 2676} 2677 2678static 2679int fixtitle(PyUnicodeObject *self) 2680{ 2681 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 2682 register Py_UNICODE *e; 2683 int previous_is_cased; 2684 2685 /* Shortcut for single character strings */ 2686 if (PyUnicode_GET_SIZE(self) == 1) { 2687 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p); 2688 if (*p != ch) { 2689 *p = ch; 2690 return 1; 2691 } 2692 else 2693 return 0; 2694 } 2695 2696 e = p + PyUnicode_GET_SIZE(self); 2697 previous_is_cased = 0; 2698 for (; p < e; p++) { 2699 register const Py_UNICODE ch = *p; 2700 2701 if (previous_is_cased) 2702 *p = Py_UNICODE_TOLOWER(ch); 2703 else 2704 *p = Py_UNICODE_TOTITLE(ch); 2705 2706 if (Py_UNICODE_ISLOWER(ch) || 2707 Py_UNICODE_ISUPPER(ch) || 2708 Py_UNICODE_ISTITLE(ch)) 2709 previous_is_cased = 1; 2710 else 2711 previous_is_cased = 0; 2712 } 2713 return 1; 2714} 2715 2716PyObject *PyUnicode_Join(PyObject *separator, 2717 PyObject *seq) 2718{ 2719 Py_UNICODE *sep; 2720 int seplen; 2721 PyUnicodeObject *res = NULL; 2722 int reslen = 0; 2723 Py_UNICODE *p; 2724 int seqlen = 0; 2725 int sz = 100; 2726 int i; 2727 PyObject *it; 2728 2729 it = PyObject_GetIter(seq); 2730 if (it == NULL) 2731 return NULL; 2732 2733 if (separator == NULL) { 2734 Py_UNICODE blank = ' '; 2735 sep = ␣ 2736 seplen = 1; 2737 } 2738 else { 2739 separator = PyUnicode_FromObject(separator); 2740 if (separator == NULL) 2741 goto onError; 2742 sep = PyUnicode_AS_UNICODE(separator); 2743 seplen = PyUnicode_GET_SIZE(separator); 2744 } 2745 2746 res = _PyUnicode_New(sz); 2747 if (res == NULL) 2748 goto onError; 2749 p = PyUnicode_AS_UNICODE(res); 2750 reslen = 0; 2751 2752 for (i = 0; ; ++i) { 2753 int itemlen; 2754 PyObject *item = PyIter_Next(it); 2755 if (item == NULL) { 2756 if (PyErr_Occurred()) 2757 goto onError; 2758 break; 2759 } 2760 if (!PyUnicode_Check(item)) { 2761 PyObject *v; 2762 v = PyUnicode_FromObject(item); 2763 Py_DECREF(item); 2764 item = v; 2765 if (item == NULL) 2766 goto onError; 2767 } 2768 itemlen = PyUnicode_GET_SIZE(item); 2769 while (reslen + itemlen + seplen >= sz) { 2770 if (_PyUnicode_Resize(&res, sz*2)) 2771 goto onError; 2772 sz *= 2; 2773 p = PyUnicode_AS_UNICODE(res) + reslen; 2774 } 2775 if (i > 0) { 2776 Py_UNICODE_COPY(p, sep, seplen); 2777 p += seplen; 2778 reslen += seplen; 2779 } 2780 Py_UNICODE_COPY(p, PyUnicode_AS_UNICODE(item), itemlen); 2781 p += itemlen; 2782 reslen += itemlen; 2783 Py_DECREF(item); 2784 } 2785 if (_PyUnicode_Resize(&res, reslen)) 2786 goto onError; 2787 2788 Py_XDECREF(separator); 2789 Py_DECREF(it); 2790 return (PyObject *)res; 2791 2792 onError: 2793 Py_XDECREF(separator); 2794 Py_XDECREF(res); 2795 Py_DECREF(it); 2796 return NULL; 2797} 2798 2799static 2800PyUnicodeObject *pad(PyUnicodeObject *self, 2801 int left, 2802 int right, 2803 Py_UNICODE fill) 2804{ 2805 PyUnicodeObject *u; 2806 2807 if (left < 0) 2808 left = 0; 2809 if (right < 0) 2810 right = 0; 2811 2812 if (left == 0 && right == 0) { 2813 Py_INCREF(self); 2814 return self; 2815 } 2816 2817 u = _PyUnicode_New(left + self->length + right); 2818 if (u) { 2819 if (left) 2820 Py_UNICODE_FILL(u->str, fill, left); 2821 Py_UNICODE_COPY(u->str + left, self->str, self->length); 2822 if (right) 2823 Py_UNICODE_FILL(u->str + left + self->length, fill, right); 2824 } 2825 2826 return u; 2827} 2828 2829#define SPLIT_APPEND(data, left, right) \ 2830 str = PyUnicode_FromUnicode(data + left, right - left); \ 2831 if (!str) \ 2832 goto onError; \ 2833 if (PyList_Append(list, str)) { \ 2834 Py_DECREF(str); \ 2835 goto onError; \ 2836 } \ 2837 else \ 2838 Py_DECREF(str); 2839 2840static 2841PyObject *split_whitespace(PyUnicodeObject *self, 2842 PyObject *list, 2843 int maxcount) 2844{ 2845 register int i; 2846 register int j; 2847 int len = self->length; 2848 PyObject *str; 2849 2850 for (i = j = 0; i < len; ) { 2851 /* find a token */ 2852 while (i < len && Py_UNICODE_ISSPACE(self->str[i])) 2853 i++; 2854 j = i; 2855 while (i < len && !Py_UNICODE_ISSPACE(self->str[i])) 2856 i++; 2857 if (j < i) { 2858 if (maxcount-- <= 0) 2859 break; 2860 SPLIT_APPEND(self->str, j, i); 2861 while (i < len && Py_UNICODE_ISSPACE(self->str[i])) 2862 i++; 2863 j = i; 2864 } 2865 } 2866 if (j < len) { 2867 SPLIT_APPEND(self->str, j, len); 2868 } 2869 return list; 2870 2871 onError: 2872 Py_DECREF(list); 2873 return NULL; 2874} 2875 2876PyObject *PyUnicode_Splitlines(PyObject *string, 2877 int keepends) 2878{ 2879 register int i; 2880 register int j; 2881 int len; 2882 PyObject *list; 2883 PyObject *str; 2884 Py_UNICODE *data; 2885 2886 string = PyUnicode_FromObject(string); 2887 if (string == NULL) 2888 return NULL; 2889 data = PyUnicode_AS_UNICODE(string); 2890 len = PyUnicode_GET_SIZE(string); 2891 2892 list = PyList_New(0); 2893 if (!list) 2894 goto onError; 2895 2896 for (i = j = 0; i < len; ) { 2897 int eol; 2898 2899 /* Find a line and append it */ 2900 while (i < len && !Py_UNICODE_ISLINEBREAK(data[i])) 2901 i++; 2902 2903 /* Skip the line break reading CRLF as one line break */ 2904 eol = i; 2905 if (i < len) { 2906 if (data[i] == '\r' && i + 1 < len && 2907 data[i+1] == '\n') 2908 i += 2; 2909 else 2910 i++; 2911 if (keepends) 2912 eol = i; 2913 } 2914 SPLIT_APPEND(data, j, eol); 2915 j = i; 2916 } 2917 if (j < len) { 2918 SPLIT_APPEND(data, j, len); 2919 } 2920 2921 Py_DECREF(string); 2922 return list; 2923 2924 onError: 2925 Py_DECREF(list); 2926 Py_DECREF(string); 2927 return NULL; 2928} 2929 2930static 2931PyObject *split_char(PyUnicodeObject *self, 2932 PyObject *list, 2933 Py_UNICODE ch, 2934 int maxcount) 2935{ 2936 register int i; 2937 register int j; 2938 int len = self->length; 2939 PyObject *str; 2940 2941 for (i = j = 0; i < len; ) { 2942 if (self->str[i] == ch) { 2943 if (maxcount-- <= 0) 2944 break; 2945 SPLIT_APPEND(self->str, j, i); 2946 i = j = i + 1; 2947 } else 2948 i++; 2949 } 2950 if (j <= len) { 2951 SPLIT_APPEND(self->str, j, len); 2952 } 2953 return list; 2954 2955 onError: 2956 Py_DECREF(list); 2957 return NULL; 2958} 2959 2960static 2961PyObject *split_substring(PyUnicodeObject *self, 2962 PyObject *list, 2963 PyUnicodeObject *substring, 2964 int maxcount) 2965{ 2966 register int i; 2967 register int j; 2968 int len = self->length; 2969 int sublen = substring->length; 2970 PyObject *str; 2971 2972 for (i = j = 0; i <= len - sublen; ) { 2973 if (Py_UNICODE_MATCH(self, i, substring)) { 2974 if (maxcount-- <= 0) 2975 break; 2976 SPLIT_APPEND(self->str, j, i); 2977 i = j = i + sublen; 2978 } else 2979 i++; 2980 } 2981 if (j <= len) { 2982 SPLIT_APPEND(self->str, j, len); 2983 } 2984 return list; 2985 2986 onError: 2987 Py_DECREF(list); 2988 return NULL; 2989} 2990 2991#undef SPLIT_APPEND 2992 2993static 2994PyObject *split(PyUnicodeObject *self, 2995 PyUnicodeObject *substring, 2996 int maxcount) 2997{ 2998 PyObject *list; 2999 3000 if (maxcount < 0) 3001 maxcount = INT_MAX; 3002 3003 list = PyList_New(0); 3004 if (!list) 3005 return NULL; 3006 3007 if (substring == NULL) 3008 return split_whitespace(self,list,maxcount); 3009 3010 else if (substring->length == 1) 3011 return split_char(self,list,substring->str[0],maxcount); 3012 3013 else if (substring->length == 0) { 3014 Py_DECREF(list); 3015 PyErr_SetString(PyExc_ValueError, "empty separator"); 3016 return NULL; 3017 } 3018 else 3019 return split_substring(self,list,substring,maxcount); 3020} 3021 3022static 3023PyObject *strip(PyUnicodeObject *self, 3024 int left, 3025 int right) 3026{ 3027 Py_UNICODE *p = self->str; 3028 int start = 0; 3029 int end = self->length; 3030 3031 if (left) 3032 while (start < end && Py_UNICODE_ISSPACE(p[start])) 3033 start++; 3034 3035 if (right) 3036 while (end > start && Py_UNICODE_ISSPACE(p[end-1])) 3037 end--; 3038 3039 if (start == 0 && end == self->length) { 3040 /* couldn't strip anything off, return original string */ 3041 Py_INCREF(self); 3042 return (PyObject*) self; 3043 } 3044 3045 return (PyObject*) PyUnicode_FromUnicode( 3046 self->str + start, 3047 end - start 3048 ); 3049} 3050 3051static 3052PyObject *replace(PyUnicodeObject *self, 3053 PyUnicodeObject *str1, 3054 PyUnicodeObject *str2, 3055 int maxcount) 3056{ 3057 PyUnicodeObject *u; 3058 3059 if (maxcount < 0) 3060 maxcount = INT_MAX; 3061 3062 if (str1->length == 1 && str2->length == 1) { 3063 int i; 3064 3065 /* replace characters */ 3066 if (!findchar(self->str, self->length, str1->str[0])) { 3067 /* nothing to replace, return original string */ 3068 Py_INCREF(self); 3069 u = self; 3070 } else { 3071 Py_UNICODE u1 = str1->str[0]; 3072 Py_UNICODE u2 = str2->str[0]; 3073 3074 u = (PyUnicodeObject*) PyUnicode_FromUnicode( 3075 NULL, 3076 self->length 3077 ); 3078 if (u != NULL) { 3079 Py_UNICODE_COPY(u->str, self->str, 3080 self->length); 3081 for (i = 0; i < u->length; i++) 3082 if (u->str[i] == u1) { 3083 if (--maxcount < 0) 3084 break; 3085 u->str[i] = u2; 3086 } 3087 } 3088 } 3089 3090 } else { 3091 int n, i; 3092 Py_UNICODE *p; 3093 3094 /* replace strings */ 3095 n = count(self, 0, self->length, str1); 3096 if (n > maxcount) 3097 n = maxcount; 3098 if (n == 0) { 3099 /* nothing to replace, return original string */ 3100 Py_INCREF(self); 3101 u = self; 3102 } else { 3103 u = _PyUnicode_New( 3104 self->length + n * (str2->length - str1->length)); 3105 if (u) { 3106 i = 0; 3107 p = u->str; 3108 while (i <= self->length - str1->length) 3109 if (Py_UNICODE_MATCH(self, i, str1)) { 3110 /* replace string segment */ 3111 Py_UNICODE_COPY(p, str2->str, str2->length); 3112 p += str2->length; 3113 i += str1->length; 3114 if (--n <= 0) { 3115 /* copy remaining part */ 3116 Py_UNICODE_COPY(p, self->str+i, self->length-i); 3117 break; 3118 } 3119 } else 3120 *p++ = self->str[i++]; 3121 } 3122 } 3123 } 3124 3125 return (PyObject *) u; 3126} 3127 3128/* --- Unicode Object Methods --------------------------------------------- */ 3129 3130static char title__doc__[] = 3131"S.title() -> unicode\n\ 3132\n\ 3133Return a titlecased version of S, i.e. words start with title case\n\ 3134characters, all remaining cased characters have lower case."; 3135 3136static PyObject* 3137unicode_title(PyUnicodeObject *self, PyObject *args) 3138{ 3139 if (!PyArg_NoArgs(args)) 3140 return NULL; 3141 return fixup(self, fixtitle); 3142} 3143 3144static char capitalize__doc__[] = 3145"S.capitalize() -> unicode\n\ 3146\n\ 3147Return a capitalized version of S, i.e. make the first character\n\ 3148have upper case."; 3149 3150static PyObject* 3151unicode_capitalize(PyUnicodeObject *self, PyObject *args) 3152{ 3153 if (!PyArg_NoArgs(args)) 3154 return NULL; 3155 return fixup(self, fixcapitalize); 3156} 3157 3158#if 0 3159static char capwords__doc__[] = 3160"S.capwords() -> unicode\n\ 3161\n\ 3162Apply .capitalize() to all words in S and return the result with\n\ 3163normalized whitespace (all whitespace strings are replaced by ' ')."; 3164 3165static PyObject* 3166unicode_capwords(PyUnicodeObject *self, PyObject *args) 3167{ 3168 PyObject *list; 3169 PyObject *item; 3170 int i; 3171 3172 if (!PyArg_NoArgs(args)) 3173 return NULL; 3174 3175 /* Split into words */ 3176 list = split(self, NULL, -1); 3177 if (!list) 3178 return NULL; 3179 3180 /* Capitalize each word */ 3181 for (i = 0; i < PyList_GET_SIZE(list); i++) { 3182 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i), 3183 fixcapitalize); 3184 if (item == NULL) 3185 goto onError; 3186 Py_DECREF(PyList_GET_ITEM(list, i)); 3187 PyList_SET_ITEM(list, i, item); 3188 } 3189 3190 /* Join the words to form a new string */ 3191 item = PyUnicode_Join(NULL, list); 3192 3193onError: 3194 Py_DECREF(list); 3195 return (PyObject *)item; 3196} 3197#endif 3198 3199static char center__doc__[] = 3200"S.center(width) -> unicode\n\ 3201\n\ 3202Return S centered in a Unicode string of length width. Padding is done\n\ 3203using spaces."; 3204 3205static PyObject * 3206unicode_center(PyUnicodeObject *self, PyObject *args) 3207{ 3208 int marg, left; 3209 int width; 3210 3211 if (!PyArg_ParseTuple(args, "i:center", &width)) 3212 return NULL; 3213 3214 if (self->length >= width) { 3215 Py_INCREF(self); 3216 return (PyObject*) self; 3217 } 3218 3219 marg = width - self->length; 3220 left = marg / 2 + (marg & width & 1); 3221 3222 return (PyObject*) pad(self, left, marg - left, ' '); 3223} 3224 3225#if 0 3226 3227/* This code should go into some future Unicode collation support 3228 module. The basic comparison should compare ordinals on a naive 3229 basis (this is what Java does and thus JPython too). */ 3230 3231/* speedy UTF-16 code point order comparison */ 3232/* gleaned from: */ 3233/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */ 3234 3235static short utf16Fixup[32] = 3236{ 3237 0, 0, 0, 0, 0, 0, 0, 0, 3238 0, 0, 0, 0, 0, 0, 0, 0, 3239 0, 0, 0, 0, 0, 0, 0, 0, 3240 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800 3241}; 3242 3243static int 3244unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2) 3245{ 3246 int len1, len2; 3247 3248 Py_UNICODE *s1 = str1->str; 3249 Py_UNICODE *s2 = str2->str; 3250 3251 len1 = str1->length; 3252 len2 = str2->length; 3253 3254 while (len1 > 0 && len2 > 0) { 3255 Py_UNICODE c1, c2; 3256 long diff; 3257 3258 c1 = *s1++; 3259 c2 = *s2++; 3260 if (c1 > (1<<11) * 26) 3261 c1 += utf16Fixup[c1>>11]; 3262 if (c2 > (1<<11) * 26) 3263 c2 += utf16Fixup[c2>>11]; 3264 3265 /* now c1 and c2 are in UTF-32-compatible order */ 3266 diff = (long)c1 - (long)c2; 3267 if (diff) 3268 return (diff < 0) ? -1 : (diff != 0); 3269 len1--; len2--; 3270 } 3271 3272 return (len1 < len2) ? -1 : (len1 != len2); 3273} 3274 3275#else 3276 3277static int 3278unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2) 3279{ 3280 register int len1, len2; 3281 3282 Py_UNICODE *s1 = str1->str; 3283 Py_UNICODE *s2 = str2->str; 3284 3285 len1 = str1->length; 3286 len2 = str2->length; 3287 3288 while (len1 > 0 && len2 > 0) { 3289 register long diff; 3290 3291 diff = (long)*s1++ - (long)*s2++; 3292 if (diff) 3293 return (diff < 0) ? -1 : (diff != 0); 3294 len1--; len2--; 3295 } 3296 3297 return (len1 < len2) ? -1 : (len1 != len2); 3298} 3299 3300#endif 3301 3302int PyUnicode_Compare(PyObject *left, 3303 PyObject *right) 3304{ 3305 PyUnicodeObject *u = NULL, *v = NULL; 3306 int result; 3307 3308 /* Coerce the two arguments */ 3309 u = (PyUnicodeObject *)PyUnicode_FromObject(left); 3310 if (u == NULL) 3311 goto onError; 3312 v = (PyUnicodeObject *)PyUnicode_FromObject(right); 3313 if (v == NULL) 3314 goto onError; 3315 3316 /* Shortcut for empty or interned objects */ 3317 if (v == u) { 3318 Py_DECREF(u); 3319 Py_DECREF(v); 3320 return 0; 3321 } 3322 3323 result = unicode_compare(u, v); 3324 3325 Py_DECREF(u); 3326 Py_DECREF(v); 3327 return result; 3328 3329onError: 3330 Py_XDECREF(u); 3331 Py_XDECREF(v); 3332 return -1; 3333} 3334 3335int PyUnicode_Contains(PyObject *container, 3336 PyObject *element) 3337{ 3338 PyUnicodeObject *u = NULL, *v = NULL; 3339 int result; 3340 register const Py_UNICODE *p, *e; 3341 register Py_UNICODE ch; 3342 3343 /* Coerce the two arguments */ 3344 v = (PyUnicodeObject *)PyUnicode_FromObject(element); 3345 if (v == NULL) { 3346 PyErr_SetString(PyExc_TypeError, 3347 "'in <string>' requires character as left operand"); 3348 goto onError; 3349 } 3350 u = (PyUnicodeObject *)PyUnicode_FromObject(container); 3351 if (u == NULL) { 3352 Py_DECREF(v); 3353 goto onError; 3354 } 3355 3356 /* Check v in u */ 3357 if (PyUnicode_GET_SIZE(v) != 1) { 3358 PyErr_SetString(PyExc_TypeError, 3359 "'in <string>' requires character as left operand"); 3360 goto onError; 3361 } 3362 ch = *PyUnicode_AS_UNICODE(v); 3363 p = PyUnicode_AS_UNICODE(u); 3364 e = p + PyUnicode_GET_SIZE(u); 3365 result = 0; 3366 while (p < e) { 3367 if (*p++ == ch) { 3368 result = 1; 3369 break; 3370 } 3371 } 3372 3373 Py_DECREF(u); 3374 Py_DECREF(v); 3375 return result; 3376 3377onError: 3378 Py_XDECREF(u); 3379 Py_XDECREF(v); 3380 return -1; 3381} 3382 3383/* Concat to string or Unicode object giving a new Unicode object. */ 3384 3385PyObject *PyUnicode_Concat(PyObject *left, 3386 PyObject *right) 3387{ 3388 PyUnicodeObject *u = NULL, *v = NULL, *w; 3389 3390 /* Coerce the two arguments */ 3391 u = (PyUnicodeObject *)PyUnicode_FromObject(left); 3392 if (u == NULL) 3393 goto onError; 3394 v = (PyUnicodeObject *)PyUnicode_FromObject(right); 3395 if (v == NULL) 3396 goto onError; 3397 3398 /* Shortcuts */ 3399 if (v == unicode_empty) { 3400 Py_DECREF(v); 3401 return (PyObject *)u; 3402 } 3403 if (u == unicode_empty) { 3404 Py_DECREF(u); 3405 return (PyObject *)v; 3406 } 3407 3408 /* Concat the two Unicode strings */ 3409 w = _PyUnicode_New(u->length + v->length); 3410 if (w == NULL) 3411 goto onError; 3412 Py_UNICODE_COPY(w->str, u->str, u->length); 3413 Py_UNICODE_COPY(w->str + u->length, v->str, v->length); 3414 3415 Py_DECREF(u); 3416 Py_DECREF(v); 3417 return (PyObject *)w; 3418 3419onError: 3420 Py_XDECREF(u); 3421 Py_XDECREF(v); 3422 return NULL; 3423} 3424 3425static char count__doc__[] = 3426"S.count(sub[, start[, end]]) -> int\n\ 3427\n\ 3428Return the number of occurrences of substring sub in Unicode string\n\ 3429S[start:end]. Optional arguments start and end are\n\ 3430interpreted as in slice notation."; 3431 3432static PyObject * 3433unicode_count(PyUnicodeObject *self, PyObject *args) 3434{ 3435 PyUnicodeObject *substring; 3436 int start = 0; 3437 int end = INT_MAX; 3438 PyObject *result; 3439 3440 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring, 3441 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 3442 return NULL; 3443 3444 substring = (PyUnicodeObject *)PyUnicode_FromObject( 3445 (PyObject *)substring); 3446 if (substring == NULL) 3447 return NULL; 3448 3449 if (start < 0) 3450 start += self->length; 3451 if (start < 0) 3452 start = 0; 3453 if (end > self->length) 3454 end = self->length; 3455 if (end < 0) 3456 end += self->length; 3457 if (end < 0) 3458 end = 0; 3459 3460 result = PyInt_FromLong((long) count(self, start, end, substring)); 3461 3462 Py_DECREF(substring); 3463 return result; 3464} 3465 3466static char encode__doc__[] = 3467"S.encode([encoding[,errors]]) -> string\n\ 3468\n\ 3469Return an encoded string version of S. Default encoding is the current\n\ 3470default string encoding. errors may be given to set a different error\n\ 3471handling scheme. Default is 'strict' meaning that encoding errors raise\n\ 3472a ValueError. Other possible values are 'ignore' and 'replace'."; 3473 3474static PyObject * 3475unicode_encode(PyUnicodeObject *self, PyObject *args) 3476{ 3477 char *encoding = NULL; 3478 char *errors = NULL; 3479 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors)) 3480 return NULL; 3481 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors); 3482} 3483 3484static char expandtabs__doc__[] = 3485"S.expandtabs([tabsize]) -> unicode\n\ 3486\n\ 3487Return a copy of S where all tab characters are expanded using spaces.\n\ 3488If tabsize is not given, a tab size of 8 characters is assumed."; 3489 3490static PyObject* 3491unicode_expandtabs(PyUnicodeObject *self, PyObject *args) 3492{ 3493 Py_UNICODE *e; 3494 Py_UNICODE *p; 3495 Py_UNICODE *q; 3496 int i, j; 3497 PyUnicodeObject *u; 3498 int tabsize = 8; 3499 3500 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize)) 3501 return NULL; 3502 3503 /* First pass: determine size of output string */ 3504 i = j = 0; 3505 e = self->str + self->length; 3506 for (p = self->str; p < e; p++) 3507 if (*p == '\t') { 3508 if (tabsize > 0) 3509 j += tabsize - (j % tabsize); 3510 } 3511 else { 3512 j++; 3513 if (*p == '\n' || *p == '\r') { 3514 i += j; 3515 j = 0; 3516 } 3517 } 3518 3519 /* Second pass: create output string and fill it */ 3520 u = _PyUnicode_New(i + j); 3521 if (!u) 3522 return NULL; 3523 3524 j = 0; 3525 q = u->str; 3526 3527 for (p = self->str; p < e; p++) 3528 if (*p == '\t') { 3529 if (tabsize > 0) { 3530 i = tabsize - (j % tabsize); 3531 j += i; 3532 while (i--) 3533 *q++ = ' '; 3534 } 3535 } 3536 else { 3537 j++; 3538 *q++ = *p; 3539 if (*p == '\n' || *p == '\r') 3540 j = 0; 3541 } 3542 3543 return (PyObject*) u; 3544} 3545 3546static char find__doc__[] = 3547"S.find(sub [,start [,end]]) -> int\n\ 3548\n\ 3549Return the lowest index in S where substring sub is found,\n\ 3550such that sub is contained within s[start,end]. Optional\n\ 3551arguments start and end are interpreted as in slice notation.\n\ 3552\n\ 3553Return -1 on failure."; 3554 3555static PyObject * 3556unicode_find(PyUnicodeObject *self, PyObject *args) 3557{ 3558 PyUnicodeObject *substring; 3559 int start = 0; 3560 int end = INT_MAX; 3561 PyObject *result; 3562 3563 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring, 3564 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 3565 return NULL; 3566 substring = (PyUnicodeObject *)PyUnicode_FromObject( 3567 (PyObject *)substring); 3568 if (substring == NULL) 3569 return NULL; 3570 3571 result = PyInt_FromLong(findstring(self, substring, start, end, 1)); 3572 3573 Py_DECREF(substring); 3574 return result; 3575} 3576 3577static PyObject * 3578unicode_getitem(PyUnicodeObject *self, int index) 3579{ 3580 if (index < 0 || index >= self->length) { 3581 PyErr_SetString(PyExc_IndexError, "string index out of range"); 3582 return NULL; 3583 } 3584 3585 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1); 3586} 3587 3588static long 3589unicode_hash(PyUnicodeObject *self) 3590{ 3591 /* Since Unicode objects compare equal to their ASCII string 3592 counterparts, they should use the individual character values 3593 as basis for their hash value. This is needed to assure that 3594 strings and Unicode objects behave in the same way as 3595 dictionary keys. */ 3596 3597 register int len; 3598 register Py_UNICODE *p; 3599 register long x; 3600 3601 if (self->hash != -1) 3602 return self->hash; 3603 len = PyUnicode_GET_SIZE(self); 3604 p = PyUnicode_AS_UNICODE(self); 3605 x = *p << 7; 3606 while (--len >= 0) 3607 x = (1000003*x) ^ *p++; 3608 x ^= PyUnicode_GET_SIZE(self); 3609 if (x == -1) 3610 x = -2; 3611 self->hash = x; 3612 return x; 3613} 3614 3615static char index__doc__[] = 3616"S.index(sub [,start [,end]]) -> int\n\ 3617\n\ 3618Like S.find() but raise ValueError when the substring is not found."; 3619 3620static PyObject * 3621unicode_index(PyUnicodeObject *self, PyObject *args) 3622{ 3623 int result; 3624 PyUnicodeObject *substring; 3625 int start = 0; 3626 int end = INT_MAX; 3627 3628 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring, 3629 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 3630 return NULL; 3631 3632 substring = (PyUnicodeObject *)PyUnicode_FromObject( 3633 (PyObject *)substring); 3634 if (substring == NULL) 3635 return NULL; 3636 3637 result = findstring(self, substring, start, end, 1); 3638 3639 Py_DECREF(substring); 3640 if (result < 0) { 3641 PyErr_SetString(PyExc_ValueError, "substring not found"); 3642 return NULL; 3643 } 3644 return PyInt_FromLong(result); 3645} 3646 3647static char islower__doc__[] = 3648"S.islower() -> int\n\ 3649\n\ 3650Return 1 if all cased characters in S are lowercase and there is\n\ 3651at least one cased character in S, 0 otherwise."; 3652 3653static PyObject* 3654unicode_islower(PyUnicodeObject *self, PyObject *args) 3655{ 3656 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 3657 register const Py_UNICODE *e; 3658 int cased; 3659 3660 if (!PyArg_NoArgs(args)) 3661 return NULL; 3662 3663 /* Shortcut for single character strings */ 3664 if (PyUnicode_GET_SIZE(self) == 1) 3665 return PyInt_FromLong(Py_UNICODE_ISLOWER(*p) != 0); 3666 3667 /* Special case for empty strings */ 3668 if (PyString_GET_SIZE(self) == 0) 3669 return PyInt_FromLong(0); 3670 3671 e = p + PyUnicode_GET_SIZE(self); 3672 cased = 0; 3673 for (; p < e; p++) { 3674 register const Py_UNICODE ch = *p; 3675 3676 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) 3677 return PyInt_FromLong(0); 3678 else if (!cased && Py_UNICODE_ISLOWER(ch)) 3679 cased = 1; 3680 } 3681 return PyInt_FromLong(cased); 3682} 3683 3684static char isupper__doc__[] = 3685"S.isupper() -> int\n\ 3686\n\ 3687Return 1 if all cased characters in S are uppercase and there is\n\ 3688at least one cased character in S, 0 otherwise."; 3689 3690static PyObject* 3691unicode_isupper(PyUnicodeObject *self, PyObject *args) 3692{ 3693 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 3694 register const Py_UNICODE *e; 3695 int cased; 3696 3697 if (!PyArg_NoArgs(args)) 3698 return NULL; 3699 3700 /* Shortcut for single character strings */ 3701 if (PyUnicode_GET_SIZE(self) == 1) 3702 return PyInt_FromLong(Py_UNICODE_ISUPPER(*p) != 0); 3703 3704 /* Special case for empty strings */ 3705 if (PyString_GET_SIZE(self) == 0) 3706 return PyInt_FromLong(0); 3707 3708 e = p + PyUnicode_GET_SIZE(self); 3709 cased = 0; 3710 for (; p < e; p++) { 3711 register const Py_UNICODE ch = *p; 3712 3713 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch)) 3714 return PyInt_FromLong(0); 3715 else if (!cased && Py_UNICODE_ISUPPER(ch)) 3716 cased = 1; 3717 } 3718 return PyInt_FromLong(cased); 3719} 3720 3721static char istitle__doc__[] = 3722"S.istitle() -> int\n\ 3723\n\ 3724Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\ 3725may only follow uncased characters and lowercase characters only cased\n\ 3726ones. Return 0 otherwise."; 3727 3728static PyObject* 3729unicode_istitle(PyUnicodeObject *self, PyObject *args) 3730{ 3731 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 3732 register const Py_UNICODE *e; 3733 int cased, previous_is_cased; 3734 3735 if (!PyArg_NoArgs(args)) 3736 return NULL; 3737 3738 /* Shortcut for single character strings */ 3739 if (PyUnicode_GET_SIZE(self) == 1) 3740 return PyInt_FromLong((Py_UNICODE_ISTITLE(*p) != 0) || 3741 (Py_UNICODE_ISUPPER(*p) != 0)); 3742 3743 /* Special case for empty strings */ 3744 if (PyString_GET_SIZE(self) == 0) 3745 return PyInt_FromLong(0); 3746 3747 e = p + PyUnicode_GET_SIZE(self); 3748 cased = 0; 3749 previous_is_cased = 0; 3750 for (; p < e; p++) { 3751 register const Py_UNICODE ch = *p; 3752 3753 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) { 3754 if (previous_is_cased) 3755 return PyInt_FromLong(0); 3756 previous_is_cased = 1; 3757 cased = 1; 3758 } 3759 else if (Py_UNICODE_ISLOWER(ch)) { 3760 if (!previous_is_cased) 3761 return PyInt_FromLong(0); 3762 previous_is_cased = 1; 3763 cased = 1; 3764 } 3765 else 3766 previous_is_cased = 0; 3767 } 3768 return PyInt_FromLong(cased); 3769} 3770 3771static char isspace__doc__[] = 3772"S.isspace() -> int\n\ 3773\n\ 3774Return 1 if there are only whitespace characters in S,\n\ 37750 otherwise."; 3776 3777static PyObject* 3778unicode_isspace(PyUnicodeObject *self, PyObject *args) 3779{ 3780 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 3781 register const Py_UNICODE *e; 3782 3783 if (!PyArg_NoArgs(args)) 3784 return NULL; 3785 3786 /* Shortcut for single character strings */ 3787 if (PyUnicode_GET_SIZE(self) == 1 && 3788 Py_UNICODE_ISSPACE(*p)) 3789 return PyInt_FromLong(1); 3790 3791 /* Special case for empty strings */ 3792 if (PyString_GET_SIZE(self) == 0) 3793 return PyInt_FromLong(0); 3794 3795 e = p + PyUnicode_GET_SIZE(self); 3796 for (; p < e; p++) { 3797 if (!Py_UNICODE_ISSPACE(*p)) 3798 return PyInt_FromLong(0); 3799 } 3800 return PyInt_FromLong(1); 3801} 3802 3803static char isalpha__doc__[] = 3804"S.isalpha() -> int\n\ 3805\n\ 3806Return 1 if all characters in S are alphabetic\n\ 3807and there is at least one character in S, 0 otherwise."; 3808 3809static PyObject* 3810unicode_isalpha(PyUnicodeObject *self, PyObject *args) 3811{ 3812 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 3813 register const Py_UNICODE *e; 3814 3815 if (!PyArg_NoArgs(args)) 3816 return NULL; 3817 3818 /* Shortcut for single character strings */ 3819 if (PyUnicode_GET_SIZE(self) == 1 && 3820 Py_UNICODE_ISALPHA(*p)) 3821 return PyInt_FromLong(1); 3822 3823 /* Special case for empty strings */ 3824 if (PyString_GET_SIZE(self) == 0) 3825 return PyInt_FromLong(0); 3826 3827 e = p + PyUnicode_GET_SIZE(self); 3828 for (; p < e; p++) { 3829 if (!Py_UNICODE_ISALPHA(*p)) 3830 return PyInt_FromLong(0); 3831 } 3832 return PyInt_FromLong(1); 3833} 3834 3835static char isalnum__doc__[] = 3836"S.isalnum() -> int\n\ 3837\n\ 3838Return 1 if all characters in S are alphanumeric\n\ 3839and there is at least one character in S, 0 otherwise."; 3840 3841static PyObject* 3842unicode_isalnum(PyUnicodeObject *self, PyObject *args) 3843{ 3844 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 3845 register const Py_UNICODE *e; 3846 3847 if (!PyArg_NoArgs(args)) 3848 return NULL; 3849 3850 /* Shortcut for single character strings */ 3851 if (PyUnicode_GET_SIZE(self) == 1 && 3852 Py_UNICODE_ISALNUM(*p)) 3853 return PyInt_FromLong(1); 3854 3855 /* Special case for empty strings */ 3856 if (PyString_GET_SIZE(self) == 0) 3857 return PyInt_FromLong(0); 3858 3859 e = p + PyUnicode_GET_SIZE(self); 3860 for (; p < e; p++) { 3861 if (!Py_UNICODE_ISALNUM(*p)) 3862 return PyInt_FromLong(0); 3863 } 3864 return PyInt_FromLong(1); 3865} 3866 3867static char isdecimal__doc__[] = 3868"S.isdecimal() -> int\n\ 3869\n\ 3870Return 1 if there are only decimal characters in S,\n\ 38710 otherwise."; 3872 3873static PyObject* 3874unicode_isdecimal(PyUnicodeObject *self, PyObject *args) 3875{ 3876 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 3877 register const Py_UNICODE *e; 3878 3879 if (!PyArg_NoArgs(args)) 3880 return NULL; 3881 3882 /* Shortcut for single character strings */ 3883 if (PyUnicode_GET_SIZE(self) == 1 && 3884 Py_UNICODE_ISDECIMAL(*p)) 3885 return PyInt_FromLong(1); 3886 3887 /* Special case for empty strings */ 3888 if (PyString_GET_SIZE(self) == 0) 3889 return PyInt_FromLong(0); 3890 3891 e = p + PyUnicode_GET_SIZE(self); 3892 for (; p < e; p++) { 3893 if (!Py_UNICODE_ISDECIMAL(*p)) 3894 return PyInt_FromLong(0); 3895 } 3896 return PyInt_FromLong(1); 3897} 3898 3899static char isdigit__doc__[] = 3900"S.isdigit() -> int\n\ 3901\n\ 3902Return 1 if there are only digit characters in S,\n\ 39030 otherwise."; 3904 3905static PyObject* 3906unicode_isdigit(PyUnicodeObject *self, PyObject *args) 3907{ 3908 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 3909 register const Py_UNICODE *e; 3910 3911 if (!PyArg_NoArgs(args)) 3912 return NULL; 3913 3914 /* Shortcut for single character strings */ 3915 if (PyUnicode_GET_SIZE(self) == 1 && 3916 Py_UNICODE_ISDIGIT(*p)) 3917 return PyInt_FromLong(1); 3918 3919 /* Special case for empty strings */ 3920 if (PyString_GET_SIZE(self) == 0) 3921 return PyInt_FromLong(0); 3922 3923 e = p + PyUnicode_GET_SIZE(self); 3924 for (; p < e; p++) { 3925 if (!Py_UNICODE_ISDIGIT(*p)) 3926 return PyInt_FromLong(0); 3927 } 3928 return PyInt_FromLong(1); 3929} 3930 3931static char isnumeric__doc__[] = 3932"S.isnumeric() -> int\n\ 3933\n\ 3934Return 1 if there are only numeric characters in S,\n\ 39350 otherwise."; 3936 3937static PyObject* 3938unicode_isnumeric(PyUnicodeObject *self, PyObject *args) 3939{ 3940 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 3941 register const Py_UNICODE *e; 3942 3943 if (!PyArg_NoArgs(args)) 3944 return NULL; 3945 3946 /* Shortcut for single character strings */ 3947 if (PyUnicode_GET_SIZE(self) == 1 && 3948 Py_UNICODE_ISNUMERIC(*p)) 3949 return PyInt_FromLong(1); 3950 3951 /* Special case for empty strings */ 3952 if (PyString_GET_SIZE(self) == 0) 3953 return PyInt_FromLong(0); 3954 3955 e = p + PyUnicode_GET_SIZE(self); 3956 for (; p < e; p++) { 3957 if (!Py_UNICODE_ISNUMERIC(*p)) 3958 return PyInt_FromLong(0); 3959 } 3960 return PyInt_FromLong(1); 3961} 3962 3963static char join__doc__[] = 3964"S.join(sequence) -> unicode\n\ 3965\n\ 3966Return a string which is the concatenation of the strings in the\n\ 3967sequence. The separator between elements is S."; 3968 3969static PyObject* 3970unicode_join(PyUnicodeObject *self, PyObject *args) 3971{ 3972 PyObject *data; 3973 if (!PyArg_ParseTuple(args, "O:join", &data)) 3974 return NULL; 3975 3976 return PyUnicode_Join((PyObject *)self, data); 3977} 3978 3979static int 3980unicode_length(PyUnicodeObject *self) 3981{ 3982 return self->length; 3983} 3984 3985static char ljust__doc__[] = 3986"S.ljust(width) -> unicode\n\ 3987\n\ 3988Return S left justified in a Unicode string of length width. Padding is\n\ 3989done using spaces."; 3990 3991static PyObject * 3992unicode_ljust(PyUnicodeObject *self, PyObject *args) 3993{ 3994 int width; 3995 if (!PyArg_ParseTuple(args, "i:ljust", &width)) 3996 return NULL; 3997 3998 if (self->length >= width) { 3999 Py_INCREF(self); 4000 return (PyObject*) self; 4001 } 4002 4003 return (PyObject*) pad(self, 0, width - self->length, ' '); 4004} 4005 4006static char lower__doc__[] = 4007"S.lower() -> unicode\n\ 4008\n\ 4009Return a copy of the string S converted to lowercase."; 4010 4011static PyObject* 4012unicode_lower(PyUnicodeObject *self, PyObject *args) 4013{ 4014 if (!PyArg_NoArgs(args)) 4015 return NULL; 4016 return fixup(self, fixlower); 4017} 4018 4019static char lstrip__doc__[] = 4020"S.lstrip() -> unicode\n\ 4021\n\ 4022Return a copy of the string S with leading whitespace removed."; 4023 4024static PyObject * 4025unicode_lstrip(PyUnicodeObject *self, PyObject *args) 4026{ 4027 if (!PyArg_NoArgs(args)) 4028 return NULL; 4029 return strip(self, 1, 0); 4030} 4031 4032static PyObject* 4033unicode_repeat(PyUnicodeObject *str, int len) 4034{ 4035 PyUnicodeObject *u; 4036 Py_UNICODE *p; 4037 int nchars; 4038 size_t nbytes; 4039 4040 if (len < 0) 4041 len = 0; 4042 4043 if (len == 1) { 4044 /* no repeat, return original string */ 4045 Py_INCREF(str); 4046 return (PyObject*) str; 4047 } 4048 4049 /* ensure # of chars needed doesn't overflow int and # of bytes 4050 * needed doesn't overflow size_t 4051 */ 4052 nchars = len * str->length; 4053 if (len && nchars / len != str->length) { 4054 PyErr_SetString(PyExc_OverflowError, 4055 "repeated string is too long"); 4056 return NULL; 4057 } 4058 nbytes = (nchars + 1) * sizeof(Py_UNICODE); 4059 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) { 4060 PyErr_SetString(PyExc_OverflowError, 4061 "repeated string is too long"); 4062 return NULL; 4063 } 4064 u = _PyUnicode_New(nchars); 4065 if (!u) 4066 return NULL; 4067 4068 p = u->str; 4069 4070 while (len-- > 0) { 4071 Py_UNICODE_COPY(p, str->str, str->length); 4072 p += str->length; 4073 } 4074 4075 return (PyObject*) u; 4076} 4077 4078PyObject *PyUnicode_Replace(PyObject *obj, 4079 PyObject *subobj, 4080 PyObject *replobj, 4081 int maxcount) 4082{ 4083 PyObject *self; 4084 PyObject *str1; 4085 PyObject *str2; 4086 PyObject *result; 4087 4088 self = PyUnicode_FromObject(obj); 4089 if (self == NULL) 4090 return NULL; 4091 str1 = PyUnicode_FromObject(subobj); 4092 if (str1 == NULL) { 4093 Py_DECREF(self); 4094 return NULL; 4095 } 4096 str2 = PyUnicode_FromObject(replobj); 4097 if (str2 == NULL) { 4098 Py_DECREF(self); 4099 Py_DECREF(str1); 4100 return NULL; 4101 } 4102 result = replace((PyUnicodeObject *)self, 4103 (PyUnicodeObject *)str1, 4104 (PyUnicodeObject *)str2, 4105 maxcount); 4106 Py_DECREF(self); 4107 Py_DECREF(str1); 4108 Py_DECREF(str2); 4109 return result; 4110} 4111 4112static char replace__doc__[] = 4113"S.replace (old, new[, maxsplit]) -> unicode\n\ 4114\n\ 4115Return a copy of S with all occurrences of substring\n\ 4116old replaced by new. If the optional argument maxsplit is\n\ 4117given, only the first maxsplit occurrences are replaced."; 4118 4119static PyObject* 4120unicode_replace(PyUnicodeObject *self, PyObject *args) 4121{ 4122 PyUnicodeObject *str1; 4123 PyUnicodeObject *str2; 4124 int maxcount = -1; 4125 PyObject *result; 4126 4127 if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount)) 4128 return NULL; 4129 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1); 4130 if (str1 == NULL) 4131 return NULL; 4132 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2); 4133 if (str2 == NULL) 4134 return NULL; 4135 4136 result = replace(self, str1, str2, maxcount); 4137 4138 Py_DECREF(str1); 4139 Py_DECREF(str2); 4140 return result; 4141} 4142 4143static 4144PyObject *unicode_repr(PyObject *unicode) 4145{ 4146 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode), 4147 PyUnicode_GET_SIZE(unicode), 4148 1); 4149} 4150 4151static char rfind__doc__[] = 4152"S.rfind(sub [,start [,end]]) -> int\n\ 4153\n\ 4154Return the highest index in S where substring sub is found,\n\ 4155such that sub is contained within s[start,end]. Optional\n\ 4156arguments start and end are interpreted as in slice notation.\n\ 4157\n\ 4158Return -1 on failure."; 4159 4160static PyObject * 4161unicode_rfind(PyUnicodeObject *self, PyObject *args) 4162{ 4163 PyUnicodeObject *substring; 4164 int start = 0; 4165 int end = INT_MAX; 4166 PyObject *result; 4167 4168 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring, 4169 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 4170 return NULL; 4171 substring = (PyUnicodeObject *)PyUnicode_FromObject( 4172 (PyObject *)substring); 4173 if (substring == NULL) 4174 return NULL; 4175 4176 result = PyInt_FromLong(findstring(self, substring, start, end, -1)); 4177 4178 Py_DECREF(substring); 4179 return result; 4180} 4181 4182static char rindex__doc__[] = 4183"S.rindex(sub [,start [,end]]) -> int\n\ 4184\n\ 4185Like S.rfind() but raise ValueError when the substring is not found."; 4186 4187static PyObject * 4188unicode_rindex(PyUnicodeObject *self, PyObject *args) 4189{ 4190 int result; 4191 PyUnicodeObject *substring; 4192 int start = 0; 4193 int end = INT_MAX; 4194 4195 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring, 4196 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 4197 return NULL; 4198 substring = (PyUnicodeObject *)PyUnicode_FromObject( 4199 (PyObject *)substring); 4200 if (substring == NULL) 4201 return NULL; 4202 4203 result = findstring(self, substring, start, end, -1); 4204 4205 Py_DECREF(substring); 4206 if (result < 0) { 4207 PyErr_SetString(PyExc_ValueError, "substring not found"); 4208 return NULL; 4209 } 4210 return PyInt_FromLong(result); 4211} 4212 4213static char rjust__doc__[] = 4214"S.rjust(width) -> unicode\n\ 4215\n\ 4216Return S right justified in a Unicode string of length width. Padding is\n\ 4217done using spaces."; 4218 4219static PyObject * 4220unicode_rjust(PyUnicodeObject *self, PyObject *args) 4221{ 4222 int width; 4223 if (!PyArg_ParseTuple(args, "i:rjust", &width)) 4224 return NULL; 4225 4226 if (self->length >= width) { 4227 Py_INCREF(self); 4228 return (PyObject*) self; 4229 } 4230 4231 return (PyObject*) pad(self, width - self->length, 0, ' '); 4232} 4233 4234static char rstrip__doc__[] = 4235"S.rstrip() -> unicode\n\ 4236\n\ 4237Return a copy of the string S with trailing whitespace removed."; 4238 4239static PyObject * 4240unicode_rstrip(PyUnicodeObject *self, PyObject *args) 4241{ 4242 if (!PyArg_NoArgs(args)) 4243 return NULL; 4244 return strip(self, 0, 1); 4245} 4246 4247static PyObject* 4248unicode_slice(PyUnicodeObject *self, int start, int end) 4249{ 4250 /* standard clamping */ 4251 if (start < 0) 4252 start = 0; 4253 if (end < 0) 4254 end = 0; 4255 if (end > self->length) 4256 end = self->length; 4257 if (start == 0 && end == self->length) { 4258 /* full slice, return original string */ 4259 Py_INCREF(self); 4260 return (PyObject*) self; 4261 } 4262 if (start > end) 4263 start = end; 4264 /* copy slice */ 4265 return (PyObject*) PyUnicode_FromUnicode(self->str + start, 4266 end - start); 4267} 4268 4269PyObject *PyUnicode_Split(PyObject *s, 4270 PyObject *sep, 4271 int maxsplit) 4272{ 4273 PyObject *result; 4274 4275 s = PyUnicode_FromObject(s); 4276 if (s == NULL) 4277 return NULL; 4278 if (sep != NULL) { 4279 sep = PyUnicode_FromObject(sep); 4280 if (sep == NULL) { 4281 Py_DECREF(s); 4282 return NULL; 4283 } 4284 } 4285 4286 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit); 4287 4288 Py_DECREF(s); 4289 Py_XDECREF(sep); 4290 return result; 4291} 4292 4293static char split__doc__[] = 4294"S.split([sep [,maxsplit]]) -> list of strings\n\ 4295\n\ 4296Return a list of the words in S, using sep as the\n\ 4297delimiter string. If maxsplit is given, at most maxsplit\n\ 4298splits are done. If sep is not specified, any whitespace string\n\ 4299is a separator."; 4300 4301static PyObject* 4302unicode_split(PyUnicodeObject *self, PyObject *args) 4303{ 4304 PyObject *substring = Py_None; 4305 int maxcount = -1; 4306 4307 if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount)) 4308 return NULL; 4309 4310 if (substring == Py_None) 4311 return split(self, NULL, maxcount); 4312 else if (PyUnicode_Check(substring)) 4313 return split(self, (PyUnicodeObject *)substring, maxcount); 4314 else 4315 return PyUnicode_Split((PyObject *)self, substring, maxcount); 4316} 4317 4318static char splitlines__doc__[] = 4319"S.splitlines([keepends]]) -> list of strings\n\ 4320\n\ 4321Return a list of the lines in S, breaking at line boundaries.\n\ 4322Line breaks are not included in the resulting list unless keepends\n\ 4323is given and true."; 4324 4325static PyObject* 4326unicode_splitlines(PyUnicodeObject *self, PyObject *args) 4327{ 4328 int keepends = 0; 4329 4330 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends)) 4331 return NULL; 4332 4333 return PyUnicode_Splitlines((PyObject *)self, keepends); 4334} 4335 4336static 4337PyObject *unicode_str(PyUnicodeObject *self) 4338{ 4339 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL); 4340} 4341 4342static char strip__doc__[] = 4343"S.strip() -> unicode\n\ 4344\n\ 4345Return a copy of S with leading and trailing whitespace removed."; 4346 4347static PyObject * 4348unicode_strip(PyUnicodeObject *self, PyObject *args) 4349{ 4350 if (!PyArg_NoArgs(args)) 4351 return NULL; 4352 return strip(self, 1, 1); 4353} 4354 4355static char swapcase__doc__[] = 4356"S.swapcase() -> unicode\n\ 4357\n\ 4358Return a copy of S with uppercase characters converted to lowercase\n\ 4359and vice versa."; 4360 4361static PyObject* 4362unicode_swapcase(PyUnicodeObject *self, PyObject *args) 4363{ 4364 if (!PyArg_NoArgs(args)) 4365 return NULL; 4366 return fixup(self, fixswapcase); 4367} 4368 4369static char translate__doc__[] = 4370"S.translate(table) -> unicode\n\ 4371\n\ 4372Return a copy of the string S, where all characters have been mapped\n\ 4373through the given translation table, which must be a mapping of\n\ 4374Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\ 4375are left untouched. Characters mapped to None are deleted."; 4376 4377static PyObject* 4378unicode_translate(PyUnicodeObject *self, PyObject *args) 4379{ 4380 PyObject *table; 4381 4382 if (!PyArg_ParseTuple(args, "O:translate", &table)) 4383 return NULL; 4384 return PyUnicode_TranslateCharmap(self->str, 4385 self->length, 4386 table, 4387 "ignore"); 4388} 4389 4390static char upper__doc__[] = 4391"S.upper() -> unicode\n\ 4392\n\ 4393Return a copy of S converted to uppercase."; 4394 4395static PyObject* 4396unicode_upper(PyUnicodeObject *self, PyObject *args) 4397{ 4398 if (!PyArg_NoArgs(args)) 4399 return NULL; 4400 return fixup(self, fixupper); 4401} 4402 4403#if 0 4404static char zfill__doc__[] = 4405"S.zfill(width) -> unicode\n\ 4406\n\ 4407Pad a numeric string x with zeros on the left, to fill a field\n\ 4408of the specified width. The string x is never truncated."; 4409 4410static PyObject * 4411unicode_zfill(PyUnicodeObject *self, PyObject *args) 4412{ 4413 int fill; 4414 PyUnicodeObject *u; 4415 4416 int width; 4417 if (!PyArg_ParseTuple(args, "i:zfill", &width)) 4418 return NULL; 4419 4420 if (self->length >= width) { 4421 Py_INCREF(self); 4422 return (PyObject*) self; 4423 } 4424 4425 fill = width - self->length; 4426 4427 u = pad(self, fill, 0, '0'); 4428 4429 if (u->str[fill] == '+' || u->str[fill] == '-') { 4430 /* move sign to beginning of string */ 4431 u->str[0] = u->str[fill]; 4432 u->str[fill] = '0'; 4433 } 4434 4435 return (PyObject*) u; 4436} 4437#endif 4438 4439#if 0 4440static PyObject* 4441unicode_freelistsize(PyUnicodeObject *self, PyObject *args) 4442{ 4443 if (!PyArg_NoArgs(args)) 4444 return NULL; 4445 return PyInt_FromLong(unicode_freelist_size); 4446} 4447#endif 4448 4449static char startswith__doc__[] = 4450"S.startswith(prefix[, start[, end]]) -> int\n\ 4451\n\ 4452Return 1 if S starts with the specified prefix, otherwise return 0. With\n\ 4453optional start, test S beginning at that position. With optional end, stop\n\ 4454comparing S at that position."; 4455 4456static PyObject * 4457unicode_startswith(PyUnicodeObject *self, 4458 PyObject *args) 4459{ 4460 PyUnicodeObject *substring; 4461 int start = 0; 4462 int end = INT_MAX; 4463 PyObject *result; 4464 4465 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring, 4466 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 4467 return NULL; 4468 substring = (PyUnicodeObject *)PyUnicode_FromObject( 4469 (PyObject *)substring); 4470 if (substring == NULL) 4471 return NULL; 4472 4473 result = PyInt_FromLong(tailmatch(self, substring, start, end, -1)); 4474 4475 Py_DECREF(substring); 4476 return result; 4477} 4478 4479 4480static char endswith__doc__[] = 4481"S.endswith(suffix[, start[, end]]) -> int\n\ 4482\n\ 4483Return 1 if S ends with the specified suffix, otherwise return 0. With\n\ 4484optional start, test S beginning at that position. With optional end, stop\n\ 4485comparing S at that position."; 4486 4487static PyObject * 4488unicode_endswith(PyUnicodeObject *self, 4489 PyObject *args) 4490{ 4491 PyUnicodeObject *substring; 4492 int start = 0; 4493 int end = INT_MAX; 4494 PyObject *result; 4495 4496 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring, 4497 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 4498 return NULL; 4499 substring = (PyUnicodeObject *)PyUnicode_FromObject( 4500 (PyObject *)substring); 4501 if (substring == NULL) 4502 return NULL; 4503 4504 result = PyInt_FromLong(tailmatch(self, substring, start, end, +1)); 4505 4506 Py_DECREF(substring); 4507 return result; 4508} 4509 4510 4511static PyMethodDef unicode_methods[] = { 4512 4513 /* Order is according to common usage: often used methods should 4514 appear first, since lookup is done sequentially. */ 4515 4516 {"encode", (PyCFunction) unicode_encode, 1, encode__doc__}, 4517 {"replace", (PyCFunction) unicode_replace, 1, replace__doc__}, 4518 {"split", (PyCFunction) unicode_split, 1, split__doc__}, 4519 {"join", (PyCFunction) unicode_join, 1, join__doc__}, 4520 {"capitalize", (PyCFunction) unicode_capitalize, 0, capitalize__doc__}, 4521 {"title", (PyCFunction) unicode_title, 0, title__doc__}, 4522 {"center", (PyCFunction) unicode_center, 1, center__doc__}, 4523 {"count", (PyCFunction) unicode_count, 1, count__doc__}, 4524 {"expandtabs", (PyCFunction) unicode_expandtabs, 1, expandtabs__doc__}, 4525 {"find", (PyCFunction) unicode_find, 1, find__doc__}, 4526 {"index", (PyCFunction) unicode_index, 1, index__doc__}, 4527 {"ljust", (PyCFunction) unicode_ljust, 1, ljust__doc__}, 4528 {"lower", (PyCFunction) unicode_lower, 0, lower__doc__}, 4529 {"lstrip", (PyCFunction) unicode_lstrip, 0, lstrip__doc__}, 4530/* {"maketrans", (PyCFunction) unicode_maketrans, 1, maketrans__doc__}, */ 4531 {"rfind", (PyCFunction) unicode_rfind, 1, rfind__doc__}, 4532 {"rindex", (PyCFunction) unicode_rindex, 1, rindex__doc__}, 4533 {"rjust", (PyCFunction) unicode_rjust, 1, rjust__doc__}, 4534 {"rstrip", (PyCFunction) unicode_rstrip, 0, rstrip__doc__}, 4535 {"splitlines", (PyCFunction) unicode_splitlines, 1, splitlines__doc__}, 4536 {"strip", (PyCFunction) unicode_strip, 0, strip__doc__}, 4537 {"swapcase", (PyCFunction) unicode_swapcase, 0, swapcase__doc__}, 4538 {"translate", (PyCFunction) unicode_translate, 1, translate__doc__}, 4539 {"upper", (PyCFunction) unicode_upper, 0, upper__doc__}, 4540 {"startswith", (PyCFunction) unicode_startswith, 1, startswith__doc__}, 4541 {"endswith", (PyCFunction) unicode_endswith, 1, endswith__doc__}, 4542 {"islower", (PyCFunction) unicode_islower, 0, islower__doc__}, 4543 {"isupper", (PyCFunction) unicode_isupper, 0, isupper__doc__}, 4544 {"istitle", (PyCFunction) unicode_istitle, 0, istitle__doc__}, 4545 {"isspace", (PyCFunction) unicode_isspace, 0, isspace__doc__}, 4546 {"isdecimal", (PyCFunction) unicode_isdecimal, 0, isdecimal__doc__}, 4547 {"isdigit", (PyCFunction) unicode_isdigit, 0, isdigit__doc__}, 4548 {"isnumeric", (PyCFunction) unicode_isnumeric, 0, isnumeric__doc__}, 4549 {"isalpha", (PyCFunction) unicode_isalpha, 0, isalpha__doc__}, 4550 {"isalnum", (PyCFunction) unicode_isalnum, 0, isalnum__doc__}, 4551#if 0 4552 {"zfill", (PyCFunction) unicode_zfill, 1, zfill__doc__}, 4553 {"capwords", (PyCFunction) unicode_capwords, 0, capwords__doc__}, 4554#endif 4555 4556#if 0 4557 /* This one is just used for debugging the implementation. */ 4558 {"freelistsize", (PyCFunction) unicode_freelistsize, 0}, 4559#endif 4560 4561 {NULL, NULL} 4562}; 4563 4564static PyObject * 4565unicode_getattr(PyUnicodeObject *self, char *name) 4566{ 4567 return Py_FindMethod(unicode_methods, (PyObject*) self, name); 4568} 4569 4570static PySequenceMethods unicode_as_sequence = { 4571 (inquiry) unicode_length, /* sq_length */ 4572 (binaryfunc) PyUnicode_Concat, /* sq_concat */ 4573 (intargfunc) unicode_repeat, /* sq_repeat */ 4574 (intargfunc) unicode_getitem, /* sq_item */ 4575 (intintargfunc) unicode_slice, /* sq_slice */ 4576 0, /* sq_ass_item */ 4577 0, /* sq_ass_slice */ 4578 (objobjproc)PyUnicode_Contains, /*sq_contains*/ 4579}; 4580 4581static int 4582unicode_buffer_getreadbuf(PyUnicodeObject *self, 4583 int index, 4584 const void **ptr) 4585{ 4586 if (index != 0) { 4587 PyErr_SetString(PyExc_SystemError, 4588 "accessing non-existent unicode segment"); 4589 return -1; 4590 } 4591 *ptr = (void *) self->str; 4592 return PyUnicode_GET_DATA_SIZE(self); 4593} 4594 4595static int 4596unicode_buffer_getwritebuf(PyUnicodeObject *self, int index, 4597 const void **ptr) 4598{ 4599 PyErr_SetString(PyExc_TypeError, 4600 "cannot use unicode as modifyable buffer"); 4601 return -1; 4602} 4603 4604static int 4605unicode_buffer_getsegcount(PyUnicodeObject *self, 4606 int *lenp) 4607{ 4608 if (lenp) 4609 *lenp = PyUnicode_GET_DATA_SIZE(self); 4610 return 1; 4611} 4612 4613static int 4614unicode_buffer_getcharbuf(PyUnicodeObject *self, 4615 int index, 4616 const void **ptr) 4617{ 4618 PyObject *str; 4619 4620 if (index != 0) { 4621 PyErr_SetString(PyExc_SystemError, 4622 "accessing non-existent unicode segment"); 4623 return -1; 4624 } 4625 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL); 4626 if (str == NULL) 4627 return -1; 4628 *ptr = (void *) PyString_AS_STRING(str); 4629 return PyString_GET_SIZE(str); 4630} 4631 4632/* Helpers for PyUnicode_Format() */ 4633 4634static PyObject * 4635getnextarg(PyObject *args, int arglen, int *p_argidx) 4636{ 4637 int argidx = *p_argidx; 4638 if (argidx < arglen) { 4639 (*p_argidx)++; 4640 if (arglen < 0) 4641 return args; 4642 else 4643 return PyTuple_GetItem(args, argidx); 4644 } 4645 PyErr_SetString(PyExc_TypeError, 4646 "not enough arguments for format string"); 4647 return NULL; 4648} 4649 4650#define F_LJUST (1<<0) 4651#define F_SIGN (1<<1) 4652#define F_BLANK (1<<2) 4653#define F_ALT (1<<3) 4654#define F_ZERO (1<<4) 4655 4656static 4657int usprintf(register Py_UNICODE *buffer, char *format, ...) 4658{ 4659 register int i; 4660 int len; 4661 va_list va; 4662 char *charbuffer; 4663 va_start(va, format); 4664 4665 /* First, format the string as char array, then expand to Py_UNICODE 4666 array. */ 4667 charbuffer = (char *)buffer; 4668 len = vsprintf(charbuffer, format, va); 4669 for (i = len - 1; i >= 0; i--) 4670 buffer[i] = (Py_UNICODE) charbuffer[i]; 4671 4672 va_end(va); 4673 return len; 4674} 4675 4676static int 4677formatfloat(Py_UNICODE *buf, 4678 size_t buflen, 4679 int flags, 4680 int prec, 4681 int type, 4682 PyObject *v) 4683{ 4684 /* fmt = '%#.' + `prec` + `type` 4685 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/ 4686 char fmt[20]; 4687 double x; 4688 4689 x = PyFloat_AsDouble(v); 4690 if (x == -1.0 && PyErr_Occurred()) 4691 return -1; 4692 if (prec < 0) 4693 prec = 6; 4694 if (type == 'f' && (fabs(x) / 1e25) >= 1e25) 4695 type = 'g'; 4696 sprintf(fmt, "%%%s.%d%c", (flags & F_ALT) ? "#" : "", prec, type); 4697 /* worst case length calc to ensure no buffer overrun: 4698 fmt = %#.<prec>g 4699 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp 4700 for any double rep.) 4701 len = 1 + prec + 1 + 2 + 5 = 9 + prec 4702 If prec=0 the effective precision is 1 (the leading digit is 4703 always given), therefore increase by one to 10+prec. */ 4704 if (buflen <= (size_t)10 + (size_t)prec) { 4705 PyErr_SetString(PyExc_OverflowError, 4706 "formatted float is too long (precision too long?)"); 4707 return -1; 4708 } 4709 return usprintf(buf, fmt, x); 4710} 4711 4712static PyObject* 4713formatlong(PyObject *val, int flags, int prec, int type) 4714{ 4715 char *buf; 4716 int i, len; 4717 PyObject *str; /* temporary string object. */ 4718 PyUnicodeObject *result; 4719 4720 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len); 4721 if (!str) 4722 return NULL; 4723 result = _PyUnicode_New(len); 4724 for (i = 0; i < len; i++) 4725 result->str[i] = buf[i]; 4726 result->str[len] = 0; 4727 Py_DECREF(str); 4728 return (PyObject*)result; 4729} 4730 4731static int 4732formatint(Py_UNICODE *buf, 4733 size_t buflen, 4734 int flags, 4735 int prec, 4736 int type, 4737 PyObject *v) 4738{ 4739 /* fmt = '%#.' + `prec` + 'l' + `type` 4740 worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine) 4741 + 1 + 1 = 24*/ 4742 char fmt[64]; /* plenty big enough! */ 4743 long x; 4744 int use_native_c_format = 1; 4745 4746 x = PyInt_AsLong(v); 4747 if (x == -1 && PyErr_Occurred()) 4748 return -1; 4749 if (prec < 0) 4750 prec = 1; 4751 /* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal)) 4752 worst case buf = '0x' + [0-9]*prec, where prec >= 11 */ 4753 if (buflen <= 13 || buflen <= (size_t)2+(size_t)prec) { 4754 PyErr_SetString(PyExc_OverflowError, 4755 "formatted integer is too long (precision too long?)"); 4756 return -1; 4757 } 4758 /* When converting 0 under %#x or %#X, C leaves off the base marker, 4759 * but we want it (for consistency with other %#x conversions, and 4760 * for consistency with Python's hex() function). 4761 * BUG 28-Apr-2001 tim: At least two platform Cs (Metrowerks & 4762 * Compaq Tru64) violate the std by converting 0 w/ leading 0x anyway. 4763 * So add it only if the platform doesn't already. 4764 */ 4765 if (x == 0 && (flags & F_ALT) && (type == 'x' || type == 'X')) { 4766 /* Only way to know what the platform does is to try it. */ 4767 sprintf(fmt, type == 'x' ? "%#x" : "%#X", 0); 4768 if (fmt[1] != (char)type) { 4769 /* Supply our own leading 0x/0X -- needed under std C */ 4770 use_native_c_format = 0; 4771 sprintf(fmt, "0%c%%#.%dl%c", type, prec, type); 4772 } 4773 } 4774 if (use_native_c_format) 4775 sprintf(fmt, "%%%s.%dl%c", (flags & F_ALT) ? "#" : "", prec, type); 4776 return usprintf(buf, fmt, x); 4777} 4778 4779static int 4780formatchar(Py_UNICODE *buf, 4781 size_t buflen, 4782 PyObject *v) 4783{ 4784 /* presume that the buffer is at least 2 characters long */ 4785 if (PyUnicode_Check(v)) { 4786 if (PyUnicode_GET_SIZE(v) != 1) 4787 goto onError; 4788 buf[0] = PyUnicode_AS_UNICODE(v)[0]; 4789 } 4790 4791 else if (PyString_Check(v)) { 4792 if (PyString_GET_SIZE(v) != 1) 4793 goto onError; 4794 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0]; 4795 } 4796 4797 else { 4798 /* Integer input truncated to a character */ 4799 long x; 4800 x = PyInt_AsLong(v); 4801 if (x == -1 && PyErr_Occurred()) 4802 goto onError; 4803 buf[0] = (char) x; 4804 } 4805 buf[1] = '\0'; 4806 return 1; 4807 4808 onError: 4809 PyErr_SetString(PyExc_TypeError, 4810 "%c requires int or char"); 4811 return -1; 4812} 4813 4814/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...) 4815 4816 FORMATBUFLEN is the length of the buffer in which the floats, ints, & 4817 chars are formatted. XXX This is a magic number. Each formatting 4818 routine does bounds checking to ensure no overflow, but a better 4819 solution may be to malloc a buffer of appropriate size for each 4820 format. For now, the current solution is sufficient. 4821*/ 4822#define FORMATBUFLEN (size_t)120 4823 4824PyObject *PyUnicode_Format(PyObject *format, 4825 PyObject *args) 4826{ 4827 Py_UNICODE *fmt, *res; 4828 int fmtcnt, rescnt, reslen, arglen, argidx; 4829 int args_owned = 0; 4830 PyUnicodeObject *result = NULL; 4831 PyObject *dict = NULL; 4832 PyObject *uformat; 4833 4834 if (format == NULL || args == NULL) { 4835 PyErr_BadInternalCall(); 4836 return NULL; 4837 } 4838 uformat = PyUnicode_FromObject(format); 4839 if (uformat == NULL) 4840 return NULL; 4841 fmt = PyUnicode_AS_UNICODE(uformat); 4842 fmtcnt = PyUnicode_GET_SIZE(uformat); 4843 4844 reslen = rescnt = fmtcnt + 100; 4845 result = _PyUnicode_New(reslen); 4846 if (result == NULL) 4847 goto onError; 4848 res = PyUnicode_AS_UNICODE(result); 4849 4850 if (PyTuple_Check(args)) { 4851 arglen = PyTuple_Size(args); 4852 argidx = 0; 4853 } 4854 else { 4855 arglen = -1; 4856 argidx = -2; 4857 } 4858 if (args->ob_type->tp_as_mapping) 4859 dict = args; 4860 4861 while (--fmtcnt >= 0) { 4862 if (*fmt != '%') { 4863 if (--rescnt < 0) { 4864 rescnt = fmtcnt + 100; 4865 reslen += rescnt; 4866 if (_PyUnicode_Resize(&result, reslen) < 0) 4867 return NULL; 4868 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt; 4869 --rescnt; 4870 } 4871 *res++ = *fmt++; 4872 } 4873 else { 4874 /* Got a format specifier */ 4875 int flags = 0; 4876 int width = -1; 4877 int prec = -1; 4878 Py_UNICODE c = '\0'; 4879 Py_UNICODE fill; 4880 PyObject *v = NULL; 4881 PyObject *temp = NULL; 4882 Py_UNICODE *pbuf; 4883 Py_UNICODE sign; 4884 int len; 4885 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */ 4886 4887 fmt++; 4888 if (*fmt == '(') { 4889 Py_UNICODE *keystart; 4890 int keylen; 4891 PyObject *key; 4892 int pcount = 1; 4893 4894 if (dict == NULL) { 4895 PyErr_SetString(PyExc_TypeError, 4896 "format requires a mapping"); 4897 goto onError; 4898 } 4899 ++fmt; 4900 --fmtcnt; 4901 keystart = fmt; 4902 /* Skip over balanced parentheses */ 4903 while (pcount > 0 && --fmtcnt >= 0) { 4904 if (*fmt == ')') 4905 --pcount; 4906 else if (*fmt == '(') 4907 ++pcount; 4908 fmt++; 4909 } 4910 keylen = fmt - keystart - 1; 4911 if (fmtcnt < 0 || pcount > 0) { 4912 PyErr_SetString(PyExc_ValueError, 4913 "incomplete format key"); 4914 goto onError; 4915 } 4916 /* keys are converted to strings using UTF-8 and 4917 then looked up since Python uses strings to hold 4918 variables names etc. in its namespaces and we 4919 wouldn't want to break common idioms. */ 4920 key = PyUnicode_EncodeUTF8(keystart, 4921 keylen, 4922 NULL); 4923 if (key == NULL) 4924 goto onError; 4925 if (args_owned) { 4926 Py_DECREF(args); 4927 args_owned = 0; 4928 } 4929 args = PyObject_GetItem(dict, key); 4930 Py_DECREF(key); 4931 if (args == NULL) { 4932 goto onError; 4933 } 4934 args_owned = 1; 4935 arglen = -1; 4936 argidx = -2; 4937 } 4938 while (--fmtcnt >= 0) { 4939 switch (c = *fmt++) { 4940 case '-': flags |= F_LJUST; continue; 4941 case '+': flags |= F_SIGN; continue; 4942 case ' ': flags |= F_BLANK; continue; 4943 case '#': flags |= F_ALT; continue; 4944 case '0': flags |= F_ZERO; continue; 4945 } 4946 break; 4947 } 4948 if (c == '*') { 4949 v = getnextarg(args, arglen, &argidx); 4950 if (v == NULL) 4951 goto onError; 4952 if (!PyInt_Check(v)) { 4953 PyErr_SetString(PyExc_TypeError, 4954 "* wants int"); 4955 goto onError; 4956 } 4957 width = PyInt_AsLong(v); 4958 if (width < 0) { 4959 flags |= F_LJUST; 4960 width = -width; 4961 } 4962 if (--fmtcnt >= 0) 4963 c = *fmt++; 4964 } 4965 else if (c >= '0' && c <= '9') { 4966 width = c - '0'; 4967 while (--fmtcnt >= 0) { 4968 c = *fmt++; 4969 if (c < '0' || c > '9') 4970 break; 4971 if ((width*10) / 10 != width) { 4972 PyErr_SetString(PyExc_ValueError, 4973 "width too big"); 4974 goto onError; 4975 } 4976 width = width*10 + (c - '0'); 4977 } 4978 } 4979 if (c == '.') { 4980 prec = 0; 4981 if (--fmtcnt >= 0) 4982 c = *fmt++; 4983 if (c == '*') { 4984 v = getnextarg(args, arglen, &argidx); 4985 if (v == NULL) 4986 goto onError; 4987 if (!PyInt_Check(v)) { 4988 PyErr_SetString(PyExc_TypeError, 4989 "* wants int"); 4990 goto onError; 4991 } 4992 prec = PyInt_AsLong(v); 4993 if (prec < 0) 4994 prec = 0; 4995 if (--fmtcnt >= 0) 4996 c = *fmt++; 4997 } 4998 else if (c >= '0' && c <= '9') { 4999 prec = c - '0'; 5000 while (--fmtcnt >= 0) { 5001 c = Py_CHARMASK(*fmt++); 5002 if (c < '0' || c > '9') 5003 break; 5004 if ((prec*10) / 10 != prec) { 5005 PyErr_SetString(PyExc_ValueError, 5006 "prec too big"); 5007 goto onError; 5008 } 5009 prec = prec*10 + (c - '0'); 5010 } 5011 } 5012 } /* prec */ 5013 if (fmtcnt >= 0) { 5014 if (c == 'h' || c == 'l' || c == 'L') { 5015 if (--fmtcnt >= 0) 5016 c = *fmt++; 5017 } 5018 } 5019 if (fmtcnt < 0) { 5020 PyErr_SetString(PyExc_ValueError, 5021 "incomplete format"); 5022 goto onError; 5023 } 5024 if (c != '%') { 5025 v = getnextarg(args, arglen, &argidx); 5026 if (v == NULL) 5027 goto onError; 5028 } 5029 sign = 0; 5030 fill = ' '; 5031 switch (c) { 5032 5033 case '%': 5034 pbuf = formatbuf; 5035 /* presume that buffer length is at least 1 */ 5036 pbuf[0] = '%'; 5037 len = 1; 5038 break; 5039 5040 case 's': 5041 case 'r': 5042 if (PyUnicode_Check(v) && c == 's') { 5043 temp = v; 5044 Py_INCREF(temp); 5045 } 5046 else { 5047 PyObject *unicode; 5048 if (c == 's') 5049 temp = PyObject_Str(v); 5050 else 5051 temp = PyObject_Repr(v); 5052 if (temp == NULL) 5053 goto onError; 5054 if (!PyString_Check(temp)) { 5055 /* XXX Note: this should never happen, since 5056 PyObject_Repr() and PyObject_Str() assure 5057 this */ 5058 Py_DECREF(temp); 5059 PyErr_SetString(PyExc_TypeError, 5060 "%s argument has non-string str()"); 5061 goto onError; 5062 } 5063 unicode = PyUnicode_Decode(PyString_AS_STRING(temp), 5064 PyString_GET_SIZE(temp), 5065 NULL, 5066 "strict"); 5067 Py_DECREF(temp); 5068 temp = unicode; 5069 if (temp == NULL) 5070 goto onError; 5071 } 5072 pbuf = PyUnicode_AS_UNICODE(temp); 5073 len = PyUnicode_GET_SIZE(temp); 5074 if (prec >= 0 && len > prec) 5075 len = prec; 5076 break; 5077 5078 case 'i': 5079 case 'd': 5080 case 'u': 5081 case 'o': 5082 case 'x': 5083 case 'X': 5084 if (c == 'i') 5085 c = 'd'; 5086 if (PyLong_Check(v)) { 5087 temp = formatlong(v, flags, prec, c); 5088 if (!temp) 5089 goto onError; 5090 pbuf = PyUnicode_AS_UNICODE(temp); 5091 len = PyUnicode_GET_SIZE(temp); 5092 /* unbounded ints can always produce 5093 a sign character! */ 5094 sign = 1; 5095 } 5096 else { 5097 pbuf = formatbuf; 5098 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), 5099 flags, prec, c, v); 5100 if (len < 0) 5101 goto onError; 5102 /* only d conversion is signed */ 5103 sign = c == 'd'; 5104 } 5105 if (flags & F_ZERO) 5106 fill = '0'; 5107 break; 5108 5109 case 'e': 5110 case 'E': 5111 case 'f': 5112 case 'g': 5113 case 'G': 5114 pbuf = formatbuf; 5115 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), 5116 flags, prec, c, v); 5117 if (len < 0) 5118 goto onError; 5119 sign = 1; 5120 if (flags & F_ZERO) 5121 fill = '0'; 5122 break; 5123 5124 case 'c': 5125 pbuf = formatbuf; 5126 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v); 5127 if (len < 0) 5128 goto onError; 5129 break; 5130 5131 default: 5132 PyErr_Format(PyExc_ValueError, 5133 "unsupported format character '%c' (0x%x) " 5134 "at index %i", 5135 (31<=c && c<=126) ? c : '?', 5136 c, fmt -1 - PyUnicode_AS_UNICODE(uformat)); 5137 goto onError; 5138 } 5139 if (sign) { 5140 if (*pbuf == '-' || *pbuf == '+') { 5141 sign = *pbuf++; 5142 len--; 5143 } 5144 else if (flags & F_SIGN) 5145 sign = '+'; 5146 else if (flags & F_BLANK) 5147 sign = ' '; 5148 else 5149 sign = 0; 5150 } 5151 if (width < len) 5152 width = len; 5153 if (rescnt < width + (sign != 0)) { 5154 reslen -= rescnt; 5155 rescnt = width + fmtcnt + 100; 5156 reslen += rescnt; 5157 if (_PyUnicode_Resize(&result, reslen) < 0) 5158 return NULL; 5159 res = PyUnicode_AS_UNICODE(result) 5160 + reslen - rescnt; 5161 } 5162 if (sign) { 5163 if (fill != ' ') 5164 *res++ = sign; 5165 rescnt--; 5166 if (width > len) 5167 width--; 5168 } 5169 if ((flags & F_ALT) && (c == 'x' || c == 'X')) { 5170 assert(pbuf[0] == '0'); 5171 assert(pbuf[1] == c); 5172 if (fill != ' ') { 5173 *res++ = *pbuf++; 5174 *res++ = *pbuf++; 5175 } 5176 rescnt -= 2; 5177 width -= 2; 5178 if (width < 0) 5179 width = 0; 5180 len -= 2; 5181 } 5182 if (width > len && !(flags & F_LJUST)) { 5183 do { 5184 --rescnt; 5185 *res++ = fill; 5186 } while (--width > len); 5187 } 5188 if (fill == ' ') { 5189 if (sign) 5190 *res++ = sign; 5191 if ((flags & F_ALT) && (c == 'x' || c == 'X')) { 5192 assert(pbuf[0] == '0'); 5193 assert(pbuf[1] == c); 5194 *res++ = *pbuf++; 5195 *res++ = *pbuf++; 5196 } 5197 } 5198 Py_UNICODE_COPY(res, pbuf, len); 5199 res += len; 5200 rescnt -= len; 5201 while (--width >= len) { 5202 --rescnt; 5203 *res++ = ' '; 5204 } 5205 if (dict && (argidx < arglen) && c != '%') { 5206 PyErr_SetString(PyExc_TypeError, 5207 "not all arguments converted"); 5208 goto onError; 5209 } 5210 Py_XDECREF(temp); 5211 } /* '%' */ 5212 } /* until end */ 5213 if (argidx < arglen && !dict) { 5214 PyErr_SetString(PyExc_TypeError, 5215 "not all arguments converted"); 5216 goto onError; 5217 } 5218 5219 if (args_owned) { 5220 Py_DECREF(args); 5221 } 5222 Py_DECREF(uformat); 5223 if (_PyUnicode_Resize(&result, reslen - rescnt)) 5224 goto onError; 5225 return (PyObject *)result; 5226 5227 onError: 5228 Py_XDECREF(result); 5229 Py_DECREF(uformat); 5230 if (args_owned) { 5231 Py_DECREF(args); 5232 } 5233 return NULL; 5234} 5235 5236static PyBufferProcs unicode_as_buffer = { 5237 (getreadbufferproc) unicode_buffer_getreadbuf, 5238 (getwritebufferproc) unicode_buffer_getwritebuf, 5239 (getsegcountproc) unicode_buffer_getsegcount, 5240 (getcharbufferproc) unicode_buffer_getcharbuf, 5241}; 5242 5243PyTypeObject PyUnicode_Type = { 5244 PyObject_HEAD_INIT(&PyType_Type) 5245 0, /* ob_size */ 5246 "unicode", /* tp_name */ 5247 sizeof(PyUnicodeObject), /* tp_size */ 5248 0, /* tp_itemsize */ 5249 /* Slots */ 5250 (destructor)_PyUnicode_Free, /* tp_dealloc */ 5251 0, /* tp_print */ 5252 (getattrfunc)unicode_getattr, /* tp_getattr */ 5253 0, /* tp_setattr */ 5254 (cmpfunc) unicode_compare, /* tp_compare */ 5255 (reprfunc) unicode_repr, /* tp_repr */ 5256 0, /* tp_as_number */ 5257 &unicode_as_sequence, /* tp_as_sequence */ 5258 0, /* tp_as_mapping */ 5259 (hashfunc) unicode_hash, /* tp_hash*/ 5260 0, /* tp_call*/ 5261 (reprfunc) unicode_str, /* tp_str */ 5262 (getattrofunc) NULL, /* tp_getattro */ 5263 (setattrofunc) NULL, /* tp_setattro */ 5264 &unicode_as_buffer, /* tp_as_buffer */ 5265 Py_TPFLAGS_DEFAULT, /* tp_flags */ 5266}; 5267 5268/* Initialize the Unicode implementation */ 5269 5270void _PyUnicode_Init(void) 5271{ 5272 int i; 5273 5274 /* Doublecheck the configuration... */ 5275 if (sizeof(Py_UNICODE) != 2) 5276 Py_FatalError("Unicode configuration error: " 5277 "sizeof(Py_UNICODE) != 2 bytes"); 5278 5279 /* Init the implementation */ 5280 unicode_freelist = NULL; 5281 unicode_freelist_size = 0; 5282 unicode_empty = _PyUnicode_New(0); 5283 strcpy(unicode_default_encoding, "ascii"); 5284 for (i = 0; i < 256; i++) 5285 unicode_latin1[i] = NULL; 5286} 5287 5288/* Finalize the Unicode implementation */ 5289 5290void 5291_PyUnicode_Fini(void) 5292{ 5293 PyUnicodeObject *u; 5294 int i; 5295 5296 Py_XDECREF(unicode_empty); 5297 unicode_empty = NULL; 5298 5299 for (i = 0; i < 256; i++) { 5300 if (unicode_latin1[i]) { 5301 Py_DECREF(unicode_latin1[i]); 5302 unicode_latin1[i] = NULL; 5303 } 5304 } 5305 5306 for (u = unicode_freelist; u != NULL;) { 5307 PyUnicodeObject *v = u; 5308 u = *(PyUnicodeObject **)u; 5309 if (v->str) 5310 PyMem_DEL(v->str); 5311 Py_XDECREF(v->defenc); 5312 PyObject_DEL(v); 5313 } 5314 unicode_freelist = NULL; 5315 unicode_freelist_size = 0; 5316} 5317